Using logic that filter
can interpret,
compare_conditions()
will summarize the data aggregating condition x
and
condition y
Arguments
- df
data frame
- x
condition for comparison, same criteria you would use in 'dplyr::filter', used in contrast to the reference group 'y'
- y
condition for comparison, same criteria you would use in 'dplyr::filter', used in contrast to the reference group 'x'
- .cols
columns to use in comparison
- .fns
named list of the functions to use, ex: list(avg = mean, sd = sd) 'purrr' style phrases are also supported like list(mean = ~mean(.x, na.rm = TRUE), sd = sd) and dplyr::lst(mean, sd) will create a list(mean = mean, sd = sd)
Details
compare_conditions()
passes its arguments to
across
. The .cols
and .fns
work the same. For
clarity, it is helpful to use the lst
function for the
.fns
parameter. Using
compare_conditions(..., .cols = my_var, .fns = lst(mean, sd))
will return
the values mean_my_var_x
, mean_my_var_y
, sd_my_var_x
and sd_my_var_x
Examples
# compare_conditions works similar to dplyr::across()
pixar_films |>
compare_conditions(
x = (rating == "G"),
y = (rating == "PG"),
.cols = rotten_tomatoes
)
#> # A tibble: 1 × 2
#> mean_rotten_tomatoes_x mean_rotten_tomatoes_y
#> <dbl> <dbl>
#> 1 87.4 91
# because data frames are just fancy lists, you pass the result to headline_list()
pixar_films |>
compare_conditions(
x = (rating == "G"),
y = (rating == "PG"),
.cols = rotten_tomatoes
) |>
headline_list("a difference of {delta} points")
#> a difference of 3.6 points
# you can return multiple objects to compare
# 'view_List()' is a helper to see list objects in a compact way
pixar_films |>
compare_conditions(
x = (rating == "G"),
y = (rating == "PG"),
.cols = c(rotten_tomatoes, metacritic),
.fns = dplyr::lst(mean, sd)
) |>
view_list()
#> value
#> mean_metacritic_x 80.769231
#> mean_metacritic_y 78.444444
#> mean_rotten_tomatoes_x 87.384615
#> mean_rotten_tomatoes_y 91.000000
#> sd_metacritic_x 13.772883
#> sd_metacritic_y 11.325978
#> sd_rotten_tomatoes_x 17.566723
#> sd_rotten_tomatoes_y 8.558621
# you can use any of the `tidyselect` helpers
pixar_films |>
compare_conditions(
x = (rating == "G"),
y = (rating == "PG"),
.cols = dplyr::starts_with("bo_")
)
#> # A tibble: 1 × 4
#> mean_bo_domestic_x mean_bo_domestic_y mean_bo_intl_x mean_bo_intl_y
#> <dbl> <dbl> <dbl> <dbl>
#> 1 259. 293. 370. 409.
# if you want to compare x to the overall average, use y = TRUE
pixar_films |>
compare_conditions(
x = (rating == "G"),
y = TRUE,
.cols = rotten_tomatoes
)
#> # A tibble: 1 × 2
#> mean_rotten_tomatoes_x mean_rotten_tomatoes_y
#> <dbl> <dbl>
#> 1 87.4 88.9
# to get the # of observations use length() instead of n()
# note: don't pass the parentheses
pixar_films |>
compare_conditions(
x = (rating == "G"),
y = (rating == "PG"),
.cols = rotten_tomatoes, # can put anything here really
.fns = list(n = length)
)
#> # A tibble: 1 × 2
#> n_rotten_tomatoes_x n_rotten_tomatoes_y
#> <int> <int>
#> 1 13 9
# you can also use purrr-style lambdas
pixar_films |>
compare_conditions(
x = (rating == "G"),
y = (rating == "PG"),
.cols = rotten_tomatoes,
.fns = list(avg = ~ sum(.x) / length(.x))
)
#> # A tibble: 1 × 2
#> avg_rotten_tomatoes_x avg_rotten_tomatoes_y
#> <dbl> <dbl>
#> 1 87.4 91
# you can compare categorical data with functions like dplyr::n_distinct()
pixar_films |>
compare_conditions(
x = (rating == "G"),
y = (rating == "PG"),
.cols = film,
.fns = list(distinct = dplyr::n_distinct)
)
#> # A tibble: 1 × 2
#> distinct_film_x distinct_film_y
#> <int> <int>
#> 1 13 9