Compare two conditions within a data frame — compare

Using logic that filter can interpret, compare_conditions() will summarize the data aggregating condition x and condition y

Usage

compare_conditions(df, x, y, .cols = everything(), .fns = lst(mean))

Arguments

df: data frame
x: condition for comparison, same criteria you would use in 'dplyr::filter', used in contrast to the reference group 'y'
y: condition for comparison, same criteria you would use in 'dplyr::filter', used in contrast to the reference group 'x'
.cols: columns to use in comparison
.fns: named list of the functions to use, ex: list(avg = mean, sd = sd) 'purrr' style phrases are also supported like list(mean = ~mean(.x, na.rm = TRUE), sd = sd) and dplyr::lst(mean, sd) will create a list(mean = mean, sd = sd)

Value

Returns a data frame that is either 1 row, or if grouped, 1 row per group.

Details

compare_conditions() passes its arguments to across. The .cols and .fns work the same. For clarity, it is helpful to use the lst function for the .fns parameter. Using compare_conditions(..., .cols = my_var, .fns = lst(mean, sd)) will return the values mean_my_var_x, mean_my_var_y, sd_my_var_x and sd_my_var_x

Examples


# compare_conditions works similar to dplyr::across()
pixar_films |>
  compare_conditions(
    x = (rating == "G"),
    y = (rating == "PG"),
    .cols = rotten_tomatoes
  )
#> # A tibble: 1 × 2
#>   mean_rotten_tomatoes_x mean_rotten_tomatoes_y
#>                    <dbl>                  <dbl>
#> 1                   87.4                     91


# because data frames are just fancy lists, you pass the result to headline_list()
pixar_films |>
  compare_conditions(
    x = (rating == "G"),
    y = (rating == "PG"),
    .cols = rotten_tomatoes
  ) |>
 headline_list("a difference of {delta} points")
#> a difference of 3.6 points


 # you can return multiple objects to compare
 # 'view_List()' is a helper to see list objects in a compact way
 pixar_films |>
  compare_conditions(
    x = (rating == "G"),
    y = (rating == "PG"),
    .cols = c(rotten_tomatoes, metacritic),
    .fns = dplyr::lst(mean, sd)
  ) |>
  view_list()
#>                            value
#> mean_metacritic_x      80.769231
#> mean_metacritic_y      78.444444
#> mean_rotten_tomatoes_x 87.384615
#> mean_rotten_tomatoes_y 91.000000
#> sd_metacritic_x        13.772883
#> sd_metacritic_y        11.325978
#> sd_rotten_tomatoes_x   17.566723
#> sd_rotten_tomatoes_y    8.558621


# you can use any of the `tidyselect` helpers
pixar_films |>
  compare_conditions(
    x = (rating == "G"),
    y = (rating == "PG"),
    .cols = dplyr::starts_with("bo_")
  )
#> # A tibble: 1 × 4
#>   mean_bo_domestic_x mean_bo_domestic_y mean_bo_intl_x mean_bo_intl_y
#>                <dbl>              <dbl>          <dbl>          <dbl>
#> 1               259.               293.           370.           409.


# if you want to compare x to the overall average, use y = TRUE
pixar_films |>
  compare_conditions(
    x = (rating == "G"),
    y = TRUE,
    .cols = rotten_tomatoes
  )
#> # A tibble: 1 × 2
#>   mean_rotten_tomatoes_x mean_rotten_tomatoes_y
#>                    <dbl>                  <dbl>
#> 1                   87.4                   88.9


# to get the # of observations use length() instead of n()
# note: don't pass the parentheses
pixar_films |>
  compare_conditions(
    x = (rating == "G"),
    y = (rating == "PG"),
    .cols = rotten_tomatoes, # can put anything here really
    .fns = list(n = length)
  )
#> # A tibble: 1 × 2
#>   n_rotten_tomatoes_x n_rotten_tomatoes_y
#>                 <int>               <int>
#> 1                  13                   9


# you can also use purrr-style lambdas
pixar_films |>
  compare_conditions(
    x = (rating == "G"),
    y = (rating == "PG"),
    .cols = rotten_tomatoes,
    .fns = list(avg = ~ sum(.x) / length(.x))
  )
#> # A tibble: 1 × 2
#>   avg_rotten_tomatoes_x avg_rotten_tomatoes_y
#>                   <dbl>                 <dbl>
#> 1                  87.4                    91

# you can compare categorical data with functions like dplyr::n_distinct()
pixar_films |>
  compare_conditions(
    x = (rating == "G"),
    y = (rating == "PG"),
    .cols = film,
    .fns = list(distinct = dplyr::n_distinct)
  )
#> # A tibble: 1 × 2
#>   distinct_film_x distinct_film_y
#>             <int>           <int>
#> 1              13               9