feat: supply tidyselection for sum_groups_epi_df (#657)

dajmcdon · brookslogan · web-flow · commit e08a6937a9f3 · 2025-04-09T23:01:54.000-07:00
Co-authored-by: Logan C. Brooks &lt;lcbrooks+github@andrew.cmu.edu&gt;
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: epiprocess
 Title: Tools for basic signal processing in epidemiology
-Version: 0.11.3
+Version: 0.11.4
 Authors@R: c(
     person("Jacob", "Bien", role = "ctb"),
     person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")),
@@ -105,6 +105,7 @@ Collate:
     'methods-epi_archive.R'
     'grouped_epi_archive.R'
     'growth_rate.R'
+    'inline-roxygen.R'
     'key_colnames.R'
     'methods-epi_df.R'
     'outliers.R'
diff --git a/NEWS.md b/NEWS.md
@@ -58,6 +58,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat
 - Various functions are now faster, using faster variants of core operations and
   avoiding reconstructing grouped `epi_df`s when unnecessary.
 - Add `autoplot.epi_archive()` to display revision patterns.
+- `sum_groups_epi_df()` now supports tidyselect syntax in it's second argument (#655).
 
 ## Bug fixes
 
diff --git a/R/inline-roxygen.R b/R/inline-roxygen.R
@@ -0,0 +1,16 @@
+# Helpers here are meant to be used inside inline R expressions within roxygen2
+# documentation when @template is inappropriate.
+
+#' Description of a single arg that tidyselects value variables
+#'
+#' Not meant for when describing tidyselect `...`.
+#'
+#' @keywords internal
+tidyselect_arg_roxygen <- '
+  <[`tidy-select`][dplyr_tidy_select]> An unquoted column
+  name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`),
+  [other tidy-select expression][tidyselect::language], or a vector of
+  characters (e.g. `c("cases", "deaths")`). Variable names can be used as if
+  they were positions in the data frame, so expressions like `x:y` can be
+  used to select a range of variables.
+'
diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R
@@ -499,34 +499,40 @@ group_epi_df <- function(x, exclude = character()) {
 #' the resulting `epi_df` will have `geo_value` set to `"total"`.
 #'
 #' @param .x an `epi_df`
-#' @param sum_cols character vector of the columns to aggregate
+#' @param sum_cols `r tidyselect_arg_roxygen`
 #' @param group_cols character vector of column names to group by. "time_value" is
-#' included by default.
+#'   included by default.
 #' @return an `epi_df` object
 #'
+#' @examples
+#' # This data has other_keys age_group and edu_qual:
+#' grad_employ_subset
+#'
+#' # Aggregate num_graduates within each geo_value (and time_value):
+#' grad_employ_subset %>%
+#'   sum_groups_epi_df(num_graduates, group_cols = "geo_value")
+#'
 #' @export
-sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character()) {
+sum_groups_epi_df <- function(.x, sum_cols, group_cols = "time_value") {
   assert_class(.x, "epi_df")
-  assert_character(sum_cols)
   assert_character(group_cols)
-  checkmate::assert_subset(sum_cols, setdiff(names(.x), key_colnames(.x)))
   checkmate::assert_subset(group_cols, key_colnames(.x))
   if (!"time_value" %in% group_cols) {
     group_cols <- c("time_value", group_cols)
   }
-
-  out <- .x %>%
-    group_by(across(all_of(group_cols))) %>%
-    dplyr::summarize(across(all_of(sum_cols), sum), .groups = "drop")
+  # Attempt tidyselection ourselves to get "Error in `sum_groups_epi_df()`"
+  # rather than "in `dplyr::summarize()`", before forwarding:
+  sum_cols <- rlang::enquo(sum_cols)
+  tidyselect::eval_select(sum_cols, .x)
+  out <- group_by(.x, across(all_of(group_cols))) %>%
+    dplyr::summarize(across(!!sum_cols, sum), .groups = "drop")
 
   # To preserve epi_df-ness, we need to ensure that the `geo_value` column is
   # present.
-  out <- if (!"geo_value" %in% group_cols) {
-    out %>%
+  if (!"geo_value" %in% group_cols) {
+    out <- out %>%
       mutate(geo_value = "total") %>%
-      relocate(geo_value, .before = 1)
-  } else {
-    out
+      relocate(.data$geo_value, .before = 1)
   }
 
   # The `geo_type` will be correctly inherited here by the following logic:
@@ -535,10 +541,10 @@ sum_groups_epi_df <- function(.x, sum_cols = "value", group_cols = character())
   # - if `geo_value` is not in `group_cols`, then the constructor will see
   #   the unrecognizeable "total" value and will correctly infer the "custom"
   #   geo_type.
-  out %>%
-    as_epi_df(
-      as_of = attr(.x, "metadata")$as_of,
-      other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols)
-    ) %>%
+  as_epi_df(
+    out,
+    as_of = attr(.x, "metadata")$as_of,
+    other_keys = intersect(attr(.x, "metadata")$other_keys, group_cols)
+  ) %>%
     arrange_canonical()
 }
diff --git a/R/slide.R b/R/slide.R
@@ -557,12 +557,7 @@ get_before_after_from_window <- function(window_size, align, time_type) {
 #' `vignette("epi_df")` for more examples.
 #'
 #' @template basic-slide-params
-#' @param .col_names <[`tidy-select`][dplyr_tidy_select]> An unquoted column
-#'   name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`),
-#'   [other tidy-select expression][tidyselect::language], or a vector of
-#'   characters (e.g. `c("cases", "deaths")`). Variable names can be used as if
-#'   they were positions in the data frame, so expressions like `x:y` can be
-#'   used to select a range of variables.
+#' @param .col_names `r tidyselect_arg_roxygen`
 #'
 #'   The tidy-selection renaming interface is not supported, and cannot be used
 #'   to provide output column names; if you want to customize the output column
diff --git a/man/sum_groups_epi_df.Rd b/man/sum_groups_epi_df.Rd
diff --git a/man/tidyselect_arg_roxygen.Rd b/man/tidyselect_arg_roxygen.Rd
diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R
@@ -311,20 +311,25 @@ test_that("complete.epi_df works", {
 })
 
 test_that("sum_groups_epi_df works", {
-  out <- toy_epi_df %>% sum_groups_epi_df(sum_cols = "x")
+  out <- toy_epi_df %>% sum_groups_epi_df("x")
   expected_out <- toy_epi_df %>%
     group_by(time_value) %>%
     summarize(x = sum(x)) %>%
     mutate(geo_value = "total") %>%
     as_epi_df(as_of = attr(toy_epi_df, "metadata")$as_of)
   expect_equal(out, expected_out)
+  out <- toy_epi_df %>% sum_groups_epi_df(x)
+  expect_equal(out, expected_out)
 
   out <- toy_epi_df %>%
-    sum_groups_epi_df(sum_cols = c("x", "y"), group_cols = c("time_value", "geo_value", "indic_var1"))
+    sum_groups_epi_df(c(x, y), group_cols = c("time_value", "geo_value", "indic_var1"))
   expected_out <- toy_epi_df %>%
     group_by(time_value, geo_value, indic_var1) %>%
     summarize(x = sum(x), y = sum(y), .groups = "drop") %>%
     as_epi_df(as_of = attr(toy_epi_df, "metadata")$as_of, other_keys = "indic_var1") %>%
     arrange_canonical()
   expect_equal(out, expected_out)
+  out <- toy_epi_df %>%
+    sum_groups_epi_df(x:y, group_cols = c("time_value", "geo_value", "indic_var1"))
+  expect_equal(out, expected_out)
 })