diff --git a/DESCRIPTION b/DESCRIPTION index b008956d..183097a8 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.11.6 +Version: 0.11.7 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), diff --git a/NEWS.md b/NEWS.md index 58cd16cc..ba277bb4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,8 +10,20 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicat - `epix_as_of_current()` introduced as an alias for `epix_as_of(.$versions_end)`. - Added `dplyr::filter` implementation for `epi_archive`s. +## Improvements + +- Various documentation has been updated, simplified, and improved with better + examples. + # epiprocess 0.11 +## Highlights + +`{epiprocess}` should once again not require Rtools or a compiler to be able to +install! We've also updated some function interfaces to be more consistent +throughout the package & with tidyverse, and improved the generality of and +fixed bugs in various functions and documentation. + ## Breaking changes - `growth_rate()` argument order and names have changed. You will need to diff --git a/R/archive.R b/R/archive.R index 88f8d704..44058f8b 100644 --- a/R/archive.R +++ b/R/archive.R @@ -322,6 +322,9 @@ new_epi_archive <- function( #' Perform second (costly) round of validation that `x` is a proper `epi_archive` #' +#' Does not validate `geo_type` or `time_type` against `geo_value` and +#' `time_value` columns. These are assumed to have been set to compatibly. +#' #' @return * Of `validate_epi_archive`: an `epi_archive`, #' [invisibly][base::invisible] (or raises an error if `x` was invalid) #' diff --git a/R/autoplot.R b/R/autoplot.R index 15d577ec..2644f3ba 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -1,7 +1,7 @@ #' Automatically plot an epi_df or epi_archive #' #' @param object,x An `epi_df` or `epi_archive` -#' @param ... <[`tidy-select`][dplyr_tidy_select]> One or more unquoted +#' @param ... <[`tidy-select`][dplyr::dplyr_tidy_select]> One or more unquoted #' expressions separated by commas. Variable names can be used as if they #' were positions in the data frame, so expressions like `x:y` can #' be used to select a range of variables. @@ -22,7 +22,7 @@ #' @param .max_facets `r lifecycle::badge("deprecated")` #' @param .facet_filter Select which facets will be displayed. Especially #' useful for when there are many `geo_value`'s or keys. This is a -#' <[`rlang`][args_data_masking]> expression along the lines of [dplyr::filter()]. +#' <[`rlang`][rlang::args_data_masking]> expression along the lines of [dplyr::filter()]. #' However, it must be a single expression combined with the `&` operator. This #' contrasts to the typical use case which allows multiple comma-separated expressions #' which are implicitly combined with `&`. When multiple variables are selected diff --git a/R/epi_df.R b/R/epi_df.R index 4955ab08..b9d999d9 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -6,7 +6,8 @@ #' which can be seen as measured variables at each key. In brief, an `epi_df` #' represents a snapshot of an epidemiological data set at a point in time. #' -#' @details An `epi_df` is a tibble with (at least) the following columns: +#' @details An `epi_df` is a kind of tibble with (at least) the following +#' columns: #' #' - `geo_value`: A character vector representing the geographical unit of #' observation. This could be a country code, a state name, a county code, @@ -14,10 +15,11 @@ #' - `time_value`: A date or integer vector representing the time of observation. #' #' Other columns can be considered as measured variables, which we also refer to -#' as signal variables. An `epi_df` object also has metadata with (at least) -#' the following fields: +#' as indicators or signals. An `epi_df` object also has metadata with (at +#' least) the following fields: #' #' * `geo_type`: the type for the geo values. +#' * `time_type`: the type for the time values. #' * `as_of`: the time value at which the given data were available. #' #' Most users should use `as_epi_df`. The input tibble `x` to the constructor diff --git a/R/grouped_epi_archive.R b/R/grouped_epi_archive.R index 5fb9d081..5a64e3ff 100644 --- a/R/grouped_epi_archive.R +++ b/R/grouped_epi_archive.R @@ -203,13 +203,12 @@ ungroup.grouped_epi_archive <- function(x, ...) { } -#' @rdname epix_slide -#' #' @importFrom data.table key address rbindlist setDF copy #' @importFrom tibble as_tibble new_tibble validate_tibble #' @importFrom dplyr group_by groups #' @importFrom rlang !! !!! enquo quo_is_missing enquos is_quosure sym syms #' env missing_arg +#' #' @export epix_slide.grouped_epi_archive <- function( .x, diff --git a/R/inline-roxygen.R b/R/inline-roxygen.R index ae2ce66c..e6fb4cd1 100644 --- a/R/inline-roxygen.R +++ b/R/inline-roxygen.R @@ -7,7 +7,7 @@ #' #' @keywords internal tidyselect_arg_roxygen <- ' - <[`tidy-select`][dplyr_tidy_select]> An unquoted column + <[`tidy-select`][dplyr::dplyr_tidy_select]> An unquoted column name (e.g., `cases`), multiple column names (e.g., `c(cases, deaths)`), [other tidy-select expression][tidyselect::language], or a vector of characters (e.g. `c("cases", "deaths")`). Variable names can be used as if diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index faf3c128..f3a8d031 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -650,72 +650,89 @@ epix_detailed_restricted_mutate <- function(.data, ...) { } -#' Slide a function over variables in an `epi_archive` or `grouped_epi_archive` +#' Take each requested (group and) version in an archive, run a computation (e.g., forecast) #' -#' Slides a given function over variables in an `epi_archive` object. This -#' behaves similarly to `epi_slide()`, with the key exception that it is -#' version-aware: the sliding computation at any given reference time t is -#' performed on **data that would have been available as of t**. This function -#' is intended for use in accurate backtesting of models; see -#' `vignette("backtesting", package="epipredict")` for a walkthrough. +#' ... and collect the results. This is useful for more accurately simulating +#' how a forecaster, nowcaster, or other algorithm would have behaved in real +#' time, factoring in reporting latency and data revisions; see +#' \href{https://cmu-delphi.github.io/epipredict/articles/backtesting.html}{`vignette("backtesting", +#' package="epipredict")`} for a walkthrough. +#' +#' This is similar to looping over versions and calling [`epix_as_of`], but has +#' some conveniences such as working naturally with [`grouped_epi_archive`]s, +#' optional time windowing, and syntactic sugar to make things shorter to write. #' #' @param .x An [`epi_archive`] or [`grouped_epi_archive`] object. If ungrouped, #' all data in `x` will be treated as part of a single data group. #' @param .f Function, formula, or missing; together with `...` specifies the -#' computation to slide. To "slide" means to apply a computation over a -#' sliding (a.k.a. "rolling") time window for each data group. The window is -#' determined by the `.before` parameter (see details for more). If a -#' function, `.f` must have the form `function(x, g, t, ...)`, where -#' -#' - "x" is an epi_df with the same column names as the archive's `DT`, minus -#' the `version` column -#' - "g" is a one-row tibble containing the values of the grouping variables -#' for the associated group -#' - "t" is the ref_time_value for the current window -#' - "..." are additional arguments +#' computation. The computation will be run on each requested group-version +#' combination, with a time window filter applied if `.before` is supplied. +#' +#' If `.f` is a function must have the form `function(x, g, v)` or +#' `function(x, g, v, )`, where +#' +#' - `x` is an `epi_df` with the same column names as the archive's `DT`, +#' minus the `version` column. (Or, if `.all_versions = TRUE`, an +#' `epi_archive` with the requested partial version history.) +#' +#' - `g` is a one-row tibble containing the values of the grouping variables +#' for the associated group. +#' +#' - `v` (length-1) is the associated `version` (one of the requested +#' `.versions`) +#' +#' - `` are optional; you can add such +#' arguments to your function and set them by passing them through the +#' `...` argument to `epix_slide()`. #' #' If a formula, `.f` can operate directly on columns accessed via `.x$var` or #' `.$var`, as in `~ mean (.x$var)` to compute a mean of a column `var` for #' each group-`ref_time_value` combination. The group key can be accessed via -#' `.y` or `.group_key`, and the reference time value can be accessed via `.z` -#' or `.ref_time_value`. If `.f` is missing, then `...` will specify the -#' computation. +#' `.y` or `.group_key`, and the reference time value can be accessed via +#' `.z`, `.version`, or `.ref_time_value`. If `.f` is missing, then `...` will +#' specify the computation. #' @param ... Additional arguments to pass to the function or formula specified #' via `f`. Alternatively, if `.f` is missing, then the `...` is interpreted #' as a ["data-masking"][rlang::args_data_masking] expression or expressions #' for tidy evaluation; in addition to referring columns directly by name, the #' expressions have access to `.data` and `.env` pronouns as in `dplyr` verbs, #' and can also refer to `.x` (not the same as the input epi_archive), -#' `.group_key`, and `.ref_time_value`. See details for more. -#' @param .before How many time values before the `.ref_time_value` -#' should each snapshot handed to the function `.f` contain? If provided, it -#' should be a single value that is compatible with the time_type of the -#' time_value column (more below), but most commonly an integer. This window -#' endpoint is inclusive. For example, if `.before = 7`, `time_type` -#' in the archive is "day", and the `.ref_time_value` is January 8, then the -#' smallest time_value in the snapshot will be January 1. If missing, then the -#' default is no limit on the time values, so the full snapshot is given. -#' @param .versions Reference time values / versions for sliding -#' computations; each element of this vector serves both as the anchor point -#' for the `time_value` window for the computation and the `max_version` -#' `epix_as_of` which we fetch data in this window. If missing, then this will -#' set to a regularly-spaced sequence of values set to cover the range of -#' `version`s in the `DT` plus the `versions_end`; the spacing of values will -#' be guessed (using the GCD of the skips between values). +#' `.group_key` and `.version`/`.ref_time_value`. See details for more. +#' @param .before Optional; applies a `time_value` filter before running each +#' computation. The default is not to apply a `time_value` filter. If +#' provided, it should be a single integer or difftime that is compatible with +#' the time_type of the time_value column. If an integer, then the minimum +#' possible `time_value` included will be that many time steps (according to +#' the `time_type`) before each requested `.version`. This window endpoint is +#' inclusive. For example, if `.before = 14`, the `time_type` in the archive +#' is "day", and the requested `.version` is January 15, then the smallest +#' possible `time_value` possible in the snapshot will be January 1. Note that +#' this does not mean that there will be 14 or 15 distinct `time_value`s +#' actually appearing in the data; for most reporting streams, reporting as of +#' January 15 won't include `time_value`s all the way through January 14, due +#' to reporting latency. Unlike `epi_slide()`, `epix_slide()` won't fill in +#' any missing `time_values` in this window. +#' @param .versions Requested versions on which to run the computation. Each +#' requested `.version` also serves as the anchor point from which +#' the `time_value` window specified by `.before` is drawn. If `.versions` is +#' missing, it will be set to a regularly-spaced sequence of values set to +#' cover the range of `version`s in the `DT` plus the `versions_end`; the +#' spacing of values will be guessed (using the GCD of the skips between +#' values). #' @param .new_col_name Either `NULL` or a string indicating the name of the new #' column that will contain the derived values. The default, `NULL`, will use #' the name "slide_value" unless your slide computations output data frames, -#' in which case they will be unpacked into the constituent columns and those -#' names used. If the resulting column name(s) overlap with the column names -#' used for labeling the computations, which are `group_vars(x)` and -#' `"version"`, then the values for these columns must be identical to the -#' labels we assign. +#' in which case they will be unpacked into the constituent columns and the +#' data frame's column names will be used instead. If the resulting column +#' name(s) overlap with the column names used for labeling the computations, +#' which are `group_vars(x)` and `"version"`, then the values for these +#' columns must be identical to the labels we assign. #' @param .all_versions (Not the same as `.all_rows` parameter of `epi_slide`.) #' If `.all_versions = TRUE`, then the slide computation will be passed the -#' version history (all `version <= .version` where `.version` is one of the -#' requested `.versions`) for rows having a `time_value` of at least `.version -#' - before`. Otherwise, the slide computation will be passed only the most -#' recent `version` for every unique `time_value`. Default is `FALSE`. +#' version history (all versions `<= .version` where `.version` is one of the +#' requested `.version`s), in `epi_archive` format. Otherwise, the slide +#' computation will be passed only the most recent `version` for every unique +#' `time_value`, in `epi_df` format. Default is `FALSE`. #' @return A tibble whose columns are: the grouping variables (if any), #' `time_value`, containing the reference time values for the slide #' computation, and a column named according to the `.new_col_name` argument, @@ -724,16 +741,17 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' @details A few key distinctions between the current function and `epi_slide()`: #' 1. In `.f` functions for `epix_slide`, one should not assume that the input #' data to contain any rows with `time_value` matching the computation's -#' `.ref_time_value` (accessible via `attributes()$metadata$as_of`); for -#' typical epidemiological surveillance data, observations pertaining to a -#' particular time period (`time_value`) are first reported `as_of` some -#' instant after that time period has ended. +#' `.version`, due to reporting latency; for typical epidemiological +#' surveillance data, observations pertaining to a particular time period +#' (`time_value`) are first reported `as_of` some instant after that time +#' period has ended. No time window completion is performed as in +#' `epi_slide()`. #' 2. The input class and columns are similar but different: `epix_slide` #' (with the default `.all_versions=FALSE`) keeps all columns and the #' `epi_df`-ness of the first argument to each computation; `epi_slide` only #' provides the grouping variables in the second input, and will convert the #' first input into a regular tibble if the grouping variables include the -#' essential `geo_value` column. (With .all_versions=TRUE`, `epix_slide` will +#' essential `geo_value` column. (With `.all_versions=TRUE`, `epix_slide` #' will provide an `epi_archive` rather than an `epi-df` to each #' computation.) #' 3. The output class and columns are similar but different: `epix_slide()` @@ -747,81 +765,60 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' 4. There are no size stability checks or element/row recycling to maintain #' size stability in `epix_slide`, unlike in `epi_slide`. (`epix_slide` is #' roughly analogous to [`dplyr::group_modify`], while `epi_slide` is roughly -#' analogous to `dplyr::mutate` followed by `dplyr::arrange`) This is detailed -#' in the "advanced" vignette. +#' analogous to [`dplyr::mutate`].) #' 5. `.all_rows` is not supported in `epix_slide`; since the slide #' computations are allowed more flexibility in their outputs than in #' `epi_slide`, we can't guess a good representation for missing computations #' for excluded group-`.ref_time_value` pairs. -#' 76. The `.versions` default for `epix_slide` is based on making an +#' 6. The `.versions` default for `epix_slide` is based on making an #' evenly-spaced sequence out of the `version`s in the `DT` plus the -#' `versions_end`, rather than the `time_value`s. +#' `versions_end`, rather than all unique `time_value`s. +#' 7. `epix_slide()` computations can refer to the current element of +#' `.versions` as either `.version` or `.ref_time_value`, while `epi_slide()` +#' computations refer to the current element of `.ref_time_values` with +#' `.ref_time_value`. #' #' Apart from the above distinctions, the interfaces between `epix_slide()` and #' `epi_slide()` are the same. #' -#' Furthermore, the current function can be considerably slower than -#' `epi_slide()`, for two reasons: (1) it must repeatedly fetch -#' properly-versioned snapshots from the data archive (via `epix_as_of()`), -#' and (2) it performs a "manual" sliding of sorts, and does not benefit from -#' the highly efficient `slider` package. For this reason, it should never be -#' used in place of `epi_slide()`, and only used when version-aware sliding is -#' necessary (as it its purpose). -#' #' @examples #' library(dplyr) #' -#' # Reference time points for which we want to compute slide values: -#' versions <- seq(as.Date("2020-06-02"), -#' as.Date("2020-06-15"), -#' by = "1 day" -#' ) +#' # Request only a small set of versions, for example's sake: +#' requested_versions <- +#' seq(as.Date("2020-09-02"), as.Date("2020-09-15"), by = "1 day") #' -#' # A simple (but not very useful) example (see the archive vignette for a more -#' # realistic one): +#' # Investigate reporting lag of `percent_cli` signal (though normally we'd +#' # probably work off of the dedicated `revision_summary()` function instead): #' archive_cases_dv_subset %>% -#' group_by(geo_value) %>% #' epix_slide( -#' .f = ~ mean(.x$case_rate_7d_av), -#' .before = 2, -#' .versions = versions, -#' .new_col_name = "case_rate_7d_av_recent_av" -#' ) %>% -#' ungroup() -#' # We requested time windows that started 2 days before the corresponding time -#' # values. The actual number of `time_value`s in each computation depends on -#' # the reporting latency of the signal and `time_value` range covered by the -#' # archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have -#' # * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically -#' # discarded -#' # * 1 `time_value`, for ref time 2020-06-02 -#' # * 2 `time_value`s, for the rest of the results -#' # * never the 3 `time_value`s we would get from `epi_slide`, since, because -#' # of data latency, we'll never have an observation -#' # `time_value == .ref_time_value` as of `.ref_time_value`. -#' # The example below shows this type of behavior in more detail. -#' -#' # Examining characteristics of the data passed to each computation with -#' # `all_versions=FALSE`. +#' geowide_percent_cli_max_time = max(time_value[!is.na(percent_cli)]), +#' geowide_percent_cli_rpt_lag = .version - geowide_percent_cli_max_time, +#' .versions = requested_versions +#' ) #' archive_cases_dv_subset %>% #' group_by(geo_value) %>% #' epix_slide( -#' function(x, gk, rtv) { -#' tibble( -#' time_range = if (nrow(x) == 0L) { -#' "0 `time_value`s" -#' } else { -#' sprintf("%s -- %s", min(x$time_value), max(x$time_value)) -#' }, -#' n = nrow(x), -#' class1 = class(x)[[1L]] -#' ) -#' }, -#' .before = 5, .all_versions = FALSE, -#' .versions = versions -#' ) %>% -#' ungroup() %>% -#' arrange(geo_value, version) +#' percent_cli_max_time = max(time_value[!is.na(percent_cli)]), +#' percent_cli_rpt_lag = .version - percent_cli_max_time, +#' .versions = requested_versions +#' ) +#' +#' # Backtest a forecaster "pseudoprospectively" (i.e., faithfully with respect +#' # to the data version history): +#' case_death_rate_archive %>% +#' epix_slide( +#' .versions = as.Date(c("2021-10-01", "2021-10-08")), +#' function(x, g, v) { +#' epipredict::arx_forecaster( +#' x, +#' outcome = "death_rate", +#' predictors = c("death_rate_7d_av", "case_rate_7d_av") +#' )$predictions +#' } +#' ) +#' # See `vignette("backtesting", package="epipredict")` for a full walkthrough +#' # on backtesting forecasters, including plots, etc. #' #' # --- Advanced: --- #' @@ -853,7 +850,7 @@ epix_detailed_restricted_mutate <- function(.data, ...) { #' ) #' }, #' .before = 5, .all_versions = TRUE, -#' .versions = versions +#' .versions = requested_versions #' ) %>% #' ungroup() %>% #' # Focus on one geo_value so we can better see the columns above: @@ -873,7 +870,6 @@ epix_slide <- function( } -#' @rdname epix_slide #' @export epix_slide.epi_archive <- function( .x, diff --git a/R/revision_analysis.R b/R/revision_analysis.R index fc057c56..4e12fb77 100644 --- a/R/revision_analysis.R +++ b/R/revision_analysis.R @@ -26,7 +26,7 @@ #' the window afterwards at 150. #' #' @param epi_arch an epi_archive to be analyzed -#' @param ... <[`tidyselect`][dplyr_tidy_select]>, used to choose the column to +#' @param ... <[`tidyselect`][dplyr::dplyr_tidy_select]>, used to choose the column to #' summarize. If empty and there is only one value/measurement column (i.e., #' not in [`key_colnames`]) in the archive, it will automatically select it. #' If supplied, `...` must select exactly one column. diff --git a/R/slide.R b/R/slide.R index 6be54baa..bc993bf4 100644 --- a/R/slide.R +++ b/R/slide.R @@ -1,47 +1,82 @@ -#' Slide a function over variables in an `epi_df` object +#' More general form of [`epi_slide_opt`] for rolling/running computations #' -#' @description Slides a given function over variables in an `epi_df` object. -#' This is useful for computations like rolling averages. The function supports -#' many ways to specify the computation, but by far the most common use case is -#' as follows: +#' Most rolling/running computations can be handled by [`epi_slide_mean`], +#' [`epi_slide_sum`], or the medium-generality [`epi_slide_opt`] functions, +#' which have been specialized to run more quickly. `epi_slide()` is a slower +#' but even more general function for rolling/running computations, and uses a +#' different interface to specify the computations; you typically only need to +#' consider using `epi_slide()` if you have a computation that depends on +#' multiple columns simultaneously, outputs multiple columns simultaneously, or +#' produces non-numeric output. For example, this computation depends on +#' multiple columns: #' #' ``` -#' # Create new column `cases_7dmed` that contains a 7-day trailing median of cases -#' epi_slide(edf, cases_7dmed = median(cases), .window_size = 7) +#' cases_deaths_subset %>% +#' epi_slide( +#' cfr_estimate_v0 = death_rate_7d_av[[22]]/case_rate_7d_av[[1]], +#' .window_size = 22 +#' ) %>% +#' print(n = 30) #' ``` #' -#' For two very common use cases, we provide optimized functions that are much -#' faster than `epi_slide`: `epi_slide_mean()` and `epi_slide_sum()`. We -#' recommend using these functions when possible. +#' (Here, the value 22 was selected using `epi_cor()` and averaging across +#' `geo_value`s. See +#' \href{https://www.medrxiv.org/content/10.1101/2024.12.27.24319518v1}{this +#' manuscript} for some warnings & information using similar types of CFR +#' estimators.) #' #' See `vignette("epi_df")` for more examples. #' #' @template basic-slide-params -#' @param .f Function, formula, or missing; together with `...` specifies the -#' computation to slide. The return of the computation should either be a -#' scalar or a 1-row data frame. Data frame returns will be -#' `tidyr::unpack()`-ed, if named, and will be [`tidyr::pack`]-ed columns, if -#' not named. See examples. +#' @param .f,... The computation to slide. The input will be a time window of +#' the data for a single subpopulation (i.e., a single `geo_value` and single +#' value for any [`other_keys`][as_epi_df] you set up, such as age groups, race, etc.). +#' The input will always have the same size, determined by `.window_size`, and +#' will fill in any missing `time_values`, using `NA` values for missing +#' measurements. The output should be a scalar value or a 1-row data frame; +#' these outputs will be collected into a new column or columns in the +#' `epi_slide()` result. Data frame outputs will be unpacked into multiple +#' columns in the result by default, or [`tidyr::pack`]ed into a single +#' data-frame-type column if you provide a name for such a column (e.g., via +#' `.new_col_name`). +#' +#' You can specify the computation in one of the following ways: +#' +#' - Don't provide `.f`, and instead use one or more +#' [`dplyr::summarize`]-esque ["data-masking"][rlang::args_data_masking] +#' expressions in `...`, e.g., `cfr_estimate_v0 = +#' death_rate_7d_av[[22]]/case_rate_7d_av[[1]]`. This way is sometimes more +#' convenient, but also has the most computational overhead. +#' +#' - Provide a formula in `.f`, e.g., `~ +#' .x$death_rate_7d_av[[22]]/.x$case_rate_7d_av[[1]]`. In this formula, `.x` +#' is an `epi_df` containing data for a single time window as described above, +#' taken from the original `.x` fed into `epi_slide()`. +#' +#' - Provide a function in `.f`, e.g., `function(x, g, t) +#' x$death_rate_7d_av[[22]]/x$case_rate_7d_av[[1]]`. The function should be of +#' the form `function(x, g, t)` or `function(x, g, t, )`, where: #' -#' - If `.f` is missing, then `...` will specify the computation via -#' tidy-evaluation. This is usually the most convenient way to use -#' `epi_slide`. See examples. -#' - If `.f` is a formula, then the formula should use `.x` (not the same as -#' the input `epi_df`) to operate on the columns of the input `epi_df`, e.g. -#' `~mean(.x$var)` to compute a mean of `var`. -#' - If a function, `.f` must have the form `function(x, g, t, ...)`, where: #' - `x` is a data frame with the same column names as the original object, -#' minus any grouping variables, with only the windowed data for one -#' group-`.ref_time_value` combination -#' - `g` is a one-row tibble containing the values of the grouping variables -#' for the associated group +#' minus any grouping variables, with only the windowed data for one +#' group-`.ref_time_value` combination +#' +#' - `g` is a one-row tibble specifying the `geo_value` and value of any +#' `other_keys` for this computation +#' #' - `t` is the `.ref_time_value` for the current window -#' - `...` are additional arguments #' -#' @param ... Additional arguments to pass to the function or formula specified -#' via `.f`. Alternatively, if `.f` is missing, then the `...` is interpreted -#' as a ["data-masking"][rlang::args_data_masking] expression or expressions -#' for tidy evaluation. +#' - If you have a complex `.f` containing ``, you can provide values for those arguments in the `...` +#' argument to `epi_slide()`. +#' +#' The values of `g` and `t` are also available to data-masking expression and +#' formula-based computations as `.group_key` and `.ref_time_value`, +#' respectively. Formula computations also let you use `.y` or `.z`, +#' respectively, as additional names for these same quantities (similar to +#' [`dplyr::group_modify`]). +#' #' @param .new_col_name Name for the new column that will contain the computed #' values. The default is "slide_value" unless your slide computations output #' data frames, in which case they will be unpacked (as in `tidyr::unpack()`) @@ -49,14 +84,34 @@ #' be given names that clash with the existing columns of `.x`. #' #' @details +#' +#' ## Motivation and lower-level alternatives +#' +#' `epi_slide()` is focused on preventing errors and providing a convenient +#' interface. If you need computational speed, many computations can be optimized +#' by one of the following: +#' +#' * Performing core sliding operations with `epi_slide_opt()` with +#' `frollapply`, and using potentially-grouped `mutate()`s to transform or +#' combine the results. +#' +#' * Grouping by `geo_value` and any `other_keys`; [`complete()`]ing with +#' `full_seq()` to fill in time gaps; `arrange()`ing by `time_value`s within +#' each group; using `mutate()` with vectorized operations and shift operators +#' like `dplyr::lead()` and `dplyr::lag()` to perform the core operations, +#' being careful to give the desired results for the least and most recent +#' `time_value`s (often `NA`s for the least recent); ungrouping; and +#' `filter()`ing back down to only rows that existed before the `complete()` +#' stage if necessary. +#' #' ## Advanced uses of `.f` via tidy evaluation #' -#' If specifying `.f` via tidy evaluation, in addition to the standard [`.data`] -#' and [`.env`], we make some additional "pronoun"-like bindings available: +#' If specifying `.f` via tidy evaluation, in addition to the standard [`.data`][rlang::.data] +#' and [`.env`][rlang::.env], we make some additional "pronoun"-like bindings available: #' #' - .x, which is like `.x` in [`dplyr::group_modify`]; an ordinary object #' like an `epi_df` rather than an rlang [pronoun][rlang::as_data_pronoun] -#' like [`.data`]; this allows you to use additional `dplyr`, `tidyr`, and +#' like `.data`; this allows you to use additional `dplyr`, `tidyr`, and #' `epiprocess` operations. If you have multiple expressions in `...`, this #' won't let you refer to the output of the earlier expressions, but `.data` #' will. @@ -72,34 +127,43 @@ #' @examples #' library(dplyr) #' -#' # Get the 7-day trailing standard deviation of cases and the 7-day trailing mean of cases -#' cases_deaths_subset %>% +#' # Generate some simple time-varying CFR estimates: +#' with_cfr_estimates <- cases_deaths_subset %>% #' epi_slide( -#' cases_7sd = sd(cases, na.rm = TRUE), -#' cases_7dav = mean(cases, na.rm = TRUE), -#' .window_size = 7 -#' ) %>% -#' select(geo_value, time_value, cases, cases_7sd, cases_7dav) -#' # Note that epi_slide_mean could be used to more quickly calculate cases_7dav. +#' cfr_estimate_v0 = death_rate_7d_av[[22]] / case_rate_7d_av[[1]], +#' .window_size = 22 +#' ) +#' with_cfr_estimates %>% +#' print(n = 30) +#' # (Here, the value 22 was selected using `epi_cor()` and averaging across +#' # `geo_value`s. See +#' # https://www.medrxiv.org/content/10.1101/2024.12.27.24319518v1 for some +#' # warnings & information using CFR estimators along these lines.) #' -#' # In addition to the [`dplyr::mutate`]-like syntax, you can feed in a function or -#' # formula in a way similar to [`dplyr::group_modify`]: -#' my_summarizer <- function(window_data) { -#' window_data %>% -#' summarize( -#' cases_7sd = sd(cases, na.rm = TRUE), -#' cases_7dav = mean(cases, na.rm = TRUE) -#' ) +#' # In addition to the [`dplyr::mutate`]-like syntax, you can feed in a +#' # function or formula in a way similar to [`dplyr::group_modify`]; these +#' # often run much more quickly: +#' my_computation <- function(window_data) { +#' tibble( +#' cfr_estimate_v0 = window_data$death_rate_7d_av[[nrow(window_data)]] / +#' window_data$case_rate_7d_av[[1]] +#' ) #' } -#' cases_deaths_subset %>% +#' with_cfr_estimates2 <- cases_deaths_subset %>% #' epi_slide( -#' ~ my_summarizer(.x), -#' .window_size = 7 -#' ) %>% -#' select(geo_value, time_value, cases, cases_7sd, cases_7dav) -#' -#' -#' +#' ~ my_computation(.x), +#' .window_size = 22 +#' ) +#' with_cfr_estimates3 <- cases_deaths_subset %>% +#' epi_slide( +#' function(window_data, g, t) { +#' tibble( +#' cfr_estimate_v0 = window_data$death_rate_7d_av[[nrow(window_data)]] / +#' window_data$case_rate_7d_av[[1]] +#' ) +#' }, +#' .window_size = 22 +#' ) #' #' #' #### Advanced: #### @@ -549,12 +613,22 @@ get_before_after_from_window <- function(window_size, align, time_type) { list(before = before, after = after) } -#' Optimized slide functions for common cases +#' Calculate rolling or running means, sums, etc., or custom calculations +#' +#' @description These methods take each subpopulation (i.e., a single +#' `geo_value` and combination of any `other_keys` you set up for age groups, +#' etc.) and perform a `.window_size`-width time window rolling/sliding +#' computation, or alternatively, a running/cumulative computation (with +#' `.window_size = Inf`) on the requested columns. Explicit `NA` measurements +#' are temporarily added to fill in any time gaps, and, for rolling +#' computations, to pad the time series to ensure that the first & last +#' computations use exactly `.window_size` values. #' -#' @description `epi_slide_opt` allows sliding an n-timestep [data.table::froll] -#' or [slider::summary-slide] function over variables in an `epi_df` object. -#' These functions tend to be much faster than `epi_slide()`. See -#' `vignette("epi_df")` for more examples. +#' `epi_slide_opt` allows you to use any [data.table::froll] or +#' [slider::summary-slide] function. If none of those specialized functions fit +#' your usecase, you can use `data.table::frollapply` together with a non-rolling +#' function (e.g., `median`). See [`epi_slide`] if you need to work with +#' multiple columns at once or output a custom type. #' #' @template basic-slide-params #' @param .col_names `r tidyselect_arg_roxygen` diff --git a/man/autoplot-epi.Rd b/man/autoplot-epi.Rd index 6ed9ddaf..c7c8a7c4 100644 --- a/man/autoplot-epi.Rd +++ b/man/autoplot-epi.Rd @@ -34,7 +34,7 @@ \arguments{ \item{object, x}{An \code{epi_df} or \code{epi_archive}} -\item{...}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> One or more unquoted +\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> One or more unquoted expressions separated by commas. Variable names can be used as if they were positions in the data frame, so expressions like \code{x:y} can be used to select a range of variables.} @@ -60,7 +60,7 @@ locations would share the same color line.} \item{.facet_filter}{Select which facets will be displayed. Especially useful for when there are many \code{geo_value}'s or keys. This is a -<\code{\link[=args_data_masking]{rlang}}> expression along the lines of \code{\link[dplyr:filter]{dplyr::filter()}}. +<\code{\link[rlang:args_data_masking]{rlang}}> expression along the lines of \code{\link[dplyr:filter]{dplyr::filter()}}. However, it must be a single expression combined with the \code{&} operator. This contrasts to the typical use case which allows multiple comma-separated expressions which are implicitly combined with \code{&}. When multiple variables are selected diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index f91834f3..32cba5b5 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -129,6 +129,9 @@ only performs some fast, basic checks on the inputs. \code{validate_epi_archive} can perform more costly validation checks on its output. But most users should use \code{as_epi_archive}, which performs all necessary checks and has some additional features. + +Does not validate \code{geo_type} or \code{time_type} against \code{geo_value} and +\code{time_value} columns. These are assumed to have been set to compatibly. } \details{ An \code{epi_archive} contains a \code{data.table} object \code{DT} (from the diff --git a/man/epi_df.Rd b/man/epi_df.Rd index a6782718..0ec5830c 100644 --- a/man/epi_df.Rd +++ b/man/epi_df.Rd @@ -89,7 +89,8 @@ which can be seen as measured variables at each key. In brief, an \code{epi_df} represents a snapshot of an epidemiological data set at a point in time. } \details{ -An \code{epi_df} is a tibble with (at least) the following columns: +An \code{epi_df} is a kind of tibble with (at least) the following +columns: \itemize{ \item \code{geo_value}: A character vector representing the geographical unit of observation. This could be a country code, a state name, a county code, @@ -98,10 +99,11 @@ etc. } Other columns can be considered as measured variables, which we also refer to -as signal variables. An \code{epi_df} object also has metadata with (at least) -the following fields: +as indicators or signals. An \code{epi_df} object also has metadata with (at +least) the following fields: \itemize{ \item \code{geo_type}: the type for the geo values. +\item \code{time_type}: the type for the time values. \item \code{as_of}: the time value at which the given data were available. } diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index f053909d..92c1fa91 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/slide.R \name{epi_slide} \alias{epi_slide} -\title{Slide a function over variables in an \code{epi_df} object} +\title{More general form of \code{\link{epi_slide_opt}} for rolling/running computations} \usage{ epi_slide( .x, @@ -20,34 +20,46 @@ epi_slide( and any columns in \code{other_keys}. If grouped, we make sure the grouping is by \code{geo_value} and \code{other_keys}.} -\item{.f}{Function, formula, or missing; together with \code{...} specifies the -computation to slide. The return of the computation should either be a -scalar or a 1-row data frame. Data frame returns will be -\code{tidyr::unpack()}-ed, if named, and will be \code{\link[tidyr:pack]{tidyr::pack}}-ed columns, if -not named. See examples. +\item{.f, ...}{The computation to slide. The input will be a time window of +the data for a single subpopulation (i.e., a single \code{geo_value} and single +value for any \code{\link[=as_epi_df]{other_keys}} you set up, such as age groups, race, etc.). +The input will always have the same size, determined by \code{.window_size}, and +will fill in any missing \code{time_values}, using \code{NA} values for missing +measurements. The output should be a scalar value or a 1-row data frame; +these outputs will be collected into a new column or columns in the +\code{epi_slide()} result. Data frame outputs will be unpacked into multiple +columns in the result by default, or \code{\link[tidyr:pack]{tidyr::pack}}ed into a single +data-frame-type column if you provide a name for such a column (e.g., via +\code{.new_col_name}). + +You can specify the computation in one of the following ways: \itemize{ -\item If \code{.f} is missing, then \code{...} will specify the computation via -tidy-evaluation. This is usually the most convenient way to use -\code{epi_slide}. See examples. -\item If \code{.f} is a formula, then the formula should use \code{.x} (not the same as -the input \code{epi_df}) to operate on the columns of the input \code{epi_df}, e.g. -\code{~mean(.x$var)} to compute a mean of \code{var}. -\item If a function, \code{.f} must have the form \verb{function(x, g, t, ...)}, where: +\item Don't provide \code{.f}, and instead use one or more +\code{\link[dplyr:summarise]{dplyr::summarize}}-esque \link[rlang:args_data_masking]{"data-masking"} +expressions in \code{...}, e.g., \code{cfr_estimate_v0 = death_rate_7d_av[[22]]/case_rate_7d_av[[1]]}. This way is sometimes more +convenient, but also has the most computational overhead. +\item Provide a formula in \code{.f}, e.g., \code{~ .x$death_rate_7d_av[[22]]/.x$case_rate_7d_av[[1]]}. In this formula, \code{.x} +is an \code{epi_df} containing data for a single time window as described above, +taken from the original \code{.x} fed into \code{epi_slide()}. +\item Provide a function in \code{.f}, e.g., \code{function(x, g, t) x$death_rate_7d_av[[22]]/x$case_rate_7d_av[[1]]}. The function should be of +the form \verb{function(x, g, t)} or \verb{function(x, g, t, )}, where: \itemize{ \item \code{x} is a data frame with the same column names as the original object, minus any grouping variables, with only the windowed data for one group-\code{.ref_time_value} combination -\item \code{g} is a one-row tibble containing the values of the grouping variables -for the associated group +\item \code{g} is a one-row tibble specifying the \code{geo_value} and value of any +\code{other_keys} for this computation \item \code{t} is the \code{.ref_time_value} for the current window -\item \code{...} are additional arguments +\item If you have a complex \code{.f} containing \verb{}, you can provide values for those arguments in the \code{...} +argument to \code{epi_slide()}. } -}} -\item{...}{Additional arguments to pass to the function or formula specified -via \code{.f}. Alternatively, if \code{.f} is missing, then the \code{...} is interpreted -as a \link[rlang:args_data_masking]{"data-masking"} expression or expressions -for tidy evaluation.} +The values of \code{g} and \code{t} are also available to data-masking expression and +formula-based computations as \code{.group_key} and \code{.ref_time_value}, +respectively. Formula computations also let you use \code{.y} or \code{.z}, +respectively, as additional names for these same quantities (similar to +\code{\link[dplyr:group_map]{dplyr::group_modify}}). +}} \item{.window_size}{The size of the sliding window. The accepted values depend on the type of the \code{time_value} column in \code{.x}: @@ -95,30 +107,60 @@ added. It will be ungrouped if \code{.x} was ungrouped, and have the same groups as \code{.x} if \code{.x} was grouped. } \description{ -Slides a given function over variables in an \code{epi_df} object. -This is useful for computations like rolling averages. The function supports -many ways to specify the computation, but by far the most common use case is -as follows: - -\if{html}{\out{
}}\preformatted{# Create new column `cases_7dmed` that contains a 7-day trailing median of cases -epi_slide(edf, cases_7dmed = median(cases), .window_size = 7) +Most rolling/running computations can be handled by \code{\link{epi_slide_mean}}, +\code{\link{epi_slide_sum}}, or the medium-generality \code{\link{epi_slide_opt}} functions, +which have been specialized to run more quickly. \code{epi_slide()} is a slower +but even more general function for rolling/running computations, and uses a +different interface to specify the computations; you typically only need to +consider using \code{epi_slide()} if you have a computation that depends on +multiple columns simultaneously, outputs multiple columns simultaneously, or +produces non-numeric output. For example, this computation depends on +multiple columns: +} +\details{ +\if{html}{\out{
}}\preformatted{cases_deaths_subset \%>\% + epi_slide( + cfr_estimate_v0 = death_rate_7d_av[[22]]/case_rate_7d_av[[1]], + .window_size = 22 + ) \%>\% + print(n = 30) }\if{html}{\out{
}} -For two very common use cases, we provide optimized functions that are much -faster than \code{epi_slide}: \code{epi_slide_mean()} and \code{epi_slide_sum()}. We -recommend using these functions when possible. +(Here, the value 22 was selected using \code{epi_cor()} and averaging across +\code{geo_value}s. See +\href{https://www.medrxiv.org/content/10.1101/2024.12.27.24319518v1}{this +manuscript} for some warnings & information using similar types of CFR +estimators.) See \code{vignette("epi_df")} for more examples. +\subsection{Motivation and lower-level alternatives}{ + +\code{epi_slide()} is focused on preventing errors and providing a convenient +interface. If you need computational speed, many computations can be optimized +by one of the following: +\itemize{ +\item Performing core sliding operations with \code{epi_slide_opt()} with +\code{frollapply}, and using potentially-grouped \code{mutate()}s to transform or +combine the results. +\item Grouping by \code{geo_value} and any \code{other_keys}; \code{\link[=complete]{complete()}}ing with +\code{full_seq()} to fill in time gaps; \code{arrange()}ing by \code{time_value}s within +each group; using \code{mutate()} with vectorized operations and shift operators +like \code{dplyr::lead()} and \code{dplyr::lag()} to perform the core operations, +being careful to give the desired results for the least and most recent +\code{time_value}s (often \code{NA}s for the least recent); ungrouping; and +\code{filter()}ing back down to only rows that existed before the \code{complete()} +stage if necessary. } -\details{ +} + \subsection{Advanced uses of \code{.f} via tidy evaluation}{ -If specifying \code{.f} via tidy evaluation, in addition to the standard \code{\link{.data}} -and \code{\link{.env}}, we make some additional "pronoun"-like bindings available: +If specifying \code{.f} via tidy evaluation, in addition to the standard \code{\link[rlang:dot-data]{.data}} +and \code{\link[rlang:dot-data]{.env}}, we make some additional "pronoun"-like bindings available: \itemize{ \item .x, which is like \code{.x} in \code{\link[dplyr:group_map]{dplyr::group_modify}}; an ordinary object like an \code{epi_df} rather than an rlang \link[rlang:as_data_mask]{pronoun} -like \code{\link{.data}}; this allows you to use additional \code{dplyr}, \code{tidyr}, and +like \code{.data}; this allows you to use additional \code{dplyr}, \code{tidyr}, and \code{epiprocess} operations. If you have multiple expressions in \code{...}, this won't let you refer to the output of the earlier expressions, but \code{.data} will. @@ -131,34 +173,43 @@ determined the time window for the current computation. \examples{ library(dplyr) -# Get the 7-day trailing standard deviation of cases and the 7-day trailing mean of cases -cases_deaths_subset \%>\% +# Generate some simple time-varying CFR estimates: +with_cfr_estimates <- cases_deaths_subset \%>\% epi_slide( - cases_7sd = sd(cases, na.rm = TRUE), - cases_7dav = mean(cases, na.rm = TRUE), - .window_size = 7 - ) \%>\% - select(geo_value, time_value, cases, cases_7sd, cases_7dav) -# Note that epi_slide_mean could be used to more quickly calculate cases_7dav. - -# In addition to the [`dplyr::mutate`]-like syntax, you can feed in a function or -# formula in a way similar to [`dplyr::group_modify`]: -my_summarizer <- function(window_data) { - window_data \%>\% - summarize( - cases_7sd = sd(cases, na.rm = TRUE), - cases_7dav = mean(cases, na.rm = TRUE) - ) + cfr_estimate_v0 = death_rate_7d_av[[22]] / case_rate_7d_av[[1]], + .window_size = 22 + ) +with_cfr_estimates \%>\% + print(n = 30) +# (Here, the value 22 was selected using `epi_cor()` and averaging across +# `geo_value`s. See +# https://www.medrxiv.org/content/10.1101/2024.12.27.24319518v1 for some +# warnings & information using CFR estimators along these lines.) + +# In addition to the [`dplyr::mutate`]-like syntax, you can feed in a +# function or formula in a way similar to [`dplyr::group_modify`]; these +# often run much more quickly: +my_computation <- function(window_data) { + tibble( + cfr_estimate_v0 = window_data$death_rate_7d_av[[nrow(window_data)]] / + window_data$case_rate_7d_av[[1]] + ) } -cases_deaths_subset \%>\% +with_cfr_estimates2 <- cases_deaths_subset \%>\% epi_slide( - ~ my_summarizer(.x), - .window_size = 7 - ) \%>\% - select(geo_value, time_value, cases, cases_7sd, cases_7dav) - - - + ~ my_computation(.x), + .window_size = 22 + ) +with_cfr_estimates3 <- cases_deaths_subset \%>\% + epi_slide( + function(window_data, g, t) { + tibble( + cfr_estimate_v0 = window_data$death_rate_7d_av[[nrow(window_data)]] / + window_data$case_rate_7d_av[[1]] + ) + }, + .window_size = 22 + ) #### Advanced: #### diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 4b75e9ff..e1cb2092 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -4,7 +4,7 @@ \alias{epi_slide_opt} \alias{epi_slide_mean} \alias{epi_slide_sum} -\title{Optimized slide functions for common cases} +\title{Calculate rolling or running means, sums, etc., or custom calculations} \usage{ epi_slide_opt( .x, @@ -51,7 +51,7 @@ epi_slide_sum( and any columns in \code{other_keys}. If grouped, we make sure the grouping is by \code{geo_value} and \code{other_keys}.} -\item{.col_names}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +\item{.col_names}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> An unquoted column name (e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), \link[tidyselect:language]{other tidy-select expression}, or a vector of characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if @@ -134,10 +134,20 @@ added. It will be ungrouped if \code{.x} was ungrouped, and have the same groups as \code{.x} if \code{.x} was grouped. } \description{ -\code{epi_slide_opt} allows sliding an n-timestep \link[data.table:froll]{data.table::froll} -or \link[slider:summary-slide]{slider::summary-slide} function over variables in an \code{epi_df} object. -These functions tend to be much faster than \code{epi_slide()}. See -\code{vignette("epi_df")} for more examples. +These methods take each subpopulation (i.e., a single +\code{geo_value} and combination of any \code{other_keys} you set up for age groups, +etc.) and perform a \code{.window_size}-width time window rolling/sliding +computation, or alternatively, a running/cumulative computation (with +\code{.window_size = Inf}) on the requested columns. Explicit \code{NA} measurements +are temporarily added to fill in any time gaps, and, for rolling +computations, to pad the time series to ensure that the first & last +computations use exactly \code{.window_size} values. + +\code{epi_slide_opt} allows you to use any \link[data.table:froll]{data.table::froll} or +\link[slider:summary-slide]{slider::summary-slide} function. If none of those specialized functions fit +your usecase, you can use \code{data.table::frollapply} together with a non-rolling +function (e.g., \code{median}). See \code{\link{epi_slide}} if you need to work with +multiple columns at once or output a custom type. \code{epi_slide_mean} is a wrapper around \code{epi_slide_opt} with \code{.f = data.table::frollmean}. diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index cadb1983..ad6be18b 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -1,10 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods-epi_archive.R, R/grouped_epi_archive.R +% Please edit documentation in R/methods-epi_archive.R \name{epix_slide} \alias{epix_slide} -\alias{epix_slide.epi_archive} -\alias{epix_slide.grouped_epi_archive} -\title{Slide a function over variables in an \code{epi_archive} or \code{grouped_epi_archive}} +\title{Take each requested (group and) version in an archive, run a computation (e.g., forecast)} \usage{ epix_slide( .x, @@ -15,51 +13,39 @@ epix_slide( .new_col_name = NULL, .all_versions = FALSE ) - -\method{epix_slide}{epi_archive}( - .x, - .f, - ..., - .before = Inf, - .versions = NULL, - .new_col_name = NULL, - .all_versions = FALSE -) - -\method{epix_slide}{grouped_epi_archive}( - .x, - .f, - ..., - .before = Inf, - .versions = NULL, - .new_col_name = NULL, - .all_versions = FALSE -) } \arguments{ \item{.x}{An \code{\link{epi_archive}} or \code{\link{grouped_epi_archive}} object. If ungrouped, all data in \code{x} will be treated as part of a single data group.} \item{.f}{Function, formula, or missing; together with \code{...} specifies the -computation to slide. To "slide" means to apply a computation over a -sliding (a.k.a. "rolling") time window for each data group. The window is -determined by the \code{.before} parameter (see details for more). If a -function, \code{.f} must have the form \verb{function(x, g, t, ...)}, where -\itemize{ -\item "x" is an epi_df with the same column names as the archive's \code{DT}, minus -the \code{version} column -\item "g" is a one-row tibble containing the values of the grouping variables -for the associated group -\item "t" is the ref_time_value for the current window -\item "..." are additional arguments -} +computation. The computation will be run on each requested group-version +combination, with a time window filter applied if \code{.before} is supplied. + +If \code{.f} is a function must have the form \verb{function(x, g, v)} or +\verb{function(x, g, v, )}, where + +\if{html}{\out{
}}\preformatted{- `x` is an `epi_df` with the same column names as the archive's `DT`, + minus the `version` column. (Or, if `.all_versions = TRUE`, an + `epi_archive` with the requested partial version history.) + +- `g` is a one-row tibble containing the values of the grouping variables + for the associated group. + +- `v` (length-1) is the associated `version` (one of the requested + `.versions`) + +- `` are optional; you can add such + arguments to your function and set them by passing them through the + `...` argument to `epix_slide()`. +}\if{html}{\out{
}} If a formula, \code{.f} can operate directly on columns accessed via \code{.x$var} or \code{.$var}, as in \code{~ mean (.x$var)} to compute a mean of a column \code{var} for each group-\code{ref_time_value} combination. The group key can be accessed via -\code{.y} or \code{.group_key}, and the reference time value can be accessed via \code{.z} -or \code{.ref_time_value}. If \code{.f} is missing, then \code{...} will specify the -computation.} +\code{.y} or \code{.group_key}, and the reference time value can be accessed via +\code{.z}, \code{.version}, or \code{.ref_time_value}. If \code{.f} is missing, then \code{...} will +specify the computation.} \item{...}{Additional arguments to pass to the function or formula specified via \code{f}. Alternatively, if \code{.f} is missing, then the \code{...} is interpreted @@ -67,41 +53,46 @@ as a \link[rlang:args_data_masking]{"data-masking"} expression or expressions for tidy evaluation; in addition to referring columns directly by name, the expressions have access to \code{.data} and \code{.env} pronouns as in \code{dplyr} verbs, and can also refer to \code{.x} (not the same as the input epi_archive), -\code{.group_key}, and \code{.ref_time_value}. See details for more.} - -\item{.before}{How many time values before the \code{.ref_time_value} -should each snapshot handed to the function \code{.f} contain? If provided, it -should be a single value that is compatible with the time_type of the -time_value column (more below), but most commonly an integer. This window -endpoint is inclusive. For example, if \code{.before = 7}, \code{time_type} -in the archive is "day", and the \code{.ref_time_value} is January 8, then the -smallest time_value in the snapshot will be January 1. If missing, then the -default is no limit on the time values, so the full snapshot is given.} - -\item{.versions}{Reference time values / versions for sliding -computations; each element of this vector serves both as the anchor point -for the \code{time_value} window for the computation and the \code{max_version} -\code{epix_as_of} which we fetch data in this window. If missing, then this will -set to a regularly-spaced sequence of values set to cover the range of -\code{version}s in the \code{DT} plus the \code{versions_end}; the spacing of values will -be guessed (using the GCD of the skips between values).} +\code{.group_key} and \code{.version}/\code{.ref_time_value}. See details for more.} + +\item{.before}{Optional; applies a \code{time_value} filter before running each +computation. The default is not to apply a \code{time_value} filter. If +provided, it should be a single integer or difftime that is compatible with +the time_type of the time_value column. If an integer, then the minimum +possible \code{time_value} included will be that many time steps (according to +the \code{time_type}) before each requested \code{.version}. This window endpoint is +inclusive. For example, if \code{.before = 14}, the \code{time_type} in the archive +is "day", and the requested \code{.version} is January 15, then the smallest +possible \code{time_value} possible in the snapshot will be January 1. Note that +this does not mean that there will be 14 or 15 distinct \code{time_value}s +actually appearing in the data; for most reporting streams, reporting as of +January 15 won't include \code{time_value}s all the way through January 14, due +to reporting latency. Unlike \code{epi_slide()}, \code{epix_slide()} won't fill in +any missing \code{time_values} in this window.} + +\item{.versions}{Requested versions on which to run the computation. Each +requested \code{.version} also serves as the anchor point from which +the \code{time_value} window specified by \code{.before} is drawn. If \code{.versions} is +missing, it will be set to a regularly-spaced sequence of values set to +cover the range of \code{version}s in the \code{DT} plus the \code{versions_end}; the +spacing of values will be guessed (using the GCD of the skips between +values).} \item{.new_col_name}{Either \code{NULL} or a string indicating the name of the new column that will contain the derived values. The default, \code{NULL}, will use the name "slide_value" unless your slide computations output data frames, -in which case they will be unpacked into the constituent columns and those -names used. If the resulting column name(s) overlap with the column names -used for labeling the computations, which are \code{group_vars(x)} and -\code{"version"}, then the values for these columns must be identical to the -labels we assign.} +in which case they will be unpacked into the constituent columns and the +data frame's column names will be used instead. If the resulting column +name(s) overlap with the column names used for labeling the computations, +which are \code{group_vars(x)} and \code{"version"}, then the values for these +columns must be identical to the labels we assign.} \item{.all_versions}{(Not the same as \code{.all_rows} parameter of \code{epi_slide}.) If \code{.all_versions = TRUE}, then the slide computation will be passed the -version history (all \code{version <= .version} where \code{.version} is one of the -requested \code{.versions}) for rows having a \code{time_value} of at least `.version -\itemize{ -\item before\verb{. Otherwise, the slide computation will be passed only the most recent }version\verb{for every unique}time_value\verb{. Default is }FALSE`. -}} +version history (all versions \verb{<= .version} where \code{.version} is one of the +requested \code{.version}s), in \code{epi_archive} format. Otherwise, the slide +computation will be passed only the most recent \code{version} for every unique +\code{time_value}, in \code{epi_df} format. Default is \code{FALSE}.} } \value{ A tibble whose columns are: the grouping variables (if any), @@ -110,28 +101,32 @@ computation, and a column named according to the \code{.new_col_name} argument, containing the slide values. It will be grouped by the grouping variables. } \description{ -Slides a given function over variables in an \code{epi_archive} object. This -behaves similarly to \code{epi_slide()}, with the key exception that it is -version-aware: the sliding computation at any given reference time t is -performed on \strong{data that would have been available as of t}. This function -is intended for use in accurate backtesting of models; see -\code{vignette("backtesting", package="epipredict")} for a walkthrough. +... and collect the results. This is useful for more accurately simulating +how a forecaster, nowcaster, or other algorithm would have behaved in real +time, factoring in reporting latency and data revisions; see +\href{https://cmu-delphi.github.io/epipredict/articles/backtesting.html}{\code{vignette("backtesting", package="epipredict")}} for a walkthrough. } \details{ +This is similar to looping over versions and calling \code{\link{epix_as_of}}, but has +some conveniences such as working naturally with \code{\link{grouped_epi_archive}}s, +optional time windowing, and syntactic sugar to make things shorter to write. + A few key distinctions between the current function and \code{epi_slide()}: \enumerate{ \item In \code{.f} functions for \code{epix_slide}, one should not assume that the input data to contain any rows with \code{time_value} matching the computation's -\code{.ref_time_value} (accessible via \verb{attributes()$metadata$as_of}); for -typical epidemiological surveillance data, observations pertaining to a -particular time period (\code{time_value}) are first reported \code{as_of} some -instant after that time period has ended. +\code{.version}, due to reporting latency; for typical epidemiological +surveillance data, observations pertaining to a particular time period +(\code{time_value}) are first reported \code{as_of} some instant after that time +period has ended. No time window completion is performed as in +\code{epi_slide()}. \item The input class and columns are similar but different: \code{epix_slide} (with the default \code{.all_versions=FALSE}) keeps all columns and the \code{epi_df}-ness of the first argument to each computation; \code{epi_slide} only provides the grouping variables in the second input, and will convert the first input into a regular tibble if the grouping variables include the -essential \code{geo_value} column. (With .all_versions=TRUE\verb{, }epix_slide\verb{will will provide an}epi_archive\verb{rather than an}epi-df` to each +essential \code{geo_value} column. (With \code{.all_versions=TRUE}, \code{epix_slide} +will provide an \code{epi_archive} rather than an \code{epi-df} to each computation.) \item The output class and columns are similar but different: \code{epix_slide()} returns a tibble containing only the grouping variables, \code{time_value}, and @@ -144,82 +139,61 @@ results as they are not supported by tibbles.) \item There are no size stability checks or element/row recycling to maintain size stability in \code{epix_slide}, unlike in \code{epi_slide}. (\code{epix_slide} is roughly analogous to \code{\link[dplyr:group_map]{dplyr::group_modify}}, while \code{epi_slide} is roughly -analogous to \code{dplyr::mutate} followed by \code{dplyr::arrange}) This is detailed -in the "advanced" vignette. +analogous to \code{\link[dplyr:mutate]{dplyr::mutate}}.) \item \code{.all_rows} is not supported in \code{epix_slide}; since the slide computations are allowed more flexibility in their outputs than in \code{epi_slide}, we can't guess a good representation for missing computations for excluded group-\code{.ref_time_value} pairs. \item The \code{.versions} default for \code{epix_slide} is based on making an evenly-spaced sequence out of the \code{version}s in the \code{DT} plus the -\code{versions_end}, rather than the \code{time_value}s. +\code{versions_end}, rather than all unique \code{time_value}s. +\item \code{epix_slide()} computations can refer to the current element of +\code{.versions} as either \code{.version} or \code{.ref_time_value}, while \code{epi_slide()} +computations refer to the current element of \code{.ref_time_values} with +\code{.ref_time_value}. } Apart from the above distinctions, the interfaces between \code{epix_slide()} and \code{epi_slide()} are the same. - -Furthermore, the current function can be considerably slower than -\code{epi_slide()}, for two reasons: (1) it must repeatedly fetch -properly-versioned snapshots from the data archive (via \code{epix_as_of()}), -and (2) it performs a "manual" sliding of sorts, and does not benefit from -the highly efficient \code{slider} package. For this reason, it should never be -used in place of \code{epi_slide()}, and only used when version-aware sliding is -necessary (as it its purpose). } \examples{ library(dplyr) -# Reference time points for which we want to compute slide values: -versions <- seq(as.Date("2020-06-02"), - as.Date("2020-06-15"), - by = "1 day" -) +# Request only a small set of versions, for example's sake: +requested_versions <- + seq(as.Date("2020-09-02"), as.Date("2020-09-15"), by = "1 day") -# A simple (but not very useful) example (see the archive vignette for a more -# realistic one): +# Investigate reporting lag of `percent_cli` signal (though normally we'd +# probably work off of the dedicated `revision_summary()` function instead): archive_cases_dv_subset \%>\% - group_by(geo_value) \%>\% epix_slide( - .f = ~ mean(.x$case_rate_7d_av), - .before = 2, - .versions = versions, - .new_col_name = "case_rate_7d_av_recent_av" - ) \%>\% - ungroup() -# We requested time windows that started 2 days before the corresponding time -# values. The actual number of `time_value`s in each computation depends on -# the reporting latency of the signal and `time_value` range covered by the -# archive (2020-06-01 -- 2021-11-30 in this example). In this case, we have -# * 0 `time_value`s, for ref time 2020-06-01 --> the result is automatically -# discarded -# * 1 `time_value`, for ref time 2020-06-02 -# * 2 `time_value`s, for the rest of the results -# * never the 3 `time_value`s we would get from `epi_slide`, since, because -# of data latency, we'll never have an observation -# `time_value == .ref_time_value` as of `.ref_time_value`. -# The example below shows this type of behavior in more detail. - -# Examining characteristics of the data passed to each computation with -# `all_versions=FALSE`. + geowide_percent_cli_max_time = max(time_value[!is.na(percent_cli)]), + geowide_percent_cli_rpt_lag = .version - geowide_percent_cli_max_time, + .versions = requested_versions + ) archive_cases_dv_subset \%>\% group_by(geo_value) \%>\% epix_slide( - function(x, gk, rtv) { - tibble( - time_range = if (nrow(x) == 0L) { - "0 `time_value`s" - } else { - sprintf("\%s -- \%s", min(x$time_value), max(x$time_value)) - }, - n = nrow(x), - class1 = class(x)[[1L]] - ) - }, - .before = 5, .all_versions = FALSE, - .versions = versions - ) \%>\% - ungroup() \%>\% - arrange(geo_value, version) + percent_cli_max_time = max(time_value[!is.na(percent_cli)]), + percent_cli_rpt_lag = .version - percent_cli_max_time, + .versions = requested_versions + ) + +# Backtest a forecaster "pseudoprospectively" (i.e., faithfully with respect +# to the data version history): +case_death_rate_archive \%>\% + epix_slide( + .versions = as.Date(c("2021-10-01", "2021-10-08")), + function(x, g, v) { + epipredict::arx_forecaster( + x, + outcome = "death_rate", + predictors = c("death_rate_7d_av", "case_rate_7d_av") + )$predictions + } + ) +# See `vignette("backtesting", package="epipredict")` for a full walkthrough +# on backtesting forecasters, including plots, etc. # --- Advanced: --- @@ -251,7 +225,7 @@ archive_cases_dv_subset \%>\% ) }, .before = 5, .all_versions = TRUE, - .versions = versions + .versions = requested_versions ) \%>\% ungroup() \%>\% # Focus on one geo_value so we can better see the columns above: diff --git a/man/revision_analysis.Rd b/man/revision_analysis.Rd index 1c7336b3..59f0c959 100644 --- a/man/revision_analysis.Rd +++ b/man/revision_analysis.Rd @@ -40,7 +40,7 @@ revision_summary( \arguments{ \item{epi_arch}{an epi_archive to be analyzed} -\item{...}{<\code{\link[=dplyr_tidy_select]{tidyselect}}>, used to choose the column to +\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidyselect}}>, used to choose the column to summarize. If empty and there is only one value/measurement column (i.e., not in \code{\link{key_colnames}}) in the archive, it will automatically select it. If supplied, \code{...} must select exactly one column.} diff --git a/man/sum_groups_epi_df.Rd b/man/sum_groups_epi_df.Rd index 34ec9993..62eecf29 100644 --- a/man/sum_groups_epi_df.Rd +++ b/man/sum_groups_epi_df.Rd @@ -9,7 +9,7 @@ sum_groups_epi_df(.x, sum_cols, group_cols = "time_value") \arguments{ \item{.x}{an \code{epi_df}} -\item{sum_cols}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> An unquoted column +\item{sum_cols}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> An unquoted column name (e.g., \code{cases}), multiple column names (e.g., \code{c(cases, deaths)}), \link[tidyselect:language]{other tidy-select expression}, or a vector of characters (e.g. \code{c("cases", "deaths")}). Variable names can be used as if diff --git a/vignettes/epi_archive.Rmd b/vignettes/epi_archive.Rmd index cae40dfa..7e6eaea5 100644 --- a/vignettes/epi_archive.Rmd +++ b/vignettes/epi_archive.Rmd @@ -235,7 +235,8 @@ observation carried forward (LOCF). For more information, see `epix_merge()`. ## Backtesting forecasting models One of the most common use cases of `epiprocess::epi_archive()` object is for -accurate model backtesting. See `vignette("backtesting", package="epipredict")` +accurate model backtesting. See [`vignette("backtesting", +package="epipredict")`](https://cmu-delphi.github.io/epipredict/articles/backtesting.html) for an in-depth demo, using a pre-built forecaster in that package. ## Attribution