diff --git a/R/epi_df.R b/R/epi_df.R index 65acfb94..9ed677cf 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -91,25 +91,7 @@ NULL #' correct metadata for an `epi_df` object (ie. `geo_type`, `time_type`, and `as_of`). #' Refer to the below info. about the arguments for more details. #' -#' @param x A data.frame, [tibble::tibble], or [tsibble::tsibble] to be converted -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". -#' @param as_of Time value representing the time at which the given data were -#' available. For example, if `as_of` is January 31, 2022, then the `epi_df` -#' object that is created would represent the most up-to-date version of the -#' data available as of January 31, 2022. If the `as_of` argument is missing, -#' then the current day-time will be used. -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_df` object. The metadata will have `geo_type`, `time_type`, and -#' `as_of` fields; named entries from the passed list will be included as -#' well. If your tibble has additional keys, be sure to specify them as a -#' character vector in the `other_keys` component of `additional_metadata`. -#' @param ... Additional arguments passed to methods. -#' @return An `epi_df` object. +#' @template epi_df-params #' #' @export new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, @@ -182,25 +164,7 @@ new_epi_df <- function(x = tibble::tibble(), geo_type, time_type, as_of, #' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for #' examples. #' -#' @param x A data.frame, [tibble::tibble], or [tsibble::tsibble] to be converted -#' @param geo_type Type for the geo values. If missing, then the function will -#' attempt to infer it from the geo values present; if this fails, then it -#' will be set to "custom". -#' @param time_type Type for the time values. If missing, then the function will -#' attempt to infer it from the time values present; if this fails, then it -#' will be set to "custom". -#' @param as_of Time value representing the time at which the given data were -#' available. For example, if `as_of` is January 31, 2022, then the `epi_df` -#' object that is created would represent the most up-to-date version of the -#' data available as of January 31, 2022. If the `as_of` argument is missing, -#' then the current day-time will be used. -#' @param additional_metadata List of additional metadata to attach to the -#' `epi_df` object. The metadata will have `geo_type`, `time_type`, and -#' `as_of` fields; named entries from the passed list will be included as -#' well. If your tibble has additional keys, be sure to specify them as a -#' character vector in the `other_keys` component of `additional_metadata`. -#' @param ... Additional arguments passed to methods. -#' @return An `epi_df` object. +#' @template epi_df-params #' #' @export #' @examples diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 22ea2928..632dc3a3 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -3,8 +3,8 @@ #' Converts an `epi_df` object into a tibble, dropping metadata and any #' grouping. #' -#' @param x an `epi_df` -#' @param ... arguments to forward to `NextMethod()` +#' @template x +#' @param ... additional arguments to forward to `NextMethod()` #' #' @importFrom tibble as_tibble #' @export @@ -22,7 +22,7 @@ as_tibble.epi_df <- function(x, ...) { #' others in the `other_keys` field of the metadata, or else explicitly set. #' #' @method as_tsibble epi_df -#' @param x The `epi_df` object. +#' @template x #' @param key Optional. Any additional keys (other than `geo_value`) to add to #' the `tsibble`. #' @param ... additional arguments passed on to `tsibble::as_tsibble()` @@ -39,8 +39,8 @@ as_tsibble.epi_df <- function(x, key, ...) { #' #' Print and summary functions for an `epi_df` object. #' -#' @param x The `epi_df` object. -#' @param ... Additional arguments passed to methods. +#' @template x +#' @param ... additional arguments to forward to `NextMethod()` #' #' @method print epi_df #' @export @@ -61,7 +61,7 @@ print.epi_df <- function(x, ...) { #' Prints a variety of summary statistics about the `epi_df` object, such as #' the time range included and geographic coverage. #' -#' @param object The `epi_df` object. +#' @param object an `epi_df` #' @param ... Additional arguments, for compatibility with `summary()`. #' Currently unused. #' @@ -204,6 +204,7 @@ dplyr_row_slice.epi_df <- function(data, i, ...) { } #' @method group_by epi_df +#' @param .data an `epi_df` #' @rdname print.epi_df #' @export group_by.epi_df <- function(.data, ...) { @@ -223,7 +224,7 @@ ungroup.epi_df <- function(x, ...) { #' @method group_modify epi_df #' @rdname print.epi_df -#' @param .data The `epi_df` object. +#' @param .data an `epi_df` #' @param .f function or formula; see [`dplyr::group_modify`] #' @param .keep Boolean; see [`dplyr::group_modify`] #' @export @@ -233,7 +234,7 @@ group_modify.epi_df <- function(.data, .f, ..., .keep = FALSE) { #' @method unnest epi_df #' @rdname print.epi_df -#' @param data The `epi_df` object. +#' @param data an `epi_df` #' @export unnest.epi_df <- function(data, ...) { dplyr::dplyr_reconstruct(NextMethod(), data) diff --git a/R/outliers.R b/R/outliers.R index 68a656a7..ab4f0e8e 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -6,10 +6,7 @@ #' vignette](https://cmu-delphi.github.io/epiprocess/articles/outliers.html) for #' examples. #' -#' @param x Design points corresponding to the signal values `y`. Default is -#' `seq_along(y)` (that is, equally-spaced points from 1 to the length of -#' `y`). -#' @param y Signal values. +#' @template x-y #' @param methods A tibble specifying the method(s) to use for outlier #' detection, with one row per method, and the following columns: #' * `method`: Either "rm" or "stl", or a custom function for outlier @@ -25,9 +22,7 @@ #' summarized results are calculated. Note that if the number of `methods` #' (number of rows) is odd, then "median" is equivalent to a majority vote for #' purposes of determining whether a given observation is an outlier. -#' @return An tibble with number of rows equal to `length(y)` and columns giving -#' the outlier detection thresholds and replacement values from each detection -#' method. +#' @template detect-outlr-return #' #' @details Each outlier detection method, one per row of the passed `methods` #' tibble, is a function that must take as its first two arguments `x` and @@ -147,32 +142,14 @@ detect_outlr <- function(x = seq_along(y), y, #' Detects outliers based on a distance from the rolling median specified in #' terms of multiples of the rolling interquartile range (IQR). #' -#' @param x Design points corresponding to the signal values `y`. Default is -#' `seq_along(y)` (that is, equally-spaced points from 1 to the length of -#' `y`). -#' @param y Signal values. +#' @template x-y #' @param n Number of time steps to use in the rolling window. Default is 21. #' This value is centrally aligned. When `n` is an odd number, the rolling #' window extends from `(n-1)/2` time steps before each design point to `(n-1)/2` #' time steps after. When `n` is even, then the rolling range extends from #' `n/2-1` time steps before to `n/2` time steps after. -#' @param log_transform Should a log transform be applied before running outlier -#' detection? Default is `FALSE`. If `TRUE`, and zeros are present, then the -#' log transform will be padded by 1. -#' @param detect_negatives Should negative values automatically count as -#' outliers? Default is `FALSE`. -#' @param detection_multiplier Value determining how far the outlier detection -#' thresholds are from the rolling median, which are calculated as (rolling -#' median) +/- (detection multiplier) * (rolling IQR). Default is 2. -#' @param min_radius Minimum distance between rolling median and threshold, on -#' transformed scale. Default is 0. -#' @param replacement_multiplier Value determining how far the replacement -#' values are from the rolling median. The replacement is the original value -#' if it is within the detection thresholds, or otherwise it is rounded to the -#' nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). -#' Default is 0. -#' @return A tibble with number of rows equal to `length(y)`, and columns -#' `lower`, `upper`, and `replacement`. +#' @template outlier-detection-options +#' @template detect-outlr-return #' #' @export #' @examples @@ -235,10 +212,7 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' #' Detects outliers based on a seasonal-trend decomposition using LOESS (STL). #' -#' @param x Design points corresponding to the signal values `y`. Default is -#' `seq_along(y)` (that is, equally-spaced points from 1 to the length of -#' `y`). -#' @param y Signal values. +#' @template x-y #' @param n_trend Number of time steps to use in the rolling window for trend. #' Default is 21. #' @param n_seasonal Number of time steps to use in the rolling window for @@ -248,23 +222,8 @@ detect_outlr_rm <- function(x = seq_along(y), y, n = 21, #' @param seasonal_period Integer specifying period of seasonality. For example, #' for daily data, a period 7 means weekly seasonality. The default is `NULL`, #' meaning that no seasonal term will be included in the STL decomposition. -#' @param log_transform Should a log transform be applied before running outlier -#' detection? Default is `FALSE`. If `TRUE`, and zeros are present, then the -#' log transform will be padded by 1. -#' @param detect_negatives Should negative values automatically count as -#' outliers? Default is `FALSE`. -#' @param detection_multiplier Value determining how far the outlier detection -#' thresholds are from the rolling median, which are calculated as (rolling -#' median) +/- (detection multiplier) * (rolling IQR). Default is 2. -#' @param min_radius Minimum distance between rolling median and threshold, on -#' transformed scale. Default is 0. -#' @param replacement_multiplier Value determining how far the replacement -#' values are from the rolling median. The replacement is the original value -#' if it is within the detection thresholds, or otherwise it is rounded to the -#' nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). -#' Default is 0. -#' @return A tibble with number of rows equal to `length(y)`, and columns -#' `lower`, `upper`, and `replacement`. +#' @template outlier-detection-options +#' @template detect-outlr-return #' #' @details The STL decomposition is computed using the `feasts` package. Once #' computed, the outlier detection method is analogous to the rolling median diff --git a/R/slide.R b/R/slide.R index 2e07d502..7cdc8f38 100644 --- a/R/slide.R +++ b/R/slide.R @@ -1,12 +1,10 @@ #' Slide a function over variables in an `epi_df` object #' -#' Slides a given function over variables in an `epi_df` object. See the [slide -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for -#' examples. +#' Slides a given function over variables in an `epi_df` object. See the +#' [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) +#' for examples. #' -#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] -#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a -#' single data group. +#' @template basic-slide-params #' @param f Function, formula, or missing; together with `...` specifies the #' computation to slide. To "slide" means to apply a computation within a #' sliding (a.k.a. "rolling") time window for each data group. The window is @@ -27,33 +25,6 @@ #' directly by name, the expression has access to `.data` and `.env` pronouns #' as in `dplyr` verbs, and can also refer to `.x`, `.group_key`, and #' `.ref_time_value`. See details. -#' @param before,after How far `before` and `after` each `ref_time_value` should -#' the sliding window extend? At least one of these two arguments must be -#' provided; the other's default will be 0. Any value provided for either -#' argument must be a single, non-`NA`, non-negative, -#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: -#' * For trailing/right-aligned windows from `ref_time_value - time_step -#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass -#' `before=k, after=0`. -#' * For center-aligned windows from `ref_time_value - time_step(k)` to -#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. -#' * For leading/left-aligned windows from `ref_time_value` to -#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, -#' or pass `before=0, after=k`. -#' See "Details:" about the definition of a time step,(non)treatment of -#' missing rows within the window, and avoiding warnings about -#' `before`&`after` settings for a certain uncommon use case. -#' @param ref_time_values Time values for sliding computations, meaning, each -#' element of this vector serves as the reference time point for one sliding -#' window. If missing, then this will be set to all unique time values in the -#' underlying data table, by default. -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a non-negative integer and -#' return an object of class `lubridate::period`. For example, we can use -#' `time_step = lubridate::hours` in order to set the time step to be one hour -#' (this would only be meaningful if `time_value` is of class `POSIXct`). #' @param new_col_name String indicating the name of the new column that will #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. @@ -63,24 +34,9 @@ #' [`tidyr::unnest()`]), and, if the slide computations output data frames, #' the names of the resulting columns are given by prepending `new_col_name` #' to the names of the list elements. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_name` entirely. -#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in -#' the output even with `ref_time_values` provided, with some type of missing -#' value marker for the slide computation output column(s) for `time_value`s -#' outside `ref_time_values`; otherwise, there will be one row for each row in -#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The -#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type -#' of the slide computation output. If using `as_list_col = TRUE`, note that -#' the missing marker is a `NULL` entry in the list column; for certain -#' operations, you might want to replace these `NULL` entries with a different -#' `NA` marker. -#' @return An `epi_df` object given by appending a new column to `x`, named -#' according to the `new_col_name` argument. #' #' @details To "slide" means to apply a function or formula over a rolling -#' window of time steps for each data group, where the window is entered at a +#' window of time steps for each data group, where the window is centered at a #' reference time and left and right endpoints are given by the `before` and #' `after` arguments. The unit (the meaning of one time step) is implicitly #' defined by the way the `time_value` column treats addition and subtraction; @@ -386,15 +342,13 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' Optimized slide function for performing common rolling computations on an `epi_df` object #' -#' Slides an n-timestep mean over variables in an `epi_df` object. See the [slide -#' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for -#' examples. +#' Slides an n-timestep [data.table::froll] or [slider::summary-slide] function +#' over variables in an `epi_df` object. See the +#' [slide vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) +#' for examples. #' -#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] -#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a -#' single data group. -#' @param col_names A single tidyselection or a tidyselection vector of the -#' names of one or more columns for which to calculate the rolling mean. +#' @template basic-slide-params +#' @template opt-slide-params #' @param f Function; together with `...` specifies the computation to slide. #' `f` must be one of `data.table`'s rolling functions #' (`frollmean`, `frollsum`, `frollapply`. See [data.table::roll]) or one @@ -418,79 +372,8 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' these args via `...` will cause an error. If `f` is a `slider` function, #' it is automatically passed the data `x` to operate on, and number of #' points `before` and `after` to use in the computation. -#' @param before,after How far `before` and `after` each `ref_time_value` should -#' the sliding window extend? At least one of these two arguments must be -#' provided; the other's default will be 0. Any value provided for either -#' argument must be a single, non-`NA`, non-negative, -#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of -#' the window are inclusive. Common settings: -#' * For trailing/right-aligned windows from `ref_time_value - time_step -#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass -#' `before=k, after=0`. -#' * For center-aligned windows from `ref_time_value - time_step(k)` to -#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. -#' * For leading/left-aligned windows from `ref_time_value` to -#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, -#' or pass `before=0, after=k`. -#' See "Details:" about the definition of a time step,(non)treatment of -#' missing rows within the window, and avoiding warnings about -#' `before`&`after` settings for a certain uncommon use case. -#' @param ref_time_values Time values for sliding computations, meaning, each -#' element of this vector serves as the reference time point for one sliding -#' window. If missing, then this will be set to all unique time values in the -#' underlying data table, by default. -#' @param time_step Optional function used to define the meaning of one time -#' step, which if specified, overrides the default choice based on the -#' `time_value` column. This function must take a non-negative integer and -#' return an object of class [lubridate::period]. For example, we can use -#' `time_step = lubridate::hours` in order to set the time step to be one hour -#' (this would only be meaningful if `time_value` is of class `POSIXct`). -#' @param new_col_names String indicating the name of the new column that will -#' contain the derivative values. Default is "slide_value"; note that setting -#' `new_col_names` equal to an existing column name will overwrite this column. -#' @param as_list_col Not supported. Included to match `epi_slide` interface. -#' @param names_sep String specifying the separator to use in `tidyr::unnest()` -#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix -#' from `new_col_names` entirely. -#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in -#' the output even with `ref_time_values` provided, with some type of missing -#' value marker for the slide computation output column(s) for `time_value`s -#' outside `ref_time_values`; otherwise, there will be one row for each row in -#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The -#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type -#' of the slide computation output. -#' @return An `epi_df` object given by appending one or more new columns to -#' `x`, depending on the `col_names` argument, named according to the -#' `new_col_names` argument. -#' -#' @details To "slide" means to apply a function or formula over a rolling -#' window of time steps for each data group, where the window is entered at a -#' reference time and left and right endpoints are given by the `before` and -#' `after` arguments. The unit (the meaning of one time step) is implicitly -#' defined by the way the `time_value` column treats addition and subtraction; -#' for example, if the time values are coded as `Date` objects, then one time -#' step is one day, since `as.Date("2022-01-01") + 1` equals -#' `as.Date("2022-01-02")`. Alternatively, the time step can be set explicitly -#' using the `time_step` argument (which if specified would override the -#' default choice based on `time_value` column). If there are not enough time -#' steps available to complete the window at any given reference time, then -#' `epi_slide()` still attempts to perform the computation anyway (it does not -#' require a complete window). The issue of what to do with partial -#' computations (those run on incomplete windows) is therefore left up to the -#' user, either through the specified function or formula `f`, or through -#' post-processing. For a centrally-aligned slide of `n` `time_value`s in a -#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the -#' number of `time_value`s in a sliding window is odd and `before = n/2-1` and -#' `after = n/2` when `n` is even. #' -#' Sometimes, we want to experiment with various trailing or leading window -#' widths and compare the slide outputs. In the (uncommon) case where -#' zero-width windows are considered, manually pass both the `before` and -#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` -#' with `k=0` and `after` missing may produce a warning. To avoid warnings, -#' use `before=k, after=0` instead; otherwise, it looks too much like a -#' leading window was intended, but the `after` argument was forgotten or -#' misspelled.) +#' @template opt-slide-details #' #' @importFrom dplyr bind_rows mutate %>% arrange tibble select #' @importFrom rlang enquo quo_get_expr as_label @@ -507,7 +390,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 +#' f = data.table::frollmean, new_col_name = "cases_7dav", names_sep = NULL, before = 6 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% @@ -519,7 +402,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt(cases, #' f = data.table::frollmean, -#' new_col_names = "cases_7dav", names_sep = NULL, before = 6, +#' new_col_name = "cases_7dav", names_sep = NULL, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% @@ -531,7 +414,7 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 +#' f = slider::slide_mean, new_col_name = "cases_7dav", names_sep = NULL, after = 6 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% @@ -542,14 +425,14 @@ epi_slide <- function(x, f, ..., before, after, ref_time_values, #' group_by(geo_value) %>% #' epi_slide_opt( #' cases, -#' f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 +#' f = data.table::frollsum, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3 #' ) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, time_step, - new_col_names = "slide_value", as_list_col = NULL, + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE) { assert_class(x, "epi_df") @@ -678,28 +561,28 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, # If single column name, do nothing. if (is.null(names_sep)) { - if (length(new_col_names) != length(col_names_chr)) { + if (length(new_col_name) != length(col_names_chr)) { cli_abort( c( - "`new_col_names` must be the same length as `col_names` when + "`new_col_name` must be the same length as `col_names` when `names_sep` is NULL to avoid duplicate output column names." ), class = "epiprocess__epi_slide_mean__col_names_length_mismatch", - epiprocess__new_col_names = new_col_names, + epiprocess__new_col_name = new_col_name, epiprocess__col_names = col_names_chr ) } - result_col_names <- new_col_names + result_col_names <- new_col_name } else { - if (length(new_col_names) != 1L && length(new_col_names) != length(col_names_chr)) { + if (length(new_col_name) != 1L && length(new_col_name) != length(col_names_chr)) { cli_abort( - "`new_col_names` must be either length 1 or the same length as `col_names`.", + "`new_col_name` must be either length 1 or the same length as `col_names`.", class = "epiprocess__epi_slide_mean__col_names_length_mismatch_and_not_one", - epiprocess__new_col_names = new_col_names, + epiprocess__new_col_name = new_col_name, epiprocess__col_names = col_names_chr ) } - result_col_names <- paste(new_col_names, col_names_chr, sep = names_sep) + result_col_names <- paste(new_col_name, col_names_chr, sep = names_sep) } slide_one_grp <- function(.data_group, .group_key, ...) { @@ -802,13 +685,14 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' #' Wrapper around `epi_slide_opt` with `f = datatable::frollmean`. #' +#' @template basic-slide-params +#' @template opt-slide-params #' @param ... Additional arguments to pass to `data.table::frollmean`, for #' example, `na.rm` and `algo`. `data.table::frollmean` is automatically #' passed the data `x` to operate on, the window size `n`, and the alignment #' `align`. Providing these args via `...` will cause an error. -#' @inheritParams epi_slide_opt -#' @inherit epi_slide_opt return -#' @inherit epi_slide_opt details +#' +#' @template opt-slide-details #' #' @export #' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_sum`] @@ -816,7 +700,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # slide a 7-day trailing average formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 6) %>% +#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 6) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() @@ -826,7 +710,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% #' epi_slide_mean(cases, -#' new_col_names = "cases_7dav", names_sep = NULL, before = 6, +#' new_col_name = "cases_7dav", names_sep = NULL, before = 6, #' # `frollmean` options #' na.rm = TRUE, algo = "exact", hasNA = TRUE #' ) %>% @@ -836,7 +720,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # slide a 7-day leading average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, after = 6) %>% +#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, after = 6) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() @@ -844,7 +728,7 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # slide a 7-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% +#' epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dav) %>% #' ungroup() @@ -852,13 +736,13 @@ epi_slide_opt <- function(x, col_names, f, ..., before, after, ref_time_values, #' # slide a 14-day centre-aligned average #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_mean(cases, new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% +#' epi_slide_mean(cases, new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_14dav) %>% #' ungroup() epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, time_step, - new_col_names = "slide_value", as_list_col = NULL, + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE) { epi_slide_opt( x = x, @@ -869,7 +753,7 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, after = after, ref_time_values = ref_time_values, time_step = time_step, - new_col_names = new_col_names, + new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, all_rows = all_rows @@ -878,19 +762,20 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' Optimized slide function for performing rolling sums on an `epi_df` object #' -#' Slides an n-timestep mean over variables in an `epi_df` object. See the [slide +#' Slides an n-timestep sum over variables in an `epi_df` object. See the [slide #' vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) for #' examples. #' #' Wrapper around `epi_slide_opt` with `f = datatable::frollsum`. #' +#' @template basic-slide-params +#' @template opt-slide-params #' @param ... Additional arguments to pass to `data.table::frollsum`, for #' example, `na.rm` and `algo`. `data.table::frollsum` is automatically #' passed the data `x` to operate on, the window size `n`, and the alignment #' `align`. Providing these args via `...` will cause an error. -#' @inheritParams epi_slide_opt -#' @inherit epi_slide_opt return -#' @inherit epi_slide_opt details +#' +#' @template opt-slide-details #' #' @export #' @seealso [`epi_slide`] [`epi_slide_opt`] [`epi_slide_mean`] @@ -898,13 +783,13 @@ epi_slide_mean <- function(x, col_names, ..., before, after, ref_time_values, #' # slide a 7-day trailing sum formula on cases #' jhu_csse_daily_subset %>% #' group_by(geo_value) %>% -#' epi_slide_sum(cases, new_col_names = "cases_7dsum", names_sep = NULL, before = 6) %>% +#' epi_slide_sum(cases, new_col_name = "cases_7dsum", names_sep = NULL, before = 6) %>% #' # Remove a nonessential var. to ensure new col is printed #' dplyr::select(geo_value, time_value, cases, cases_7dsum) %>% #' ungroup() epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, time_step, - new_col_names = "slide_value", as_list_col = NULL, + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE) { epi_slide_opt( x = x, @@ -915,7 +800,7 @@ epi_slide_sum <- function(x, col_names, ..., before, after, ref_time_values, after = after, ref_time_values = ref_time_values, time_step = time_step, - new_col_names = new_col_names, + new_col_name = new_col_name, as_list_col = as_list_col, names_sep = names_sep, all_rows = all_rows @@ -943,7 +828,7 @@ full_date_seq <- function(x, before, after, time_step) { # `tsibble` classes apparently can't be added to in different units, so even # if `time_step` is provided by the user, use a value-1 unitless step. if (inherits(x$time_value, c("yearquarter", "yearweek", "yearmonth")) || - is.numeric(x$time_value)) { + is.numeric(x$time_value)) { # nolint: indentation_linter all_dates <- seq(min(x$time_value), max(x$time_value), by = 1L) if (before != 0) { diff --git a/man-roxygen/basic-slide-params.R b/man-roxygen/basic-slide-params.R new file mode 100644 index 00000000..383c102d --- /dev/null +++ b/man-roxygen/basic-slide-params.R @@ -0,0 +1,45 @@ +#' @param x The `epi_df` object under consideration, [grouped][dplyr::group_by] +#' or ungrouped. If ungrouped, all data in `x` will be treated as part of a +#' single data group. +#' @param before,after How far `before` and `after` each `ref_time_value` should +#' the sliding window extend? At least one of these two arguments must be +#' provided; the other's default will be 0. Any value provided for either +#' argument must be a single, non-`NA`, non-negative, +#' [integer-compatible][vctrs::vec_cast] number of time steps. Endpoints of +#' the window are inclusive. Common settings: +#' * For trailing/right-aligned windows from `ref_time_value - time_step +#' (k)` to `ref_time_value`: either pass `before=k` by itself, or pass +#' `before=k, after=0`. +#' * For center-aligned windows from `ref_time_value - time_step(k)` to +#' `ref_time_value + time_step(k)`: pass `before=k, after=k`. +#' * For leading/left-aligned windows from `ref_time_value` to +#' `ref_time_value + time_step(k)`: either pass pass `after=k` by itself, +#' or pass `before=0, after=k`. +#' See "Details:" about the definition of a time step,(non)treatment of +#' missing rows within the window, and avoiding warnings about +#' `before`&`after` settings for a certain uncommon use case. +#' @param ref_time_values Time values for sliding computations, meaning, each +#' element of this vector serves as the reference time point for one sliding +#' window. If missing, then this will be set to all unique time values in the +#' underlying data table, by default. +#' @param time_step Optional function used to define the meaning of one time +#' step, which if specified, overrides the default choice based on the +#' `time_value` column. This function must take a non-negative integer and +#' return an object of class [lubridate::period]. For example, we can use +#' `time_step = lubridate::hours` in order to set the time step to be one hour +#' (this would only be meaningful if `time_value` is of class `POSIXct`). +#' @param names_sep String specifying the separator to use in `tidyr::unnest()` +#' when `as_list_col = FALSE`. Default is "_". Using `NULL` drops the prefix +#' from `new_col_name` entirely. +#' @param all_rows If `all_rows = TRUE`, then all rows of `x` will be kept in +#' the output even with `ref_time_values` provided, with some type of missing +#' value marker for the slide computation output column(s) for `time_value`s +#' outside `ref_time_values`; otherwise, there will be one row for each row in +#' `x` that had a `time_value` in `ref_time_values`. Default is `FALSE`. The +#' missing value marker is the result of `vctrs::vec_cast`ing `NA` to the type +#' of the slide computation output. If using `as_list_col = TRUE`, note that +#' the missing marker is a `NULL` entry in the list column; for certain +#' operations, you might want to replace these `NULL` entries with a different +#' `NA` marker. +#' @return An `epi_df` object given by appending one or more new columns to +#' `x`, named according to the `new_col_name` argument. diff --git a/man-roxygen/detect-outlr-return.R b/man-roxygen/detect-outlr-return.R new file mode 100644 index 00000000..50222e0e --- /dev/null +++ b/man-roxygen/detect-outlr-return.R @@ -0,0 +1,3 @@ +#' @return An tibble with number of rows equal to `length(y)` and columns +#' giving the outlier detection thresholds (`lower` and `upper`) and +#' replacement values from each detection method (`replacement`). diff --git a/man-roxygen/epi_df-params.R b/man-roxygen/epi_df-params.R new file mode 100644 index 00000000..54d8c2d2 --- /dev/null +++ b/man-roxygen/epi_df-params.R @@ -0,0 +1,19 @@ +#' @param x A data.frame, [tibble::tibble], or [tsibble::tsibble] to be converted +#' @param geo_type Type for the geo values. If missing, then the function will +#' attempt to infer it from the geo values present; if this fails, then it +#' will be set to "custom". +#' @param time_type Type for the time values. If missing, then the function will +#' attempt to infer it from the time values present; if this fails, then it +#' will be set to "custom". +#' @param as_of Time value representing the time at which the given data were +#' available. For example, if `as_of` is January 31, 2022, then the `epi_df` +#' object that is created would represent the most up-to-date version of the +#' data available as of January 31, 2022. If the `as_of` argument is missing, +#' then the current day-time will be used. +#' @param additional_metadata List of additional metadata to attach to the +#' `epi_df` object. The metadata will have `geo_type`, `time_type`, and +#' `as_of` fields; named entries from the passed list will be included as +#' well. If your tibble has additional keys, be sure to specify them as a +#' character vector in the `other_keys` component of `additional_metadata`. +#' @param ... Additional arguments passed to methods. +#' @return An `epi_df` object. diff --git a/man-roxygen/opt-slide-details.R b/man-roxygen/opt-slide-details.R new file mode 100644 index 00000000..33fb437c --- /dev/null +++ b/man-roxygen/opt-slide-details.R @@ -0,0 +1,25 @@ +#' @details To "slide" means to apply a function over a rolling window of time +#' steps for each data group, where the window is centered at a reference +#' time and left and right endpoints are given by the `before` and `after` +#' arguments. The unit (the meaning of one time step) is implicitly defined +#' by the way the `time_value` column treats addition and subtraction; for +#' example, if the time values are coded as `Date` objects, then one time +#' step is one day, since `as.Date("2022-01-01") + 1` equals `as.Date +#' ("2022-01-02")`. Alternatively, the time step can be set explicitly using +#' the `time_step` argument (which if specified would override the default +#' choice based on `time_value` column). If there are not enough time steps +#' available to complete the window at any given reference time, then +#' `epi_slide_*()` will fail; it requires a complete window to perform the +#' computation. For a centrally-aligned slide of `n` `time_value`s in a +#' sliding window, set `before = (n-1)/2` and `after = (n-1)/2` when the +#' number of `time_value`s in a sliding window is odd and `before = n/2-1` +#' and `after = n/2` when `n` is even. +#' +#' Sometimes, we want to experiment with various trailing or leading window +#' widths and compare the slide outputs. In the (uncommon) case where +#' zero-width windows are considered, manually pass both the `before` and +#' `after` arguments in order to prevent potential warnings. (E.g., `before=k` +#' with `k=0` and `after` missing may produce a warning. To avoid warnings, +#' use `before=k, after=0` instead; otherwise, it looks too much like a +#' leading window was intended, but the `after` argument was forgotten or +#' misspelled.) diff --git a/man-roxygen/opt-slide-params.R b/man-roxygen/opt-slide-params.R new file mode 100644 index 00000000..a7d5b04a --- /dev/null +++ b/man-roxygen/opt-slide-params.R @@ -0,0 +1,8 @@ +#' @param col_names A single tidyselection or a tidyselection vector of the +#' names of one or more columns for which to calculate the rolling mean. +#' @param as_list_col Not supported. Included to match `epi_slide` interface. +#' @param new_col_name Character vector indicating the name(s) of the new +#' column(s) that will contain the derivative values. Default +#' is "slide_value"; note that setting `new_col_name` equal to any existing +#' column names will overwrite those columns. If `names_sep` is `NULL`, +#' `new_col_name` must be the same length as `col_names`. diff --git a/man-roxygen/outlier-detection-options.R b/man-roxygen/outlier-detection-options.R new file mode 100644 index 00000000..4b4260e5 --- /dev/null +++ b/man-roxygen/outlier-detection-options.R @@ -0,0 +1,15 @@ +#' @param log_transform Should a log transform be applied before running outlier +#' detection? Default is `FALSE`. If `TRUE`, and zeros are present, then the +#' log transform will be padded by 1. +#' @param detect_negatives Should negative values automatically count as +#' outliers? Default is `FALSE`. +#' @param detection_multiplier Value determining how far the outlier detection +#' thresholds are from the rolling median, which are calculated as (rolling +#' median) +/- (detection multiplier) * (rolling IQR). Default is 2. +#' @param min_radius Minimum distance between rolling median and threshold, on +#' transformed scale. Default is 0. +#' @param replacement_multiplier Value determining how far the replacement +#' values are from the rolling median. The replacement is the original value +#' if it is within the detection thresholds, or otherwise it is rounded to the +#' nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). +#' Default is 0. diff --git a/man-roxygen/x-y.R b/man-roxygen/x-y.R new file mode 100644 index 00000000..a4f9d1d7 --- /dev/null +++ b/man-roxygen/x-y.R @@ -0,0 +1,4 @@ +#' @param x Design points corresponding to the signal values `y`. Default is +#' `seq_along(y)` (that is, equally-spaced points from 1 to the length of +#' `y`). +#' @param y Signal values. diff --git a/man-roxygen/x.R b/man-roxygen/x.R new file mode 100644 index 00000000..a26f9f25 --- /dev/null +++ b/man-roxygen/x.R @@ -0,0 +1 @@ +#' @param x an `epi_df` diff --git a/man/as_tibble.epi_df.Rd b/man/as_tibble.epi_df.Rd index c314f47e..5913a5e7 100644 --- a/man/as_tibble.epi_df.Rd +++ b/man/as_tibble.epi_df.Rd @@ -9,7 +9,7 @@ \arguments{ \item{x}{an \code{epi_df}} -\item{...}{arguments to forward to \code{NextMethod()}} +\item{...}{additional arguments to forward to \code{NextMethod()}} } \description{ Converts an \code{epi_df} object into a tibble, dropping metadata and any diff --git a/man/as_tsibble.epi_df.Rd b/man/as_tsibble.epi_df.Rd index 98ca7f74..73200c3b 100644 --- a/man/as_tsibble.epi_df.Rd +++ b/man/as_tsibble.epi_df.Rd @@ -7,7 +7,7 @@ \method{as_tsibble}{epi_df}(x, key, ...) } \arguments{ -\item{x}{The \code{epi_df} object.} +\item{x}{an \code{epi_df}} \item{key}{Optional. Any additional keys (other than \code{geo_value}) to add to the \code{tsibble}.} diff --git a/man/detect_outlr.Rd b/man/detect_outlr.Rd index 3a793ebf..4263a64b 100644 --- a/man/detect_outlr.Rd +++ b/man/detect_outlr.Rd @@ -38,9 +38,9 @@ summarized results are calculated. Note that if the number of \code{methods} purposes of determining whether a given observation is an outlier.} } \value{ -An tibble with number of rows equal to \code{length(y)} and columns giving -the outlier detection thresholds and replacement values from each detection -method. +An tibble with number of rows equal to \code{length(y)} and columns +giving the outlier detection thresholds (\code{lower} and \code{upper}) and +replacement values from each detection method (\code{replacement}). } \description{ Applies one or more outlier detection methods to a given signal variable, and diff --git a/man/detect_outlr_rm.Rd b/man/detect_outlr_rm.Rd index 0d011619..333c4a7b 100644 --- a/man/detect_outlr_rm.Rd +++ b/man/detect_outlr_rm.Rd @@ -49,8 +49,9 @@ nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). Default is 0.} } \value{ -A tibble with number of rows equal to \code{length(y)}, and columns -\code{lower}, \code{upper}, and \code{replacement}. +An tibble with number of rows equal to \code{length(y)} and columns +giving the outlier detection thresholds (\code{lower} and \code{upper}) and +replacement values from each detection method (\code{replacement}). } \description{ Detects outliers based on a distance from the rolling median specified in diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index 34a550d5..2b518451 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -59,8 +59,9 @@ nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). Default is 0.} } \value{ -A tibble with number of rows equal to \code{length(y)}, and columns -\code{lower}, \code{upper}, and \code{replacement}. +An tibble with number of rows equal to \code{length(y)} and columns +giving the outlier detection thresholds (\code{lower} and \code{upper}) and +replacement values from each detection method (\code{replacement}). } \description{ Detects outliers based on a seasonal-trend decomposition using LOESS (STL). diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 2fe1dce6..0d0dfb55 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -72,7 +72,7 @@ underlying data table, by default.} \item{time_step}{Optional function used to define the meaning of one time step, which if specified, overrides the default choice based on the \code{time_value} column. This function must take a non-negative integer and -return an object of class \code{lubridate::period}. For example, we can use +return an object of class \link[lubridate:period]{lubridate::period}. For example, we can use \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} @@ -103,16 +103,17 @@ operations, you might want to replace these \code{NULL} entries with a different \code{NA} marker.} } \value{ -An \code{epi_df} object given by appending a new column to \code{x}, named -according to the \code{new_col_name} argument. +An \code{epi_df} object given by appending one or more new columns to +\code{x}, named according to the \code{new_col_name} argument. } \description{ -Slides a given function over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for -examples. +Slides a given function over variables in an \code{epi_df} object. See the +\href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} +for examples. } \details{ To "slide" means to apply a function or formula over a rolling -window of time steps for each data group, where the window is entered at a +window of time steps for each data group, where the window is centered at a reference time and left and right endpoints are given by the \code{before} and \code{after} arguments. The unit (the meaning of one time step) is implicitly defined by the way the \code{time_value} column treats addition and subtraction; diff --git a/man/epi_slide_mean.Rd b/man/epi_slide_mean.Rd index 19b6fcec..ee3e7838 100644 --- a/man/epi_slide_mean.Rd +++ b/man/epi_slide_mean.Rd @@ -12,7 +12,7 @@ epi_slide_mean( after, ref_time_values, time_step, - new_col_names = "slide_value", + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE @@ -62,15 +62,17 @@ return an object of class \link[lubridate:period]{lubridate::period}. For exampl \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} -\item{new_col_names}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_names} equal to an existing column name will overwrite this column.} +\item{new_col_name}{Character vector indicating the name(s) of the new +column(s) that will contain the derivative values. Default +is "slide_value"; note that setting \code{new_col_name} equal to any existing +column names will overwrite those columns. If \code{names_sep} is \code{NULL}, +\code{new_col_name} must be the same length as \code{col_names}.} \item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_names} entirely.} +from \code{new_col_name} entirely.} \item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in the output even with \code{ref_time_values} provided, with some type of missing @@ -78,12 +80,14 @@ value marker for the slide computation output column(s) for \code{time_value}s outside \code{ref_time_values}; otherwise, there will be one row for each row in \code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output.} +of the slide computation output. If using \code{as_list_col = TRUE}, note that +the missing marker is a \code{NULL} entry in the list column; for certain +operations, you might want to replace these \code{NULL} entries with a different +\code{NA} marker.} } \value{ An \code{epi_df} object given by appending one or more new columns to -\code{x}, depending on the \code{col_names} argument, named according to the -\code{new_col_names} argument. +\code{x}, named according to the \code{new_col_name} argument. } \description{ Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for @@ -91,12 +95,37 @@ examples. } \details{ Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollmean}. + +To "slide" means to apply a function over a rolling window of time +steps for each data group, where the window is centered at a reference +time and left and right endpoints are given by the \code{before} and \code{after} +arguments. The unit (the meaning of one time step) is implicitly defined +by the way the \code{time_value} column treats addition and subtraction; for +example, if the time values are coded as \code{Date} objects, then one time +step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using +the \code{time_step} argument (which if specified would override the default +choice based on \code{time_value} column). If there are not enough time steps +available to complete the window at any given reference time, then +\verb{epi_slide_*()} will fail; it requires a complete window to perform the +computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a +sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the +number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} +and \code{after = n/2} when \code{n} is even. + +Sometimes, we want to experiment with various trailing or leading window +widths and compare the slide outputs. In the (uncommon) case where +zero-width windows are considered, manually pass both the \code{before} and +\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} +with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, +use \verb{before=k, after=0} instead; otherwise, it looks too much like a +leading window was intended, but the \code{after} argument was forgotten or +misspelled.) } \examples{ # slide a 7-day trailing average formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 6) \%>\% + epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 6) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() @@ -106,7 +135,7 @@ jhu_csse_daily_subset \%>\% jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_mean(cases, - new_col_names = "cases_7dav", names_sep = NULL, before = 6, + new_col_name = "cases_7dav", names_sep = NULL, before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% @@ -116,7 +145,7 @@ jhu_csse_daily_subset \%>\% # slide a 7-day leading average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, after = 6) \%>\% + epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, after = 6) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() @@ -124,7 +153,7 @@ jhu_csse_daily_subset \%>\% # slide a 7-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% + epi_slide_mean(cases, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% ungroup() @@ -132,7 +161,7 @@ jhu_csse_daily_subset \%>\% # slide a 14-day centre-aligned average jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_mean(cases, new_col_names = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% + epi_slide_mean(cases, new_col_name = "cases_14dav", names_sep = NULL, before = 6, after = 7) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_14dav) \%>\% ungroup() diff --git a/man/epi_slide_opt.Rd b/man/epi_slide_opt.Rd index 9c9cf6c4..0772b431 100644 --- a/man/epi_slide_opt.Rd +++ b/man/epi_slide_opt.Rd @@ -13,7 +13,7 @@ epi_slide_opt( after, ref_time_values, time_step, - new_col_names = "slide_value", + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE @@ -83,15 +83,17 @@ return an object of class \link[lubridate:period]{lubridate::period}. For exampl \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} -\item{new_col_names}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_names} equal to an existing column name will overwrite this column.} +\item{new_col_name}{Character vector indicating the name(s) of the new +column(s) that will contain the derivative values. Default +is "slide_value"; note that setting \code{new_col_name} equal to any existing +column names will overwrite those columns. If \code{names_sep} is \code{NULL}, +\code{new_col_name} must be the same length as \code{col_names}.} \item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_names} entirely.} +from \code{new_col_name} entirely.} \item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in the output even with \code{ref_time_values} provided, with some type of missing @@ -99,37 +101,37 @@ value marker for the slide computation output column(s) for \code{time_value}s outside \code{ref_time_values}; otherwise, there will be one row for each row in \code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output.} +of the slide computation output. If using \code{as_list_col = TRUE}, note that +the missing marker is a \code{NULL} entry in the list column; for certain +operations, you might want to replace these \code{NULL} entries with a different +\code{NA} marker.} } \value{ An \code{epi_df} object given by appending one or more new columns to -\code{x}, depending on the \code{col_names} argument, named according to the -\code{new_col_names} argument. +\code{x}, named according to the \code{new_col_name} argument. } \description{ -Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for -examples. +Slides an n-timestep \link[data.table:froll]{data.table::froll} or \link[slider:summary-slide]{slider::summary-slide} function +over variables in an \code{epi_df} object. See the +\href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} +for examples. } \details{ -To "slide" means to apply a function or formula over a rolling -window of time steps for each data group, where the window is entered at a -reference time and left and right endpoints are given by the \code{before} and -\code{after} arguments. The unit (the meaning of one time step) is implicitly -defined by the way the \code{time_value} column treats addition and subtraction; -for example, if the time values are coded as \code{Date} objects, then one time -step is one day, since \code{as.Date("2022-01-01") + 1} equals -\code{as.Date("2022-01-02")}. Alternatively, the time step can be set explicitly -using the \code{time_step} argument (which if specified would override the -default choice based on \code{time_value} column). If there are not enough time -steps available to complete the window at any given reference time, then -\code{epi_slide()} still attempts to perform the computation anyway (it does not -require a complete window). The issue of what to do with partial -computations (those run on incomplete windows) is therefore left up to the -user, either through the specified function or formula \code{f}, or through -post-processing. For a centrally-aligned slide of \code{n} \code{time_value}s in a +To "slide" means to apply a function over a rolling window of time +steps for each data group, where the window is centered at a reference +time and left and right endpoints are given by the \code{before} and \code{after} +arguments. The unit (the meaning of one time step) is implicitly defined +by the way the \code{time_value} column treats addition and subtraction; for +example, if the time values are coded as \code{Date} objects, then one time +step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using +the \code{time_step} argument (which if specified would override the default +choice based on \code{time_value} column). If there are not enough time steps +available to complete the window at any given reference time, then +\verb{epi_slide_*()} will fail; it requires a complete window to perform the +computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the -number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} and -\code{after = n/2} when \code{n} is even. +number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} +and \code{after = n/2} when \code{n} is even. Sometimes, we want to experiment with various trailing or leading window widths and compare the slide outputs. In the (uncommon) case where @@ -146,7 +148,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollmean, new_col_names = "cases_7dav", names_sep = NULL, before = 6 + f = data.table::frollmean, new_col_name = "cases_7dav", names_sep = NULL, before = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% @@ -158,7 +160,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt(cases, f = data.table::frollmean, - new_col_names = "cases_7dav", names_sep = NULL, before = 6, + new_col_name = "cases_7dav", names_sep = NULL, before = 6, # `frollmean` options na.rm = TRUE, algo = "exact", hasNA = TRUE ) \%>\% @@ -170,7 +172,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = slider::slide_mean, new_col_names = "cases_7dav", names_sep = NULL, after = 6 + f = slider::slide_mean, new_col_name = "cases_7dav", names_sep = NULL, after = 6 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% @@ -181,7 +183,7 @@ jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% epi_slide_opt( cases, - f = data.table::frollsum, new_col_names = "cases_7dav", names_sep = NULL, before = 3, after = 3 + f = data.table::frollsum, new_col_name = "cases_7dav", names_sep = NULL, before = 3, after = 3 ) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dav) \%>\% diff --git a/man/epi_slide_sum.Rd b/man/epi_slide_sum.Rd index 001bafdf..d5961f27 100644 --- a/man/epi_slide_sum.Rd +++ b/man/epi_slide_sum.Rd @@ -12,7 +12,7 @@ epi_slide_sum( after, ref_time_values, time_step, - new_col_names = "slide_value", + new_col_name = "slide_value", as_list_col = NULL, names_sep = "_", all_rows = FALSE @@ -62,15 +62,17 @@ return an object of class \link[lubridate:period]{lubridate::period}. For exampl \code{time_step = lubridate::hours} in order to set the time step to be one hour (this would only be meaningful if \code{time_value} is of class \code{POSIXct}).} -\item{new_col_names}{String indicating the name of the new column that will -contain the derivative values. Default is "slide_value"; note that setting -\code{new_col_names} equal to an existing column name will overwrite this column.} +\item{new_col_name}{Character vector indicating the name(s) of the new +column(s) that will contain the derivative values. Default +is "slide_value"; note that setting \code{new_col_name} equal to any existing +column names will overwrite those columns. If \code{names_sep} is \code{NULL}, +\code{new_col_name} must be the same length as \code{col_names}.} \item{as_list_col}{Not supported. Included to match \code{epi_slide} interface.} \item{names_sep}{String specifying the separator to use in \code{tidyr::unnest()} when \code{as_list_col = FALSE}. Default is "_". Using \code{NULL} drops the prefix -from \code{new_col_names} entirely.} +from \code{new_col_name} entirely.} \item{all_rows}{If \code{all_rows = TRUE}, then all rows of \code{x} will be kept in the output even with \code{ref_time_values} provided, with some type of missing @@ -78,25 +80,52 @@ value marker for the slide computation output column(s) for \code{time_value}s outside \code{ref_time_values}; otherwise, there will be one row for each row in \code{x} that had a \code{time_value} in \code{ref_time_values}. Default is \code{FALSE}. The missing value marker is the result of \code{vctrs::vec_cast}ing \code{NA} to the type -of the slide computation output.} +of the slide computation output. If using \code{as_list_col = TRUE}, note that +the missing marker is a \code{NULL} entry in the list column; for certain +operations, you might want to replace these \code{NULL} entries with a different +\code{NA} marker.} } \value{ An \code{epi_df} object given by appending one or more new columns to -\code{x}, depending on the \code{col_names} argument, named according to the -\code{new_col_names} argument. +\code{x}, named according to the \code{new_col_name} argument. } \description{ -Slides an n-timestep mean over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for +Slides an n-timestep sum over variables in an \code{epi_df} object. See the \href{https://cmu-delphi.github.io/epiprocess/articles/slide.html}{slide vignette} for examples. } \details{ Wrapper around \code{epi_slide_opt} with \code{f = datatable::frollsum}. + +To "slide" means to apply a function over a rolling window of time +steps for each data group, where the window is centered at a reference +time and left and right endpoints are given by the \code{before} and \code{after} +arguments. The unit (the meaning of one time step) is implicitly defined +by the way the \code{time_value} column treats addition and subtraction; for +example, if the time values are coded as \code{Date} objects, then one time +step is one day, since \code{as.Date("2022-01-01") + 1} equals \code{as.Date ("2022-01-02")}. Alternatively, the time step can be set explicitly using +the \code{time_step} argument (which if specified would override the default +choice based on \code{time_value} column). If there are not enough time steps +available to complete the window at any given reference time, then +\verb{epi_slide_*()} will fail; it requires a complete window to perform the +computation. For a centrally-aligned slide of \code{n} \code{time_value}s in a +sliding window, set \code{before = (n-1)/2} and \code{after = (n-1)/2} when the +number of \code{time_value}s in a sliding window is odd and \code{before = n/2-1} +and \code{after = n/2} when \code{n} is even. + +Sometimes, we want to experiment with various trailing or leading window +widths and compare the slide outputs. In the (uncommon) case where +zero-width windows are considered, manually pass both the \code{before} and +\code{after} arguments in order to prevent potential warnings. (E.g., \code{before=k} +with \code{k=0} and \code{after} missing may produce a warning. To avoid warnings, +use \verb{before=k, after=0} instead; otherwise, it looks too much like a +leading window was intended, but the \code{after} argument was forgotten or +misspelled.) } \examples{ # slide a 7-day trailing sum formula on cases jhu_csse_daily_subset \%>\% group_by(geo_value) \%>\% - epi_slide_sum(cases, new_col_names = "cases_7dsum", names_sep = NULL, before = 6) \%>\% + epi_slide_sum(cases, new_col_name = "cases_7dsum", names_sep = NULL, before = 6) \%>\% # Remove a nonessential var. to ensure new col is printed dplyr::select(geo_value, time_value, cases, cases_7dsum) \%>\% ungroup() diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index f5749d82..5a232de0 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -22,20 +22,20 @@ \method{unnest}{epi_df}(data, ...) } \arguments{ -\item{x}{The \code{epi_df} object.} +\item{x}{an \code{epi_df}} \item{...}{Additional arguments, for compatibility with \code{summary()}. Currently unused.} -\item{object}{The \code{epi_df} object.} +\item{object}{an \code{epi_df}} -\item{.data}{The \code{epi_df} object.} +\item{.data}{an \code{epi_df}} \item{.f}{function or formula; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} \item{.keep}{Boolean; see \code{\link[dplyr:group_map]{dplyr::group_modify}}} -\item{data}{The \code{epi_df} object.} +\item{data}{an \code{epi_df}} } \description{ Print and summary functions for an \code{epi_df} object.