diff --git a/.Rbuildignore b/.Rbuildignore index 510725267..dc41e6223 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,4 +20,5 @@ ^doc$ ^Meta$ ^.lintr$ -^.venv$ \ No newline at end of file +^.venv$ +^inst/templates$ diff --git a/DESCRIPTION b/DESCRIPTION index 5cd468fb9..d3369cf23 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.1.0 +Version: 0.1.1 Authors@R: c( person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), @@ -40,6 +40,7 @@ Imports: magrittr, recipes (>= 1.0.4), rlang (>= 1.1.0), + purrr, stats, tibble, tidyr, diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 67f1b3003..8bc251e77 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -32,7 +32,9 @@ The `main` version is available at `file:////epidatr/epipredict/inde You can also build the docs manually and launch the site with python. From the terminal, this looks like ```bash +R -e 'pkgdown::clean_site()' R -e 'devtools::document()' +R -e 'pkgdown::build_site()' python -m http.server -d docs ``` diff --git a/NAMESPACE b/NAMESPACE index e815203eb..86b77716b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,6 +19,7 @@ S3method(autoplot,canned_epipred) S3method(autoplot,epi_workflow) S3method(bake,check_enough_train_data) S3method(bake,epi_recipe) +S3method(bake,step_adjust_latency) S3method(bake,step_epi_ahead) S3method(bake,step_epi_lag) S3method(bake,step_epi_slide) @@ -58,6 +59,7 @@ S3method(predict,epi_workflow) S3method(predict,flatline) S3method(prep,check_enough_train_data) S3method(prep,epi_recipe) +S3method(prep,step_adjust_latency) S3method(prep,step_epi_ahead) S3method(prep,step_epi_lag) S3method(prep,step_epi_slide) @@ -87,6 +89,7 @@ S3method(print,layer_quantile_distn) S3method(print,layer_residual_quantiles) S3method(print,layer_threshold) S3method(print,layer_unnest) +S3method(print,step_adjust_latency) S3method(print,step_epi_ahead) S3method(print,step_epi_lag) S3method(print,step_epi_slide) @@ -195,6 +198,7 @@ export(remove_frosting) export(remove_model) export(slather) export(smooth_quantile_reg) +export(step_adjust_latency) export(step_epi_ahead) export(step_epi_lag) export(step_epi_naomit) @@ -225,6 +229,7 @@ importFrom(checkmate,test_numeric) importFrom(checkmate,test_scalar) importFrom(cli,cli_abort) importFrom(cli,cli_warn) +importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,all_of) importFrom(dplyr,any_of) @@ -235,13 +240,20 @@ importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,full_join) importFrom(dplyr,group_by) +importFrom(dplyr,group_by_at) +importFrom(dplyr,join_by) importFrom(dplyr,left_join) importFrom(dplyr,mutate) +importFrom(dplyr,n) +importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,rename) +importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,summarize) +importFrom(dplyr,tibble) +importFrom(dplyr,tribble) importFrom(dplyr,ungroup) importFrom(epiprocess,epi_slide) importFrom(epiprocess,growth_rate) @@ -255,10 +267,13 @@ importFrom(ggplot2,geom_line) importFrom(ggplot2,geom_linerange) importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) +importFrom(glue,glue) +importFrom(hardhat,extract_recipe) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) importFrom(magrittr,"%>%") importFrom(recipes,bake) +importFrom(recipes,detect_step) importFrom(recipes,prep) importFrom(recipes,rand_id) importFrom(rlang,"!!!") @@ -266,7 +281,6 @@ importFrom(rlang,"!!") importFrom(rlang,"%@%") importFrom(rlang,"%||%") importFrom(rlang,":=") -importFrom(rlang,abort) importFrom(rlang,arg_match) importFrom(rlang,as_function) importFrom(rlang,caller_arg) @@ -276,9 +290,11 @@ importFrom(rlang,enquos) importFrom(rlang,expr) importFrom(rlang,global_env) importFrom(rlang,inject) +importFrom(rlang,is_empty) importFrom(rlang,is_logical) importFrom(rlang,is_null) importFrom(rlang,is_true) +importFrom(rlang,list2) importFrom(rlang,set_names) importFrom(rlang,sym) importFrom(stats,as.formula) @@ -286,6 +302,7 @@ importFrom(stats,family) importFrom(stats,lm) importFrom(stats,median) importFrom(stats,model.frame) +importFrom(stats,na.omit) importFrom(stats,poly) importFrom(stats,predict) importFrom(stats,qnorm) @@ -294,6 +311,12 @@ importFrom(stats,residuals) importFrom(tibble,as_tibble) importFrom(tibble,tibble) importFrom(tidyr,crossing) +importFrom(tidyr,drop_na) +importFrom(tidyr,expand_grid) +importFrom(tidyr,fill) +importFrom(tidyr,unnest) +importFrom(tidyselect,all_of) +importFrom(utils,capture.output) importFrom(vctrs,as_list_of) importFrom(vctrs,field) importFrom(vctrs,new_rcrd) @@ -303,3 +326,4 @@ importFrom(vctrs,vec_data) importFrom(vctrs,vec_ptype_abbr) importFrom(vctrs,vec_ptype_full) importFrom(vctrs,vec_recycle_common) +importFrom(workflows,extract_preprocessor) diff --git a/NEWS.md b/NEWS.md index 8edddae92..ef71394a2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,13 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicate PR's. +# epipredict 0.2 + +## features +- Add `step_adjust_latency`, which give several methods to adjust the forecast if the `forecast_date` is after the last day of data. + +## bugfixes + # epipredict 0.1 - simplify `layer_residual_quantiles()` to avoid timesuck in `utils::methods()` diff --git a/R/arx_classifier.R b/R/arx_classifier.R index ca6a3537b..0aec0e362 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -55,11 +55,18 @@ arx_classifier <- function( wf <- arx_class_epi_workflow(epi_data, outcome, predictors, trainer, args_list) wf <- fit(wf, epi_data) + if (args_list$adjust_latency == "none") { + forecast_date_default <- max(epi_data$time_value) + if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { + cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") + } + } else { + forecast_date_default <- attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% forecast_date_default + target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) preds <- forecast( wf, - fill_locf = TRUE, - n_recent = args_list$nafill_buffer, - forecast_date = args_list$forecast_date %||% max(epi_data$time_value) ) %>% as_tibble() %>% select(-time_value) @@ -125,13 +132,25 @@ arx_class_epi_workflow <- function( if (!(is.null(trainer) || is_classification(trainer))) { cli_abort("`trainer` must be a {.pkg parsnip} model of mode 'classification'.") } + + if (args_list$adjust_latency == "none") { + forecast_date_default <- max(epi_data$time_value) + if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { + cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") + } + } else { + forecast_date_default <- attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% forecast_date_default + target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) + lags <- arx_lags_validator(predictors, args_list$lags) # --- preprocessor # ------- predictors r <- epi_recipe(epi_data) %>% step_growth_rate( - dplyr::all_of(predictors), + all_of(predictors), role = "grp", horizon = args_list$horizon, method = args_list$method, @@ -139,13 +158,13 @@ arx_class_epi_workflow <- function( additional_gr_args_list = args_list$additional_gr_args ) for (l in seq_along(lags)) { - p <- predictors[l] - p <- as.character(glue::glue_data(args_list, "gr_{horizon}_{method}_{p}")) - r <- step_epi_lag(r, !!p, lag = lags[[l]]) + pred_names <- predictors[l] + pred_names <- as.character(glue::glue_data(args_list, "gr_{horizon}_{method}_{pred_names}")) + r <- step_epi_lag(r, !!pred_names, lag = lags[[l]]) } # ------- outcome if (args_list$outcome_transform == "lag_difference") { - o <- as.character( + pre_out_name <- as.character( glue::glue_data(args_list, "lag_diff_{horizon}_{outcome}") ) r <- r %>% @@ -156,7 +175,7 @@ arx_class_epi_workflow <- function( ) } if (args_list$outcome_transform == "growth_rate") { - o <- as.character( + pre_out_name <- as.character( glue::glue_data(args_list, "gr_{horizon}_{method}_{outcome}") ) if (!(outcome %in% predictors)) { @@ -171,11 +190,30 @@ arx_class_epi_workflow <- function( ) } } - o2 <- rlang::sym(paste0("ahead_", args_list$ahead, "_", o)) + # regex that will match any amount of adjustment for the ahead + ahead_out_name_regex <- glue::glue("ahead_[0-9]*_{pre_out_name}") + method_adjust_latency <- args_list$adjust_latency + if (method_adjust_latency != "none") { + if (method_adjust_latency != "extend_ahead") { + cli_abort("only extend_ahead is currently supported", + class = "epipredict__arx_classifier__adjust_latency_unsupported_method" + ) + } + r <- r %>% step_adjust_latency(!!pre_out_name, + fixed_forecast_date = forecast_date, + method = method_adjust_latency + ) + } r <- r %>% - step_epi_ahead(!!o, ahead = args_list$ahead, role = "pre-outcome") %>% - recipes::step_mutate( - outcome_class = cut(!!o2, breaks = args_list$breaks), + step_epi_ahead(!!pre_out_name, ahead = args_list$ahead, role = "pre-outcome") + r <- r %>% + step_mutate( + across( + matches(ahead_out_name_regex), + ~ cut(.x, breaks = args_list$breaks), + .names = "outcome_class", + .unpack = TRUE + ), role = "outcome" ) %>% step_epi_naomit() %>% @@ -192,10 +230,6 @@ arx_class_epi_workflow <- function( ) } - - forecast_date <- args_list$forecast_date %||% max(epi_data$time_value) - target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) - # --- postprocessor f <- frosting() %>% layer_predict() # %>% layer_naomit() f <- layer_add_forecast_date(f, forecast_date = forecast_date) %>% @@ -260,13 +294,14 @@ arx_class_args_list <- function( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, outcome_transform = c("growth_rate", "lag_difference"), breaks = 0.25, horizon = 7L, method = c("rel_change", "linear_reg"), log_scale = FALSE, additional_gr_args = list(), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ...) { @@ -276,7 +311,8 @@ arx_class_args_list <- function( method <- rlang::arg_match(method) outcome_transform <- rlang::arg_match(outcome_transform) - arg_is_scalar(ahead, n_training, horizon, log_scale) + adjust_latency <- rlang::arg_match(adjust_latency) + arg_is_scalar(ahead, n_training, horizon, log_scale, adjust_latency, warn_latency) arg_is_scalar(forecast_date, target_date, allow_null = TRUE) arg_is_date(forecast_date, target_date, allow_null = TRUE) arg_is_nonneg_int(ahead, lags, horizon) @@ -284,7 +320,6 @@ arx_class_args_list <- function( arg_is_lgl(log_scale) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) if (!is.list(additional_gr_args)) { cli_abort(c( "`additional_gr_args` must be a {.cls list}.", @@ -297,10 +332,13 @@ arx_class_args_list <- function( if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { - cli::cli_warn(c( - "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." - )) + cli_warn( + paste0( + "`forecast_date` {.val {forecast_date}} +", + " `ahead` {.val {ahead}} must equal `target_date` {.val {target_date}}." + ), + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" + ) } } @@ -318,13 +356,13 @@ arx_class_args_list <- function( breaks, forecast_date, target_date, + adjust_latency, outcome_transform, max_lags, horizon, method, log_scale, additional_gr_args, - nafill_buffer, check_enough_data_n, check_enough_data_epi_keys ), @@ -337,5 +375,3 @@ print.arx_class <- function(x, ...) { name <- "ARX Classifier" NextMethod(name = name, ...) } - -# this is a trivial change to induce a check diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 37c9aae86..bfd5eaec1 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -51,12 +51,16 @@ arx_forecaster <- function( wf <- arx_fcast_epi_workflow(epi_data, outcome, predictors, trainer, args_list) wf <- fit(wf, epi_data) - preds <- forecast( - wf, - fill_locf = TRUE, - n_recent = args_list$nafill_buffer, - forecast_date = args_list$forecast_date %||% max(epi_data$time_value) - ) %>% + # get the forecast date for the forecast function + if (args_list$adjust_latency == "none") { + forecast_date_default <- max(epi_data$time_value) + } else { + forecast_date_default <- attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% forecast_date_default + + + preds <- forecast(wf, forecast_date = forecast_date) %>% as_tibble() %>% select(-time_value) @@ -119,22 +123,58 @@ arx_fcast_epi_workflow <- function( if (!(is.null(trainer) || is_regression(trainer))) { cli_abort("`trainer` must be a {.pkg parsnip} model of mode 'regression'.") } + # forecast_date is above all what they set; + # if they don't and they're not adjusting latency, it defaults to the max time_value + # if they're adjusting, it defaults to the as_of + if (args_list$adjust_latency == "none") { + forecast_date_default <- max(epi_data$time_value) + if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { + cli_warn( + "The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is actually occurring {forecast_date_default}.", + class = "epipredict__arx_forecaster__forecast_date_defaulting" + ) + } + } else { + forecast_date_default <- attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% forecast_date_default + target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) + if (forecast_date + args_list$ahead != target_date) { + cli_abort("`forecast_date` {.val {forecast_date}} + `ahead` {.val {ahead}} must equal `target_date` {.val {target_date}}.", + class = "epipredict__arx_forecaster__inconsistent_target_ahead_forecaste_date" + ) + } + lags <- arx_lags_validator(predictors, args_list$lags) # --- preprocessor r <- epi_recipe(epi_data) + # adjust latency if the user asks + method_adjust_latency <- args_list$adjust_latency + if (!is.null(method_adjust_latency)) { + if (method_adjust_latency == "extend_ahead") { + r <- r %>% step_adjust_latency(all_outcomes(), + fixed_forecast_date = forecast_date, + method = method_adjust_latency + ) + } else if (method_adjust_latency == "extend_lags") { + r <- r %>% step_adjust_latency(all_predictors(), + fixed_forecast_date = forecast_date, + method = method_adjust_latency + ) + } + } for (l in seq_along(lags)) { p <- predictors[l] r <- step_epi_lag(r, !!p, lag = lags[[l]]) } r <- r %>% - step_epi_ahead(!!outcome, ahead = args_list$ahead) %>% + step_epi_ahead(!!outcome, ahead = args_list$ahead) + r <- r %>% step_epi_naomit() %>% step_training_window(n_recent = args_list$n_training) - if (!is.null(args_list$check_enough_data_n)) { - r <- check_enough_train_data( - r, + r <- r %>% check_enough_train_data( all_predictors(), !!outcome, n = args_list$check_enough_data_n, @@ -144,9 +184,6 @@ arx_fcast_epi_workflow <- function( } - forecast_date <- args_list$forecast_date %||% max(epi_data$time_value) - target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) - # --- postprocessor f <- frosting() %>% layer_predict() # %>% layer_naomit() if (inherits(trainer, "quantile_reg")) { @@ -156,12 +193,12 @@ arx_fcast_epi_workflow <- function( rlang::eval_tidy(trainer$args$quantile_levels) )) args_list$quantile_levels <- quantile_levels - trainer$args$quantile_levels <- enquo(quantile_levels) - f <- layer_quantile_distn(f, quantile_levels = quantile_levels) %>% + trainer$args$quantile_levels <- rlang::enquo(quantile_levels) + f <- f %>% + layer_quantile_distn(quantile_levels = quantile_levels) %>% layer_point_from_distn() } else { - f <- layer_residual_quantiles( - f, + f <- f %>% layer_residual_quantiles( quantile_levels = args_list$quantile_levels, symmetrize = args_list$symmetrize, by_key = args_list$quantile_by_key @@ -187,10 +224,27 @@ arx_fcast_epi_workflow <- function( #' @param n_training Integer. An upper limit for the number of rows per #' key that are used for training #' (in the time unit of the `epi_df`). -#' @param forecast_date Date. The date on which the forecast is created. -#' The default `NULL` will attempt to determine this automatically. -#' @param target_date Date. The date for which the forecast is intended. -#' The default `NULL` will attempt to determine this automatically. +#' @param forecast_date Date. The date from which the forecast is occurring. +#' The default `NULL` will determine this automatically from either +#' 1. the maximum time value for which there's data if there is no latency +#' adjustment (the default case), or +#' 2. the `as_of` date of `epi_data` if `adjust_latency` is +#' non-`NULL`. +#' @param target_date Date. The date that is being forecast. The default `NULL` +#' will determine this automatically as `forecast_date + ahead`. +#' @param adjust_latency Character. One of the `method`s of +#' [step_adjust_latency()], or `"none"` (in which case there is no adjustment). +#' If the `forecast_date` is after the last day of data, this determines how +#' to shift the model to account for this difference. The options are: +#' - `"none"` the default, assumes the `forecast_date` is the last day of data +#' - `"extend_ahead"`: increase the `ahead` by the latency so it's relative to +#' the last day of data. For example, if the last day of data was 3 days ago, +#' the ahead becomes `ahead+3`. +#' - `"extend_lags"`: increase the lags so they're relative to the actual +#' forecast date. For example, if the lags are `c(0,7,14)` and the last day of +#' data was 3 days ago, the lags become `c(3,10,17)`. +#' @param warn_latency by default, `step_adjust_latency` warns the user if the +#' latency is large. If this is `FALSE`, that warning is turned off. #' @param quantile_levels Vector or `NULL`. A vector of probabilities to produce #' prediction intervals. These are created by computing the quantiles of #' training residuals. A `NULL` value will result in point forecasts only. @@ -206,15 +260,6 @@ arx_fcast_epi_workflow <- function( #' `character(0)` performs no grouping. This argument only applies when #' residual quantiles are used. It is not applicable with #' `trainer = quantile_reg()`, for example. -#' @param nafill_buffer At predict time, recent values of the training data -#' are used to create a forecast. However, these can be `NA` due to, e.g., -#' data latency issues. By default, any missing values will get filled with -#' less recent data. Setting this value to `NULL` will result in 1 extra -#' recent row (beyond those required for lag creation) to be used. Note that -#' we require at least `min(lags)` rows of recent data per `geo_value` to -#' create a prediction. For this reason, setting `nafill_buffer < min(lags)` -#' will be treated as _additional_ allowed recent data rather than the -#' total amount of recent data to examine. #' @param check_enough_data_n Integer. A lower limit for the number of rows per #' epi_key that are required for training. If `NULL`, this check is ignored. #' @param check_enough_data_epi_keys Character vector. A character vector of @@ -236,11 +281,12 @@ arx_args_list <- function( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, quantile_levels = c(0.05, 0.95), symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ...) { @@ -249,7 +295,8 @@ arx_args_list <- function( .lags <- lags if (is.list(lags)) lags <- unlist(lags) - arg_is_scalar(ahead, n_training, symmetrize, nonneg) + adjust_latency <- rlang::arg_match(adjust_latency) + arg_is_scalar(ahead, n_training, symmetrize, nonneg, adjust_latency, warn_latency) arg_is_chr(quantile_by_key, allow_empty = TRUE) arg_is_scalar(forecast_date, target_date, allow_null = TRUE) arg_is_date(forecast_date, target_date, allow_null = TRUE) @@ -258,16 +305,14 @@ arx_args_list <- function( arg_is_probabilities(quantile_levels, allow_null = TRUE) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) arg_is_pos(check_enough_data_n, allow_null = TRUE) arg_is_chr(check_enough_data_epi_keys, allow_null = TRUE) if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { - cli_warn(c( - "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." - )) + cli_abort("`forecast_date` {.val {forecast_date}} + `ahead` {.val {ahead}} must equal `target_date` {.val {target_date}}.", + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" + ) } } @@ -280,11 +325,12 @@ arx_args_list <- function( quantile_levels, forecast_date, target_date, + adjust_latency, + warn_latency, symmetrize, nonneg, max_lags, quantile_by_key, - nafill_buffer, check_enough_data_n, check_enough_data_epi_keys ), diff --git a/R/autoplot.R b/R/autoplot.R index 648c74e33..8bded03a3 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -217,7 +217,7 @@ autoplot.canned_epipred <- function( ewf <- object$epi_workflow predictions <- object$predictions %>% - dplyr::rename(time_value = target_date) + rename(time_value = target_date) autoplot(ewf, predictions, .color_by = .color_by, .facet_by = .facet_by, diff --git a/R/canned-epipred.R b/R/canned-epipred.R index 0adc0536a..1e088426e 100644 --- a/R/canned-epipred.R +++ b/R/canned-epipred.R @@ -63,6 +63,7 @@ print.alist <- function(x, ...) { } #' @export +#' @importFrom hardhat extract_recipe print.canned_epipred <- function(x, name, ...) { d <- cli::cli_div(theme = list(rule = list("line-type" = "double"))) cli::cli_rule("A basic forecaster of type {name}") @@ -81,6 +82,7 @@ print.canned_epipred <- function(x, name, ...) { } cli::cli_li("Time type: {.field {x$metadata$training$time_type}},") cli::cli_li("Using data up-to-date as of: {.field {format(x$metadata$training$as_of)}}.") + cli::cli_li("With the last data available on {.field {format(max(x$epi_workflow$original_data$time_value))}}") cli::cli_end() } fn_meta() @@ -103,10 +105,43 @@ print.canned_epipred <- function(x, name, ...) { "A total of {.val {nrow(x$predictions)}} prediction{?s}", " {?is/are} available for" )) + cli::cli_ul(c( "{.val {n_geos}} unique geographic region{?s},", "At forecast date{?s}: {.val {fds}},", - "For target date{?s}: {.val {tds}}." + "For target date{?s}: {.val {tds}}," )) + fit_recipe <- extract_recipe(x$epi_workflow) + if (detect_step(fit_recipe, "adjust_latency")) { + is_adj_latency <- map_lgl(fit_recipe$steps, function(x) inherits(x, "step_adjust_latency")) + latency_step <- fit_recipe$steps[is_adj_latency][[1]] + # all steps after adjust_latency + later_steps <- fit_recipe$steps[-(1:which(is_adj_latency))] + if (latency_step$method == "extend_ahead") { + step_names <- "step_epi_ahead" + type_str <- "Aheads" + } else if (latency_step$method == "extend_lags") { + step_names <- "step_epi_lag" + type_str <- "Lags" + } else { + step_names <- "" + type_str <- "columns locf" + } + later_steps[[1]]$columns + valid_columns <- later_steps %>% + keep(function(x) inherits(x, step_names)) %>% + purrr::map("columns") %>% + reduce(c) + latency_per_base_col <- latency_step$latency_table %>% + filter(col_name %in% valid_columns) %>% + mutate(latency = abs(latency)) + if (latency_step$method != "locf" && nrow(latency_per_base_col) > 1) { + intro_text <- glue::glue("{type_str} adjusted per column: ") + } else if (latency_step$method != "locf") { + intro_text <- glue::glue("{type_str} adjusted for ") + } + latency_info <- paste0(intro_text, paste(apply(latency_per_base_col, 1, paste0, collapse = "="), collapse = ", ")) + cli::cli_ul(latency_info) + } cli::cli_text("") } diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index b2e7434e2..3352c5159 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -78,10 +78,7 @@ cdc_baseline_forecaster <- function( # target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) - latest <- get_test_data( - epi_recipe(epi_data), epi_data, TRUE, args_list$nafill_buffer, - forecast_date - ) + latest <- get_test_data(epi_recipe(epi_data), epi_data) f <- frosting() %>% layer_predict() %>% @@ -169,7 +166,6 @@ cdc_baseline_args_list <- function( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = "geo_value", - nafill_buffer = Inf, ...) { rlang::check_dots_empty() arg_is_scalar(n_training, nsims, data_frequency) @@ -183,7 +179,6 @@ cdc_baseline_args_list <- function( arg_is_probabilities(quantile_levels, allow_null = TRUE) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) structure( enlist( @@ -195,8 +190,7 @@ cdc_baseline_args_list <- function( nsims, symmetrize, nonneg, - quantile_by_key, - nafill_buffer + quantile_by_key ), class = c("cdc_baseline_fcast", "alist") ) diff --git a/R/compat-purrr.R b/R/compat-purrr.R index e06038e44..6ec7df02f 100644 --- a/R/compat-purrr.R +++ b/R/compat-purrr.R @@ -11,11 +11,11 @@ map_vec <- function(.x, .f, ...) { map_dfr <- function(.x, .f, ..., .id = NULL) { .f <- rlang::as_function(.f, env = rlang::global_env()) res <- map(.x, .f, ...) - dplyr::bind_rows(res, .id = .id) + bind_rows(res, .id = .id) } map2_dfr <- function(.x, .y, .f, ..., .id = NULL) { .f <- rlang::as_function(.f, env = rlang::global_env()) res <- map2(.x, .y, .f, ...) - dplyr::bind_rows(res, .id = .id) + bind_rows(res, .id = .id) } diff --git a/R/compat-recipes.R b/R/compat-recipes.R index 12d11049a..f90367497 100644 --- a/R/compat-recipes.R +++ b/R/compat-recipes.R @@ -18,7 +18,7 @@ inline_check <- function(x) { funs <- fun_calls(x) funs <- funs[!(funs %in% c("~", "+", "-"))] if (length(funs) > 0) { - rlang::abort(paste0( + cli_abort(paste0( "No in-line functions should be used here; ", "use steps to define baking actions." )) diff --git a/R/create-layer.R b/R/create-layer.R index 0268a906f..7179cb7e3 100644 --- a/R/create-layer.R +++ b/R/create-layer.R @@ -22,7 +22,7 @@ create_layer <- function(name = NULL, open = rlang::is_interactive()) { if (substr(name, 1, 5) == "layer") { nn <- substring(name, 6) if (substr(nn, 1, 1) == "_") nn <- substring(nn, 2) - cli::cli_abort( + cli_abort( c('`name` should not begin with "layer" or "layer_".', i = 'Did you mean to use `create_layer("{ nn }")`?' ) diff --git a/R/dist_quantiles.R b/R/dist_quantiles.R index dd97ec809..8930bdeaa 100644 --- a/R/dist_quantiles.R +++ b/R/dist_quantiles.R @@ -23,7 +23,7 @@ new_quantiles <- function(values = double(1), quantile_levels = double(1)) { quantile_levels <- quantile_levels[o] } if (is.unsorted(values, na.rm = TRUE)) { - cli::cli_abort("`values[order(quantile_levels)]` produces unsorted quantiles.") + cli_abort("`values[order(quantile_levels)]` produces unsorted quantiles.") } new_rcrd(list(values = values, quantile_levels = quantile_levels), @@ -102,14 +102,14 @@ validate_dist_quantiles <- function(values, quantile_levels) { ) length_diff <- vctrs::list_sizes(values) != vctrs::list_sizes(quantile_levels) if (any(length_diff)) { - cli::cli_abort(c( + cli_abort(c( "`values` and `quantile_levels` must have common length.", i = "Mismatches found at position(s): {.val {which(length_diff)}}." )) } level_duplication <- map_lgl(quantile_levels, vctrs::vec_duplicate_any) if (any(level_duplication)) { - cli::cli_abort(c( + cli_abort(c( "`quantile_levels` must not be duplicated.", i = "Duplicates found at position(s): {.val {which(level_duplication)}}." )) @@ -171,7 +171,7 @@ quantile_extrapolate <- function(x, tau_out, middle) { return(qvals[match(tau_out, tau)]) } if (length(tau) < 2) { - cli::cli_abort( + cli_abort( "Quantile extrapolation is not possible with fewer than 2 quantiles." ) return(qvals_out) @@ -209,7 +209,7 @@ quantile_extrapolate <- function(x, tau_out, middle) { v = c(qvals, qvals_out[indm]) ) %>% dplyr::distinct(q, .keep_all = TRUE) %>% - dplyr::arrange(q) + arrange(q) } if (any(indl)) { qvals_out[indl] <- tail_extrapolate(tau_out[indl], utils::head(qv, 2)) @@ -267,7 +267,7 @@ Ops.dist_quantiles <- function(e1, e2) { } tau <- union(tau1, tau2) if (all(is_dist)) { - cli::cli_abort( + cli_abort( "You can't perform arithmetic between two distributions like this." ) } else { diff --git a/R/epi_recipe.R b/R/epi_recipe.R index f8216c2af..311b9d073 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -61,15 +61,13 @@ epi_recipe.epi_df <- attr(x, "decay_to_tibble") <- FALSE if (!is.null(formula)) { if (!is.null(vars)) { - rlang::abort( - paste0( - "This `vars` specification will be ignored ", - "when a formula is used" - ) - ) + cli_abort(paste0( + "This `vars` specification will be ignored ", + "when a formula is used" + )) } if (!is.null(roles)) { - rlang::abort( + cli_abort( paste0( "This `roles` specification will be ignored ", "when a formula is used" @@ -82,10 +80,10 @@ epi_recipe.epi_df <- } if (is.null(vars)) vars <- colnames(x) if (any(table(vars) > 1)) { - rlang::abort("`vars` should have unique members") + cli_abort("`vars` should have unique members") } if (any(!(vars %in% colnames(x)))) { - rlang::abort("1 or more elements of `vars` are not in the data") + cli_abort("1 or more elements of `vars` are not in the data") } keys <- key_colnames(x) # we know x is an epi_df @@ -96,14 +94,14 @@ epi_recipe.epi_df <- ## Check and add roles when available if (!is.null(roles)) { if (length(roles) != length(vars)) { - rlang::abort(c( + cli_abort(paste0( "The number of roles should be the same as the number of ", "variables." )) } var_info$role <- roles } else { - var_info <- var_info %>% dplyr::filter(!(variable %in% keys)) + var_info <- var_info %>% filter(!(variable %in% keys)) var_info$role <- "raw" } ## Now we add the keys when necessary @@ -113,12 +111,12 @@ epi_recipe.epi_df <- ) ## Add types - var_info <- dplyr::full_join(recipes:::get_types(x), var_info, by = "variable") + var_info <- full_join(recipes:::get_types(x), var_info, by = "variable") var_info$source <- "original" ## arrange to easy order var_info <- var_info %>% - dplyr::arrange(factor( + arrange(factor( role, levels = union( c("predictor", "outcome", "time_value", "geo_value", "key"), @@ -142,7 +140,6 @@ epi_recipe.epi_df <- #' @rdname epi_recipe -#' @importFrom rlang abort #' @export epi_recipe.formula <- function(formula, data, ...) { # we ensure that there's only 1 row in the template @@ -395,7 +392,7 @@ adjust_epi_recipe.epi_workflow <- function(x, which_step, ..., blueprint = defau #' @export adjust_epi_recipe.epi_recipe <- function(x, which_step, ..., blueprint = default_epi_recipe_blueprint()) { if (!(is.numeric(which_step) || is.character(which_step))) { - cli::cli_abort( + cli_abort( c("`which_step` must be a number or a character.", i = "`which_step` has class {.cls {class(which_step)[1]}}." ) @@ -408,7 +405,7 @@ adjust_epi_recipe.epi_recipe <- function(x, which_step, ..., blueprint = default if (!starts_with_step) which_step <- paste0("step_", which_step) if (!(which_step %in% step_names)) { - cli::cli_abort(c( + cli_abort(c( "`which_step` does not appear in the available `epi_recipe` step names. ", i = "The step names are {.val {step_names}}." )) @@ -417,7 +414,7 @@ adjust_epi_recipe.epi_recipe <- function(x, which_step, ..., blueprint = default if (length(which_step_idx) == 1) { x$steps[[which_step_idx]] <- update(x$steps[[which_step_idx]], ...) } else { - cli::cli_abort(c( + cli_abort(c( "`which_step` is not unique. Matches steps: {.val {which_step_idx}}.", i = "Please use the step number instead for precise alterations." )) @@ -432,7 +429,7 @@ prep.epi_recipe <- function( x, training = NULL, fresh = FALSE, verbose = FALSE, retain = TRUE, log_changes = FALSE, strings_as_factors = TRUE, ...) { if (is.null(training)) { - cli::cli_warn(c( + cli_warn(c( "!" = "No training data was supplied to {.fn prep}.", "!" = "Unlike a {.cls recipe}, an {.cls epi_recipe} does not ", "!" = "store the full template data in the object.", @@ -442,7 +439,7 @@ prep.epi_recipe <- function( } training <- recipes:::check_training_set(training, x, fresh) training <- epi_check_training_set(training, x) - training <- dplyr::relocate(training, dplyr::all_of(key_colnames(training))) + training <- relocate(training, all_of(key_colnames(training))) tr_data <- recipes:::train_info(training) keys <- key_colnames(x) @@ -457,7 +454,7 @@ prep.epi_recipe <- function( } skippers <- map_lgl(x$steps, recipes:::is_skipable) if (any(skippers) & !retain) { - cli::cli_warn(c( + cli_warn(paste( "Since some operations have `skip = TRUE`, using ", "`retain = TRUE` will allow those steps results to ", "be accessible." @@ -465,9 +462,9 @@ prep.epi_recipe <- function( } if (fresh) x$term_info <- x$var_info - running_info <- x$term_info %>% dplyr::mutate(number = 0, skip = FALSE) - for (i in seq(along.with = x$steps)) { - needs_tuning <- map_lgl(x$steps[[i]], recipes:::is_tune) + running_info <- x$term_info %>% mutate(number = 0, skip = FALSE) + for (ii in seq(along.with = x$steps)) { + needs_tuning <- map_lgl(x$steps[[ii]], recipes:::is_tune) if (any(needs_tuning)) { arg <- names(needs_tuning)[needs_tuning] arg <- paste0("'", arg, "'", collapse = ", ") @@ -475,22 +472,22 @@ prep.epi_recipe <- function( "You cannot `prep()` a tuneable recipe. Argument(s) with `tune()`: ", arg, ". Do you want to use a tuning function such as `tune_grid()`?" ) - rlang::abort(msg) + cli_abort(msg) } - note <- paste("oper", i, gsub("_", " ", class(x$steps[[i]])[1])) - if (!x$steps[[i]]$trained | fresh) { + note <- paste("oper", ii, gsub("_", " ", class(x$steps[[ii]])[1])) + if (!x$steps[[ii]]$trained || fresh) { if (verbose) { cat(note, "[training]", "\n") } before_nms <- names(training) before_template <- training[1, ] - x$steps[[i]] <- prep(x$steps[[i]], + x$steps[[ii]] <- prep(x$steps[[ii]], training = training, info = x$term_info ) - training <- bake(x$steps[[i]], new_data = training) + training <- bake(x$steps[[ii]], new_data = training) if (!tibble::is_tibble(training)) { - cli::cli_abort("`bake()` methods should always return {.cls tibble}.") + cli_abort("`bake()` methods should always return {.cls tibble}.") } if (!is_epi_df(training)) { # tidymodels killed our class @@ -502,20 +499,20 @@ prep.epi_recipe <- function( other_keys = metadata$other_keys %||% character() ) } - training <- dplyr::relocate(training, all_of(key_colnames(training))) + training <- relocate(training, all_of(key_colnames(training))) x$term_info <- recipes:::merge_term_info(get_types(training), x$term_info) - if (!is.na(x$steps[[i]]$role)) { + if (!is.na(x$steps[[ii]]$role)) { new_vars <- setdiff(x$term_info$variable, running_info$variable) pos_new_var <- x$term_info$variable %in% new_vars pos_new_and_na_role <- pos_new_var & is.na(x$term_info$role) pos_new_and_na_source <- pos_new_var & is.na(x$term_info$source) - x$term_info$role[pos_new_and_na_role] <- x$steps[[i]]$role + x$term_info$role[pos_new_and_na_role] <- x$steps[[ii]]$role x$term_info$source[pos_new_and_na_source] <- "derived" } recipes:::changelog(log_changes, before_nms, names(training), x$steps[[i]]) running_info <- rbind( running_info, - dplyr::mutate(x$term_info, number = i, skip = x$steps[[i]]$skip) + mutate(x$term_info, number = ii, skip = x$steps[[ii]]$skip) ) } else { if (verbose) cat(note, "[pre-trained]\n") @@ -547,9 +544,9 @@ prep.epi_recipe <- function( x$orig_lvls <- orig_lvls x$retained <- retain x$last_term_info <- running_info %>% - dplyr::group_by(variable) %>% - dplyr::arrange(dplyr::desc(number)) %>% - dplyr::summarise( + group_by(variable) %>% + arrange(dplyr::desc(number)) %>% + summarise( type = list(dplyr::first(type)), role = list(unique(unlist(role))), source = dplyr::first(source), @@ -571,6 +568,7 @@ bake.epi_recipe <- function(object, new_data, ..., composition = "epi_df") { } composition <- "tibble" } + new_data <- NextMethod("bake") if (!is.null(meta)) { # Baking should have dropped epi_df-ness and metadata. Re-infer some diff --git a/R/epi_shift.R b/R/epi_shift.R index eb534f1ea..367e26285 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -2,43 +2,72 @@ #' #' This is a lower-level function. As such it performs no error checking. #' -#' @param x Data frame. Variables to shift -#' @param shifts List. Each list element is a vector of shifts. -#' Negative values produce leads. The list should have the same -#' length as the number of columns in `x`. -#' @param time_value Vector. Same length as `x` giving time stamps. -#' @param keys Data frame, vector, or `NULL`. Additional grouping vars. -#' @param out_name Chr. The output list will use this as a prefix. +#' @param x Data frame. +#' @param shift_val a single integer. Negative values produce leads. +#' @param newname the name for the newly shifted column +#' @param key_cols vector, or `NULL`. Additional grouping vars. #' #' @keywords internal #' #' @return a list of tibbles -epi_shift <- function(x, shifts, time_value, keys = NULL, out_name = "x") { - if (!is.data.frame(x)) x <- data.frame(x) - if (is.null(keys)) keys <- rep("empty", nrow(x)) - p_in <- ncol(x) - out_list <- tibble(i = 1:p_in, shift = shifts) %>% - tidyr::unchop(shift) %>% # what is chop - mutate(name = paste0(out_name, 1:nrow(.))) %>% - # One list element for each shifted feature - pmap(function(i, shift, name) { - tibble(keys, - time_value = time_value + shift, # Shift back - !!name := x[[i]] - ) - }) - if (is.data.frame(keys)) { - common_names <- c(names(keys), "time_value") - } else { - common_names <- c("keys", "time_value") - } - - reduce(out_list, dplyr::full_join, by = common_names) -} - epi_shift_single <- function(x, col, shift_val, newname, key_cols) { x %>% select(all_of(c(key_cols, col))) %>% mutate(time_value = time_value + shift_val) %>% rename(!!newname := {{ col }}) } + +#' lags move columns forward to bring the past up to today, while aheads drag +#' the future back to today +#' @keywords internal +get_sign <- function(object) { + if (!is.null(object$prefix)) { + if (object$prefix == "lag_") { + return(1) + } else { + return(-1) + } + } else if (object$method == "extend_lags") { + return(1) + } else { + return(-1) + } +} + +#' backend for both `bake.step_epi_ahead` and `bake.step_epi_lag`, performs the +#' checks missing in `epi_shift_single` +#' @keywords internal +#' @importFrom tidyr expand_grid +#' @importFrom dplyr join_by +add_shifted_columns <- function(new_data, object) { + grid <- object$shift_grid + + ## ensure no name clashes + new_data_names <- colnames(new_data) + intersection <- new_data_names %in% grid$newname + if (any(intersection)) { + cli_abort(c( + "Name collision occured in {.cls {class(object)[1]}}", + "The following variable name{?s} already exist{?s/}: {.val {new_data_names[intersection]}}." + )) + } + ok <- object$keys + shifted <- reduce( + pmap(grid, epi_shift_single, x = new_data, key_cols = ok), + full_join, + by = ok + ) + processed <- new_data %>% + full_join(shifted, by = ok) %>% + group_by(across(all_of(kill_time_value(ok)))) %>% + arrange(time_value) + if (inherits(new_data, "epi_df")) { + processed <- processed %>% + ungroup() %>% + as_epi_df( + as_of = attributes(new_data)$metadata$as_of, + other_keys = attributes(new_data)$metadata$other_keys + ) + } + return(processed) +} diff --git a/R/epi_workflow.R b/R/epi_workflow.R index af4555303..ff2393ecc 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -157,7 +157,7 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor #' preds predict.epi_workflow <- function(object, new_data, type = NULL, opts = list(), ...) { if (!workflows::is_trained_workflow(object)) { - cli::cli_abort(c( + cli_abort(c( "Can't predict on an untrained epi_workflow.", i = "Do you need to call `fit()`?" )) @@ -188,8 +188,8 @@ augment.epi_workflow <- function(x, new_data, ...) { join_by <- key_colnames(predictions) } else { cli_abort(c( - "Cannot determine how to join new_data with the predictions.", - "Try converting new_data to an epi_df with `as_epi_df(new_data)`." + "Cannot determine how to join `new_data` with the `predictions`.", + "Try converting `new_data` to an {.cls epi_df} with `as_epi_df(new_data)`." )) } complete_overlap <- intersect(names(new_data), join_by) @@ -231,7 +231,6 @@ print.epi_workflow <- function(x, ...) { #' #' @param object An epi workflow. #' @param ... Not used. -#' @param fill_locf Logical. Should we use locf to fill in missing data? #' @param n_recent Integer or NULL. If filling missing data with locf = TRUE, #' how far back are we willing to tolerate missing data? Larger values allow #' more filling. The default NULL will determine this from the the recipe. For @@ -245,7 +244,7 @@ print.epi_workflow <- function(x, ...) { #' @return A forecast tibble. #' #' @export -forecast.epi_workflow <- function(object, ..., fill_locf = FALSE, n_recent = NULL, forecast_date = NULL) { +forecast.epi_workflow <- function(object, ..., n_recent = NULL, forecast_date = NULL) { rlang::check_dots_empty() if (!object$trained) { @@ -268,10 +267,7 @@ forecast.epi_workflow <- function(object, ..., fill_locf = FALSE, n_recent = NUL test_data <- get_test_data( hardhat::extract_preprocessor(object), - object$original_data, - fill_locf = fill_locf, - n_recent = n_recent %||% Inf, - forecast_date = forecast_date %||% frosting_fd %||% max(object$original_data$time_value) + object$original_data ) predict(object, new_data = test_data) diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index 55808b803..59f54bd86 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -63,14 +63,9 @@ flatline_forecaster <- function( eng <- linear_reg(engine = "flatline") - wf <- epi_workflow(r, eng, f) - wf <- fit(wf, epi_data) - preds <- suppressWarnings(forecast( - wf, - fill_locf = TRUE, - n_recent = args_list$nafill_buffer, - forecast_date = forecast_date - )) %>% + wf <- epi_workflow(r, eng, f) %>% + fit(epi_data) + preds <- suppressWarnings(forecast(wf)) %>% as_tibble() %>% select(-time_value) @@ -117,7 +112,6 @@ flatline_args_list <- function( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, ...) { rlang::check_dots_empty() arg_is_scalar(ahead, n_training) @@ -129,11 +123,10 @@ flatline_args_list <- function( arg_is_probabilities(quantile_levels, allow_null = TRUE) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { - cli::cli_warn(c( + cli_warn(c( "`forecast_date` + `ahead` must equal `target_date`.", i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." )) @@ -149,8 +142,7 @@ flatline_args_list <- function( quantile_levels, symmetrize, nonneg, - quantile_by_key, - nafill_buffer + quantile_by_key ), class = c("flat_fcast", "alist") ) diff --git a/R/flusight_hub_formatter.R b/R/flusight_hub_formatter.R index 3e0eb1aaa..c1aa00b82 100644 --- a/R/flusight_hub_formatter.R +++ b/R/flusight_hub_formatter.R @@ -1,7 +1,7 @@ location_to_abbr <- function(location) { dictionary <- state_census %>% - dplyr::mutate(fips = sprintf("%02d", fips)) %>% + mutate(fips = sprintf("%02d", fips)) %>% dplyr::transmute( location = dplyr::case_match(fips, "00" ~ "US", .default = fips), abbr @@ -12,7 +12,7 @@ location_to_abbr <- function(location) { abbr_to_location <- function(abbr) { dictionary <- state_census %>% - dplyr::mutate(fips = sprintf("%02d", fips)) %>% + mutate(fips = sprintf("%02d", fips)) %>% dplyr::transmute( location = dplyr::case_match(fips, "00" ~ "US", .default = fips), abbr diff --git a/R/frosting.R b/R/frosting.R index 8474edbdf..2672bcdd1 100644 --- a/R/frosting.R +++ b/R/frosting.R @@ -89,7 +89,7 @@ validate_has_postprocessor <- function(x, ..., call = caller_env()) { "The workflow must have a frosting postprocessor.", i = "Provide one with `add_frosting()`." ) - rlang::abort(message, call = call) + cli_abort(message, call = call) } invisible(x) } @@ -288,7 +288,7 @@ new_frosting <- function() { #' p frosting <- function(layers = NULL, requirements = NULL) { if (!is_null(layers) || !is_null(requirements)) { - cli::cli_abort( + cli_abort( "Currently, no arguments to `frosting()` are allowed to be non-null." ) } @@ -356,7 +356,6 @@ apply_frosting.default <- function(workflow, components, ...) { #' @rdname apply_frosting #' @importFrom rlang is_null -#' @importFrom rlang abort #' @param type,opts forwarded (along with `...`) to [`predict.model_fit()`] and #' [`slather()`] for supported layers #' @export @@ -368,21 +367,21 @@ apply_frosting.epi_workflow <- components$predictions <- predict( the_fit, components$forged$predictors, ... ) - components$predictions <- dplyr::bind_cols( + components$predictions <- bind_cols( components$keys, components$predictions ) return(components) } if (!has_postprocessor_frosting(workflow)) { - cli_warn(c( + cli_warn(paste( "Only postprocessors of class {.cls frosting} are allowed.", "Returning unpostprocessed predictions." )) components$predictions <- predict( the_fit, components$forged$predictors, type, opts, ... ) - components$predictions <- dplyr::bind_cols( + components$predictions <- bind_cols( components$keys, components$predictions ) return(components) diff --git a/R/get_test_data.R b/R/get_test_data.R index 694e73b06..8de698301 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -11,25 +11,9 @@ #' used if growth rate calculations are requested by the recipe. This is #' calculated internally. #' -#' It also optionally fills missing values -#' using the last-observation-carried-forward (LOCF) method. If this -#' is not possible (say because there would be only `NA`'s in some location), -#' it will produce an error suggesting alternative options to handle missing -#' values with more advanced techniques. -#' #' @param recipe A recipe object. #' @param x An epi_df. The typical usage is to #' pass the same data as that used for fitting the recipe. -#' @param fill_locf Logical. Should we use `locf` to fill in missing data? -#' @param n_recent Integer or NULL. If filling missing data with `locf = TRUE`, -#' how far back are we willing to tolerate missing data? Larger values allow -#' more filling. The default `NULL` will determine this from the -#' the `recipe`. For example, suppose `n_recent = 3`, then if the -#' 3 most recent observations in any `geo_value` are all `NA`’s, we won’t be -#' able to fill anything, and an error message will be thrown. (See details.) -#' @param forecast_date By default, this is set to the maximum -#' `time_value` in `x`. But if there is data latency such that recent `NA`'s -#' should be filled, this may be _after_ the last available `time_value`. #' #' @return An object of the same type as `x` with columns `geo_value`, `time_value`, any additional #' keys, as well other variables in the original dataset. @@ -41,21 +25,10 @@ #' step_epi_lag(case_rate, lag = c(0, 7, 14)) #' get_test_data(recipe = rec, x = case_death_rate_subset) #' @importFrom rlang %@% +#' @importFrom stats na.omit #' @export -get_test_data <- function( - recipe, - x, - fill_locf = FALSE, - n_recent = NULL, - forecast_date = max(x$time_value)) { +get_test_data <- function(recipe, x) { if (!is_epi_df(x)) cli_abort("`x` must be an `epi_df`.") - arg_is_lgl(fill_locf) - arg_is_scalar(fill_locf) - arg_is_scalar(n_recent, allow_null = TRUE) - if (!is.null(n_recent) && is.finite(n_recent)) { - arg_is_pos_int(n_recent, allow_null = TRUE) - } - if (!is.null(n_recent)) n_recent <- abs(n_recent) # in case they passed -Inf check <- hardhat::check_column_names(x, colnames(recipe$template)) if (!check$ok) { @@ -64,103 +37,40 @@ get_test_data <- function( i = "The following required columns are missing: {check$missing_names}" )) } - if (class(forecast_date) != class(x$time_value)) { - cli_abort("`forecast_date` must be the same class as `x$time_value`.") - } - if (forecast_date < max(x$time_value)) { - cli_abort("`forecast_date` must be no earlier than `max(x$time_value)`") - } min_lags <- min(map_dbl(recipe$steps, ~ min(.x$lag %||% Inf)), Inf) max_lags <- max(map_dbl(recipe$steps, ~ max(.x$lag %||% 0)), 0) max_horizon <- max(map_dbl(recipe$steps, ~ max(.x$horizon %||% 0)), 0) max_slide <- max(map_dbl(recipe$steps, ~ max(.x$before %||% 0)), 0) min_required <- max_lags + max_horizon + max_slide - if (is.null(n_recent)) n_recent <- min_required + 1 # one extra for filling - if (n_recent <= min_required) n_recent <- min_required + n_recent + keep <- max_lags + max_horizon # CHECK: Error out if insufficient training data # Probably needs a fix based on the time_type of the epi_df avail_recent <- diff(range(x$time_value)) - if (avail_recent < min_required) { + if (avail_recent < keep) { cli_abort(c( "You supplied insufficient recent data for this recipe. ", "!" = "You need at least {min_required} days of data,", "!" = "but `x` contains only {avail_recent}." )) } - + max_time_value <- x %>% + na.omit() %>% + pull(time_value) %>% + max() x <- arrange(x, time_value) groups <- epi_keys_only(recipe) # If we skip NA completion, we remove undesirably early time values # Happens globally, over all groups - keep <- max(n_recent, min_required + 1) - x <- filter(x, forecast_date - time_value <= keep) - - # Pad with explicit missing values up to and including the forecast_date - # x is grouped here - x <- pad_to_end(x, groups, forecast_date) %>% - group_by(across(all_of(groups))) + x <- filter(x, max_time_value - time_value <= keep) # If all(lags > 0), then we get rid of recent data if (min_lags > 0 && min_lags < Inf) { - x <- filter(x, forecast_date - time_value >= min_lags) - } - - # Now, fill forward missing data if requested - if (fill_locf) { - cannot_be_used <- x %>% - filter(forecast_date - time_value <= n_recent) %>% - mutate(fillers = forecast_date - time_value > min_required) %>% - summarize( - across( - -any_of(key_colnames(recipe)), - ~ all(is.na(.x[fillers])) & is.na(head(.x[!fillers], 1)) - ), - .groups = "drop" - ) %>% - select(-fillers) %>% - summarise(across(-any_of(key_colnames(recipe)), ~ any(.x))) %>% - unlist() - if (any(cannot_be_used)) { - bad_vars <- names(cannot_be_used)[cannot_be_used] - if (recipes::is_trained(recipe)) { - cli_abort(c( - "The variables {.var {bad_vars}} have too many recent missing", - `!` = "values to be filled automatically. ", - i = "You should either choose `n_recent` larger than its current ", - i = "value {n_recent}, or perform NA imputation manually, perhaps with ", - i = "{.code recipes::step_impute_*()} or with {.code tidyr::fill()}." - )) - } - } - x <- tidyr::fill(x, !time_value) + x <- filter(x, max_time_value - time_value >= min_lags) } - filter(x, forecast_date - time_value <= min_required) %>% - ungroup() -} - -pad_to_end <- function(x, groups, end_date) { - itval <- guess_period(c(x$time_value, end_date), "time_value") - completed_time_values <- x %>% - group_by(across(all_of(groups))) %>% - summarise( - time_value = rlang::list2( - time_value = Seq(max(time_value) + itval, end_date, itval) - ) - ) %>% - unnest("time_value") %>% - mutate(time_value = vctrs::vec_cast(time_value, x$time_value)) - - bind_rows(x, completed_time_values) %>% - arrange(across(all_of(c("time_value", groups)))) -} - -Seq <- function(from, to, by) { - if (from > to) { - return(NULL) - } - seq(from = from, to = to, by = by) + filter(x, max_time_value - time_value <= keep) %>% + epiprocess::ungroup() } diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 3d5ea010b..c8f857c89 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -2,10 +2,12 @@ #' #' @param frosting a `frosting` postprocessor #' @param forecast_date The forecast date to add as a column to the `epi_df`. -#' For most cases, this should be specified in the form "yyyy-mm-dd". Note that -#' when the forecast date is left unspecified, it is set to the maximum time -#' value from the data used in pre-processing, fitting the model, and -#' postprocessing. +#' For most cases, this should be specified in the form "yyyy-mm-dd". Note +#' that when the forecast date is left unspecified, it is set to one of two +#' values. If there is a `step_adjust_latency` step present, it uses the +#' `forecast_date` as set in that function. Otherwise, it uses the maximum +#' `time_value` across the data used for pre-processing, fitting the model, +#' and postprocessing. #' @param id a random id string #' #' @return an updated `frosting` postprocessor @@ -86,19 +88,16 @@ layer_add_forecast_date_new <- function(forecast_date, id) { } #' @export +#' @importFrom workflows extract_preprocessor slather.layer_add_forecast_date <- function(object, components, workflow, new_data, ...) { rlang::check_dots_empty() - if (is.null(object$forecast_date)) { - max_time_value <- as.Date(max( - workflows::extract_preprocessor(workflow)$max_time_value, + forecast_date <- object$forecast_date %||% + get_forecast_date_in_layer( + extract_recipe(workflow), workflow$fit$meta$max_time_value, - max(new_data$time_value) - )) - forecast_date <- max_time_value - } else { - forecast_date <- object$forecast_date - } + new_data + ) expected_time_type <- attr( workflows::extract_preprocessor(workflow)$template, "metadata" diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 094ec8501..991ec2140 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -1,22 +1,26 @@ #' Postprocessing step to add the target date #' #' @param frosting a `frosting` postprocessor -#' @param target_date The target date to add as a column to the -#' `epi_df`. If there's a forecast date specified in a layer, then -#' it is the forecast date plus `ahead` (from `step_epi_ahead` in -#' the `epi_recipe`). Otherwise, it is the maximum `time_value` -#' (from the data used in pre-processing, fitting the model, and -#' postprocessing) plus `ahead`, where `ahead` has been specified in -#' preprocessing. The user may override these by specifying a -#' target date of their own (of the form "yyyy-mm-dd"). +#' @param target_date The target date to add as a column to the `epi_df`. If +#' there's a forecast date specified upstream (either in a +#' `step_adjust_latency` or in a `layer_forecast_date`), then it is the +#' forecast date plus `ahead` (from `step_epi_ahead` in the `epi_recipe`). +#' Otherwise, it is the maximum `time_value` (from the data used in +#' pre-processing, fitting the model, and postprocessing) plus `ahead`, where +#' `ahead` has been specified in preprocessing. The user may override these by +#' specifying a target date of their own (of the form "yyyy-mm-dd"). #' @param id a random id string #' #' @return an updated `frosting` postprocessor #' #' @details By default, this function assumes that a value for `ahead` #' has been specified in a preprocessing step (most likely in -#' `step_epi_ahead`). Then, `ahead` is added to the maximum `time_value` -#' in the test data to get the target date. +#' `step_epi_ahead`). Then, `ahead` is added to the `forecast_date` +#' in the test data to get the target date. `forecast_date` can be set in 3 ways: +#' 1. `step_adjust_latency`, which typically uses the training `epi_df`'s `as_of` +#' 2. `layer_add_forecast_date`, which inherits from 1 if not manually specifed +#' 3. if none of those are the case, it is simply the maximum `time_value` over +#' every dataset used (prep, training, and prediction). #' #' @export #' @examples @@ -41,8 +45,14 @@ #' p <- forecast(wf1) #' p #' -#' # Use ahead + max time value from pre, fit, post -#' # which is the same if include `layer_add_forecast_date()` +#' # Use ahead + forecast_date from adjust_latency +#' # setting the `as_of` to something realistic +#' attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 +#' r <- epi_recipe(jhu) %>% +#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_adjust_latency(method = "extend_ahead") %>% +#' step_epi_naomit() #' f2 <- frosting() %>% #' layer_predict() %>% #' layer_add_target_date() %>% @@ -52,15 +62,26 @@ #' p2 <- forecast(wf2) #' p2 #' -#' # Specify own target date +#' # Use ahead + max time value from pre, fit, post +#' # which is the same if include `layer_add_forecast_date()` #' f3 <- frosting() %>% #' layer_predict() %>% -#' layer_add_target_date(target_date = "2022-01-08") %>% +#' layer_add_target_date() %>% #' layer_naomit(.pred) #' wf3 <- wf %>% add_frosting(f3) #' -#' p3 <- forecast(wf3) -#' p3 +#' p3 <- forecast(wf2) +#' p2 +#' +#' # Specify own target date +#' f4 <- frosting() %>% +#' layer_predict() %>% +#' layer_add_target_date(target_date = "2022-01-08") %>% +#' layer_naomit(.pred) +#' wf4 <- wf %>% add_frosting(f4) +#' +#' p4 <- forecast(wf4) +#' p4 layer_add_target_date <- function(frosting, target_date = NULL, id = rand_id("add_target_date")) { arg_is_chr_scalar(id) @@ -112,13 +133,13 @@ slather.layer_add_target_date <- function(object, components, workflow, ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") target_date <- forecast_date + ahead } else { - max_time_value <- as.Date(max( - workflows::extract_preprocessor(workflow)$max_time_value, + forecast_date <- get_forecast_date_in_layer( + extract_preprocessor(workflow), workflow$fit$meta$max_time_value, - max(new_data$time_value) - )) + new_data + ) ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") - target_date <- max_time_value + ahead + target_date <- forecast_date + ahead } object$target_date <- target_date diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 8d16ba32f..fd61c4045 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -163,7 +163,7 @@ slather.layer_cdc_flatline_quantiles <- } the_fit <- workflows::extract_fit_parsnip(workflow) if (!inherits(the_fit, "_flatline")) { - cli::cli_warn(c( + cli_warn(c( "Predictions for this workflow were not produced by the {.cls flatline}", "{.pkg parsnip} engine. Results may be unexpected. See {.fn epipredict::flatline}." )) @@ -176,7 +176,7 @@ slather.layer_cdc_flatline_quantiles <- if (length(object$by_key) > 0L) { cols_in_preds <- hardhat::check_column_names(p, object$by_key) if (!cols_in_preds$ok) { - cli::cli_warn(c( + cli_warn(paste( "Predicted values are missing key columns: {.val {cols_in_preds$missing_names}}.", "Ignoring these." )) @@ -184,7 +184,7 @@ slather.layer_cdc_flatline_quantiles <- if (inherits(the_fit, "_flatline")) { cols_in_resids <- hardhat::check_column_names(r, object$by_key) if (!cols_in_resids$ok) { - cli::cli_warn(c( + cli_warn(paste( "Existing residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) @@ -201,7 +201,7 @@ slather.layer_cdc_flatline_quantiles <- ) cols_in_resids <- hardhat::check_column_names(key_cols, object$by_key) if (!cols_in_resids$ok) { - cli::cli_warn(c( + cli_warn(paste( "Requested residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index eae151905..1b623adfa 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -102,7 +102,7 @@ slather.layer_residual_quantiles <- common <- intersect(object$by_key, names(key_cols)) excess <- setdiff(object$by_key, names(key_cols)) if (length(excess) > 0L) { - cli::cli_warn(c( + cli_warn(paste( "Requested residual grouping key(s) {.val {excess}} are unavailable ", "in the original data. Grouping by the remainder: {.val {common}}." )) @@ -113,7 +113,7 @@ slather.layer_residual_quantiles <- if (length(common_in_r) == length(common)) { r <- left_join(key_cols, r, by = common_in_r) } else { - cli::cli_warn(c( + cli_warn(paste( "Some grouping keys are not in data.frame returned by the", "`residuals()` method. Groupings may not be correct." )) @@ -124,7 +124,7 @@ slather.layer_residual_quantiles <- } r <- r %>% - summarize( + summarise( dstn = list(quantile( c(.resid, s * .resid), probs = object$quantile_levels, na.rm = TRUE @@ -132,7 +132,7 @@ slather.layer_residual_quantiles <- ) # Check for NA if (any(sapply(r$dstn, is.na))) { - cli::cli_abort(c( + cli_abort(c( "Residual quantiles could not be calculated due to missing residuals.", i = "This may be due to `n_train` < `ahead` in your {.cls epi_recipe}." )) @@ -149,7 +149,7 @@ slather.layer_residual_quantiles <- grab_residuals <- function(the_fit, components) { if (the_fit$spec$mode != "regression") { - cli::cli_abort("For meaningful residuals, the predictor should be a regression model.") + cli_abort("For meaningful residuals, the predictor should be a regression model.") } r <- stats::residuals(the_fit$fit) if (!is.null(r)) { # Got something from the method @@ -157,7 +157,7 @@ grab_residuals <- function(the_fit, components) { if (".resid" %in% names(r)) { # success return(r) } else { # failure - cli::cli_warn(c( + cli_warn(c( "The `residuals()` method for objects of class {.cls {cl}} results in", "a data frame without a column named `.resid`.", i = "Residual quantiles will be calculated directly from the", @@ -168,7 +168,7 @@ grab_residuals <- function(the_fit, components) { } else if (is.vector(drop(r))) { # also success return(tibble(.resid = drop(r))) } else { # failure - cli::cli_warn(c( + cli_warn(paste( "The `residuals()` method for objects of class {.cls {cl}} results in an", "object that is neither a data frame with a column named `.resid`,", "nor something coercible to a vector.", diff --git a/R/layers.R b/R/layers.R index aa515a917..538fcad1b 100644 --- a/R/layers.R +++ b/R/layers.R @@ -148,9 +148,9 @@ validate_layer <- function(x, ..., arg = rlang::caller_arg(x), call = caller_env()) { rlang::check_dots_empty() if (!is_layer(x)) { - cli::cli_abort( + cli_abort( "{arg} must be a frosting layer, not a {.cls {class(x)[[1]]}}.", - .call = call + call = call ) } invisible(x) diff --git a/R/make_quantile_reg.R b/R/make_quantile_reg.R index 2157aa470..9e653184c 100644 --- a/R/make_quantile_reg.R +++ b/R/make_quantile_reg.R @@ -37,7 +37,7 @@ quantile_reg <- function(mode = "regression", engine = "rq", quantile_levels = 0 if (any(quantile_levels > 1)) cli_abort("All `quantile_levels` must be less than 1.") if (any(quantile_levels < 0)) cli_abort("All `quantile_levels` must be greater than 0.") if (is.unsorted(quantile_levels)) { - cli::cli_warn("Sorting `quantile_levels` to increasing order.") + cli_warn("Sorting `quantile_levels` to increasing order.") quantile_levels <- sort(quantile_levels) } args <- list(quantile_levels = rlang::enquo(quantile_levels), method = rlang::enquo(method)) diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R index f014961e6..b01dc392c 100644 --- a/R/pivot_quantiles.R +++ b/R/pivot_quantiles.R @@ -71,7 +71,7 @@ pivot_quantiles_longer <- function(.data, ..., .ignore_length_check = FALSE) { .data <- .data %>% tidyr::unnest(all_of(col), names_sep = "_") } } else { - cli::cli_abort(c( + cli_abort(paste( "Some selected columns contain different numbers of quantiles.", "The result would be a {.emph very} long {.cls tibble}.", "To do this anyway, rerun with `.ignore_length_check = TRUE`." @@ -115,7 +115,7 @@ pivot_quantiles_wider <- function(.data, ...) { checks <- map_lgl(cols, ~ diff(range(vctrs::list_sizes(.data[[.x]]))) == 0L) if (!all(checks)) { nms <- cols[!checks] - cli::cli_abort(c( + cli_abort(c( "Quantiles must be the same length and have the same set of taus.", i = "Check failed for variables(s) {.var {nms}}." )) @@ -157,7 +157,7 @@ validate_pivot_quantiles <- function(.data, ...) { dqs <- map_lgl(cols, ~ is_dist_quantiles(.data[[.x]])) if (!all(dqs)) { nms <- cols[!dqs] - cli::cli_abort( + cli_abort( "Variables(s) {.var {nms}} are not `dist_quantiles`. Cannot pivot them." ) } diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R new file mode 100644 index 000000000..433fbb328 --- /dev/null +++ b/R/step_adjust_latency.R @@ -0,0 +1,405 @@ +#' Adapt the model to latent data +#' +#' @description +#' In the standard case, the arx models assume that the last observation is also +#' the day from which the forecast is being made. But if the data has latency, +#' then you may wish to adjust the predictors (lags) and/or the outcome (ahead) +#' to compensate. +#' This is most useful in realtime and +#' pseudo-prospective forecasting for data where there is some delay between the +#' event occurring and the event being reported. +#' +#' @details +#' This step allows the user to create models on the most recent +#' data, automatically accounting for latency patterns. Instead of using the last observation +#' date, `step_adjust_latency` uses the `as_of` date of the `epi_df` as the +#' `forecast_date`, and adjusts the model so that there is data available. To +#' demonstrate some of the subtleties, let's consider a toy dataset: +#' ```{r toy_df} +#' toy_df <- tribble( +#' ~geo_value, ~time_value, ~a, ~b, +#' "ma", as.Date("2015-01-11"), 20, 6, +#' "ma", as.Date("2015-01-12"), 23, NA, +#' "ma", as.Date("2015-01-13"), 25, NA, +#' "ca", as.Date("2015-01-11"), 100, 5, +#' "ca", as.Date("2015-01-12"), 103, 10, +#' ) %>% +#' as_epi_df(as_of = as.Date("2015-01-14")) +#' ``` +#' If we're looking to predict the value on the 15th, forecasting from the 14th (the `as_of` date above), +#' there are two issues we will need to address: +#' 1. `"ca"` is latent by 2 days, whereas `"ma"` is latent by 1 +#' 2. if we want to use `b` as an exogenous variable, for `"ma"` it is latent by 3 days instead of just 1. +#' +#' Regardless of `method`, `epi_keys_checked="geo_value"` guarantees tha the +#' difference between `"ma"` and `"ca"` is accounted for by making the +#' latency adjustment at least 2. For some comparison, here's what the various +#' methods will do: +#' +#' ## `locf` +#' Short for "last observation carried forward", `locf` assumes that every day +#' between the last observation and the forecast day is exactly the same. +#' This is a very straightforward assumption, but wrecks any features that +#' depend on changes in value over time, such as the growth rate, or even +#' adjacent lags. A more robust version of this falls under the heading of +#' nowcasting, an eventual aim for this package. On the toy dataset, it +#' doesn't matter which day we're trying to predict, since it just fills +#' forward to the `forecast_date`: +#' ```{r toy_df} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_adjust_latency(method="locf") +#' +#' toy_recipe %>% +#' prep(toy_df) %>% +#' bake(toy_df) %>% +#' arrange(geo_value, time_value) +#' ``` +#' +#' ## `extend_lags` +#' `extend_lags` increases the lags so that they are guaranteed to have +#' data. This has the advantage of being applicable on +#' a per-column basis; if cases and deaths are reported at different +#' latencies, the lags for each are adjusted separately. In the toy example: +#' ```{r toy_df} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_adjust_latency(method="extend_lags") %>% +#' step_epi_lag(a,lag=1) %>% +#' step_epi_lag(b,lag=1) %>% +#' step_epi_ahead(a, ahead=1) +#' +#' toy_recipe %>% +#' prep(toy_df) %>% +#' bake(toy_df) %>% +#' arrange(geo_value, time_value) +#' ``` +#' The maximum latency in column `a` is 2 days, so the lag is increased to 3, +#' while the max latency in column `b` is 3, so the same lag is increased to +#' 4; both of these changes are reflected in the column names. Meanwhile the +#' ahead is uneffected. +#' +#' As a side-note, lag/ahead can be somewhat ambiguous about direction. Here, +#' the values are brought forward in time, so that for a given row, column +#' `lag_3_a` represents the value 3 days before. +#' +#' ## `extend_ahead` +#' `extend_ahead` increases the ahead, turning a 3 day ahead forecast +#' into a 7 day one; this has the advantage of simplicity and is reflective of +#' the actual modelling task, but potentially leaves information unused if +#' different data sources have different latencies; it must use the latency of +#' the most latent data to insure there is data available. In the toy example: +#' ```{r toy_df} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_adjust_latency(method="extend_ahead") %>% +#' step_epi_lag(a,lag=0) %>% +#' step_epi_ahead(a, ahead=1) +#' +#' toy_recipe %>% +#' prep(toy_df) %>% +#' bake(toy_df) %>% +#' arrange(geo_value, time_value) +#' ``` +#' Even though we're doing a 1 day ahead forecast, because our worst latency +#' is 3 days from column `b`'s `"ma"` data, our outcome column is `ahead_4_a` +#' (so 4 days ahead). If we want to ignore any latency in column `b`, we need +#' to explicitly set the columns to consider while adjusting like this: +#' `step_adjust_latency(a, method="extend_ahead")`. +#' +#' # Programmatic details +#' `step_adjust_latency` uses the metadata, such as `time_type` and `as_of`, of +#' the `epi_df` used in the initial prep step, rather than baking or +#' prediction. This means reusing the same forecaster on new data is not +#' advised, though typically it is not advised in general. +#' +#' The latency adjustment only applies to columns created after this step, so +#' this step should go before both `step_epi_ahead` and `step_epi_lag`. This will work: +#' ```{r} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' # non-lag steps +#' step_adjust_latency(a, method = "extend_lags") %>% +#' step_epi_lag(a, lag=0) # other steps +#' ``` +#' while this will not: +#' ```{r} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_epi_lag(a, lag=0) %>% +#' step_adjust_latency(a, method = "extend_lags") +#' ``` +#' If you create columns that you then apply lags to (such as +#' `step_growth_rate()`), these should be created before +#' `step_adjust_latency`, so any subseqent latency can be addressed. +#' +#' @param method a character. Determines the method by which the +#' forecast handles latency. The options are: +#' - `"extend_ahead"`: Lengthen the ahead so that forecasting from the last +#' observation results in a forecast `ahead` after the `forecast_date` date. +#' E.g. if there are 3 days of latency between the last observation and the +#' `forecast_date` date for a 4 day ahead forecast, the ahead used in practice +#' is actually 7. +#' - `"locf"`: carries forward the last observed value(s) up to the forecast +#' date. +#' - `"extend_lags"`: per `epi_key` and `predictor`, adjusts the lag so that +#' the shortest lag at predict time is at the last observation. E.g. if the +#' lags are `c(0,7,14)` for data that is 3 days latent, the actual lags used +#' become `c(3,10,17)`. +#' @param epi_keys_checked a character vector. A list of keys to group by before +#' finding the `max_time_value` (the last day of data), defaulting to +#' `geo_value`. Different locations may have different latencies; to produce a +#' forecast at every location, we need to guarantee data at every location by +#' using the largest latency across every location; this means taking +#' `max_time_value` to be the minimum of the `max_time_value`s for each set of +#' key values (so the earliest date). If `NULL` or an empty character vector, +#' it will take the maximum across all values, irrespective of any keys. +#' +#' Note that this is a separate concern from different latencies across +#' different *data columns*, which is only handled by the choice of `method`. +#' @param fixed_latency either a positive integer, or a labeled positive integer +#' vector. Cannot be set at the same time as `fixed_forecast_date`. If +#' non-`NULL`, the amount to offset the ahead or lag by. If a single integer, +#' this is used for all columns; if a labeled vector, the labels must +#' correspond to the base column names (before lags/aheads). If `NULL`, the +#' latency is the distance between the `epi_df`'s `max_time_value` and the `forecast_date`. +#' @param fixed_forecast_date either a date of the same kind used in the +#' `epi_df`, or `NULL`. Exclusive with `fixed_latency`. If a date, it gives +#' the date from which the forecast is actually occurring. If `NULL`, the +#' `forecast_date` is determined either via the `fixed_latency`, or is set to +#' the `epi_df`'s `as_of` value if `fixed_latency` is also `NULL`. +#' @param check_latency_length bool, determines whether to warn if the latency +#' is unusually high. Turn off if you know your forecast is going to be far +#' into the future. +#' @template step-return +#' @inheritParams recipes::step_lag +#' +#' +#' @family row operation steps +#' @rdname step_adjust_latency +#' @export +#' @examples +#' jhu <- case_death_rate_subset %>% +#' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) +#' # setting the `as_of` to something realistic +#' attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 +#' +#' r <- epi_recipe(case_death_rate_subset) %>% +#' step_adjust_latency(method = "extend_ahead") %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_epi_lag(death_rate, lag = c(0, 7, 14)) +#' r +#' +#' jhu_fit <- epi_workflow() %>% +#' add_epi_recipe(r) %>% +#' add_model(linear_reg()) %>% +#' fit(data = jhu) +#' jhu_fit +#' +#' @importFrom recipes detect_step +#' @importFrom rlang enquos is_empty +#' @importFrom dplyr tribble n +step_adjust_latency <- + function(recipe, + ..., + method = c( + "extend_ahead", + "locf", + "extend_lags" + ), + epi_keys_checked = NULL, + fixed_latency = NULL, + fixed_forecast_date = NULL, + check_latency_length = TRUE, + id = rand_id("adjust_latency")) { + step_adjust_latency_checks( + id, method, recipe, fixed_latency, fixed_forecast_date + ) + method <- rlang::arg_match(method) + if (is.null(epi_keys_checked)) { + epi_keys_checked <- kill_time_value(key_colnames(recipe$template)) + } + recipes::add_step( + recipe, + step_adjust_latency_new( + terms = enquos(...), + role = NA, + trained = FALSE, + fixed_forecast_date = fixed_forecast_date, + forecast_date = NULL, + latency = fixed_latency, + latency_table = NULL, + latency_sign = NULL, + metadata = NULL, + method = method, + epi_keys_checked = epi_keys_checked, + check_latency_length = check_latency_length, + columns = NULL, + skip = FALSE, + id = id + ) + ) + } + +step_adjust_latency_new <- + function(terms, role, trained, fixed_forecast_date, forecast_date, latency, + latency_table, latency_sign, metadata, method, epi_keys_checked, + check_latency_length, columns, skip, id) { + step( + subclass = "adjust_latency", + terms = terms, + role = role, + trained = trained, + fixed_forecast_date = fixed_forecast_date, + forecast_date = forecast_date, + latency = latency, + latency_table = latency_table, + latency_sign = latency_sign, + metadata = metadata, + method = method, + epi_keys_checked = epi_keys_checked, + check_latency_length = check_latency_length, + columns = columns, + skip = skip, + id = id + ) + } + +# lags introduces max(lags) NA's after the max_time_value. +#' @export +#' @importFrom glue glue +#' @importFrom dplyr rowwise +prep.step_adjust_latency <- function(x, training, info = NULL, ...) { + latency <- x$latency + forecast_date <- x$fixed_forecast_date %||% + get_forecast_date(training, info, x$epi_keys_checked, latency) + + latency_table <- get_latency_table( + training, NULL, forecast_date, latency, + get_sign(x), x$epi_keys_checked, info, x$terms + ) + # get the columns used, even if it's all of them + terms_used <- x$terms + if (is_empty(terms_used)) { + terms_used <- info %>% + filter(role == "raw") %>% + pull(variable) + } + + step_adjust_latency_new( + terms = x$terms, + role = x$role, + trained = TRUE, + fixed_forecast_date = x$fixed_forecast_date, + forecast_date = forecast_date, + latency = x$latency, + latency_table = latency_table, + latency_sign = get_sign(x), + metadata = attributes(training)$metadata, + method = x$method, + epi_keys_checked = x$epi_keys_checked, + check_latency_length = x$check_latency_length, + columns = recipes_eval_select(latency_table$col_name, training, info), + skip = x$skip, + id = x$id + ) +} + +#' @importFrom dplyr %>% pull group_by_at +#' @importFrom tidyr fill +#' @export +bake.step_adjust_latency <- function(object, new_data, ...) { + if (!inherits(new_data, "epi_df") || is.null(attributes(new_data)$metadata$as_of)) { + new_data <- as_epi_df(new_data) + attributes(new_data)$metadata <- object$metadata + attributes(new_data)$metadata$as_of <- object$forecast_date + } else { + compare_bake_prep_latencies(object, new_data) + } + if (object$method == "locf") { + # locf doesn't need to mess with the metadata at all, it just forward-fills + # the requested columns + rel_keys <- setdiff(key_colnames(new_data), "time_value") + modified_columns <- object$columns %>% unname() + if (object$check_latency_length) { + check_interminable_latency( + new_data, object$latency_table, modified_columns, object$forecast_date + ) + } + + new_data <- new_data %>% + pad_to_end(rel_keys, object$forecast_date, modified_columns) %>% + # group_by_at(rel_keys) %>% + arrange(time_value) %>% + as_tibble() %>% + tidyr::fill(.direction = "down", any_of(modified_columns)) %>% + ungroup() + } else if (object$method == "extend_lags" || object$method == "extend_ahead") { + attributes(new_data)$metadata$latency_table <- object$latency_table + attributes(new_data)$metadata$latency_sign <- object$latency_sign + } + return(new_data) +} + + + +#' @export +print.step_adjust_latency <- + function(x, width = max(20, options$width - 35), ...) { + if (!is.null(x$forecast_date)) { + conj <- "with forecast date" + extra_text <- x$forecast_date + } else if (!is.null(x$latency_table)) { + conj <- if (nrow(x$latency) == 1) { + "with latency" + } else { + "with latencies" + } + extra_text <- unique(x$latency_table$latency) + } else { + conj <- "with latency" + extra_text <- "set at train time" + } + # what follows is a somewhat modified version of print_epi_step, since the case of no arguments for adjust_latency means apply to all relevant columns, and not none of them + theme_div_id <- cli::cli_div( + theme = list(.pkg = list(`vec-trunc` = Inf, `vec-last` = ", ")) + ) + # this is a slightly modified copy of + title <- trimws(x$method) + trained_text <- dplyr::if_else(x$trained, "Trained", "") + vline_seperator <- dplyr::if_else(trained_text == "", "", "|") + comma_seperator <- dplyr::if_else( + trained_text != "", true = ",", false = "" + ) + extra_text <- recipes::format_ch_vec(extra_text) + width_title <- nchar(paste0( + "* ", title, ":", " ", conj, " ", extra_text, " ", vline_seperator, + " ", trained_text, " " + )) + width_diff <- cli::console_width() * 1 - width_title + if (x$trained) { + elements <- x$columns + } else { + if (is_empty(x$terms)) { + elements <- "all future predictors" + } else { + elements <- lapply(x$terms, function(x) { + rlang::expr_deparse(rlang::quo_get_expr(x), width = Inf) + }) + elements <- vctrs::list_unchop(elements, ptype = character()) + } + } + + element_print_lengths <- cumsum(nchar(elements)) + + c(0L, cumsum(rep(2L, length(elements) - 1))) + + c(rep(5L, length(elements) - 1), 0L) + first_line <- which(width_diff >= element_print_lengths) + first_line <- unname(first_line) + first_line <- ifelse( + test = identical(first_line, integer(0)), + yes = length(element_print_lengths), + no = max(first_line) + ) + more_dots <- ifelse(first_line == length(elements), "", ", ...") + cli::cli_bullets( + c("\n {title}: \\\n {.pkg {cli::cli_vec(elements[seq_len(first_line)])}}\\\n {more_dots} \\\n {conj} \\\n {.pkg {extra_text}} \\\n {vline_seperator} \\\n {.emph {trained_text}}") + ) + + cli::cli_end(theme_div_id) + invisible(x) + } diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index 465d64e7f..a4bcee52e 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -79,6 +79,8 @@ step_epi_lag <- default = default, keys = key_colnames(recipe), columns = NULL, + shift_grid = NULL, + latency_adjusted = FALSE, skip = skip, id = id ) @@ -123,6 +125,8 @@ step_epi_ahead <- default = default, keys = key_colnames(recipe), columns = NULL, + shift_grid = NULL, + latency_adjusted = FALSE, skip = skip, id = id ) @@ -132,7 +136,7 @@ step_epi_ahead <- step_epi_lag_new <- function(terms, role, trained, lag, prefix, default, keys, - columns, skip, id) { + columns, shift_grid, latency_adjusted, skip, id) { recipes::step( subclass = "epi_lag", terms = terms, @@ -143,6 +147,8 @@ step_epi_lag_new <- default = default, keys = keys, columns = columns, + shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = skip, id = id ) @@ -150,7 +156,7 @@ step_epi_lag_new <- step_epi_ahead_new <- function(terms, role, trained, ahead, prefix, default, keys, - columns, skip, id) { + columns, shift_grid, latency_adjusted, skip, id) { recipes::step( subclass = "epi_ahead", terms = terms, @@ -161,6 +167,8 @@ step_epi_ahead_new <- default = default, keys = keys, columns = columns, + shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = skip, id = id ) @@ -170,6 +178,22 @@ step_epi_ahead_new <- #' @export prep.step_epi_lag <- function(x, training, info = NULL, ...) { + columns <- recipes::recipes_eval_select(x$terms, training, info) + if (!x$latency_adjusted) { + tmp <- create_shift_grid( + x$prefix, + x$lag, + get_sign(x), + columns, + attributes(training)$metadata$latency_table, + attributes(training)$metadata$latency_sign + ) + shift_grid <- tmp[[1]] + latency_adjusted <- tmp[[2]] + } else { + shift_grid <- x$shift_grid + } + step_epi_lag_new( terms = x$terms, role = x$role, @@ -178,7 +202,9 @@ prep.step_epi_lag <- function(x, training, info = NULL, ...) { prefix = x$prefix, default = x$default, keys = x$keys, - columns = recipes::recipes_eval_select(x$terms, training, info), + columns = columns, + shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = x$skip, id = x$id ) @@ -186,6 +212,22 @@ prep.step_epi_lag <- function(x, training, info = NULL, ...) { #' @export prep.step_epi_ahead <- function(x, training, info = NULL, ...) { + columns <- recipes::recipes_eval_select(x$terms, training, info) + if (!x$latency_adjusted) { + tmp <- create_shift_grid( + x$prefix, + x$ahead, + get_sign(x), + columns, + attributes(training)$metadata$latency_table, + attributes(training)$metadata$latency_sign + ) + shift_grid <- tmp[[1]] + latency_adjusted <- tmp[[2]] + } else { + shift_grid <- x$shift_grid + } + step_epi_ahead_new( terms = x$terms, role = x$role, @@ -194,7 +236,9 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { prefix = x$prefix, default = x$default, keys = x$keys, - columns = recipes::recipes_eval_select(x$terms, training, info), + columns = columns, + shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = x$skip, id = x$id ) @@ -204,81 +248,41 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { #' @export bake.step_epi_lag <- function(object, new_data, ...) { - grid <- tidyr::expand_grid(col = object$columns, lag = object$lag) %>% - mutate( - newname = glue::glue("{object$prefix}{lag}_{col}"), - shift_val = lag, - lag = NULL - ) - - ## ensure no name clashes - new_data_names <- colnames(new_data) - intersection <- new_data_names %in% grid$newname - if (any(intersection)) { - cli_abort(c( - "Name collision occured in {.cls {class(object)[1]}}", - "The following variable name{?s} already exist{?s/}: {.val {new_data_names[intersection]}}." - )) - } - ok <- object$keys - shifted <- reduce( - pmap(grid, epi_shift_single, x = new_data, key_cols = ok), - full_join, - by = ok - ) - - full_join(new_data, shifted, by = ok) %>% - group_by(across(all_of(kill_time_value(ok)))) %>% - arrange(time_value) %>% - ungroup() + add_shifted_columns(new_data, object) } #' @export bake.step_epi_ahead <- function(object, new_data, ...) { - grid <- tidyr::expand_grid(col = object$columns, ahead = object$ahead) %>% - mutate( - newname = glue::glue("{object$prefix}{ahead}_{col}"), - shift_val = -ahead, - ahead = NULL - ) - - ## ensure no name clashes - new_data_names <- colnames(new_data) - intersection <- new_data_names %in% grid$newname - if (any(intersection)) { - cli_abort(c( - "Name collision occured in {.cls {class(object)[1]}}", - "The following variable name{?s} already exist{?s/}: {.val {new_data_names[intersection]}}." - )) - } - ok <- object$keys - shifted <- reduce( - pmap(grid, epi_shift_single, x = new_data, key_cols = ok), - full_join, - by = ok - ) - - full_join(new_data, shifted, by = ok) %>% - group_by(across(all_of(kill_time_value(ok)))) %>% - arrange(time_value) %>% - ungroup() + add_shifted_columns(new_data, object) } - #' @export print.step_epi_lag <- function(x, width = max(20, options()$width - 30), ...) { + if (x$latency_adjusted && x$trained) { + lag <- x$shift_grid$shift_val + lag <- c(lag, "(lat adj)") + } else { + lag <- x$lag + } print_epi_step(x$columns, x$terms, x$trained, "Lagging", conjunction = "by", - extra_text = x$lag + extra_text = lag ) invisible(x) } + #' @export print.step_epi_ahead <- function(x, width = max(20, options()$width - 30), ...) { + if (x$latency_adjusted && x$trained) { + ahead <- x$shift_grid$shift_val + ahead <- c(ahead, "(lat adj)") + } else { + ahead <- x$ahead + } print_epi_step(x$columns, x$terms, x$trained, "Leading", conjunction = "by", - extra_text = x$ahead + extra_text = ahead ) invisible(x) } diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index 06f8da4cf..00bf9bd87 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -58,7 +58,7 @@ step_growth_rate <- arg_is_pos_int(horizon) arg_is_scalar(horizon) if (!is.null(replace_Inf)) { - if (length(replace_Inf) != 1L) cli_abort("replace_Inf must be a scalar.") + if (length(replace_Inf) != 1L) cli_abort("`replace_Inf` must be a scalar.") if (!is.na(replace_Inf)) arg_is_numeric(replace_Inf) } arg_is_chr(role) diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index 4e4d3aa26..76068109c 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -165,7 +165,7 @@ bake.step_population_scaling <- function(object, new_data, ...) { hardhat::validate_column_names(object$df, joinby$y) if (object$suffix != "_scaled" && object$create_new == FALSE) { - cli::cli_warn(c( + cli_warn(c( "Custom `suffix` {.val {object$suffix}} was ignored in `step_population_scaling`.", i = "Perhaps `create_new` should be {.val {TRUE}}?" )) diff --git a/R/tidy.R b/R/tidy.R index 61b298411..8fc06398a 100644 --- a/R/tidy.R +++ b/R/tidy.R @@ -96,7 +96,7 @@ tidy.frosting <- function(x, number = NA, id = NA, ...) { #' @export tidy.layer <- function(x, ...) { - cli::cli_abort( + cli_abort( "No `tidy()` method exists for a layer with class: {.cls {class(x)}}." ) } diff --git a/R/utils-enframer.R b/R/utils-enframer.R deleted file mode 100644 index 0a8152906..000000000 --- a/R/utils-enframer.R +++ /dev/null @@ -1,23 +0,0 @@ -enframer <- function(df, x, fill = NA) { - stopifnot(is.data.frame(df)) - stopifnot(length(fill) == 1 || length(fill) == nrow(df)) - arg_is_chr(x, allow_null = TRUE) - if (is.null(x)) { - return(df) - } - if (any(names(df) %in% x)) { - stop("In enframer: some new cols match existing column names") - } - for (v in x) df <- dplyr::mutate(df, !!v := fill) - df -} - -enlist <- function(...) { - # converted to thin wrapper around - rlang::dots_list( - ..., - .homonyms = "error", - .named = TRUE, - .check_assign = TRUE - ) -} diff --git a/R/utils-latency.R b/R/utils-latency.R new file mode 100644 index 000000000..311656ac1 --- /dev/null +++ b/R/utils-latency.R @@ -0,0 +1,464 @@ +#' create a table of the columns to modify, their shifts, and their prefixes +#' @keywords internal +#' @importFrom dplyr tibble +#' @importFrom tidyr unnest +construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name) { + # for the right step types (either "step_epi_lag" or "step_epi_shift"), grab + # the useful parameters, including the evaluated column names + extract_named_rates <- function(recipe_step) { + if (inherits(recipe_step, rel_step_type)) { + recipe_columns <- recipes_eval_select(recipe_step$terms, recipe$template, recipe$term_info) + if (any(recipe_columns %in% terms_used)) { + return(list(term = recipe_columns, shift = recipe_step[shift_name], prefix = recipe_step$prefix)) + } + } + return(NULL) + } + rel_list <- recipe$steps %>% + map(extract_named_rates) %>% + unlist(recursive = FALSE) %>% + split(c("term", "shift", "prefix")) + relevant_shifts <- tibble( + terms = lapply(rel_list$term, unname), + shift = lapply(rel_list$shift, unname), + prefix = unname(unlist(rel_list$prefix)) + ) %>% + unnest(c(terms, shift)) %>% + unnest(shift) + return(relevant_shifts) +} + +#' Extract the as_of for the forecast date, and make sure there's nothing very off about it. +#' @keywords internal +#' @importFrom dplyr select +#' @importFrom tidyr drop_na +#' @importFrom utils capture.output +get_forecast_date <- function(new_data, info, epi_keys_checked, latency, columns = NULL) { + if (is.null(columns)) { + columns <- info %>% + filter(source == "original") %>% + pull(variable) + # make sure that there's enough column names + if (length(columns) < 3) { + cli_abort( + glue::glue( + "The original columns of `time_value`, ", + "`geo_value` and at least one signal. The current colums are \n", + paste(capture.output(object$info), collapse = "\n\n") + ), + class = "epipredict__get_forecast_date__too_few_data_columns" + ) + } + } + # the source data determines the actual time_values + # these are the non-na time_values; + # get the minimum value across the checked epi_keys' maximum time values + max_time <- new_data %>% + select(all_of(columns)) %>% + drop_na() + # null and "" don't work in `group_by` + if (!is.null(epi_keys_checked) && (epi_keys_checked != "")) { + max_time <- max_time %>% group_by(get(epi_keys_checked)) + } + max_time <- max_time %>% + summarise(time_value = max(time_value)) %>% + pull(time_value) %>% + min() + if (is.null(latency)) { + forecast_date <- attributes(new_data)$metadata$as_of + } else { + forecast_date <- max_time + latency + } + # make sure the as_of is sane + if (!inherits(forecast_date, class(max_time)) & !inherits(forecast_date, "POSIXt")) { + cli_abort( + paste( + "the data matrix `forecast_date` value is {forecast_date}, ", + "and not a valid `time_type` with type ", + "matching `time_value`'s type of ", + "{class(max_time)}." + ), + class = "epipredict__get_forecast_date__wrong_time_value_type_error" + ) + } + if (is.null(forecast_date) || is.na(forecast_date)) { + cli_warn( + paste( + "epi_data's `forecast_date` was {forecast_date}, setting to ", + "the latest time value, {max_time}." + ), + class = "epipredict__get_forecast_date__max_time_warning" + ) + forecast_date <- max_time + } else if (forecast_date < max_time) { + cli_abort( + paste( + "`forecast_date` ({(forecast_date)}) is before the most ", + "recent data ({max_time}). Remove before ", + "predicting." + ), + class = "epipredict__get_forecast_date__misordered_forecast_date_error" + ) + } + # TODO cover the rest of the possible types for as_of and max_time... + if (inherits(max_time, "Date")) { + forecast_date <- as.Date(forecast_date) + } + return(forecast_date) +} + +#' the latency is also the amount the shift is off by +#' @param sign_shift integer. 1 if lag and -1 if ahead. These represent how you +#' need to shift the data to bring the 3 day lagged value to today. +#' @keywords internal +get_latency <- function(new_data, forecast_date, column, sign_shift, epi_keys_checked) { + shift_max_date <- new_data %>% + drop_na(all_of(column)) + # null and "" don't work in `group_by` + if (!is.null(epi_keys_checked) && epi_keys_checked != "") { + shift_max_date <- shift_max_date %>% group_by(get(epi_keys_checked)) + } + shift_max_date <- shift_max_date %>% + summarise(time_value = max(time_value)) %>% + pull(time_value) %>% + min() + return(as.integer(sign_shift * (as.Date(forecast_date) - shift_max_date))) +} + + + +#' get the target date while in a layer +#' @param this_recipe the recipe to check for `step_adjust_latency` +#' @param workflow_max_time_value the `max_time` value coming out of the fit +#' workflow (this will be the maximal time value in a potentially different +#' dataset) +#' @param new_data the data we're currently working with, from which we'll take +#' a potentially different max_time_value +#' @keywords internal +get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new_data) { + forecast_date <- as.Date(max( + workflow_max_time_value, + this_recipe$max_time_value, + max(new_data$time_value) + )) + if (this_recipe %>% recipes::detect_step("adjust_latency")) { + # get the as_of in an `adjust_latency` step, regardless of where + handpicked_forecast_date <- map( + this_recipe$steps, + function(x) { + if (inherits(x, "step_adjust_latency")) x$forecast_date + } + ) %>% Filter(Negate(is.null), .) + if (length(handpicked_forecast_date) > 0) { + forecast_date <- handpicked_forecast_date[[1]] + } else { + # if we haven't chosen one, use either the max_time_value or the as_of + forecast_date <- max( + forecast_date, + attributes(new_data)$metadata$as_of + ) + } + } + forecast_date +} + + +#' pad every group at the right interval +#' @description +#' Perform last observation carried forward on a group by group basis. It uses +#' `guess_period` to find the appropriate interval to fill-forward by. It +#' maintains the grouping structure it recieves. It does *not* fill any +#' "interior" `NA` values occurring in the data beforehand. +#' @param x an epi_df to be filled forward. +#' @param columns_to_complete which columns to apply completion to. By default every non-key column of an epi_df +#' @param groups the grouping by which to fill forward +#' @importFrom tidyselect all_of +#' @importFrom rlang list2 +#' @importFrom vctrs vec_cast +#' @importFrom dplyr across arrange bind_rows group_by summarise +#' @keywords internal +pad_to_end <- function(x, groups, end_date, columns_to_complete = NULL) { + if (is.null(columns_to_complete)) { + columns_to_complete <- setdiff(names(x), key_colnames(x)) + } + itval <- epiprocess:::guess_period(c(x$time_value, end_date), "time_value") + # get the time values we need to fill in + completed_time_values <- x %>% + group_by(across(all_of(groups))) %>% + summarise( + time_value = list2( + time_value = seq_forward(from = max(time_value) + itval, to = end_date, by = itval) + ) + ) %>% + unnest("time_value") %>% + mutate(time_value = vec_cast(time_value, x$time_value)) + # pull the last value in each group and fill forward + grouped_and_arranged <- x %>% + arrange(across(all_of(c("time_value", groups)))) %>% + group_by(across(all_of(groups))) + + values_to_fill <- grouped_and_arranged %>% + slice(min(across(all_of(columns_to_complete), count_single_column)):n()) + filled_values <- values_to_fill %>% + bind_rows(completed_time_values) %>% + arrange(across(all_of(c("time_value", groups)))) %>% + fill(all_of(columns_to_complete), .direction = "down") %>% + slice(-1) # remove the oirginal rows + + grouped_and_arranged %>% + slice(1:min(across(all_of(columns_to_complete), count_single_column))) %>% + bind_rows(filled_values) %>% + arrange(across(all_of(key_colnames(x)))) %>% + ungroup() +} + +#' get the location of the last real value +#' @param col the relevant column +#' @keywords internal +count_single_column <- function(col) { + max(which(!is.na(col))) +} + + +#' seq, but returns null if from is larger +#' @keywords internal +seq_forward <- function(from, to, by) { + if (from > to) { + return(NULL) + } + seq(from = from, to = to, by = by) +} + + +#' warn when the latency is larger than would be reasonable +#' @param dataset the epi_df +#' @param latency_table the whole collection of latencies +#' @param target_columns the names of the columns that we're adjusting, and whether its unreasonably latent +#' @keywords internal +check_interminable_latency <- function(dataset, latency_table, target_columns, forecast_date, call = caller_env()) { + # check that the shift amount isn't too extreme + rel_latency_table <- latency_table %>% + filter(col_name %in% target_columns) + # no relevant columns, so this error definitely isn't happening + if (nrow(rel_latency_table) == 0) { + return() + } + latency_max <- rel_latency_table %>% + pull(latency) %>% + abs() %>% + max() + time_type <- attributes(dataset)$metadata$time_type + i_latency <- which.max(latency_table$latency) + if ( + (grepl("day", time_type) && (latency_max >= 28)) || + (grepl("week", time_type) && (latency_max >= 4)) || + ((time_type == "yearmonth") && (latency_max >= 2)) || + ((time_type == "yearquarter") && (latency_max >= 1)) || + ((time_type == "year") && (latency_max >= 1)) + ) { + max_time_value <- dataset %>% + filter(!is.na(!!(latency_table[[i_latency, "col_name"]]))) %>% + pull(time_value) %>% + max() + cli_warn( + message = c( + paste( + "The maximum latency is {latency_max}, ", + "which is questionable for it's `time_type` of ", + "{time_type}." + ), + "i" = "latency: {latency_table$latency[[i_latency]]}", + "i" = "`max_time` = {max_time_value} -> `forecast_date` = {forecast_date}" + ), + class = "epipredict__prep.step_latency__very_large_latency", + call = call + ) + } +} + +`%nin%` <- function(x, table) { + !(x %in% table) +} + +#' create the latency table +#' This is a table of column names and the latency adjustment necessary for that column. An example: +#' +#' col_name latency +#' +#' 1 case_rate 5 +#' 2 death_rate 5 +#' @keywords internal +#' @importFrom dplyr rowwise +get_latency_table <- function(training, columns, forecast_date, latency, + sign_shift, epi_keys_checked, info, terms) { + if (is.null(columns)) { + columns <- recipes_eval_select(terms, training, info) + } + # construct the latency table + latency_table <- tibble(col_name = names(training)) %>% + filter(col_name %nin% key_colnames(training)) + if (length(columns) > 0) { + latency_table <- latency_table %>% filter(col_name %in% columns) + } + + if (is.null(latency)) { + latency_table <- latency_table %>% + rowwise() %>% + mutate(latency = get_latency( + training, forecast_date, col_name, sign_shift, epi_keys_checked + )) + } else if (length(latency) > 1) { + # if latency has a length, it must also have named elements. + # We assign based on comparing the name in the list + # with the column names, and drop any which don't have a latency assigned + latency_table <- latency_table %>% + filter(col_name %in% names(latency)) %>% + rowwise() %>% + mutate(latency = unname(latency[names(latency) == col_name])) + } else { + latency_table <- latency_table %>% + rowwise() %>% + mutate(latency = get_latency( + training, forecast_date, col_name, sign_shift, epi_keys_checked + )) + if (latency) { + latency_table <- latency_table %>% mutate(latency = latency) + } + } + return(latency_table %>% ungroup()) +} + + +#' checks: the recipe type, whether a previous step is the relevant epi_shift, +#' that either `fixed_latency` or `fixed_forecast_date` is non-null, and that +#' `fixed_latency` only references columns that exist at the time of the step +#' inclusion +#' @keywords internal +step_adjust_latency_checks <- function(id, method, recipe, fixed_latency, fixed_forecast_date, call = caller_env()) { + arg_is_chr_scalar(id, method) + if (detect_step(recipe, "adjust_latency")) { + cli_abort("Only one `step_adjust_latency()` can be included in a recipe.", + class = "epipredict__step_adjust_latency__multiple_steps" + ) + } + if (!is_epi_recipe(recipe)) { + cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", + class = "epipredict__step_adjust_latency__epi_recipe_only" + ) + } + if ((method == "extend_ahead") && (detect_step(recipe, "epi_ahead"))) { + cli_warn( + "If `method` is {.val extend_ahead}, then the previous `step_epi_ahead` won't be modified.", + class = "epipredict__step_adjust_latency__misordered_step_warning" + ) + } else if ((method == "extend_lags") && detect_step(recipe, "epi_lag")) { + cli_warn( + "If `method` is {.val extend_lags} or {.val locf}, +then the previous `step_epi_lag`s won't work with modified data.", + class = "epipredict__step_adjust_latency__misordered_step_warning" + ) + } else if ((method == "locf") && (length(recipe$steps) > 0)) { + cli_warn( + paste0( + "There are steps before `step_adjust_latency`.", + " With the method {.val locf}, it is recommended to include this step before any others" + ), + class = "epipredict__step_adjust_latency__misordered_step_warning" + ) + } + if (!is.null(fixed_latency) && !is.null(fixed_forecast_date)) { + cli_abort( + "Only one of `fixed_latency` and `fixed_forecast_date` can be non-`NULL` at a time!", + class = "epipredict__step_adjust_latency__too_many_args_error" + ) + } + if (length(fixed_latency > 1)) { + template <- recipe$template + data_names <- names(template)[!names(template) %in% key_colnames(template)] + wrong_names <- names(fixed_latency)[!names(fixed_latency) %in% data_names] + if (length(wrong_names) > 0) { + cli_abort( + "{.val fixed_latency} contains names not in the template dataset: {wrong_names}", + class = "epipredict__step_adjust_latency__undefined_names_error" + ) + } + } +} + +compare_bake_prep_latencies <- function(object, new_data, call = caller_env()) { + latency <- object$latency + current_forecast_date <- object$fixed_forecast_date %||% + get_forecast_date( + new_data, NULL, object$epi_keys_checked, latency, + c(key_colnames(new_data), object$columns) + ) + local_latency_table <- get_latency_table( + new_data, object$columns, current_forecast_date, latency, + get_sign(object), object$epi_keys_checked, NULL, NULL + ) + comparison_table <- local_latency_table %>% + ungroup() %>% + dplyr::full_join( + object$latency_table %>% ungroup(), + by = join_by(col_name), + suffix = c(".bake", ".prep") + ) %>% + mutate(bakeMprep = latency.bake - latency.prep) + if (any(comparison_table$bakeMprep > 0)) { + cli_abort( + paste0( + "There is more latency at bake time than there was at prep time.", + " You will need to fit a model with more latency to predict on this dataset." + ), + class = "epipredict__latency__bake_prep_difference_error", + latency_table = comparison_table, + call = call + ) + } + if (any(comparison_table$bakeMprep < 0)) { + cli_warn( + paste0( + "There is less latency at bake time than there was at prep time.", + " This will still fit, but will discard the most recent data." + ), + class = "epipredict__latency__bake_prep_difference_warn", + latency_table = comparison_table, + call = call + ) + } + if (current_forecast_date != object$forecast_date) { + cli_warn( + paste0( + "The forecast date differs from the one set at train time; ", + " this means any dates added by `layer_forecast_date` will be inaccurate." + ), + class = "epipredict__latency__bake_prep_forecast_date_warn", + call = call + ) + } +} + + +#' @keywords internal +create_shift_grid <- function(prefix, amount, target_sign, columns, latency_table, latency_sign) { + if (!is.null(latency_table) && + latency_sign == target_sign) { + # get the actually used latencies + rel_latency <- latency_table %>% filter(col_name %in% columns) + latency_adjusted <- TRUE + } else { + # adding zero if there's no latency table + rel_latency <- tibble(col_name = columns, latency = 0L) + latency_adjusted <- FALSE + } + shift_grid <- expand_grid(col = columns, amount = target_sign * amount) %>% + left_join(rel_latency, by = join_by(col == col_name), ) %>% + tidyr::replace_na(list(latency = 0)) %>% + mutate(shift_val = amount + latency) %>% + mutate( + newname = glue::glue("{prefix}{abs(shift_val)}_{col}"), # name is always positive + amount = NULL, + latency = NULL + ) + return(list(shift_grid, latency_adjusted)) +} diff --git a/R/utils-misc.R b/R/utils-misc.R index b4d1c28b7..a1e0f025f 100644 --- a/R/utils-misc.R +++ b/R/utils-misc.R @@ -34,7 +34,7 @@ check_pname <- function(res, preds, object, newname = NULL) { grab_forged_keys <- function(forged, workflow, new_data) { forged_roles <- names(forged$extras$roles) - extras <- dplyr::bind_cols(forged$extras$roles[forged_roles %in% c("geo_value", "time_value", "key")]) + extras <- bind_cols(forged$extras$roles[forged_roles %in% c("geo_value", "time_value", "key")]) # 1. these are the keys in the test data after prep/bake new_keys <- names(extras) # 2. these are the keys in the training data @@ -42,7 +42,7 @@ grab_forged_keys <- function(forged, workflow, new_data) { # 3. these are the keys in the test data as input new_df_keys <- key_colnames(new_data, extra_keys = setdiff(new_keys, c("geo_value", "time_value"))) if (!(setequal(old_keys, new_df_keys) && setequal(new_keys, new_df_keys))) { - cli::cli_warn(c( + cli_warn(paste( "Not all epi keys that were present in the training data are available", "in `new_data`. Predictions will have only the available keys." )) @@ -75,3 +75,14 @@ is_classification <- function(trainer) { is_regression <- function(trainer) { get_parsnip_mode(trainer) %in% c("regression", "unknown") } + + +enlist <- function(...) { + # converted to thin wrapper around + rlang::dots_list( + ..., + .homonyms = "error", + .named = TRUE, + .check_assign = TRUE + ) +} diff --git a/man/add_shifted_columns.Rd b/man/add_shifted_columns.Rd new file mode 100644 index 000000000..aad22e805 --- /dev/null +++ b/man/add_shifted_columns.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_shift.R +\name{add_shifted_columns} +\alias{add_shifted_columns} +\title{backend for both \code{bake.step_epi_ahead} and \code{bake.step_epi_lag}, performs the +checks missing in \code{epi_shift_single}} +\usage{ +add_shifted_columns(new_data, object) +} +\description{ +backend for both \code{bake.step_epi_ahead} and \code{bake.step_epi_lag}, performs the +checks missing in \code{epi_shift_single} +} +\keyword{internal} diff --git a/man/arx_args_list.Rd b/man/arx_args_list.Rd index c9ae4e733..f28cdefab 100644 --- a/man/arx_args_list.Rd +++ b/man/arx_args_list.Rd @@ -10,11 +10,12 @@ arx_args_list( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, quantile_levels = c(0.05, 0.95), symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ... @@ -32,11 +33,34 @@ date for which forecasts should be produced.} key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date that is being forecast. The default \code{NULL} +will determine this automatically as \code{forecast_date + ahead}.} + +\item{adjust_latency}{Character. One of the \code{method}s of +\code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{"none"} (in which case there is no adjustment). +If the \code{forecast_date} is after the last day of data, this determines how +to shift the model to account for this difference. The options are: +\itemize{ +\item \code{"none"} the default, assumes the \code{forecast_date} is the last day of data +\item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to +the last day of data. For example, if the last day of data was 3 days ago, +the ahead becomes \code{ahead+3}. +\item \code{"extend_lags"}: increase the lags so they're relative to the actual +forecast date. For example, if the lags are \code{c(0,7,14)} and the last day of +data was 3 days ago, the lags become \code{c(3,10,17)}. +}} + +\item{warn_latency}{by default, \code{step_adjust_latency} warns the user if the +latency is large. If this is \code{FALSE}, that warning is turned off.} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of @@ -57,16 +81,6 @@ before calculating residual quantiles. See the \code{by_key} argument to residual quantiles are used. It is not applicable with \code{trainer = quantile_reg()}, for example.} -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{check_enough_data_n}{Integer. A lower limit for the number of rows per epi_key that are required for training. If \code{NULL}, this check is ignored.} diff --git a/man/arx_class_args_list.Rd b/man/arx_class_args_list.Rd index 311950d62..a229b67c0 100644 --- a/man/arx_class_args_list.Rd +++ b/man/arx_class_args_list.Rd @@ -10,13 +10,14 @@ arx_class_args_list( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, outcome_transform = c("growth_rate", "lag_difference"), breaks = 0.25, horizon = 7L, method = c("rel_change", "linear_reg"), log_scale = FALSE, additional_gr_args = list(), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ... @@ -34,11 +35,34 @@ date for which forecasts should be produced.} key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date that is being forecast. The default \code{NULL} +will determine this automatically as \code{forecast_date + ahead}.} + +\item{adjust_latency}{Character. One of the \code{method}s of +\code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{"none"} (in which case there is no adjustment). +If the \code{forecast_date} is after the last day of data, this determines how +to shift the model to account for this difference. The options are: +\itemize{ +\item \code{"none"} the default, assumes the \code{forecast_date} is the last day of data +\item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to +the last day of data. For example, if the last day of data was 3 days ago, +the ahead becomes \code{ahead+3}. +\item \code{"extend_lags"}: increase the lags so they're relative to the actual +forecast date. For example, if the lags are \code{c(0,7,14)} and the last day of +data was 3 days ago, the lags become \code{c(3,10,17)}. +}} + +\item{warn_latency}{by default, \code{step_adjust_latency} warns the user if the +latency is large. If this is \code{FALSE}, that warning is turned off.} \item{outcome_transform}{Scalar character. Whether the outcome should be created using growth rates (as the predictors are) or lagged @@ -76,16 +100,6 @@ log scale.} calculation. See \code{\link[epiprocess:growth_rate]{epiprocess::growth_rate()}} and the related Vignette for more details.} -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{check_enough_data_n}{Integer. A lower limit for the number of rows per epi_key that are required for training. If \code{NULL}, this check is ignored.} diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd index 2f9300572..4a8c13113 100644 --- a/man/cdc_baseline_args_list.Rd +++ b/man/cdc_baseline_args_list.Rd @@ -14,7 +14,6 @@ cdc_baseline_args_list( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = "geo_value", - nafill_buffer = Inf, ... ) } @@ -34,8 +33,14 @@ set of prediction horizons for \code{\link[=layer_cdc_flatline_quantiles]{layer_ key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of @@ -63,16 +68,6 @@ before calculating residual quantiles. See the \code{by_key} argument to residual quantiles are used. It is not applicable with \code{trainer = quantile_reg()}, for example.} -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{...}{Space to handle future expansions (unused).} } \value{ diff --git a/man/check_interminable_latency.Rd b/man/check_interminable_latency.Rd new file mode 100644 index 000000000..8ae9ec6df --- /dev/null +++ b/man/check_interminable_latency.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{check_interminable_latency} +\alias{check_interminable_latency} +\title{warn when the latency is larger than would be reasonable} +\usage{ +check_interminable_latency( + dataset, + latency_table, + target_columns, + forecast_date, + call = caller_env() +) +} +\arguments{ +\item{dataset}{the epi_df} + +\item{latency_table}{the whole collection of latencies} + +\item{target_columns}{the names of the columns that we're adjusting, and whether its unreasonably latent} +} +\description{ +warn when the latency is larger than would be reasonable +} +\keyword{internal} diff --git a/man/construct_shift_tibble.Rd b/man/construct_shift_tibble.Rd new file mode 100644 index 000000000..619583f1d --- /dev/null +++ b/man/construct_shift_tibble.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{construct_shift_tibble} +\alias{construct_shift_tibble} +\title{create a table of the columns to modify, their shifts, and their prefixes} +\usage{ +construct_shift_tibble(terms_used, recipe, rel_step_type, shift_name) +} +\description{ +create a table of the columns to modify, their shifts, and their prefixes +} +\keyword{internal} diff --git a/man/count_single_column.Rd b/man/count_single_column.Rd new file mode 100644 index 000000000..7922511e7 --- /dev/null +++ b/man/count_single_column.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{count_single_column} +\alias{count_single_column} +\title{get the location of the last real value} +\usage{ +count_single_column(col) +} +\arguments{ +\item{col}{the relevant column} +} +\description{ +get the location of the last real value +} +\keyword{internal} diff --git a/man/epi_shift.Rd b/man/epi_shift.Rd deleted file mode 100644 index 14316a8db..000000000 --- a/man/epi_shift.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_shift.R -\name{epi_shift} -\alias{epi_shift} -\title{Shift predictors while maintaining grouping and time_value ordering} -\usage{ -epi_shift(x, shifts, time_value, keys = NULL, out_name = "x") -} -\arguments{ -\item{x}{Data frame. Variables to shift} - -\item{shifts}{List. Each list element is a vector of shifts. -Negative values produce leads. The list should have the same -length as the number of columns in \code{x}.} - -\item{time_value}{Vector. Same length as \code{x} giving time stamps.} - -\item{keys}{Data frame, vector, or \code{NULL}. Additional grouping vars.} - -\item{out_name}{Chr. The output list will use this as a prefix.} -} -\value{ -a list of tibbles -} -\description{ -This is a lower-level function. As such it performs no error checking. -} -\keyword{internal} diff --git a/man/epi_shift_single.Rd b/man/epi_shift_single.Rd new file mode 100644 index 000000000..871879004 --- /dev/null +++ b/man/epi_shift_single.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_shift.R +\name{epi_shift_single} +\alias{epi_shift_single} +\title{Shift predictors while maintaining grouping and time_value ordering} +\usage{ +epi_shift_single(x, col, shift_val, newname, key_cols) +} +\arguments{ +\item{x}{Data frame.} + +\item{shift_val}{a single integer. Negative values produce leads.} + +\item{newname}{the name for the newly shifted column} + +\item{key_cols}{vector, or \code{NULL}. Additional grouping vars.} +} +\value{ +a list of tibbles +} +\description{ +This is a lower-level function. As such it performs no error checking. +} +\keyword{internal} diff --git a/man/flatline_args_list.Rd b/man/flatline_args_list.Rd index 059dfa038..401850efe 100644 --- a/man/flatline_args_list.Rd +++ b/man/flatline_args_list.Rd @@ -13,7 +13,6 @@ flatline_args_list( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, ... ) } @@ -29,11 +28,17 @@ So for example, \code{ahead = 7} will create residuals by comparing values key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date that is being forecast. The default \code{NULL} +will determine this automatically as \code{forecast_date + ahead}.} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of @@ -54,16 +59,6 @@ before calculating residual quantiles. See the \code{by_key} argument to residual quantiles are used. It is not applicable with \code{trainer = quantile_reg()}, for example.} -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{...}{Space to handle future expansions (unused).} } \value{ diff --git a/man/forecast.epi_workflow.Rd b/man/forecast.epi_workflow.Rd index b9f6870b8..22f8cf4bb 100644 --- a/man/forecast.epi_workflow.Rd +++ b/man/forecast.epi_workflow.Rd @@ -4,15 +4,13 @@ \alias{forecast.epi_workflow} \title{Produce a forecast from an epi workflow} \usage{ -\method{forecast}{epi_workflow}(object, ..., fill_locf = FALSE, n_recent = NULL, forecast_date = NULL) +\method{forecast}{epi_workflow}(object, ..., n_recent = NULL, forecast_date = NULL) } \arguments{ \item{object}{An epi workflow.} \item{...}{Not used.} -\item{fill_locf}{Logical. Should we use locf to fill in missing data?} - \item{n_recent}{Integer or NULL. If filling missing data with locf = TRUE, how far back are we willing to tolerate missing data? Larger values allow more filling. The default NULL will determine this from the the recipe. For diff --git a/man/get_forecast_date.Rd b/man/get_forecast_date.Rd new file mode 100644 index 000000000..6a35fff04 --- /dev/null +++ b/man/get_forecast_date.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_forecast_date} +\alias{get_forecast_date} +\title{Extract the as_of for the forecast date, and make sure there's nothing very off about it.} +\usage{ +get_forecast_date(new_data, info, epi_keys_checked, latency, columns = NULL) +} +\description{ +Extract the as_of for the forecast date, and make sure there's nothing very off about it. +} +\keyword{internal} diff --git a/man/get_forecast_date_in_layer.Rd b/man/get_forecast_date_in_layer.Rd new file mode 100644 index 000000000..c866a88e7 --- /dev/null +++ b/man/get_forecast_date_in_layer.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_forecast_date_in_layer} +\alias{get_forecast_date_in_layer} +\title{get the target date while in a layer} +\usage{ +get_forecast_date_in_layer(this_recipe, workflow_max_time_value, new_data) +} +\arguments{ +\item{this_recipe}{the recipe to check for \code{step_adjust_latency}} + +\item{workflow_max_time_value}{the \code{max_time} value coming out of the fit +workflow (this will be the maximal time value in a potentially different +dataset)} + +\item{new_data}{the data we're currently working with, from which we'll take +a potentially different max_time_value} +} +\description{ +get the target date while in a layer +} +\keyword{internal} diff --git a/man/get_latency.Rd b/man/get_latency.Rd new file mode 100644 index 000000000..5d6d7190f --- /dev/null +++ b/man/get_latency.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_latency} +\alias{get_latency} +\title{the latency is also the amount the shift is off by} +\usage{ +get_latency(new_data, forecast_date, column, sign_shift, epi_keys_checked) +} +\arguments{ +\item{sign_shift}{integer. 1 if lag and -1 if ahead. These represent how you +need to shift the data to bring the 3 day lagged value to today.} +} +\description{ +the latency is also the amount the shift is off by +} +\keyword{internal} diff --git a/man/get_latency_table.Rd b/man/get_latency_table.Rd new file mode 100644 index 000000000..ae309c944 --- /dev/null +++ b/man/get_latency_table.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_latency_table} +\alias{get_latency_table} +\title{create the latency table +This is a table of column names and the latency adjustment necessary for that column. An example:} +\usage{ +get_latency_table( + training, + columns, + forecast_date, + latency, + sign_shift, + epi_keys_checked, + info, + terms +) +} +\description{ +col_name latency +\if{html}{\out{}} \if{html}{\out{}} +1 case_rate 5 +2 death_rate 5 +} +\keyword{internal} diff --git a/man/get_sign.Rd b/man/get_sign.Rd new file mode 100644 index 000000000..0be3e6306 --- /dev/null +++ b/man/get_sign.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_shift.R +\name{get_sign} +\alias{get_sign} +\title{lags move columns forward to bring the past up to today, while aheads drag +the future back to today} +\usage{ +get_sign(object) +} +\description{ +lags move columns forward to bring the past up to today, while aheads drag +the future back to today +} +\keyword{internal} diff --git a/man/get_test_data.Rd b/man/get_test_data.Rd index b18685d89..81649452a 100644 --- a/man/get_test_data.Rd +++ b/man/get_test_data.Rd @@ -4,32 +4,13 @@ \alias{get_test_data} \title{Get test data for prediction based on longest lag period} \usage{ -get_test_data( - recipe, - x, - fill_locf = FALSE, - n_recent = NULL, - forecast_date = max(x$time_value) -) +get_test_data(recipe, x) } \arguments{ \item{recipe}{A recipe object.} \item{x}{An epi_df. The typical usage is to pass the same data as that used for fitting the recipe.} - -\item{fill_locf}{Logical. Should we use \code{locf} to fill in missing data?} - -\item{n_recent}{Integer or NULL. If filling missing data with \code{locf = TRUE}, -how far back are we willing to tolerate missing data? Larger values allow -more filling. The default \code{NULL} will determine this from the -the \code{recipe}. For example, suppose \code{n_recent = 3}, then if the -3 most recent observations in any \code{geo_value} are all \code{NA}’s, we won’t be -able to fill anything, and an error message will be thrown. (See details.)} - -\item{forecast_date}{By default, this is set to the maximum -\code{time_value} in \code{x}. But if there is data latency such that recent \code{NA}'s -should be filled, this may be \emph{after} the last available \code{time_value}.} } \value{ An object of the same type as \code{x} with columns \code{geo_value}, \code{time_value}, any additional @@ -47,12 +28,6 @@ The minimum required (recent) data to produce a forecast is equal to the maximum lag requested (on any predictor) plus the longest horizon used if growth rate calculations are requested by the recipe. This is calculated internally. - -It also optionally fills missing values -using the last-observation-carried-forward (LOCF) method. If this -is not possible (say because there would be only \code{NA}'s in some location), -it will produce an error suggesting alternative options to handle missing -values with more advanced techniques. } \examples{ # create recipe diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index e27f2bacd..aa224013f 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -14,10 +14,12 @@ layer_add_forecast_date( \item{frosting}{a \code{frosting} postprocessor} \item{forecast_date}{The forecast date to add as a column to the \code{epi_df}. -For most cases, this should be specified in the form "yyyy-mm-dd". Note that -when the forecast date is left unspecified, it is set to the maximum time -value from the data used in pre-processing, fitting the model, and -postprocessing.} +For most cases, this should be specified in the form "yyyy-mm-dd". Note +that when the forecast date is left unspecified, it is set to one of two +values. If there is a \code{step_adjust_latency} step present, it uses the +\code{forecast_date} as set in that function. Otherwise, it uses the maximum +\code{time_value} across the data used for pre-processing, fitting the model, +and postprocessing.} \item{id}{a random id string} } diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd index dc0d2f190..e522cd6da 100644 --- a/man/layer_add_target_date.Rd +++ b/man/layer_add_target_date.Rd @@ -13,14 +13,14 @@ layer_add_target_date( \arguments{ \item{frosting}{a \code{frosting} postprocessor} -\item{target_date}{The target date to add as a column to the -\code{epi_df}. If there's a forecast date specified in a layer, then -it is the forecast date plus \code{ahead} (from \code{step_epi_ahead} in -the \code{epi_recipe}). Otherwise, it is the maximum \code{time_value} -(from the data used in pre-processing, fitting the model, and -postprocessing) plus \code{ahead}, where \code{ahead} has been specified in -preprocessing. The user may override these by specifying a -target date of their own (of the form "yyyy-mm-dd").} +\item{target_date}{The target date to add as a column to the \code{epi_df}. If +there's a forecast date specified upstream (either in a +\code{step_adjust_latency} or in a \code{layer_forecast_date}), then it is the +forecast date plus \code{ahead} (from \code{step_epi_ahead} in the \code{epi_recipe}). +Otherwise, it is the maximum \code{time_value} (from the data used in +pre-processing, fitting the model, and postprocessing) plus \code{ahead}, where +\code{ahead} has been specified in preprocessing. The user may override these by +specifying a target date of their own (of the form "yyyy-mm-dd").} \item{id}{a random id string} } @@ -33,8 +33,14 @@ Postprocessing step to add the target date \details{ By default, this function assumes that a value for \code{ahead} has been specified in a preprocessing step (most likely in -\code{step_epi_ahead}). Then, \code{ahead} is added to the maximum \code{time_value} -in the test data to get the target date. +\code{step_epi_ahead}). Then, \code{ahead} is added to the \code{forecast_date} +in the test data to get the target date. \code{forecast_date} can be set in 3 ways: +\enumerate{ +\item \code{step_adjust_latency}, which typically uses the training \code{epi_df}'s \code{as_of} +\item \code{layer_add_forecast_date}, which inherits from 1 if not manually specifed +\item if none of those are the case, it is simply the maximum \code{time_value} over +every dataset used (prep, training, and prediction). +} } \examples{ library(dplyr) @@ -58,8 +64,14 @@ wf1 <- wf \%>\% add_frosting(f) p <- forecast(wf1) p -# Use ahead + max time value from pre, fit, post -# which is the same if include `layer_add_forecast_date()` +# Use ahead + forecast_date from adjust_latency +# setting the `as_of` to something realistic +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 +r <- epi_recipe(jhu) \%>\% + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_adjust_latency(method = "extend_ahead") \%>\% + step_epi_naomit() f2 <- frosting() \%>\% layer_predict() \%>\% layer_add_target_date() \%>\% @@ -69,13 +81,24 @@ wf2 <- wf \%>\% add_frosting(f2) p2 <- forecast(wf2) p2 -# Specify own target date +# Use ahead + max time value from pre, fit, post +# which is the same if include `layer_add_forecast_date()` f3 <- frosting() \%>\% layer_predict() \%>\% - layer_add_target_date(target_date = "2022-01-08") \%>\% + layer_add_target_date() \%>\% layer_naomit(.pred) wf3 <- wf \%>\% add_frosting(f3) -p3 <- forecast(wf3) -p3 +p3 <- forecast(wf2) +p2 + +# Specify own target date +f4 <- frosting() \%>\% + layer_predict() \%>\% + layer_add_target_date(target_date = "2022-01-08") \%>\% + layer_naomit(.pred) +wf4 <- wf \%>\% add_frosting(f4) + +p4 <- forecast(wf4) +p4 } diff --git a/man/pad_to_end.Rd b/man/pad_to_end.Rd new file mode 100644 index 000000000..b9cde372e --- /dev/null +++ b/man/pad_to_end.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{pad_to_end} +\alias{pad_to_end} +\title{pad every group at the right interval} +\usage{ +pad_to_end(x, groups, end_date, columns_to_complete = NULL) +} +\arguments{ +\item{x}{an epi_df to be filled forward.} + +\item{groups}{the grouping by which to fill forward} + +\item{columns_to_complete}{which columns to apply completion to. By default every non-key column of an epi_df} +} +\description{ +Perform last observation carried forward on a group by group basis. It uses +\code{guess_period} to find the appropriate interval to fill-forward by. It +maintains the grouping structure it recieves. It does \emph{not} fill any +"interior" \code{NA} values occurring in the data beforehand. +} +\keyword{internal} diff --git a/man/seq_forward.Rd b/man/seq_forward.Rd new file mode 100644 index 000000000..9b3da6e55 --- /dev/null +++ b/man/seq_forward.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{seq_forward} +\alias{seq_forward} +\title{seq, but returns null if from is larger} +\usage{ +seq_forward(from, to, by) +} +\description{ +seq, but returns null if from is larger +} +\keyword{internal} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd new file mode 100644 index 000000000..f0ee41390 --- /dev/null +++ b/man/step_adjust_latency.Rd @@ -0,0 +1,297 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_adjust_latency.R +\name{step_adjust_latency} +\alias{step_adjust_latency} +\title{Adapt the model to latent data} +\usage{ +step_adjust_latency( + recipe, + ..., + method = c("extend_ahead", "locf", "extend_lags"), + epi_keys_checked = NULL, + fixed_latency = NULL, + fixed_forecast_date = NULL, + check_latency_length = TRUE, + id = rand_id("adjust_latency") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the +sequence of operations for this recipe.} + +\item{...}{One or more selector functions to choose variables +for this step. See \code{\link[recipes:selections]{selections()}} for more details.} + +\item{method}{a character. Determines the method by which the +forecast handles latency. The options are: +\itemize{ +\item \code{"extend_ahead"}: Lengthen the ahead so that forecasting from the last +observation results in a forecast \code{ahead} after the \code{forecast_date} date. +E.g. if there are 3 days of latency between the last observation and the +\code{forecast_date} date for a 4 day ahead forecast, the ahead used in practice +is actually 7. +\item \code{"locf"}: carries forward the last observed value(s) up to the forecast +date. +\item \code{"extend_lags"}: per \code{epi_key} and \code{predictor}, adjusts the lag so that +the shortest lag at predict time is at the last observation. E.g. if the +lags are \code{c(0,7,14)} for data that is 3 days latent, the actual lags used +become \code{c(3,10,17)}. +}} + +\item{epi_keys_checked}{a character vector. A list of keys to group by before +finding the \code{max_time_value} (the last day of data), defaulting to +\code{geo_value}. Different locations may have different latencies; to produce a +forecast at every location, we need to guarantee data at every location by +using the largest latency across every location; this means taking +\code{max_time_value} to be the minimum of the \code{max_time_value}s for each set of +key values (so the earliest date). If \code{NULL} or an empty character vector, +it will take the maximum across all values, irrespective of any keys. + +Note that this is a separate concern from different latencies across +different \emph{data columns}, which is only handled by the choice of \code{method}.} + +\item{fixed_latency}{either a positive integer, or a labeled positive integer +vector. Cannot be set at the same time as \code{fixed_forecast_date}. If +non-\code{NULL}, the amount to offset the ahead or lag by. If a single integer, +this is used for all columns; if a labeled vector, the labels must +correspond to the base column names (before lags/aheads). If \code{NULL}, the +latency is the distance between the \code{epi_df}'s \code{max_time_value} and the \code{forecast_date}.} + +\item{fixed_forecast_date}{either a date of the same kind used in the +\code{epi_df}, or \code{NULL}. Exclusive with \code{fixed_latency}. If a date, it gives +the date from which the forecast is actually occurring. If \code{NULL}, the +\code{forecast_date} is determined either via the \code{fixed_latency}, or is set to +the \code{epi_df}'s \code{as_of} value if \code{fixed_latency} is also \code{NULL}.} + +\item{check_latency_length}{bool, determines whether to warn if the latency +is unusually high. Turn off if you know your forecast is going to be far +into the future.} + +\item{id}{A character string that is unique to this step to identify it.} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +In the standard case, the arx models assume that the last observation is also +the day from which the forecast is being made. But if the data has latency, +then you may wish to adjust the predictors (lags) and/or the outcome (ahead) +to compensate. +This is most useful in realtime and +pseudo-prospective forecasting for data where there is some delay between the +event occurring and the event being reported. +} +\details{ +This step allows the user to create models on the most recent +data, automatically accounting for latency patterns. Instead of using the last observation +date, \code{step_adjust_latency} uses the \code{as_of} date of the \code{epi_df} as the +\code{forecast_date}, and adjusts the model so that there is data available. To +demonstrate some of the subtleties, let's consider a toy dataset: + +\if{html}{\out{
}}\preformatted{toy_df <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "ma", as.Date("2015-01-11"), 20, 6, + "ma", as.Date("2015-01-12"), 23, NA, + "ma", as.Date("2015-01-13"), 25, NA, + "ca", as.Date("2015-01-11"), 100, 5, + "ca", as.Date("2015-01-12"), 103, 10, +) \%>\% + as_epi_df(as_of = as.Date("2015-01-14")) +}\if{html}{\out{
}} + +If we're looking to predict the value on the 15th, forecasting from the 14th (the \code{as_of} date above), +there are two issues we will need to address: +\enumerate{ +\item \code{"ca"} is latent by 2 days, whereas \code{"ma"} is latent by 1 +\item if we want to use \code{b} as an exogenous variable, for \code{"ma"} it is latent by 3 days instead of just 1. +} + +Regardless of \code{method}, \code{epi_keys_checked="geo_value"} guarantees tha the +difference between \code{"ma"} and \code{"ca"} is accounted for by making the +latency adjustment at least 2. For some comparison, here's what the various +methods will do: +\subsection{\code{locf}}{ + +Short for "last observation carried forward", \code{locf} assumes that every day +between the last observation and the forecast day is exactly the same. +This is a very straightforward assumption, but wrecks any features that +depend on changes in value over time, such as the growth rate, or even +adjacent lags. A more robust version of this falls under the heading of +nowcasting, an eventual aim for this package. On the toy dataset, it +doesn't matter which day we're trying to predict, since it just fills +forward to the \code{forecast_date}: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_adjust_latency(method="locf") + +toy_recipe \%>\% + prep(toy_df) \%>\% + bake(toy_df) \%>\% + arrange(geo_value, time_value) +#> An `epi_df` object, 8 x 4 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2015-01-14 +#> +#> # A tibble: 8 x 4 +#> geo_value time_value a b +#> * +#> 1 ca 2015-01-11 100 5 +#> 2 ca 2015-01-12 103 10 +#> 3 ca 2015-01-13 103 10 +#> 4 ca 2015-01-14 103 10 +#> 5 ma 2015-01-11 20 6 +#> 6 ma 2015-01-12 23 6 +#> 7 ma 2015-01-13 25 6 +#> 8 ma 2015-01-14 25 6 +}\if{html}{\out{
}} +} + +\subsection{\code{extend_lags}}{ + +\code{extend_lags} increases the lags so that they are guaranteed to have +data. This has the advantage of being applicable on +a per-column basis; if cases and deaths are reported at different +latencies, the lags for each are adjusted separately. In the toy example: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_adjust_latency(method="extend_lags") \%>\% + step_epi_lag(a,lag=1) \%>\% + step_epi_lag(b,lag=1) \%>\% + step_epi_ahead(a, ahead=1) + +toy_recipe \%>\% + prep(toy_df) \%>\% + bake(toy_df) \%>\% + arrange(geo_value, time_value) +#> An `epi_df` object, 21 x 7 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2015-01-14 +#> +#> # A tibble: 21 x 7 +#> geo_value time_value a b lag_3_a lag_4_b ahead_1_a +#> * +#> 1 ca 2015-01-10 NA NA NA NA 100 +#> 2 ca 2015-01-11 100 5 NA NA 103 +#> 3 ca 2015-01-12 103 10 NA NA NA +#> 4 ca 2015-01-13 NA NA NA NA NA +#> 5 ca 2015-01-14 NA NA 100 NA NA +#> 6 ca 2015-01-15 NA NA 103 5 NA +#> 7 ca 2015-01-16 NA NA NA 10 NA +#> 8 ca 2015-01-17 NA NA NA NA NA +#> 9 ca 2015-01-18 NA NA NA NA NA +#> 10 ca 2015-01-19 NA NA NA NA NA +#> # i 11 more rows +}\if{html}{\out{
}} + +The maximum latency in column \code{a} is 2 days, so the lag is increased to 3, +while the max latency in column \code{b} is 3, so the same lag is increased to +4; both of these changes are reflected in the column names. Meanwhile the +ahead is uneffected. + +As a side-note, lag/ahead can be somewhat ambiguous about direction. Here, +the values are brought forward in time, so that for a given row, column +\code{lag_3_a} represents the value 3 days before. +} + +\subsection{\code{extend_ahead}}{ + +\code{extend_ahead} increases the ahead, turning a 3 day ahead forecast +into a 7 day one; this has the advantage of simplicity and is reflective of +the actual modelling task, but potentially leaves information unused if +different data sources have different latencies; it must use the latency of +the most latent data to insure there is data available. In the toy example: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_adjust_latency(method="extend_ahead") \%>\% + step_epi_lag(a,lag=0) \%>\% + step_epi_ahead(a, ahead=1) + +toy_recipe \%>\% + prep(toy_df) \%>\% + bake(toy_df) \%>\% + arrange(geo_value, time_value) +#> An `epi_df` object, 10 x 6 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2015-01-14 +#> +#> # A tibble: 10 x 6 +#> geo_value time_value a b lag_0_a ahead_3_a +#> * +#> 1 ca 2015-01-08 NA NA NA 100 +#> 2 ca 2015-01-09 NA NA NA 103 +#> 3 ca 2015-01-11 100 5 100 NA +#> 4 ca 2015-01-12 103 10 103 NA +#> 5 ma 2015-01-08 NA NA NA 20 +#> 6 ma 2015-01-09 NA NA NA 23 +#> 7 ma 2015-01-10 NA NA NA 25 +#> 8 ma 2015-01-11 20 6 20 NA +#> 9 ma 2015-01-12 23 NA 23 NA +#> 10 ma 2015-01-13 25 NA 25 NA +}\if{html}{\out{
}} + +Even though we're doing a 1 day ahead forecast, because our worst latency +is 3 days from column \code{b}'s \code{"ma"} data, our outcome column is \code{ahead_4_a} +(so 4 days ahead). If we want to ignore any latency in column \code{b}, we need +to explicitly set the columns to consider while adjusting like this: +\code{step_adjust_latency(a, method="extend_ahead")}. +} +} +\section{Programmatic details}{ +\code{step_adjust_latency} uses the metadata, such as \code{time_type} and \code{as_of}, of +the \code{epi_df} used in the initial prep step, rather than baking or +prediction. This means reusing the same forecaster on new data is not +advised, though typically it is not advised in general. + +The latency adjustment only applies to columns created after this step, so +this step should go before both \code{step_epi_ahead} and \code{step_epi_lag}. This will work: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + # non-lag steps + step_adjust_latency(a, method = "extend_lags") \%>\% + step_epi_lag(a, lag=0) # other steps +}\if{html}{\out{
}} + +while this will not: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_epi_lag(a, lag=0) \%>\% + step_adjust_latency(a, method = "extend_lags") +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with +#> modified data. +}\if{html}{\out{
}} + +If you create columns that you then apply lags to (such as +\code{step_growth_rate()}), these should be created before +\code{step_adjust_latency}, so any subseqent latency can be addressed. +} + +\examples{ +jhu <- case_death_rate_subset \%>\% + dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) +# setting the `as_of` to something realistic +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 + +r <- epi_recipe(case_death_rate_subset) \%>\% + step_adjust_latency(method = "extend_ahead") \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_epi_lag(death_rate, lag = c(0, 7, 14)) +r + +jhu_fit <- epi_workflow() \%>\% + add_epi_recipe(r) \%>\% + add_model(linear_reg()) \%>\% + fit(data = jhu) +jhu_fit + +} +\seealso{ +Other row operation steps: +\code{\link{step_epi_lag}()}, +\code{\link{step_growth_rate}()}, +\code{\link{step_lag_difference}()} +} +\concept{row operation steps} diff --git a/man/step_adjust_latency_checks.Rd b/man/step_adjust_latency_checks.Rd new file mode 100644 index 000000000..baed1fb9b --- /dev/null +++ b/man/step_adjust_latency_checks.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{step_adjust_latency_checks} +\alias{step_adjust_latency_checks} +\title{checks: the recipe type, whether a previous step is the relevant epi_shift, +that either \code{fixed_latency} or \code{fixed_forecast_date} is non-null, and that +\code{fixed_latency} only references columns that exist at the time of the step +inclusion} +\usage{ +step_adjust_latency_checks( + id, + method, + recipe, + fixed_latency, + fixed_forecast_date, + call = caller_env() +) +} +\description{ +checks: the recipe type, whether a previous step is the relevant epi_shift, +that either \code{fixed_latency} or \code{fixed_forecast_date} is non-null, and that +\code{fixed_latency} only references columns that exist at the time of the step +inclusion +} +\keyword{internal} diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index 2bf22c15d..30ac05d16 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -85,10 +85,12 @@ r } \seealso{ Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_growth_rate}()}, \code{\link{step_lag_difference}()} Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_growth_rate}()}, \code{\link{step_lag_difference}()} } diff --git a/man/step_growth_rate.Rd b/man/step_growth_rate.Rd index bc6da0bef..752b38dbe 100644 --- a/man/step_growth_rate.Rd +++ b/man/step_growth_rate.Rd @@ -83,6 +83,7 @@ r \%>\% } \seealso{ Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_epi_lag}()}, \code{\link{step_lag_difference}()} } diff --git a/man/step_lag_difference.Rd b/man/step_lag_difference.Rd index 7969ea3a7..e8ec2101a 100644 --- a/man/step_lag_difference.Rd +++ b/man/step_lag_difference.Rd @@ -58,6 +58,7 @@ r \%>\% } \seealso{ Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_epi_lag}()}, \code{\link{step_growth_rate}()} } diff --git a/tests/testthat.R b/tests/testthat.R index 296d916e8..27254bec7 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,8 @@ library(testthat) library(epipredict) +library(parsnip) +library(workflows) +library(dplyr) + test_check("epipredict") diff --git a/tests/testthat/_snaps/enframer.md b/tests/testthat/_snaps/enframer.md deleted file mode 100644 index 4b05dbff3..000000000 --- a/tests/testthat/_snaps/enframer.md +++ /dev/null @@ -1,32 +0,0 @@ -# enframer errors/works as needed - - Code - enframer(1:5, letters[1]) - Condition - Error in `enframer()`: - ! is.data.frame(df) is not TRUE - ---- - - Code - enframer(data.frame(a = 1:5), 1:3) - Condition - Error in `enframer()`: - ! `x` must be of type . - ---- - - Code - enframer(data.frame(a = 1:5), letters[1:3]) - Condition - Error in `enframer()`: - ! In enframer: some new cols match existing column names - ---- - - Code - enframer(data.frame(aa = 1:5), letters[1:2], fill = 1:4) - Condition - Error in `enframer()`: - ! length(fill) == 1 || length(fill) == nrow(df) is not TRUE - diff --git a/tests/testthat/_snaps/pivot_quantiles.md b/tests/testthat/_snaps/pivot_quantiles.md index 184eb62a6..ca775a18f 100644 --- a/tests/testthat/_snaps/pivot_quantiles.md +++ b/tests/testthat/_snaps/pivot_quantiles.md @@ -45,7 +45,5 @@ pivot_quantiles_longer(tib, d1, d3) Condition Error in `pivot_quantiles_longer()`: - ! Some selected columns contain different numbers of quantiles. - The result would be a very long . - To do this anyway, rerun with `.ignore_length_check = TRUE`. + ! Some selected columns contain different numbers of quantiles. The result would be a very long . To do this anyway, rerun with `.ignore_length_check = TRUE`. diff --git a/tests/testthat/_snaps/population_scaling.md b/tests/testthat/_snaps/population_scaling.md deleted file mode 100644 index 9263e8e1e..000000000 --- a/tests/testthat/_snaps/population_scaling.md +++ /dev/null @@ -1,16 +0,0 @@ -# expect error if `by` selector does not match - - Code - wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% add_frosting(f) - Condition - Error in `hardhat::validate_column_names()`: - ! The following required columns are missing: 'a'. - ---- - - Code - forecast(wf) - Condition - Error in `hardhat::validate_column_names()`: - ! The following required columns are missing: 'nothere'. - diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md index 84abf57d2..9ce6502d1 100644 --- a/tests/testthat/_snaps/snapshots.md +++ b/tests/testthat/_snaps/snapshots.md @@ -1031,6 +1031,129 @@ 18993, 18993, 18993, 18993, 18993), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) +--- + + structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" + ), .pred = c(0.303244704017743, 0.531332853311082, 0.588827944685979, + 0.988690249216229, 0.794801997001639, 0.306895457225321), .pred_distn = structure(list( + structure(list(values = c(0.136509784083987, 0.469979623951498 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c(0.364597933377326, 0.698067773244837), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr")), structure(list(values = c(0.422093024752224, + 0.755562864619735), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c(0.821955329282474, 1.15542516914998), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr")), structure(list(values = c(0.628067077067883, + 0.961536916935394), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c(0.140160537291566, 0.473630377159077), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr"))), class = c("distribution", "vctrs_vctr", + "list")), forecast_date = structure(c(18997, 18997, 18997, 18997, + 18997, 18997), class = "Date"), target_date = structure(c(18998, + 18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, + -6L), class = c("tbl_df", "tbl", "data.frame")) + +--- + + structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" + ), .pred = c(0.303244704017743, 0.531332853311082, 0.588827944685979, + 0.988690249216229, 0.794801997001639, 0.306895457225321), .pred_distn = structure(list( + structure(list(values = c(0.136509784083987, 0.469979623951498 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c(0.364597933377326, 0.698067773244837), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr")), structure(list(values = c(0.422093024752224, + 0.755562864619735), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c(0.821955329282474, 1.15542516914998), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr")), structure(list(values = c(0.628067077067883, + 0.961536916935394), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c(0.140160537291566, 0.473630377159077), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr"))), class = c("distribution", "vctrs_vctr", + "list")), forecast_date = structure(c(18997, 18997, 18997, 18997, + 18997, 18997), class = "Date"), target_date = structure(c(18998, + 18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, + -6L), class = c("tbl_df", "tbl", "data.frame")) + +# arx_forecaster output format snapshots + + Code + out1 + Message + == A basic forecaster of type ARX Forecaster =================================== + + This forecaster was fit on 999-01-01. + + Training data was an with: + * Geography: state, + * Time type: day, + * Using data up-to-date as of: 2022-05-31. + * With the last data available on 2021-12-31 + + -- Predictions ----------------------------------------------------------------- + + A total of 56 predictions are available for + * 56 unique geographic regions, + * At forecast date: 2021-12-31, + * For target date: 2022-01-07, + + +--- + + Code + out2 + Message + == A basic forecaster of type ARX Forecaster =================================== + + This forecaster was fit on 999-01-01. + + Training data was an with: + * Geography: state, + * Time type: day, + * Using data up-to-date as of: 2022-05-31. + * With the last data available on 2021-12-31 + + -- Predictions ----------------------------------------------------------------- + + A total of 56 predictions are available for + * 56 unique geographic regions, + * At forecast date: 2022-01-03, + * For target date: 2022-01-10, + * Lags adjusted per column: case_rate=3, death_rate=3 + + +--- + + Code + out3 + Message + == A basic forecaster of type ARX Forecaster =================================== + + This forecaster was fit on 999-01-01. + + Training data was an with: + * Geography: state, + * Time type: day, + * Using data up-to-date as of: 2022-05-31. + * With the last data available on 2021-12-31 + + -- Predictions ----------------------------------------------------------------- + + A total of 56 predictions are available for + * 56 unique geographic regions, + * At forecast date: 2022-01-03, + * For target date: 2022-01-10, + * Aheads adjusted for death_rate=3 + + # arx_classifier snapshots structure(list(geo_value = c("ak", "al", "ar", "az", "ca", "co", @@ -1058,3 +1181,30 @@ 18999, 18999, 18999, 18999, 18999, 18999, 18999), class = "Date")), row.names = c(NA, -53L), class = c("tbl_df", "tbl", "data.frame")) +--- + + structure(list(geo_value = c("ak", "al", "ar", "az", "ca", "co", + "ct", "dc", "de", "fl", "ga", "gu", "hi", "ia", "id", "il", "in", + "ks", "ky", "la", "ma", "me", "mi", "mn", "mo", "mp", "ms", "mt", + "nc", "nd", "ne", "nh", "nj", "nm", "nv", "ny", "oh", "ok", "or", + "pa", "pr", "ri", "sc", "sd", "tn", "tx", "ut", "va", "vt", "wa", + "wi", "wv", "wy"), .pred_class = structure(c(1L, 1L, 1L, 1L, + 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, + 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, + 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, + 1L), levels = c("(-Inf,0.25]", "(0.25, Inf]"), class = "factor"), + forecast_date = structure(c(18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994), class = "Date"), target_date = structure(c(19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001), class = "Date")), row.names = c(NA, + -53L), class = c("tbl_df", "tbl", "data.frame")) + diff --git a/tests/testthat/_snaps/step_adjust_latency.md b/tests/testthat/_snaps/step_adjust_latency.md new file mode 100644 index 000000000..e37ae07ea --- /dev/null +++ b/tests/testthat/_snaps/step_adjust_latency.md @@ -0,0 +1,84 @@ +# printing step_adjust_latency results in expected output + + Code + r5 + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Operations + 1. extend_lags: case_rate with latency set at train time + 2. Lagging: death_rate by 0, 6, 11 + 3. Lagging: case_rate by 1, 5 + 4. Leading: death_rate by 7 + +--- + + Code + prep(r5, real_x) + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Training information + Training data contained 200 data points and no incomplete rows. + + -- Operations + 1. extend_lags: case_rate with forecast date 2021-07-24 | Trained + 2. Lagging: death_rate by 0, 6, 11, (lat adj) | Trained + 3. Lagging: case_rate by 6, 10, (lat adj) | Trained + 4. Leading: death_rate by 7 | Trained + +--- + + Code + r6 + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Operations + 1. Lagging: death_rate by 0, 7, 14 + 2. extend_ahead: all future predictors with latency set at train time + 3. Leading: death_rate by 7 + +--- + + Code + prep(r6, case_death_rate_subset) + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Training information + Training data contained 20496 data points and no incomplete rows. + + -- Operations + 1. Lagging: death_rate by 0, 7, 14 | Trained + 2. extend_ahead: case_rate, ... with forecast date 2022-05-31 | Trained + 3. Leading: death_rate by -158, (lat adj) | Trained + diff --git a/tests/testthat/_snaps/step_epi_shift.md b/tests/testthat/_snaps/step_epi_shift.md index 44c828118..eaf495995 100644 --- a/tests/testthat/_snaps/step_epi_shift.md +++ b/tests/testthat/_snaps/step_epi_shift.md @@ -30,7 +30,7 @@ Code slm_fit(r4) Condition - Error in `bake()`: + Error in `add_shifted_columns()`: ! Name collision occured in The following variable name already exists: "lag_7_death_rate". diff --git a/tests/testthat/_snaps/step_growth_rate.md b/tests/testthat/_snaps/step_growth_rate.md index 5a3ac6f44..0912977e3 100644 --- a/tests/testthat/_snaps/step_growth_rate.md +++ b/tests/testthat/_snaps/step_growth_rate.md @@ -117,5 +117,5 @@ step_growth_rate(r, value, replace_Inf = c(1, 2)) Condition Error in `step_growth_rate()`: - ! replace_Inf must be a scalar. + ! `replace_Inf` must be a scalar. diff --git a/tests/testthat/test-arx_args_list.R b/tests/testthat/test-arx_args_list.R index 03cbc0025..9bbff0135 100644 --- a/tests/testthat/test-arx_args_list.R +++ b/tests/testthat/test-arx_args_list.R @@ -26,11 +26,14 @@ test_that("arx_args checks inputs", { expect_snapshot(error = TRUE, arx_args_list(n_training_min = "de")) expect_snapshot(error = TRUE, arx_args_list(epi_keys = 1)) - expect_warning(arx_args_list( - forecast_date = as.Date("2022-01-01"), - target_date = as.Date("2022-01-03"), - ahead = 1L - )) + expect_error( + arx_args_list( + forecast_date = as.Date("2022-01-01"), + target_date = as.Date("2022-01-04"), + ahead = 1L + ), + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" + ) }) test_that("arx forecaster disambiguates quantiles", { diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R new file mode 100644 index 000000000..4145a064d --- /dev/null +++ b/tests/testthat/test-arx_forecaster.R @@ -0,0 +1,26 @@ +train_data <- jhu_csse_daily_subset +test_that("arx_forecaster warns if forecast date beyond the implicit one", { + bad_date <- max(train_data$time_value) + 300 + expect_warning( + arx1 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = (arx_args_list(forecast_date = bad_date)) + ), + class = "epipredict__arx_forecaster__forecast_date_defaulting" + ) +}) + +test_that("arx_forecaster errors if forecast date, target date, and ahead are inconsistent", { + max_date <- max(train_data$time_value) + expect_error( + arx1 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = (arx_args_list(ahead = 5, target_date = max_date, forecast_date = max_date)) + ), + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" + ) +}) diff --git a/tests/testthat/test-dist_quantiles.R b/tests/testthat/test-dist_quantiles.R index 66f229956..9975213c6 100644 --- a/tests/testthat/test-dist_quantiles.R +++ b/tests/testthat/test-dist_quantiles.R @@ -41,6 +41,7 @@ test_that("quantile extrapolator works", { expect_s3_class(vctrs::vec_data(qq[1])[[1]], "dist_quantiles") expect_length(parameters(qq[1])$quantile_levels[[1]], 3L) + dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) qq <- extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75)) expect_s3_class(qq, "distribution") diff --git a/tests/testthat/test-enframer.R b/tests/testthat/test-enframer.R deleted file mode 100644 index 0926c587b..000000000 --- a/tests/testthat/test-enframer.R +++ /dev/null @@ -1,13 +0,0 @@ -test_that("enframer errors/works as needed", { - template1 <- data.frame(aa = 1:5, a = NA, b = NA, c = NA) - template2 <- data.frame(aa = 1:5, a = 2:6, b = 2:6, c = 2:6) - expect_snapshot(error = TRUE, enframer(1:5, letters[1])) - expect_snapshot(error = TRUE, enframer(data.frame(a = 1:5), 1:3)) - expect_snapshot(error = TRUE, enframer(data.frame(a = 1:5), letters[1:3])) - expect_identical(enframer(data.frame(aa = 1:5), letters[1:3]), template1) - expect_snapshot(error = TRUE, enframer(data.frame(aa = 1:5), letters[1:2], fill = 1:4)) - expect_identical( - enframer(data.frame(aa = 1:5), letters[1:3], fill = 2:6), - template2 - ) -}) diff --git a/tests/testthat/test-epi_shift.R b/tests/testthat/test-epi_shift.R index 78c9384f1..e8e843f9c 100644 --- a/tests/testthat/test-epi_shift.R +++ b/tests/testthat/test-epi_shift.R @@ -1,23 +1,3 @@ -x <- data.frame(x1 = 1:10, x2 = -10:-1) -lags <- list(c(0, 4), 1:3) - -test_that("epi shift works with NULL keys", { - time_value <- 1:10 - out <- epi_shift(x, lags, time_value) - expect_length(out, 7L) - expect_equal(nrow(out), 14L) - expect_equal(sum(complete.cases(out)), 6L) -}) - -test_that("epi shift works with groups", { - keys <- data.frame(a = rep(letters[1:2], each = 5), b = "z") - time_value <- 1:10 - out <- epi_shift(x, lags, time_value, keys) - expect_length(out, 8L) - expect_equal(nrow(out), 18L) - expect_equal(sum(complete.cases(out)), 2L) -}) - test_that("epi shift single works, renames", { tib <- tibble( x = 1:5, y = 1:5, diff --git a/tests/testthat/test-epi_workflow.R b/tests/testthat/test-epi_workflow.R index 8bb58b0bc..ecd955cc5 100644 --- a/tests/testthat/test-epi_workflow.R +++ b/tests/testthat/test-epi_workflow.R @@ -79,17 +79,11 @@ test_that("forecast method works", { )) ) - args <- list( - fill_locf = TRUE, - n_recent = 360 * 3, - forecast_date = as.Date("2024-01-01") - ) expect_equal( - forecast(wf, !!!args), + forecast(wf), predict(wf, new_data = get_test_data( hardhat::extract_preprocessor(wf), - jhu, - !!!args + jhu )) ) }) diff --git a/tests/testthat/test-get_test_data.R b/tests/testthat/test-get_test_data.R index aa799150b..5f315c499 100644 --- a/tests/testthat/test-get_test_data.R +++ b/tests/testthat/test-get_test_data.R @@ -1,4 +1,4 @@ -library(dplyr) +suppressPackageStartupMessages(library(dplyr)) test_that("return expected number of rows and returned dataset is ungrouped", { r <- epi_recipe(case_death_rate_subset) %>% step_epi_ahead(death_rate, ahead = 7) %>% @@ -44,6 +44,7 @@ test_that("expect error that geo_value or time_value does not exist", { test_that("NA fill behaves as desired", { + testthat::skip() df <- tibble::tibble( geo_value = rep(c("ca", "ny"), each = 10), time_value = rep(1:10, times = 2), @@ -81,6 +82,7 @@ test_that("NA fill behaves as desired", { }) test_that("forecast date behaves", { + testthat::skip() df <- tibble::tibble( geo_value = rep(c("ca", "ny"), each = 10), time_value = rep(1:10, times = 2), diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 428922f46..491bf5e20 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -1,5 +1,7 @@ jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 + r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% @@ -62,11 +64,6 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { layer_naomit(.pred) wf3 <- wf %>% add_frosting(f3) - # this warning has been removed - # expect_warning( - # p3 <- predict(wf3, latest), - # "forecast_date is less than the most recent update date of the data." - # ) expect_silent(p3 <- predict(wf3, latest)) expect_equal(ncol(p3), 4L) expect_s3_class(p3, "epi_df") @@ -75,6 +72,34 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { expect_named(p3, c("geo_value", "time_value", ".pred", "forecast_date")) }) +test_that("`layer_add_forecast_date()` infers correct date when using `adjust_latency`", { + jhu_reasonable_date <- jhu + attributes(jhu_reasonable_date)$metadata$as_of <- as.Date("2022-01-03") + r_latent <- epi_recipe(jhu_reasonable_date) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + frost_latent <- frosting() %>% + layer_predict() %>% + layer_add_forecast_date() %>% + layer_naomit(.pred) + wf_latent <- epi_workflow(r_latent, parsnip::linear_reg()) %>% + fit(jhu_reasonable_date) %>% + add_frosting(frost_latent) + reasonable_date <- jhu %>% + dplyr::filter(time_value >= max(time_value) - 14) + p_latent <- predict(wf_latent, reasonable_date) + expect_equal( + p_latent$forecast_date, + rep(as.Date("2022-01-03"), times = 3) + ) + expect_equal( + p_latent$forecast_date - p_latent$time_value, + as.difftime(rep(3, times = 3), units = "days") + ) +}) test_that("forecast date works for daily", { f <- frosting() %>% diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index 53506ad07..8bdb3a76b 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -6,6 +6,7 @@ r <- epi_recipe(jhu) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 latest <- jhu %>% dplyr::filter(time_value >= max(time_value) - 14) @@ -39,6 +40,27 @@ test_that("Use ahead + max time value from pre, fit, post", { expect_equal(p2$target_date, rep(as.Date("2022-01-07"), times = 3)) expect_named(p2, c("geo_value", "time_value", ".pred", "forecast_date", "target_date")) }) +test_that("latency adjust doesn't interfere with correct target date", { + r_latent <- epi_recipe(jhu) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + wf_latent <- epi_workflow(r_latent, parsnip::linear_reg()) %>% fit(jhu) + f_latent <- frosting() %>% + layer_predict() %>% + layer_add_target_date() %>% + layer_naomit(.pred) + wf_latent <- wf_latent %>% add_frosting(f_latent) + + expect_silent(p_latent <- predict(wf_latent, latest)) + expect_equal(ncol(p_latent), 4L) + expect_s3_class(p_latent, "epi_df") + expect_equal(nrow(p_latent), 3L) + expect_equal(p_latent$target_date, rep(as.Date("2022-01-10"), times = 3)) + expect_named(p_latent, c("geo_value", "time_value", ".pred", "target_date")) +}) test_that("Use ahead + specified forecast date", { f <- frosting() %>% diff --git a/tests/testthat/test-pad_to_end.R b/tests/testthat/test-pad_to_end.R deleted file mode 100644 index 6949f06ac..000000000 --- a/tests/testthat/test-pad_to_end.R +++ /dev/null @@ -1,37 +0,0 @@ -test_that("test set padding works", { - dat <- tibble::tibble( - gr1 = rep(c("a", "b"), times = c(3, 4)), - time_value = c(1:3, 1:4), - value = 1:7 - ) %>% arrange(time_value, gr1) - expect_identical(pad_to_end(dat, "gr1", 3), dat) - expect_equal(nrow(pad_to_end(dat, "gr1", 4)), 8L) - p <- pad_to_end(dat, "gr1", 5) - expect_equal(nrow(p), 10L) - expect_identical(p$gr1, rep(c("a", "b"), times = 5)) - expect_identical(p$time_value, rep(1:5, each = 2)) - expect_identical(p$value, as.integer(c(1, 4, 2, 5, 3, 6, NA, 7, NA, NA))) - - dat <- dat %>% arrange(gr1) - dat$gr2 <- c("c", "c", "d", "c", "c", "d", "d") - dat <- dat %>% arrange(time_value) - # don't treat it as a group - p <- pad_to_end(dat, "gr1", 4) - expect_identical(nrow(p), 8L) - expect_identical(p$gr2, c(rep("c", 4), "d", "d", NA, "d")) - - # treat it as a group (needs different time_value) - dat$time_value <- c(1, 1, 2, 2, 1, 1, 2) # double - p <- pad_to_end(dat, c("gr1", "gr2"), 2) - expect_equal(nrow(p), 8L) - expect_identical(p$gr1, rep(c("a", "a", "b", "b"), times = 2)) - expect_identical(p$gr2, rep(c("c", "d"), times = 4)) - expect_identical(p$time_value, rep(c(1, 2), each = 4)) - expect_identical(p$value, as.integer(c(1, 3, 4, 6, 2, NA, 5, 7))) - - # make sure it maintains the epi_df - dat <- dat %>% - dplyr::rename(geo_value = gr1) %>% - as_epi_df(other_keys = "gr2") - expect_s3_class(pad_to_end(dat, "geo_value", 2), "epi_df") -}) diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 6337a2ea8..646afdf76 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -193,6 +193,63 @@ test_that("Postprocessing to get cases from case rate", { test_that("test joining by default columns", { + jhu <- case_death_rate_subset %>% + dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% + dplyr::select(geo_value, time_value, case_rate) + + reverse_pop_data <- data.frame( + geo_value = c("ca", "ny"), + values = c(1 / 20000, 1 / 30000) + ) + + r <- epi_recipe(jhu) %>% + step_population_scaling(case_rate, + df = reverse_pop_data, + df_pop_col = "values", + by = NULL, + suffix = "_scaled" + ) %>% + step_epi_lag(case_rate_scaled, lag = c(0, 7, 14)) %>% # cases + step_epi_ahead(case_rate_scaled, ahead = 7, role = "outcome") %>% # cases + recipes::step_naomit(recipes::all_predictors()) %>% + recipes::step_naomit(recipes::all_outcomes(), skip = TRUE) + + prep <- prep(r, jhu) + + b <- bake(prep, jhu) + + f <- frosting() %>% + layer_predict() %>% + layer_threshold(.pred) %>% + layer_naomit(.pred) %>% + layer_population_scaling(.pred, + df = reverse_pop_data, + by = NULL, + df_pop_col = "values" + ) + + wf <- epi_workflow( + r, + parsnip::linear_reg() + ) %>% + fit(jhu) %>% + add_frosting(f) + + latest <- get_test_data( + recipe = r, + x = case_death_rate_subset %>% + dplyr::filter( + time_value > "2021-11-01", + geo_value %in% c("ca", "ny") + ) %>% + dplyr::select(geo_value, time_value, case_rate) + ) + + + p <- predict(wf, latest) + + + jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) diff --git a/tests/testthat/test-replace_Inf.R b/tests/testthat/test-replace_Inf.R index f9993ca13..c99a045b2 100644 --- a/tests/testthat/test-replace_Inf.R +++ b/tests/testthat/test-replace_Inf.R @@ -7,7 +7,7 @@ test_that("replace_inf works", { v1 = 1:5, v2 = c(1, 2, Inf, -Inf, NA) ) - library(dplyr) + suppressPackageStartupMessages(library(dplyr)) ok <- c("geo_value", "time_value") df2 <- df %>% mutate(across(!all_of(ok), ~ vec_replace_inf(.x, NA))) expect_identical(df[, 1:3], df2[, 1:3]) diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index d624a4c21..da8635ae0 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -68,6 +68,81 @@ test_that("arx_forecaster snapshots", { ) ) expect_snapshot_tibble(arx2$predictions) + attributes(train_data)$metadata$as_of <- max(train_data$time_value) + 5 + arx3 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = arx_args_list( + ahead = 1L, + adjust_latency = "extend_ahead" + ) + ) + # consistency check + expect_snapshot_tibble(arx3$predictions) + expect_equal( + arx3$predictions$target_date, + rep(attributes(train_data)$metadata$as_of + 1, times = 6) + ) + expect_equal( + arx3$predictions$target_date, + arx2$predictions$target_date + 5 + ) + expect_equal( + arx3$predictions$forecast_date, + arx2$predictions$forecast_date + 5 + ) + # not the same predictions + expect_false(all(arx2$predictions == arx3$predictions)) + + + arx4 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = arx_args_list( + ahead = 1L, + adjust_latency = "locf" + ) + ) + # consistency check + expect_snapshot_tibble(arx3$predictions) +}) + +test_that("arx_forecaster output format snapshots", { + jhu <- case_death_rate_subset %>% + dplyr::filter(time_value >= as.Date("2021-12-01")) + attributes(jhu)$metadata$as_of <- as.Date(attributes(jhu)$metadata$as_of) + out1 <- arx_forecaster( + jhu, "death_rate", + c("case_rate", "death_rate") + ) + expect_equal(as.Date(out1$metadata$forecast_created), Sys.Date()) + out1$metadata$forecast_created <- as.Date("0999-01-01") + expect_snapshot(out1) + out2 <- arx_forecaster(jhu, "case_rate", + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list( + quantile_levels = 1:9 / 10, + adjust_latency = "extend_lags", + forecast_date = as.Date("2022-01-03") + ) + ) + expect_equal(as.Date(out2$metadata$forecast_created), Sys.Date()) + out2$metadata$forecast_created <- as.Date("0999-01-01") + expect_snapshot(out2) + out3 <- arx_forecaster(jhu, "death_rate", + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list( + adjust_latency = "extend_ahead", + forecast_date = as.Date("2022-01-03") + ) + ) + expect_equal(as.Date(out3$metadata$forecast_created), Sys.Date()) + out3$metadata$forecast_created <- as.Date("0999-01-01") + expect_snapshot(out3) }) test_that("arx_classifier snapshots", { @@ -78,4 +153,33 @@ test_that("arx_classifier snapshots", { c("case_rate", "death_rate") ) expect_snapshot_tibble(arc1$predictions) + max_date <- case_death_rate_subset$time_value %>% max() + arc2 <- arx_classifier( + case_death_rate_subset %>% + dplyr::filter(time_value >= as.Date("2021-11-01")), + "death_rate", + c("case_rate", "death_rate"), + args_list = arx_class_args_list(adjust_latency = "extend_ahead", forecast_date = max_date + 2) + ) + expect_snapshot_tibble(arc2$predictions) + expect_error( + arc3 <- arx_classifier( + case_death_rate_subset %>% + dplyr::filter(time_value >= as.Date("2021-11-01")), + "death_rate", + c("case_rate", "death_rate"), + args_list = arx_class_args_list(adjust_latency = "extend_lags", forecast_date = max_date + 2) + ), + class = "epipredict__arx_classifier__adjust_latency_unsupported_method" + ) + expect_error( + arc4 <- arx_classifier( + case_death_rate_subset %>% + dplyr::filter(time_value >= as.Date("2021-11-01")), + "death_rate", + c("case_rate", "death_rate"), + args_list = arx_class_args_list(adjust_latency = "locf", forecast_date = max_date + 2) + ), + class = "epipredict__arx_classifier__adjust_latency_unsupported_method" + ) }) diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R new file mode 100644 index 000000000..2a9ea4419 --- /dev/null +++ b/tests/testthat/test-step_adjust_latency.R @@ -0,0 +1,605 @@ +library(dplyr) +# Test ideas that were dropped: +# - "epi_adjust_latency works correctly when there's gaps in the timeseries" +# - "epi_adjust_latency extend_ahead uses the same adjustment when predicting on new data after being baked" +# - "`step_adjust_latency` only allows one instance of itself" +# - "data with epi_df shorn off works" + +x <- tibble( + geo_value = rep("place", 200), + time_value = as.Date("2021-01-01") + 0:199, + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + as_epi_df(as_of = as.POSIXct("2024-09-17")) +max_time <- max(x$time_value) +class(attributes(x)$metadata$as_of) +as_of <- attributes(x)$metadata$as_of +ahead <- 7 +latency <- 5 + +testing_as_of <- max_time + latency +# create x with a plausible as_of date +real_x <- x +attributes(real_x)$metadata$as_of <- testing_as_of + +slm_fit <- function(recipe, data = x) { + epi_workflow() %>% + add_epi_recipe(recipe) %>% + add_model(linear_reg()) %>% + fit(data = data) +} + + +# making a toy dataset with lag between geo_values +x_lagged <- x +x_lagged$time_value <- x$time_value - 1 +x_lagged$geo_value <- "other" +x_lagged <- add_row(x, x_lagged) +x_lagged +attributes(x_lagged)$metadata$as_of <- testing_as_of + +test_that("epi_adjust_latency correctly extends the lags", { + expect_warning(epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_adjust_latency(method = "extend_lags")) + + r1 <- epi_recipe(x) %>% + step_adjust_latency(method = "extend_lags") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + # directly checking the shifts + baked_x <- r1 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_16_death_rate", max_time + 16, + "lag_11_death_rate", max_time + 11, + "lag_10_case_rate", max_time + 10, + "lag_6_case_rate", max_time + 6, + "lag_5_death_rate", max_time + 5, + "case_rate", max_time, + "death_rate", max_time, + "ahead_7_death_rate", max_time - 7, + ) + ) + + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_error( + expect_warning( + expect_warning( + fit1 <- slm_fit(r1, data = x), + class = "epipredict__prep.step_latency__very_large_latency" + ), + class = "epipredict__prep.step_latency__very_large_latency" + ), + class = "simpleError" + ) + + # now trying with the as_of a reasonable distance in the future + fit1 <- slm_fit(r1, data = real_x) + expect_equal( + names(fit1$pre$mold$predictors), + c( + "lag_5_death_rate", "lag_11_death_rate", "lag_16_death_rate", + "lag_6_case_rate", "lag_10_case_rate" + ) + ) + latest <- get_test_data(r1, real_x) + pred <- predict(fit1, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(nrow(point_pred), 1) + expect_equal(point_pred$time_value, as.Date(testing_as_of)) + + expect_equal( + names(fit1$pre$mold$outcomes), + glue::glue("ahead_{ahead}_death_rate") + ) + latest <- get_test_data(r1, x) + pred1 <- predict(fit1, latest) + actual_solutions <- pred1 %>% filter(!is.na(.pred)) + expect_equal(actual_solutions$time_value, testing_as_of) + + # should have four predictors, including the intercept + expect_equal(length(fit1$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(5, 11, 16)) %>% + step_epi_lag(case_rate, lag = c(6, 10)) %>% + step_epi_ahead(death_rate, ahead = ahead) + fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) + expect_equal( + fit1$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) + +test_that("epi_adjust_latency correctly extends the ahead", { + r2 <- epi_recipe(x) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_error(expect_warning(fit5 <- slm_fit(r2), class = "epipredict__prep.step_latency__very_large_latency"), class = "simpleError") + # real date example + fit2 <- slm_fit(r2, data = real_x) + expect_equal( + names(fit2$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r2, real_x) + pred2 <- predict(fit2, latest) + point_pred2 <- pred2 %>% filter(!is.na(.pred)) + # max time is still the forecast date + expect_equal(point_pred2$time_value, as.Date(max_time)) + # target column renamed + expect_equal( + names(fit2$pre$mold$outcomes), + glue::glue("ahead_{ahead + latency}_death_rate") + ) + # fit an equivalent forecaster + equivalent <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead + latency) + equiv_fit <- slm_fit(equivalent, data = real_x) + # adjusting the ahead should do the same thing as directly adjusting the ahead + expect_equal( + fit2$fit$fit$fit$coefficients, + equiv_fit$fit$fit$fit$coefficients + ) + + # should have four predictors, including the intercept + expect_equal(length(fit2$fit$fit$fit$coefficients), 6) +}) + +test_that("epi_adjust_latency correctly locfs", { + r1 <- epi_recipe(x) %>% + step_adjust_latency(method = "locf") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + # directly checking the shifts + baked_x <- r1 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_11_death_rate", max_time + 16, + "lag_6_death_rate", max_time + 11, + "lag_5_case_rate", max_time + 10, + "lag_1_case_rate", max_time + 6, + "case_rate", max_time + 5, + "death_rate", max_time + 5, + "lag_0_death_rate", max_time + 5, + "ahead_7_death_rate", max_time - 2, + ) + ) + # we expect a 5-fold repetition of the last values found in the original + # epi_df + last_real <- real_x %>% + group_by(geo_value) %>% + arrange(time_value) %>% + slice_tail() %>% + ungroup() %>% + select(case_rate, death_rate) %>% + tidyr::uncount(5) + # pulling just the region between the last day and the prediction day + filled_values <- + baked_x %>% + filter( + time_value > max(real_x$time_value), + time_value <= attributes(real_x)$metadata$as_of + ) %>% + ungroup() %>% + select(case_rate, death_rate) + expect_equal(last_real, filled_values) +}) + +test_that("epi_adjust_latency extends multiple aheads", { + aheads <- 1:3 + r3 <- epi_recipe(x) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = aheads) + fitter <- smooth_quantile_reg( + quantile_levels = 0.5, + outcome_locations = aheads, + degree = 1L + ) + epi_wf <- epi_workflow(r3, fitter) + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_error(expect_warning(fit3 <- fit(epi_wf, data = x), class = "epipredict__prep.step_latency__very_large_latency"), class = "simpleError") + # real date example + fit3 <- fit(epi_wf, data = real_x) + expect_equal( + names(fit3$pre$mold$outcomes), + c( + "ahead_6_death_rate", "ahead_7_death_rate", "ahead_8_death_rate" + ) + ) + expect_equal( + names(fit3$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r3, real_x) + pred3 <- predict(fit3, latest) + point_pred <- pred3 %>% + unnest(.pred) %>% + filter(!is.na(distn)) + # max time is still the forecast date + expect_equal( + point_pred$time_value, + rep(as.Date(max_time), 3) + ) + # fit an equivalent forecaster + equivalent <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead + latency) + equiv_fit <- fit(epi_wf, data = real_x) + # adjusting the ahead should do the same thing as directly adjusting the ahead + expect_equal( + fit3$fit$fit$fit$rqfit, + equiv_fit$fit$fit$fit$rqfit + ) + + # should have four predictors, including the intercept + expect_equal(length(fit3$fit$fit$fit$rqfit$coefficients), 6) +}) + +test_that("epi_adjust_latency fixed_forecast_date works", { + r4 <- epi_recipe(x) %>% + step_adjust_latency(method = "extend_lags", fixed_forecast_date = max_time + 14) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + baked_x <- r4 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_25_death_rate", max_time + 25, + "lag_20_death_rate", max_time + 20, + "lag_19_case_rate", max_time + 19, + "lag_15_case_rate", max_time + 15, + "lag_14_death_rate", max_time + 14, + "case_rate", max_time, + "death_rate", max_time, + "ahead_7_death_rate", max_time - 7, + ) + ) +}) + +test_that("epi_adjust_latency fixed_latency works", { + r4.1 <- epi_recipe(x) %>% + step_adjust_latency(method = "extend_lags", fixed_latency = 2) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + baked_x <- r4.1 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_13_death_rate", max_time + 13, + "lag_8_death_rate", max_time + 8, + "lag_7_case_rate", max_time + 7, + "lag_3_case_rate", max_time + 3, + "lag_2_death_rate", max_time + 2, + "case_rate", max_time, + "death_rate", max_time, + "ahead_7_death_rate", max_time - 7, + ) + ) +}) + + +# todo test variants on the columns for which this is applied +# todo need to have both on columns 1, and 2 + + + +# test_that("epi_adjust_latency works for other time types", {}) + +test_that("epi_adjust_latency warns there's steps before it", { + expect_warning( + r5 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_adjust_latency(method = "extend_lags"), + regexp = "extend_lags" + ) + expect_warning( + r5 <- epi_recipe(x) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% + step_adjust_latency(method = "extend_ahead"), + regexp = "extend_ahead" + ) +}) + +# TODO check that epi_adjust_latency errors for nonsense `as_of`'s + + +# TODO make sure that `epi_keys_checked` works correctly for extra epi_keys + +test_that("epi_adjust_latency correctly extends the lags when there are different delays per-geo", { + r5 <- epi_recipe(x_lagged) %>% + step_adjust_latency(method = "extend_lags") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + # now trying with the as_of a reasonable distance in the future + fit5 <- slm_fit(r5, data = x_lagged) + expect_equal( + names(fit5$pre$mold$predictors), + c( + "lag_6_death_rate", "lag_12_death_rate", "lag_17_death_rate", + "lag_7_case_rate", "lag_11_case_rate" + ) + ) + latest <- get_test_data(r5, x_lagged) + pred <- predict(fit5, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(nrow(point_pred), 1) + expect_equal(point_pred$time_value, as.Date(testing_as_of) + 1) + + expect_equal( + names(fit5$pre$mold$outcomes), + glue::glue("ahead_{ahead}_death_rate") + ) + latest <- get_test_data(r5, x) + pred <- predict(fit5, latest) + actual_solutions <- pred %>% filter(!is.na(.pred)) + expect_equal(actual_solutions$time_value, testing_as_of + 1) + + # should have four predictors, including the intercept + expect_equal(length(fit5$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(6, 12, 17)) %>% + step_epi_lag(case_rate, lag = c(7, 11)) %>% + step_epi_ahead(death_rate, ahead = ahead) + fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) + expect_equal( + fit5$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) + +test_that("epi_adjust_latency correctly extends the ahead when there are different delays per-geo", { + r5 <- epi_recipe(x_lagged) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + fit5 <- slm_fit(r5, data = x_lagged) + expect_equal( + names(fit5$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r5, x_lagged) + pred <- predict(fit5, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(nrow(point_pred), 1) + expect_equal(point_pred$time_value, as.Date(max_time)) + joint_latency <- latency + 1 + expect_equal( + names(fit5$pre$mold$outcomes), + glue::glue("ahead_{ahead+joint_latency}_death_rate") + ) + actual_solutions <- pred %>% filter(!is.na(.pred)) + expect_equal(actual_solutions$time_value, as.Date(max_time)) + + # should have four predictors, including the intercept + expect_equal(length(fit5$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead + joint_latency) + + fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) + expect_equal( + fit5$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) + +test_that("`step_adjust_latency` only uses the columns specified in the `...`", { + r5 <- epi_recipe(x) %>% + step_adjust_latency(case_rate, method = "extend_lags") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + fit5 <- slm_fit(r5, data = real_x) + expect_equal(names(fit5$fit$fit$fit$coefficients), c("(Intercept)", "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", "lag_6_case_rate", "lag_10_case_rate")) + + r51 <- epi_recipe(x) %>% + step_adjust_latency(case_rate, method = "locf") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + baked_x <- r51 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) %>% + mutate(locf_date = last_date - latency) + # iterate over all columns and make sure the latent time period has the exact same values (so the variance is zero) + for (ii in seq(nrow(last_dates))) { + baked_var <- baked_x %>% + filter(last_dates[[ii, "locf_date"]] <= time_value, time_value <= last_dates[[ii, "last_date"]]) %>% + pull(last_dates[[ii, "name"]]) %>% + var() + if (grepl("case_rate", last_dates[[ii, "name"]])) { + expect_equal(baked_var, 0) + } else { + expect_true(baked_var > 0) + } + } +}) + +test_that("printing step_adjust_latency results in expected output", { + r5 <- epi_recipe(x) %>% + step_adjust_latency(case_rate, method = "extend_lags") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + expect_snapshot(r5) + expect_snapshot(prep(r5, real_x)) + r6 <- epi_recipe(case_death_rate_subset) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_epi_ahead(death_rate, ahead = 7) + expect_snapshot(r6) + expect_snapshot(prep(r6, case_death_rate_subset)) +}) + +test_that("locf works as intended", { + expect_warning(epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_adjust_latency(method = "locf")) + + r6 <- epi_recipe(x) %>% + step_adjust_latency(method = "locf") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + # directly checking the shifts + baked_x <- r6 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) %>% + mutate(locf_date = last_date - latency) + # iterate over all columns and make sure the latent time period has the exact same values + for (ii in seq(nrow(last_dates))) { + baked_x %>% + filter(last_dates[[ii, "locf_date"]] <= time_value, time_value <= last_dates[[ii, "last_date"]]) %>% + pull(last_dates[[ii, "name"]]) %>% + var() %>% + expect_equal(0) + } + + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_warning(fit6 <- slm_fit(r6, data = x), + class = "epipredict__prep.step_latency__very_large_latency" + ) + + # now trying with the as_of a reasonable distance in the future + fit6 <- slm_fit(r6, data = real_x) + expect_equal( + names(fit6$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r6, real_x) + pred <- predict(fit6, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(max(point_pred$time_value), as.Date(testing_as_of)) + + expect_equal( + names(fit6$pre$mold$outcomes), + glue::glue("ahead_{ahead}_death_rate") + ) + latest <- get_test_data(r6, x) + pred1 <- predict(fit6, latest) + actual_solutions <- pred1 %>% filter(!is.na(.pred)) + expect_equal(max(actual_solutions$time_value), testing_as_of) + + # should have four predictors, including the intercept + expect_equal(length(fit6$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + # + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + locf_x <- real_x %>% rbind(tibble( + geo_value = rep("place", latency), + time_value = max_time + 1:latency, + case_rate = rep(real_x$case_rate[nrow(x)], latency), + death_rate = rep(real_x$death_rate[nrow(x)], latency) + )) + fit_hand_adj <- slm_fit(hand_adjusted, data = locf_x) + expect_equal( + fit6$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) diff --git a/tests/testthat/test-step_epi_naomit.R b/tests/testthat/test-step_epi_naomit.R index 0e5e1750f..2f361ec98 100644 --- a/tests/testthat/test-step_epi_naomit.R +++ b/tests/testthat/test-step_epi_naomit.R @@ -1,6 +1,6 @@ -library(dplyr) -library(parsnip) -library(workflows) +suppressPackageStartupMessages(library(dplyr)) +suppressPackageStartupMessages(library(parsnip)) +suppressPackageStartupMessages(library(workflows)) # Random generated dataset x <- tibble( diff --git a/tests/testthat/test-step_lag_difference.R b/tests/testthat/test-step_lag_difference.R index 6ff9884a7..d23a3b4fa 100644 --- a/tests/testthat/test-step_lag_difference.R +++ b/tests/testthat/test-step_lag_difference.R @@ -52,7 +52,7 @@ test_that("step_lag_difference works for a single signal", { }) -test_that("step_lag_difference works for a two signals", { +test_that("step_lag_difference works for a two signal epi_df", { df <- data.frame( time_value = 1:5, geo_value = rep("a", 5), diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R new file mode 100644 index 000000000..3ba6453a0 --- /dev/null +++ b/tests/testthat/test-utils_latency.R @@ -0,0 +1,167 @@ +time_values <- as.Date("2021-01-01") + +floor(seq(0, 100, by = .5))[1:200] +as_of <- max(time_values) + 5 +max_time <- max(time_values) +old_data <- tibble( + geo_value = rep(c("place1", "place2"), 100), + time_value = as.Date("2021-01-01") + +floor(seq(0, 100, by = .5))[1:200], + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + tmp_death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + # place2 is slightly more recent than place1 + mutate(time_value = as.Date(ifelse(geo_value == "place2", time_value + 1, time_value))) %>% + as_epi_df(as_of = as_of) +old_data +keys <- c("time_value", "geo_value") +old_data <- old_data %>% + full_join(epi_shift_single( + old_data, "tmp_death_rate", 1, "death_rate", keys + ), by = keys) %>% + select(-tmp_death_rate) +# old data is created so that death rate has a latency of 4, while case_rate has +# a latency of 5 +modified_data <- + old_data %>% + dplyr::full_join( + epi_shift_single(old_data, "case_rate", -4, "case_rate_a", keys), + by = keys + ) %>% + dplyr::full_join( + epi_shift_single(old_data, "case_rate", 3, "case_rate_b", keys), + by = keys + ) %>% + dplyr::full_join( + epi_shift_single(old_data, "death_rate", 7, "death_rate_a", keys), + by = keys + ) %>% + arrange(time_value) +time_range <- as.Date("2021-01-01") + 0:199 +x_adjust_ahead <- tibble( + geo_value = rep("place", 200), + time_value = time_range, + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + as_epi_df(as_of = max(time_range) + 3) + +modified_data %>% arrange(geo_value, desc(time_value)) +modified_data %>% + group_by(geo_value) %>% + filter(!is.na(case_rate)) %>% + summarise(max(time_value)) +as_of + +toy_df <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "ma", as.Date("2015-01-11"), 20, 6, + "ma", as.Date("2015-01-12"), 23, NA, + "ma", as.Date("2015-01-13"), 25, NA, + "ca", as.Date("2015-01-11"), 100, 5, + "ca", as.Date("2015-01-12"), 103, 10, +) %>% + as_epi_df(as_of = as.Date("2015-01-14")) + +test_that("get_latency works", { + expect_equal(get_latency(modified_data, as_of, "case_rate", 1, "geo_value"), 5) + expect_equal(get_latency(modified_data, as_of, "case_rate", -1, "geo_value"), -5) + expect_equal(get_latency(modified_data, as_of, "death_rate", 1, "geo_value"), 4) + expect_equal(get_latency(modified_data, as_of, "case_rate_a", 1, "geo_value"), 5 + 4) + expect_equal(get_latency(modified_data, as_of, "case_rate_b", 1, "geo_value"), 5 - 3) + expect_equal(get_latency(modified_data, as_of, "death_rate_a", 1, "geo_value"), 4 - 7) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "a", 1, "geo_value"), 2) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "a", -1, "geo_value"), -2) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "b", 1, "geo_value"), 3) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "b", -1, "geo_value"), -3) +}) + +test_that("get_latency infers max_time to be the minimum `max time` across grouping the specified keys", { + # place 2 is already 1 day less latent than place 1, so decreasing it's + # latency it should have no effect + place2_delayed_data <- modified_data %>% mutate(time_value = time_value + 3 * (geo_value == "place2")) + expect_equal(get_latency(place2_delayed_data, as_of, "case_rate", 1, "geo_value"), 5) + # decreaseing the latency of place1 more than 1 pushes it past place2, so at most changes the latency by 1 + place1_delayed_data <- modified_data %>% mutate(time_value = time_value + 5 * (geo_value == "place1")) + expect_equal(get_latency(place1_delayed_data, as_of, "case_rate", 1, "geo_value"), 4) +}) + + +test_that("get_forecast_date works", { + info <- tribble( + ~variable, ~type, ~role, ~source, + "time_value", "date", "time_value", "original", + "geo_value", "nominal", "geo_value", "original", + "case_rate", "numeric", "raw", "original", + "death_rate", "numeric", "raw", "original", + "not_real", "numeric", "predictor", "derived" + ) + expect_equal(get_forecast_date(modified_data, info, "geo_value", NULL), as_of) + expect_equal(get_forecast_date(modified_data, info, "", NULL), as_of) + expect_equal(get_forecast_date(modified_data, info, NULL, NULL), as_of) +}) + +test_that("pad_to_end works correctly", { + single_ex <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "1", as.Date("1066-10-13"), 2, -.6, + # internal NA + "1", as.Date("1066-10-14"), NA, NA, + "1", as.Date("1066-10-15"), 1, -.5, + "2", as.Date("1066-10-13"), 3, .9, + # note these are intentionally out of order + "3", as.Date("1066-10-14"), 2.5, NA, + "3", as.Date("1066-10-13"), 2, -.6, + ) %>% + as_epi_df(as_of = "1066-10-16") + expect_equal( + single_ex %>% pad_to_end("geo_value", as.Date("1066-10-16")), + rbind( + single_ex[-5, ], + tibble(geo_value = "1", time_value = as.Date("1066-10-16"), a = 1, b = -.5), + tibble( + geo_value = "2", + time_value = seq.Date( + from = as.Date("1066-10-14"), + to = as.Date("1066-10-16"), + by = 1 + ), + a = 3, b = .9 + ), + tibble( + geo_value = "3", + time_value = seq.Date( + from = as.Date("1066-10-14"), + to = as.Date("1066-10-16"), + by = 1 + ), + a = 2.5, b = -0.6 + ) + ) %>% arrange(geo_value, time_value) + ) +}) + + +test_that("pad_to_end handles weeks", { + single_ex <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "1", as.Date("1066-10-14"), 2, -.6, + "1", as.Date("1066-10-21"), 1, -.5, + "2", as.Date("1066-10-14"), 3, .9 + ) %>% + as_epi_df(as_of = "1066-10-28") + expect_equal( + single_ex %>% pad_to_end("geo_value", as.Date("1066-10-28")), + rbind( + single_ex, + tibble(geo_value = "1", time_value = as.Date("1066-10-28"), a = 1, b = -.5), + tibble( + geo_value = "2", + time_value = seq.Date( + from = as.Date("1066-10-21"), + to = as.Date("1066-10-28"), + by = 7 + ), + a = 3, b = .9 + ) + ) %>% arrange(geo_value, time_value) + ) +}) +# todo case where somehow columns of different roles are selected diff --git a/vignettes/articles/sliding.Rmd b/vignettes/articles/sliding.Rmd index 1556c4a72..908caf307 100644 --- a/vignettes/articles/sliding.Rmd +++ b/vignettes/articles/sliding.Rmd @@ -328,7 +328,7 @@ confirmed_incidence_prop <- pub_covidcast( geo_type = "state", time_values = epirange(20200301, 20211231), geo_values = states, - issues = epirange(20000101, 20211231) + issues = epirange(20000101, 20221231) ) %>% select(geo_value, time_value, version = issue, case_rate = value) %>% arrange(geo_value, time_value) %>% @@ -341,7 +341,7 @@ deaths_incidence_prop <- pub_covidcast( geo_type = "state", time_values = epirange(20200301, 20211231), geo_values = states, - issues = epirange(20000101, 20211231) + issues = epirange(20000101, 20221231) ) %>% select(geo_value, time_value, version = issue, death_rate = value) %>% arrange(geo_value, time_value) %>% diff --git a/vignettes/articles/smooth-qr.Rmd b/vignettes/articles/smooth-qr.Rmd index 3d626b2e1..b93c726f6 100644 --- a/vignettes/articles/smooth-qr.Rmd +++ b/vignettes/articles/smooth-qr.Rmd @@ -195,7 +195,7 @@ smooth_fc <- function(x, aheads = 1:28, degree = 3L, quantiles = 0.5, fd) { the_fit <- ewf %>% fit(x) - latest <- get_test_data(rec, x, fill_locf = TRUE) + latest <- get_test_data(rec, x) preds <- predict(the_fit, new_data = latest) %>% mutate(forecast_date = fd, target_date = fd + ahead) %>% diff --git a/vignettes/articles/symptom-surveys.Rmd b/vignettes/articles/symptom-surveys.Rmd index f480db575..1e51a9963 100644 --- a/vignettes/articles/symptom-surveys.Rmd +++ b/vignettes/articles/symptom-surveys.Rmd @@ -423,7 +423,7 @@ res_err4 <- res_all4 %>% knitr::kable( res_err4 %>% group_by(model, lead) %>% - summarize(err = median(err), n = length(unique(forecast_date))) %>% + summarise(err = median(err), n = length(unique(forecast_date))) %>% arrange(lead) %>% ungroup() %>% rename( "Model" = model, "Median scaled error" = err, @@ -472,7 +472,7 @@ res_dif4 <- res_all4 %>% knitr::kable( res_dif4 %>% group_by(model, lead) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -501,7 +501,7 @@ ggplot_colors <- c("#FC4E07", "#00AFBB", "#E7B800") ggplot(res_dif4 %>% group_by(model, lead, forecast_date) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -649,7 +649,7 @@ knitr::kable( res_err2 %>% select(-ends_with("diff")) %>% group_by(model, lead) %>% - summarize(err = median(err), n = length(unique(forecast_date))) %>% + summarise(err = median(err), n = length(unique(forecast_date))) %>% arrange(lead) %>% ungroup() %>% rename( "Model" = model, "Median scaled error" = err, @@ -684,7 +684,7 @@ to incorporate during periods of time where forecasting is easier. ggplot( res_err2 %>% group_by(model, lead, forecast_date) %>% - summarize(err = median(err)) %>% ungroup(), + summarise(err = median(err)) %>% ungroup(), aes(x = forecast_date, y = err) ) + geom_line(aes(color = model)) + @@ -722,7 +722,7 @@ res_dif2 <- res_all2 %>% knitr::kable( res_dif2 %>% group_by(model, lead) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -739,7 +739,7 @@ quite small. ```{r} ggplot(res_dif2 %>% group_by(model, lead, forecast_date) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -802,7 +802,7 @@ err_by_lead <- res %>% ) %>% mutate(model = factor(model, labels = model_names[1:2])) %>% group_by(model, lead) %>% - summarize(err = median(err)) %>% + summarise(err = median(err)) %>% ungroup() ggplot(err_by_lead, aes(x = lead, y = err)) +