diff --git a/DESCRIPTION b/DESCRIPTION index d6f37ef63..36a84f425 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.1.13 +Version: 0.1.14 Authors@R: c( person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), diff --git a/NAMESPACE b/NAMESPACE index 939a1f01a..f39d9bfbc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,7 +15,7 @@ S3method(apply_frosting,epi_workflow) S3method(augment,epi_workflow) S3method(autoplot,canned_epipred) S3method(autoplot,epi_workflow) -S3method(bake,check_enough_train_data) +S3method(bake,check_enough_data) S3method(bake,epi_recipe) S3method(bake,step_adjust_latency) S3method(bake,step_climate) @@ -49,7 +49,7 @@ S3method(key_colnames,recipe) S3method(mean,quantile_pred) S3method(predict,epi_workflow) S3method(predict,flatline) -S3method(prep,check_enough_train_data) +S3method(prep,check_enough_data) S3method(prep,epi_recipe) S3method(prep,step_adjust_latency) S3method(prep,step_climate) @@ -65,7 +65,7 @@ S3method(print,arx_class) S3method(print,arx_fcast) S3method(print,canned_epipred) S3method(print,cdc_baseline_fcast) -S3method(print,check_enough_train_data) +S3method(print,check_enough_data) S3method(print,climate_fcast) S3method(print,epi_recipe) S3method(print,epi_workflow) @@ -109,7 +109,7 @@ S3method(slather,layer_threshold) S3method(slather,layer_unnest) S3method(snap,default) S3method(snap,quantile_pred) -S3method(tidy,check_enough_train_data) +S3method(tidy,check_enough_data) S3method(tidy,frosting) S3method(tidy,layer) S3method(update,layer) @@ -142,7 +142,7 @@ export(autoplot) export(bake) export(cdc_baseline_args_list) export(cdc_baseline_forecaster) -export(check_enough_train_data) +export(check_enough_data) export(clean_f_name) export(climate_args_list) export(climatological_forecaster) diff --git a/NEWS.md b/NEWS.md index e117f3f52..24c8e3b73 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,9 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat - Removes dependence on the `distributional` package, replacing the quantiles with `hardhat::quantile_pred()`. Some associated functions are deprecated with `lifecycle` messages. +- Rename `check_enough_train_data()` to `check_enough_data()`, and generalize it + enough to use as a check on either training or testing. +- Add check for enough data to predict in `arx_forecaster()` ## Improvements @@ -33,6 +36,7 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat - Add `climatological_forecaster()` to automatically create climate baselines - Replace `dist_quantiles()` with `hardhat::quantile_pred()` - Allow `quantile()` to threshold to an interval if desired (#434) +- `arx_forecaster()` detects if there's enough data to predict ## Bug fixes diff --git a/R/arx_classifier.R b/R/arx_classifier.R index d1aa292dd..bc8783610 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -222,7 +222,7 @@ arx_class_epi_workflow <- function( step_training_window(n_recent = args_list$n_training) if (!is.null(args_list$check_enough_data_n)) { - r <- check_enough_train_data( + r <- check_enough_data( r, recipes::all_predictors(), recipes::all_outcomes(), diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index fe9128c00..f988490fd 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -171,12 +171,14 @@ arx_fcast_epi_workflow <- function( step_epi_ahead(!!outcome, ahead = args_list$ahead) r <- r %>% step_epi_naomit() %>% - step_training_window(n_recent = args_list$n_training) + step_training_window(n_recent = args_list$n_training) %>% + check_enough_data(all_predictors(), min_observations = 1, skip = FALSE) + if (!is.null(args_list$check_enough_data_n)) { - r <- r %>% check_enough_train_data( + r <- r %>% check_enough_data( all_predictors(), - !!outcome, - n = args_list$check_enough_data_n, + all_outcomes(), + min_observations = args_list$check_enough_data_n, epi_keys = args_list$check_enough_data_epi_keys, drop_na = FALSE ) diff --git a/R/canned-epipred.R b/R/canned-epipred.R index 48e984168..7d53862c2 100644 --- a/R/canned-epipred.R +++ b/R/canned-epipred.R @@ -112,7 +112,7 @@ print.canned_epipred <- function(x, name, ...) { "At forecast date{?s}: {.val {fds}},", "For target date{?s}: {.val {tds}}," )) - if ("actions" %in% names(x$pre) && "recipe" %in% names(x$pre$actions)) { + if ("pre" %in% names(x) && "actions" %in% names(x$pre) && "recipe" %in% names(x$pre$actions)) { fit_recipe <- extract_recipe(x$epi_workflow) if (detect_step(fit_recipe, "adjust_latency")) { is_adj_latency <- map_lgl(fit_recipe$steps, function(x) inherits(x, "step_adjust_latency")) diff --git a/R/check_enough_data.R b/R/check_enough_data.R new file mode 100644 index 000000000..e830d5e54 --- /dev/null +++ b/R/check_enough_data.R @@ -0,0 +1,193 @@ +#' Check the dataset contains enough data points. +#' +#' `check_enough_data` creates a *specification* of a recipe +#' operation that will check if variables contain enough data. +#' +#' @param recipe A recipe object. The check will be added to the +#' sequence of operations for this recipe. +#' @param ... One or more selector functions to choose variables for this check. +#' See [selections()] for more details. You will usually want to use +#' [recipes::all_predictors()] and/or [recipes::all_outcomes()] here. +#' @param min_observations The minimum number of data points required for +#' training. If this is NULL, the total number of predictors will be used. +#' @param epi_keys A character vector of column names on which to group the data +#' and check threshold within each group. Useful if your forecaster trains +#' per group (for example, per geo_value). +#' @param drop_na A logical for whether to count NA values as valid rows. +#' @param role Not used by this check since no new variables are +#' created. +#' @param trained A logical for whether the selectors in `...` +#' have been resolved by [prep()]. +#' @param id A character string that is unique to this check to identify it. +#' @param skip A logical. If `TRUE`, only training data is checked, while if +#' `FALSE`, both training and predicting data is checked. Technically, this +#' answers the question "should the check be skipped when the recipe is baked +#' by [bake()]?" While all operations are baked when [prep()] is run, some +#' operations may not be able to be conducted on new data (e.g. processing the +#' outcome variable(s)). Care should be taken when using `skip = TRUE` as it +#' may affect the computations for subsequent operations. +#' @family checks +#' @export +#' @details This check will break the `prep` and/or bake function if any of the +#' checked columns have not enough non-NA values. If the check passes, nothing +#' is changed in the data. It is best used after every other step. +#' +#' For checking training data, it is best to set `...` to be +#' `all_predictors(), all_outcomes()`, while for checking prediction data, it +#' is best to set `...` to be `all_predictors()` only, with `n = 1`. +#' +#' # tidy() results +#' +#' When you [`tidy()`][tidy.recipe()] this check, a tibble with column +#' `terms` (the selectors or variables selected) is returned. +#' +check_enough_data <- + function(recipe, + ..., + min_observations = NULL, + epi_keys = NULL, + drop_na = TRUE, + role = NA, + trained = FALSE, + skip = TRUE, + id = rand_id("enough_data")) { + recipes::add_check( + recipe, + check_enough_data_new( + min_observations = min_observations, + epi_keys = epi_keys, + drop_na = drop_na, + terms = enquos(...), + role = role, + trained = trained, + columns = NULL, + skip = skip, + id = id + ) + ) + } + +check_enough_data_new <- + function(min_observations, epi_keys, drop_na, terms, + role, trained, columns, skip, id) { + recipes::check( + subclass = "enough_data", + prefix = "check_", + min_observations = min_observations, + epi_keys = epi_keys, + drop_na = drop_na, + terms = terms, + role = role, + trained = trained, + columns = columns, + skip = skip, + id = id + ) + } + +#' @export +prep.check_enough_data <- function(x, training, info = NULL, ...) { + col_names <- recipes::recipes_eval_select(x$terms, training, info) + if (is.null(x$min_observations)) { + x$min_observations <- length(col_names) + } + + check_enough_data_core(training, x, col_names, "train") + + check_enough_data_new( + min_observations = x$min_observations, + epi_keys = x$epi_keys, + drop_na = x$drop_na, + terms = x$terms, + role = x$role, + trained = TRUE, + columns = col_names, + skip = x$skip, + id = x$id + ) +} + +#' @export +bake.check_enough_data <- function(object, new_data, ...) { + col_names <- object$columns + check_enough_data_core(new_data, object, col_names, "predict") + new_data +} + +#' @export +print.check_enough_data <- function(x, width = max(20, options()$width - 30), ...) { + title <- paste0("Check enough data (n = ", x$min_observations, ") for ") + recipes::print_step(x$columns, x$terms, x$trained, title, width) + invisible(x) +} + +#' @export +tidy.check_enough_data <- function(x, ...) { + if (recipes::is_trained(x)) { + res <- tibble(terms = unname(x$columns)) + } else { + res <- tibble(terms = recipes::sel2char(x$terms)) + } + res$id <- x$id + res$min_observations <- x$min_observations + res$epi_keys <- x$epi_keys + res$drop_na <- x$drop_na + res +} + +check_enough_data_core <- function(epi_df, step_obj, col_names, train_or_predict) { + epi_df <- epi_df %>% + group_by(across(all_of(.env$step_obj$epi_keys))) + if (step_obj$drop_na) { + any_missing_data <- epi_df %>% + mutate(any_are_na = rowSums(across(any_of(.env$col_names), ~ is.na(.x))) > 0) %>% + # count the number of rows where they're all not na + summarise(sum(any_are_na == 0) < .env$step_obj$min_observations, .groups = "drop") + any_missing_data <- any_missing_data %>% + summarize(across(all_of(setdiff(names(any_missing_data), step_obj$epi_keys)), any)) %>% + any() + + # figuring out which individual columns (if any) are to blame for this dearth + # of data + cols_not_enough_data <- epi_df %>% + summarise( + across( + all_of(.env$col_names), + ~ sum(!is.na(.x)) < .env$step_obj$min_observations + ), + .groups = "drop" + ) %>% + # Aggregate across keys (if present) + summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% + unlist() %>% + # Select the names of the columns that are TRUE + names(.)[.] + + # Either all columns have enough data, in which case this message won't be + # sent later or none of the single columns have enough data, that means its + # the combination of all of them. + if (length(cols_not_enough_data) == 0) { + cols_not_enough_data <- + glue::glue("no single column, but the combination of {paste0(col_names, collapse = ', ')}") + } + } else { + # if we're not dropping na values, just count + cols_not_enough_data <- epi_df %>% + summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$step_obj$min_observations)) + any_missing_data <- cols_not_enough_data %>% + summarize(across(all_of(.env$col_names), all)) %>% + all() + cols_not_enough_data <- cols_not_enough_data %>% + summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% + unlist() %>% + # Select the names of the columns that are TRUE + names(.)[.] + } + + if (any_missing_data) { + cli_abort( + "The following columns don't have enough data to {train_or_predict}: {cols_not_enough_data}.", + class = "epipredict__not_enough_data" + ) + } +} diff --git a/R/check_enough_train_data.R b/R/check_enough_train_data.R deleted file mode 100644 index 1279a3712..000000000 --- a/R/check_enough_train_data.R +++ /dev/null @@ -1,145 +0,0 @@ -#' Check the dataset contains enough data points. -#' -#' `check_enough_train_data` creates a *specification* of a recipe -#' operation that will check if variables contain enough data. -#' -#' @param recipe A recipe object. The check will be added to the -#' sequence of operations for this recipe. -#' @param ... One or more selector functions to choose variables for this check. -#' See [selections()] for more details. You will usually want to use -#' [recipes::all_predictors()] here. -#' @param n The minimum number of data points required for training. If this is -#' NULL, the total number of predictors will be used. -#' @param epi_keys A character vector of column names on which to group the data -#' and check threshold within each group. Useful if your forecaster trains -#' per group (for example, per geo_value). -#' @param drop_na A logical for whether to count NA values as valid rows. -#' @param role Not used by this check since no new variables are -#' created. -#' @param trained A logical for whether the selectors in `...` -#' have been resolved by [prep()]. -#' @param columns An internal argument that tracks which columns are evaluated -#' for this check. Should not be used by the user. -#' @param id A character string that is unique to this check to identify it. -#' @param skip A logical. Should the check be skipped when the -#' recipe is baked by [bake()]? While all operations are baked -#' when [prep()] is run, some operations may not be able to be -#' conducted on new data (e.g. processing the outcome variable(s)). -#' Care should be taken when using `skip = TRUE` as it may affect -#' the computations for subsequent operations. -#' @family checks -#' @export -#' @details This check will break the `bake` function if any of the checked -#' columns have not enough non-NA values. If the check passes, nothing is -#' changed to the data. -#' -#' # tidy() results -#' -#' When you [`tidy()`][tidy.recipe()] this check, a tibble with column -#' `terms` (the selectors or variables selected) is returned. -#' -check_enough_train_data <- - function(recipe, - ..., - n = NULL, - epi_keys = NULL, - drop_na = TRUE, - role = NA, - trained = FALSE, - columns = NULL, - skip = TRUE, - id = rand_id("enough_train_data")) { - recipes::add_check( - recipe, - check_enough_train_data_new( - n = n, - epi_keys = epi_keys, - drop_na = drop_na, - terms = enquos(...), - role = role, - trained = trained, - columns = columns, - skip = skip, - id = id - ) - ) - } - -check_enough_train_data_new <- - function(n, epi_keys, drop_na, terms, role, trained, columns, skip, id) { - recipes::check( - subclass = "enough_train_data", - prefix = "check_", - n = n, - epi_keys = epi_keys, - drop_na = drop_na, - terms = terms, - role = role, - trained = trained, - columns = columns, - skip = skip, - id = id - ) - } - -#' @export -prep.check_enough_train_data <- function(x, training, info = NULL, ...) { - col_names <- recipes::recipes_eval_select(x$terms, training, info) - if (is.null(x$n)) { - x$n <- length(col_names) - } - - if (x$drop_na) { - training <- tidyr::drop_na(training) - } - cols_not_enough_data <- training %>% - group_by(across(all_of(.env$x$epi_keys))) %>% - summarise(across(all_of(.env$col_names), ~ dplyr::n() < .env$x$n), .groups = "drop") %>% - summarise(across(all_of(.env$col_names), any), .groups = "drop") %>% - unlist() %>% - names(.)[.] - - if (length(cols_not_enough_data) > 0) { - cli_abort( - "The following columns don't have enough data to predict: {cols_not_enough_data}." - ) - } - - check_enough_train_data_new( - n = x$n, - epi_keys = x$epi_keys, - drop_na = x$drop_na, - terms = x$terms, - role = x$role, - trained = TRUE, - columns = col_names, - skip = x$skip, - id = x$id - ) -} - -#' @export -bake.check_enough_train_data <- function(object, new_data, ...) { - new_data -} - -#' @export -print.check_enough_train_data <- function(x, width = max(20, options()$width - 30), ...) { - title <- paste0("Check enough data (n = ", x$n, ") for ") - recipes::print_step(x$columns, x$terms, x$trained, title, width) - invisible(x) -} - -#' @export -tidy.check_enough_train_data <- function(x, ...) { - if (recipes::is_trained(x)) { - res <- tibble(terms = unname(x$columns)) - } else { - res <- tibble(terms = recipes::sel2char(x$terms)) - } - res$id <- x$id - res$n <- x$n - res$epi_keys <- x$epi_keys - res$drop_na <- x$drop_na - res -} diff --git a/_pkgdown.yml b/_pkgdown.yml index fe34c3b82..814bf6aa4 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -77,7 +77,7 @@ reference: - title: Epi recipe verification checks contents: - - check_enough_train_data + - check_enough_data - title: Forecast postprocessing desc: Create a series of postprocessing operations diff --git a/man/check_enough_data.Rd b/man/check_enough_data.Rd new file mode 100644 index 000000000..969caa1d2 --- /dev/null +++ b/man/check_enough_data.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_enough_data.R +\name{check_enough_data} +\alias{check_enough_data} +\title{Check the dataset contains enough data points.} +\usage{ +check_enough_data( + recipe, + ..., + min_observations = NULL, + epi_keys = NULL, + drop_na = TRUE, + role = NA, + trained = FALSE, + skip = TRUE, + id = rand_id("enough_data") +) +} +\arguments{ +\item{recipe}{A recipe object. The check will be added to the +sequence of operations for this recipe.} + +\item{...}{One or more selector functions to choose variables for this check. +See \code{\link[=selections]{selections()}} for more details. You will usually want to use +\code{\link[recipes:has_role]{recipes::all_predictors()}} and/or \code{\link[recipes:has_role]{recipes::all_outcomes()}} here.} + +\item{min_observations}{The minimum number of data points required for +training. If this is NULL, the total number of predictors will be used.} + +\item{epi_keys}{A character vector of column names on which to group the data +and check threshold within each group. Useful if your forecaster trains +per group (for example, per geo_value).} + +\item{drop_na}{A logical for whether to count NA values as valid rows.} + +\item{role}{Not used by this check since no new variables are +created.} + +\item{trained}{A logical for whether the selectors in \code{...} +have been resolved by \code{\link[=prep]{prep()}}.} + +\item{skip}{A logical. If \code{TRUE}, only training data is checked, while if +\code{FALSE}, both training and predicting data is checked. Technically, this +answers the question "should the check be skipped when the recipe is baked +by \code{\link[=bake]{bake()}}?" While all operations are baked when \code{\link[=prep]{prep()}} is run, some +operations may not be able to be conducted on new data (e.g. processing the +outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it +may affect the computations for subsequent operations.} + +\item{id}{A character string that is unique to this check to identify it.} +} +\description{ +\code{check_enough_data} creates a \emph{specification} of a recipe +operation that will check if variables contain enough data. +} +\details{ +This check will break the \code{prep} and/or bake function if any of the +checked columns have not enough non-NA values. If the check passes, nothing +is changed in the data. It is best used after every other step. + +For checking training data, it is best to set \code{...} to be +\verb{all_predictors(), all_outcomes()}, while for checking prediction data, it +is best to set \code{...} to be \code{all_predictors()} only, with \code{n = 1}. +} +\section{tidy() results}{ +When you \code{\link[=tidy.recipe]{tidy()}} this check, a tibble with column +\code{terms} (the selectors or variables selected) is returned. +} + +\concept{checks} diff --git a/man/check_enough_train_data.Rd b/man/check_enough_train_data.Rd deleted file mode 100644 index 57a4a9d78..000000000 --- a/man/check_enough_train_data.Rd +++ /dev/null @@ -1,69 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/check_enough_train_data.R -\name{check_enough_train_data} -\alias{check_enough_train_data} -\title{Check the dataset contains enough data points.} -\usage{ -check_enough_train_data( - recipe, - ..., - n = NULL, - epi_keys = NULL, - drop_na = TRUE, - role = NA, - trained = FALSE, - columns = NULL, - skip = TRUE, - id = rand_id("enough_train_data") -) -} -\arguments{ -\item{recipe}{A recipe object. The check will be added to the -sequence of operations for this recipe.} - -\item{...}{One or more selector functions to choose variables for this check. -See \code{\link[=selections]{selections()}} for more details. You will usually want to use -\code{\link[recipes:has_role]{recipes::all_predictors()}} here.} - -\item{n}{The minimum number of data points required for training. If this is -NULL, the total number of predictors will be used.} - -\item{epi_keys}{A character vector of column names on which to group the data -and check threshold within each group. Useful if your forecaster trains -per group (for example, per geo_value).} - -\item{drop_na}{A logical for whether to count NA values as valid rows.} - -\item{role}{Not used by this check since no new variables are -created.} - -\item{trained}{A logical for whether the selectors in \code{...} -have been resolved by \code{\link[=prep]{prep()}}.} - -\item{columns}{An internal argument that tracks which columns are evaluated -for this check. Should not be used by the user.} - -\item{skip}{A logical. Should the check be skipped when the -recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked -when \code{\link[=prep]{prep()}} is run, some operations may not be able to be -conducted on new data (e.g. processing the outcome variable(s)). -Care should be taken when using \code{skip = TRUE} as it may affect -the computations for subsequent operations.} - -\item{id}{A character string that is unique to this check to identify it.} -} -\description{ -\code{check_enough_train_data} creates a \emph{specification} of a recipe -operation that will check if variables contain enough data. -} -\details{ -This check will break the \code{bake} function if any of the checked -columns have not enough non-NA values. If the check passes, nothing is -changed to the data. -} -\section{tidy() results}{ -When you \code{\link[=tidy.recipe]{tidy()}} this check, a tibble with column -\code{terms} (the selectors or variables selected) is returned. -} - -\concept{checks} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 75d674472..9e1bafbd5 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -143,7 +143,7 @@ toy_recipe \%>\% #> #> # A tibble: 8 x 4 #> geo_value time_value a b -#> * +#> #> 1 ca 2015-01-11 100 5 #> 2 ca 2015-01-12 103 10 #> 3 ca 2015-01-13 103 10 @@ -179,7 +179,7 @@ toy_recipe \%>\% #> #> # A tibble: 21 x 7 #> geo_value time_value a b lag_3_a lag_4_b ahead_1_a -#> * +#> #> 1 ca 2015-01-10 NA NA NA NA 100 #> 2 ca 2015-01-11 100 5 NA NA 103 #> 3 ca 2015-01-12 103 10 NA NA NA @@ -227,7 +227,7 @@ toy_recipe \%>\% #> #> # A tibble: 10 x 6 #> geo_value time_value a b lag_0_a ahead_3_a -#> * +#> #> 1 ca 2015-01-08 NA NA NA 100 #> 2 ca 2015-01-09 NA NA NA 103 #> 3 ca 2015-01-11 100 5 100 NA @@ -267,8 +267,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't -#> work with modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with +#> modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/tests/testthat/_snaps/check_enough_data.md b/tests/testthat/_snaps/check_enough_data.md new file mode 100644 index 000000000..4a6ff336d --- /dev/null +++ b/tests/testthat/_snaps/check_enough_data.md @@ -0,0 +1,54 @@ +# check_enough_data works on pooled data + + Code + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n + 1, + drop_na = FALSE) %>% prep(toy_epi_df) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x and y. + +--- + + Code + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n - 1, + drop_na = TRUE) %>% prep(toy_epi_df) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x. + +# check_enough_data works on unpooled data + + Code + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = n + 1, + epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x and y. + +--- + + Code + epi_recipe(toy_epi_df) %>% check_enough_data(x, y, min_observations = 2 * n - 3, + epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: x and y. + +# check_enough_data only checks train data when skip = FALSE + + Code + forecaster %>% predict(new_data = toy_test_data %>% filter(time_value > + "2020-01-08")) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to predict: x. + +# check_enough_data works with all_predictors() downstream of constructed terms + + Code + epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_data( + all_predictors(), y, min_observations = 2 * n - 4) %>% prep(toy_epi_df) + Condition + Error in `check_enough_data_core()`: + ! The following columns don't have enough data to train: no single column, but the combination of lag_1_x, lag_2_x, y. + diff --git a/tests/testthat/_snaps/check_enough_train_data.md b/tests/testthat/_snaps/check_enough_train_data.md deleted file mode 100644 index 8f2389acb..000000000 --- a/tests/testthat/_snaps/check_enough_train_data.md +++ /dev/null @@ -1,46 +0,0 @@ -# check_enough_train_data works on pooled data - - Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n + 1, - drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) - Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. - ---- - - Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n - 1, - drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) - Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. - -# check_enough_train_data works on unpooled data - - Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = n + 1, epi_keys = "geo_value", - drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) - Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. - ---- - - Code - epi_recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n - 3, - epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) - Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: x and y. - -# check_enough_train_data works with all_predictors() downstream of constructed terms - - Code - epi_recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>% prep( - toy_epi_df) %>% bake(new_data = NULL) - Condition - Error in `prep()`: - ! The following columns don't have enough data to predict: lag_1_x, lag_2_x, and y. - diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R index 0f2b9bd16..d13e6d2ea 100644 --- a/tests/testthat/test-arx_forecaster.R +++ b/tests/testthat/test-arx_forecaster.R @@ -24,3 +24,22 @@ test_that("arx_forecaster errors if forecast date, target date, and ahead are in class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" ) }) + +test_that("warns if there's not enough data to predict", { + edf <- tibble( + geo_value = "ct", + time_value = seq(as.Date("2020-10-01"), as.Date("2023-05-31"), by = "day"), + ) %>% + mutate(value = seq_len(nrow(.)) + rnorm(nrow(.))) %>% + # Oct to May (flu season, ish) only: + filter(!dplyr::between(as.POSIXlt(time_value)$mon + 1L, 6L, 9L)) %>% + # and actually, pretend we're around mid-October 2022: + filter(time_value <= as.Date("2022-10-12")) %>% + as_epi_df(as_of = as.Date("2022-10-12")) + edf %>% filter(time_value > "2022-08-01") + + expect_error( + edf %>% arx_forecaster("value"), + class = "epipredict__not_enough_data" + ) +}) diff --git a/tests/testthat/test-check_enough_data.R b/tests/testthat/test-check_enough_data.R new file mode 100644 index 000000000..3ca388afb --- /dev/null +++ b/tests/testthat/test-check_enough_data.R @@ -0,0 +1,139 @@ +# Setup toy data +n <- 10 +toy_epi_df <- tibble::tibble( + time_value = rep( + seq( + as.Date("2020-01-01"), + by = 1, + length.out = n + ), + times = 2 + ), + geo_value = rep(c("ca", "hi"), each = n), + x = c(1:n, c(1:(n - 2), NA, NA)), + y = 1:(2 * n) +) %>% epiprocess::as_epi_df() + +test_that("check_enough_data works on pooled data", { + # Check both columns have enough data + expect_no_error( + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = 2 * n, drop_na = FALSE) %>% + prep(toy_epi_df) %>% + bake(new_data = NULL) + ) + # Check both column don't have enough data + expect_snapshot( + error = TRUE, + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = 2 * n + 1, drop_na = FALSE) %>% + prep(toy_epi_df) + ) + # Check drop_na works + expect_snapshot( + error = TRUE, + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = 2 * n - 1, drop_na = TRUE) %>% + prep(toy_epi_df) + ) +}) + +test_that("check_enough_data works on unpooled data", { + # Check both columns have enough data + expect_no_error( + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = n, epi_keys = "geo_value", drop_na = FALSE) %>% + prep(toy_epi_df) %>% + bake(new_data = NULL) + ) + # Check one column don't have enough data + expect_snapshot( + error = TRUE, + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% + prep(toy_epi_df) + ) + # Check drop_na works + expect_snapshot( + error = TRUE, + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% + prep(toy_epi_df) + ) +}) + +test_that("check_enough_data outputs the correct recipe values", { + expect_no_error( + p <- epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = 2 * n - 2) %>% + prep(toy_epi_df) %>% + bake(new_data = NULL) + ) + + expect_equal(nrow(p), 2 * n) + expect_equal(ncol(p), 4L) + expect_s3_class(p, "epi_df") + expect_named(p, c("geo_value", "time_value", "x", "y")) # order in epiprocess::new_epi_df + expect_equal( + p$time_value, + rep(seq(as.Date("2020-01-01"), by = 1, length.out = n), times = 2) + ) + expect_equal(p$geo_value, rep(c("ca", "hi"), each = n)) +}) + +test_that("check_enough_data only checks train data when skip = FALSE", { + # Check that the train data has enough data, the test data does not, but + # the check passes anyway (because it should be applied to training data) + toy_test_data <- toy_epi_df %>% + group_by(geo_value) %>% + slice(3:10) %>% + epiprocess::as_epi_df() + expect_no_error( + epi_recipe(toy_epi_df) %>% + check_enough_data(x, y, min_observations = n - 2, epi_keys = "geo_value") %>% + prep(toy_epi_df) %>% + bake(new_data = toy_test_data) + ) + # Making sure `skip = TRUE` is working correctly in `predict` + expect_no_error( + epi_recipe(toy_epi_df) %>% + add_role(y, new_role = "outcome") %>% + check_enough_data(x, min_observations = n - 2, epi_keys = "geo_value") %>% + epi_workflow(linear_reg()) %>% + fit(toy_epi_df) %>% + predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08")) + ) + # making sure it works for skip = FALSE, where there's enough data to train + # but not enough to predict + expect_no_error( + forecaster <- epi_recipe(toy_epi_df) %>% + add_role(y, new_role = "outcome") %>% + check_enough_data(x, min_observations = 1, epi_keys = "geo_value", skip = FALSE) %>% + epi_workflow(linear_reg()) %>% + fit(toy_epi_df) + ) + expect_snapshot( + error = TRUE, + forecaster %>% + predict(new_data = toy_test_data %>% filter(time_value > "2020-01-08")) + ) +}) + +test_that("check_enough_data works with all_predictors() downstream of constructed terms", { + # With a lag of 2, we will get 2 * n - 5 non-NA rows (NA's in x but not in the + # lags don't count) + expect_no_error( + epi_recipe(toy_epi_df) %>% + step_epi_lag(x, lag = c(1, 2)) %>% + check_enough_data(all_predictors(), y, min_observations = 2 * n - 5) %>% + prep(toy_epi_df) %>% + bake(new_data = NULL) + ) + expect_snapshot( + error = TRUE, + epi_recipe(toy_epi_df) %>% + step_epi_lag(x, lag = c(1, 2)) %>% + check_enough_data(all_predictors(), y, min_observations = 2 * n - 4) %>% + prep(toy_epi_df) + ) +}) diff --git a/tests/testthat/test-check_enough_train_data.R b/tests/testthat/test-check_enough_train_data.R deleted file mode 100644 index 9b2ef5f34..000000000 --- a/tests/testthat/test-check_enough_train_data.R +++ /dev/null @@ -1,127 +0,0 @@ -# Setup toy data -n <- 10 -toy_epi_df <- tibble::tibble( - time_value = rep( - seq( - as.Date("2020-01-01"), - by = 1, - length.out = n - ), - times = 2 - ), - geo_value = rep(c("ca", "hi"), each = n), - x = c(1:n, c(1:(n - 2), NA, NA)), - y = 1:(2 * n) -) %>% epiprocess::as_epi_df() - -test_that("check_enough_train_data works on pooled data", { - # Check both columns have enough data - expect_no_error( - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n, drop_na = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - # Check both column don't have enough data - expect_snapshot( - error = TRUE, - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - # Check drop_na works - expect_snapshot( - error = TRUE, - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) -}) - -test_that("check_enough_train_data works on unpooled data", { - # Check both columns have enough data - expect_no_error( - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = n, epi_keys = "geo_value", drop_na = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - # Check one column don't have enough data - expect_snapshot( - error = TRUE, - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - # Check drop_na works - expect_snapshot( - error = TRUE, - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) -}) - -test_that("check_enough_train_data outputs the correct recipe values", { - expect_no_error( - p <- epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = 2 * n - 2) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - - expect_equal(nrow(p), 2 * n) - expect_equal(ncol(p), 4L) - expect_s3_class(p, "epi_df") - expect_named(p, c("geo_value", "time_value", "x", "y")) # order in epiprocess::new_epi_df - expect_equal( - p$time_value, - rep(seq(as.Date("2020-01-01"), by = 1, length.out = n), times = 2) - ) - expect_equal(p$geo_value, rep(c("ca", "hi"), each = n)) -}) - -test_that("check_enough_train_data only checks train data", { - # Check that the train data has enough data, the test data does not, but - # the check passes anyway (because it should be applied to training data) - toy_test_data <- toy_epi_df %>% - group_by(geo_value) %>% - slice(3:10) %>% - epiprocess::as_epi_df() - expect_no_error( - epi_recipe(toy_epi_df) %>% - check_enough_train_data(x, y, n = n - 2, epi_keys = "geo_value") %>% - prep(toy_epi_df) %>% - bake(new_data = toy_test_data) - ) - # Same thing, but skip = FALSE - expect_no_error( - epi_recipe(toy_epi_df) %>% - check_enough_train_data(y, n = n - 2, epi_keys = "geo_value", skip = FALSE) %>% - prep(toy_epi_df) %>% - bake(new_data = toy_test_data) - ) -}) - -test_that("check_enough_train_data works with all_predictors() downstream of constructed terms", { - # With a lag of 2, we will get 2 * n - 6 non-NA rows - expect_no_error( - epi_recipe(toy_epi_df) %>% - step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 6) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) - expect_snapshot( - error = TRUE, - epi_recipe(toy_epi_df) %>% - step_epi_lag(x, lag = c(1, 2)) %>% - check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>% - prep(toy_epi_df) %>% - bake(new_data = NULL) - ) -}) diff --git a/tests/testthat/test-layer_residual_quantiles.R b/tests/testthat/test-layer_residual_quantiles.R index 3d5883c72..2421b8a1c 100644 --- a/tests/testthat/test-layer_residual_quantiles.R +++ b/tests/testthat/test-layer_residual_quantiles.R @@ -103,7 +103,7 @@ test_that("Canned forecasters work with / without", { ) expect_silent( - arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate")) + arx_forecaster(jhu, "death_rate", c("case_rate", "death_rate"), args_list = arx_args_list(check_enough_data_n = 1)) ) expect_silent( flatline_forecaster(