|
| 1 | +#' Revert population scaled prediction |
| 2 | +#' |
| 3 | +#' `layer_population_scaling` creates a specification of a frosting layer |
| 4 | +#' that will add a population scaled column in the data. For example, |
| 5 | +#' load a dataset that contains county population, and join to an `epi_df` |
| 6 | +#' that currently predicts number of new cases by county to obtain case rates. |
| 7 | +#' Although worth noting that there is nothing special about "population". |
| 8 | +#' The function can be used to scale by any variable. Population is simply the |
| 9 | +#' most natural and common use case. |
| 10 | +#' |
| 11 | +#' @param frosting a `frosting` postprocessor. The layer will be added to the |
| 12 | +#' sequence of operations for this frosting. |
| 13 | +#' @param ... One or more selector functions to scale variables |
| 14 | +#' for this step. See [selections()] for more details. |
| 15 | +#' @param df a data frame that contains the population data used for scaling. |
| 16 | +#' @param by A character vector of variables to left join by. |
| 17 | +#' |
| 18 | +#' If `NULL`, the default, the function will perform a natural join, using all |
| 19 | +#' variables in common across the `epi_df` and the user-provided dataset. |
| 20 | +#' If columns in `epi_df` and `df` have the same name (and aren't |
| 21 | +#' included in by), `.df` is added to the one from the user-provided data |
| 22 | +#' to disambiguate. |
| 23 | +#' |
| 24 | +#' To join by different variables on the `epi_df` and `df`, use a named vector. |
| 25 | +#' For example, by = c("geo_value" = "states") will match `epi_df$geo_value` |
| 26 | +#' to `df$states`. To join by multiple variables, use a vector with length > 1. |
| 27 | +#' For example, by = c("geo_value" = "states", "county" = "county") will match |
| 28 | +#' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`. |
| 29 | +#' |
| 30 | +#' See [dplyr::left_join()] for more details. |
| 31 | +#' @param df_pop_col the name of the column in the data frame `df` that |
| 32 | +#' contains the population data and used for scaling. |
| 33 | +#' @param create_new TRUE to create a new column and keep the original column |
| 34 | +#' in the `epi_df`. |
| 35 | +#' @param suffix a character. The suffix added to the column name if |
| 36 | +#' `create_new = TRUE`. Default to "_original". |
| 37 | +#' @param .flag a logical to determine if the layer is added. Passed on to |
| 38 | +#' `add_layer()`. Default `TRUE`. |
| 39 | +#' @param id a random id string |
| 40 | +#' |
| 41 | +#' @return an updated `frosting` postprocessor |
| 42 | +#' @export |
| 43 | +#' @examples |
| 44 | +#' library(epiprocess) |
| 45 | +#' jhu <- epiprocess::jhu_csse_daily_subset %>% |
| 46 | +#' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% |
| 47 | +#' dplyr::select(geo_value, time_value, cases) |
| 48 | +#' |
| 49 | +#' pop_data = data.frame(states = c("ca", "ny"), |
| 50 | +#' value = c(20000, 30000)) |
| 51 | +#' |
| 52 | +#' r <- epi_recipe(jhu) %>% |
| 53 | +#' step_population_scaling(df = pop_data, |
| 54 | +#' df_pop_col = "value", |
| 55 | +#' by = c("geo_value" = "states"), |
| 56 | +#' cases, suffix = "_scaled") %>% |
| 57 | +#' step_epi_lag(cases_scaled, lag = c(7, 14)) %>% |
| 58 | +#' step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>% |
| 59 | +#' step_epi_naomit() |
| 60 | +#' |
| 61 | +#' f <- frosting() %>% |
| 62 | +#' layer_predict() %>% |
| 63 | +#' layer_threshold(.pred) %>% |
| 64 | +#' layer_naomit(.pred) %>% |
| 65 | +#' layer_population_scaling(.pred, df = pop_data, |
| 66 | +#' by = c("geo_value" = "states"), |
| 67 | +#' df_pop_col = "value") |
| 68 | +#' |
| 69 | +#' wf <- epi_workflow(r, |
| 70 | +#' parsnip::linear_reg()) %>% |
| 71 | +#' parsnip::fit(jhu) %>% |
| 72 | +#' add_frosting(f) |
| 73 | +#' |
| 74 | +#' latest <- get_test_data(recipe = r, |
| 75 | +#' x = epiprocess::jhu_csse_daily_subset %>% |
| 76 | +#' dplyr::filter(time_value > "2021-11-01", |
| 77 | +#' geo_value %in% c("ca", "ny")) %>% |
| 78 | +#' dplyr::select(geo_value, time_value, cases)) |
| 79 | +#' |
| 80 | +#' |
| 81 | +#' predict(wf, latest) |
| 82 | +layer_population_scaling <- function(frosting, |
| 83 | + ..., |
| 84 | + df, |
| 85 | + by = NULL, |
| 86 | + df_pop_col, |
| 87 | + create_new = TRUE, |
| 88 | + suffix = "_original", |
| 89 | + .flag = TRUE, |
| 90 | + id = rand_id("population_scaling")) { |
| 91 | + |
| 92 | + add_layer( |
| 93 | + frosting, |
| 94 | + layer_population_scaling_new( |
| 95 | + df = df, |
| 96 | + by = by, |
| 97 | + df_pop_col = df_pop_col, |
| 98 | + terms = dplyr::enquos(...), |
| 99 | + create_new = create_new, |
| 100 | + suffix = suffix, |
| 101 | + id = id |
| 102 | + ), |
| 103 | + flag = .flag |
| 104 | + ) |
| 105 | +} |
| 106 | + |
| 107 | +layer_population_scaling_new <- |
| 108 | + function(df, by, df_pop_col, terms, create_new, suffix, id) { |
| 109 | + layer("population_scaling", |
| 110 | + df = df, |
| 111 | + by = by, |
| 112 | + df_pop_col = df_pop_col, |
| 113 | + terms = terms, |
| 114 | + create_new = create_new, |
| 115 | + suffix = suffix, |
| 116 | + id = id) |
| 117 | +} |
| 118 | + |
| 119 | +#' @export |
| 120 | +slather.layer_population_scaling <- |
| 121 | + function(object, components, the_fit, the_recipe, ...) { |
| 122 | + stopifnot("Only one population column allowed for scaling" = |
| 123 | + length(object$df_pop_col) == 1) |
| 124 | + |
| 125 | + try_join <- try(dplyr::left_join(components$predictions, object$df, |
| 126 | + by= object$by), |
| 127 | + silent = TRUE) |
| 128 | + if (any(grepl("Join columns must be present in data", unlist(try_join)))){ |
| 129 | + stop("columns in `by` selectors of `layer_population_scaling` must be present in data and match")} |
| 130 | + |
| 131 | + object$df <- object$df %>% |
| 132 | + dplyr::mutate(dplyr::across(where(is.character), tolower)) |
| 133 | + pop_col = rlang::sym(object$df_pop_col) |
| 134 | + exprs <- rlang::expr(c(!!!object$terms)) |
| 135 | + pos <- tidyselect::eval_select(exprs, components$predictions) |
| 136 | + col_names <- names(pos) |
| 137 | + suffix = ifelse(object$create_new, object$suffix, "") |
| 138 | + |
| 139 | + |
| 140 | + components$predictions <- dplyr::left_join(components$predictions, |
| 141 | + object$df, |
| 142 | + by= object$by, |
| 143 | + suffix = c("", ".df")) %>% |
| 144 | + dplyr::mutate(dplyr::across(dplyr::all_of(col_names), |
| 145 | + ~.x * !!pop_col , |
| 146 | + .names = "{.col}{suffix}")) %>% |
| 147 | + dplyr::select(- !!pop_col) |
| 148 | + components |
| 149 | +} |
0 commit comments