Skip to content

Commit 7181718

Browse files
authored
Merge pull request #100 from cmu-delphi/40-steppost_population_scaling
40 steppost population scaling
2 parents 86a1126 + 5014383 commit 7181718

File tree

6 files changed

+888
-0
lines changed

6 files changed

+888
-0
lines changed

NAMESPACE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ S3method(apply_frosting,epi_workflow)
77
S3method(augment,epi_workflow)
88
S3method(bake,step_epi_ahead)
99
S3method(bake,step_epi_lag)
10+
S3method(bake,step_population_scaling)
1011
S3method(detect_layer,frosting)
1112
S3method(detect_layer,workflow)
1213
S3method(epi_keys,default)
@@ -31,6 +32,7 @@ S3method(predict,epi_workflow)
3132
S3method(prep,epi_recipe)
3233
S3method(prep,step_epi_ahead)
3334
S3method(prep,step_epi_lag)
35+
S3method(prep,step_population_scaling)
3436
S3method(print,epi_workflow)
3537
S3method(print,frosting)
3638
S3method(print,step_epi_ahead)
@@ -41,6 +43,7 @@ S3method(run_mold,default_epi_recipe_blueprint)
4143
S3method(slather,layer_add_forecast_date)
4244
S3method(slather,layer_add_target_date)
4345
S3method(slather,layer_naomit)
46+
S3method(slather,layer_population_scaling)
4447
S3method(slather,layer_predict)
4548
S3method(slather,layer_predictive_distn)
4649
S3method(slather,layer_residual_quantiles)
@@ -86,6 +89,7 @@ export(layer)
8689
export(layer_add_forecast_date)
8790
export(layer_add_target_date)
8891
export(layer_naomit)
92+
export(layer_population_scaling)
8993
export(layer_predict)
9094
export(layer_predictive_distn)
9195
export(layer_residual_quantiles)
@@ -100,6 +104,7 @@ export(smooth_arx_forecaster)
100104
export(step_epi_ahead)
101105
export(step_epi_lag)
102106
export(step_epi_naomit)
107+
export(step_population_scaling)
103108
export(validate_layer)
104109
import(distributional)
105110
import(recipes)

R/layer_population_scaling.R

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#' Revert population scaled prediction
2+
#'
3+
#' `layer_population_scaling` creates a specification of a frosting layer
4+
#' that will add a population scaled column in the data. For example,
5+
#' load a dataset that contains county population, and join to an `epi_df`
6+
#' that currently predicts number of new cases by county to obtain case rates.
7+
#' Although worth noting that there is nothing special about "population".
8+
#' The function can be used to scale by any variable. Population is simply the
9+
#' most natural and common use case.
10+
#'
11+
#' @param frosting a `frosting` postprocessor. The layer will be added to the
12+
#' sequence of operations for this frosting.
13+
#' @param ... One or more selector functions to scale variables
14+
#' for this step. See [selections()] for more details.
15+
#' @param df a data frame that contains the population data used for scaling.
16+
#' @param by A character vector of variables to left join by.
17+
#'
18+
#' If `NULL`, the default, the function will perform a natural join, using all
19+
#' variables in common across the `epi_df` and the user-provided dataset.
20+
#' If columns in `epi_df` and `df` have the same name (and aren't
21+
#' included in by), `.df` is added to the one from the user-provided data
22+
#' to disambiguate.
23+
#'
24+
#' To join by different variables on the `epi_df` and `df`, use a named vector.
25+
#' For example, by = c("geo_value" = "states") will match `epi_df$geo_value`
26+
#' to `df$states`. To join by multiple variables, use a vector with length > 1.
27+
#' For example, by = c("geo_value" = "states", "county" = "county") will match
28+
#' `epi_df$geo_value` to `df$states` and `epi_df$county` to `df$county`.
29+
#'
30+
#' See [dplyr::left_join()] for more details.
31+
#' @param df_pop_col the name of the column in the data frame `df` that
32+
#' contains the population data and used for scaling.
33+
#' @param create_new TRUE to create a new column and keep the original column
34+
#' in the `epi_df`.
35+
#' @param suffix a character. The suffix added to the column name if
36+
#' `create_new = TRUE`. Default to "_original".
37+
#' @param .flag a logical to determine if the layer is added. Passed on to
38+
#' `add_layer()`. Default `TRUE`.
39+
#' @param id a random id string
40+
#'
41+
#' @return an updated `frosting` postprocessor
42+
#' @export
43+
#' @examples
44+
#' library(epiprocess)
45+
#' jhu <- epiprocess::jhu_csse_daily_subset %>%
46+
#' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
47+
#' dplyr::select(geo_value, time_value, cases)
48+
#'
49+
#' pop_data = data.frame(states = c("ca", "ny"),
50+
#' value = c(20000, 30000))
51+
#'
52+
#' r <- epi_recipe(jhu) %>%
53+
#' step_population_scaling(df = pop_data,
54+
#' df_pop_col = "value",
55+
#' by = c("geo_value" = "states"),
56+
#' cases, suffix = "_scaled") %>%
57+
#' step_epi_lag(cases_scaled, lag = c(7, 14)) %>%
58+
#' step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>%
59+
#' step_epi_naomit()
60+
#'
61+
#' f <- frosting() %>%
62+
#' layer_predict() %>%
63+
#' layer_threshold(.pred) %>%
64+
#' layer_naomit(.pred) %>%
65+
#' layer_population_scaling(.pred, df = pop_data,
66+
#' by = c("geo_value" = "states"),
67+
#' df_pop_col = "value")
68+
#'
69+
#' wf <- epi_workflow(r,
70+
#' parsnip::linear_reg()) %>%
71+
#' parsnip::fit(jhu) %>%
72+
#' add_frosting(f)
73+
#'
74+
#' latest <- get_test_data(recipe = r,
75+
#' x = epiprocess::jhu_csse_daily_subset %>%
76+
#' dplyr::filter(time_value > "2021-11-01",
77+
#' geo_value %in% c("ca", "ny")) %>%
78+
#' dplyr::select(geo_value, time_value, cases))
79+
#'
80+
#'
81+
#' predict(wf, latest)
82+
layer_population_scaling <- function(frosting,
83+
...,
84+
df,
85+
by = NULL,
86+
df_pop_col,
87+
create_new = TRUE,
88+
suffix = "_original",
89+
.flag = TRUE,
90+
id = rand_id("population_scaling")) {
91+
92+
add_layer(
93+
frosting,
94+
layer_population_scaling_new(
95+
df = df,
96+
by = by,
97+
df_pop_col = df_pop_col,
98+
terms = dplyr::enquos(...),
99+
create_new = create_new,
100+
suffix = suffix,
101+
id = id
102+
),
103+
flag = .flag
104+
)
105+
}
106+
107+
layer_population_scaling_new <-
108+
function(df, by, df_pop_col, terms, create_new, suffix, id) {
109+
layer("population_scaling",
110+
df = df,
111+
by = by,
112+
df_pop_col = df_pop_col,
113+
terms = terms,
114+
create_new = create_new,
115+
suffix = suffix,
116+
id = id)
117+
}
118+
119+
#' @export
120+
slather.layer_population_scaling <-
121+
function(object, components, the_fit, the_recipe, ...) {
122+
stopifnot("Only one population column allowed for scaling" =
123+
length(object$df_pop_col) == 1)
124+
125+
try_join <- try(dplyr::left_join(components$predictions, object$df,
126+
by= object$by),
127+
silent = TRUE)
128+
if (any(grepl("Join columns must be present in data", unlist(try_join)))){
129+
stop("columns in `by` selectors of `layer_population_scaling` must be present in data and match")}
130+
131+
object$df <- object$df %>%
132+
dplyr::mutate(dplyr::across(where(is.character), tolower))
133+
pop_col = rlang::sym(object$df_pop_col)
134+
exprs <- rlang::expr(c(!!!object$terms))
135+
pos <- tidyselect::eval_select(exprs, components$predictions)
136+
col_names <- names(pos)
137+
suffix = ifelse(object$create_new, object$suffix, "")
138+
139+
140+
components$predictions <- dplyr::left_join(components$predictions,
141+
object$df,
142+
by= object$by,
143+
suffix = c("", ".df")) %>%
144+
dplyr::mutate(dplyr::across(dplyr::all_of(col_names),
145+
~.x * !!pop_col ,
146+
.names = "{.col}{suffix}")) %>%
147+
dplyr::select(- !!pop_col)
148+
components
149+
}

0 commit comments

Comments
 (0)