Skip to content

extend to quantile_dist, exclude multi-output #458

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ S3method(update,layer)
S3method(vec_arith,quantile_pred)
S3method(vec_arith.numeric,quantile_pred)
S3method(vec_arith.quantile_pred,numeric)
S3method(vec_arith.quantile_pred,quantile_pred)
S3method(vec_math,quantile_pred)
S3method(vec_proxy_equal,quantile_pred)
S3method(weighted_interval_score,quantile_pred)
Expand Down Expand Up @@ -233,6 +234,7 @@ import(epidatasets)
import(epiprocess)
import(parsnip)
import(recipes)
import(vctrs)
importFrom(checkmate,assert_class)
importFrom(checkmate,assert_numeric)
importFrom(checkmate,test_character)
Expand Down
236 changes: 87 additions & 149 deletions R/layer_yeo_johnson.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
#' Unormalizing transformation
#'
#' Will undo a step_epi_YeoJohnson transformation.
#' Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you
#' are using this step on a column that will eventually become the outcome
#' variable, you should make sure that the original name of that column is a
#' subset of the outcome variable name. `ahead_7_cases` when `cases` is
#' transformed will work well, while `ahead_7` will not.
#'
#' @inheritParams layer_population_scaling
#' @param yj_params Internal. A data frame of parameters to be used for
#' inverting the transformation.
#' @param by A (possibly named) character vector of variables to join by.
#' @param yj_params A data frame of parameters to be used for inverting the
#' transformation. Typically set automatically. If you have done multiple
#' transformations such that the outcome variable name no longer contains the
#' column that this step transforms, then you should manually specify this to
#' be the parameters fit in the corresponding `step_epi_YeoJohnson`. For an
#' example where you wouldn't need to set this, if your output is
#' `ahead_7_cases` and `step_epi_YeoJohnson` transformed cases (possibly with
#' other columns), then you wouldn't need to set this. However if you have
#' renamed your output column to `diff_7`, then you will need to extract the `yj_params` from the step.
#'
#' @return an updated `frosting` postprocessor
#' @export
Expand Down Expand Up @@ -37,65 +47,36 @@
#' # Compare to the original data.
#' jhu %>% filter(time_value == "2021-12-31")
#' forecast(wf)
layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) {
layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, id = rand_id("epi_YeoJohnson")) {
checkmate::assert_tibble(yj_params, min.rows = 1, null.ok = TRUE)

add_layer(
frosting,
layer_epi_YeoJohnson_new(
yj_params = yj_params,
by = by,
terms = dplyr::enquos(...),
id = id
)
)
}

layer_epi_YeoJohnson_new <- function(yj_params, by, terms, id) {
layer("epi_YeoJohnson", yj_params = yj_params, by = by, terms = terms, id = id)
layer_epi_YeoJohnson_new <- function(yj_params, terms, id) {
layer("epi_YeoJohnson", yj_params = yj_params, terms = terms, id = id)
}

#' @export
#' @importFrom workflows extract_preprocessor
slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) {
rlang::check_dots_empty()

# TODO: We will error if we don't have a workflow. Write a check later.

# Get the yj_params from the layer or from the workflow.
yj_params <- object$yj_params %||% get_yj_params_in_layer(workflow)

# If the by is not specified, try to infer it from the yj_params.
if (is.null(object$by)) {
# Assume `layer_predict` has calculated the prediction keys and other
# layers don't change the prediction key colnames:
prediction_key_colnames <- names(components$keys)
lhs_potential_keys <- prediction_key_colnames
rhs_potential_keys <- colnames(select(yj_params, -starts_with(".yj_param_")))
object$by <- intersect(lhs_potential_keys, rhs_potential_keys)
suggested_min_keys <- setdiff(lhs_potential_keys, "time_value")
if (!all(suggested_min_keys %in% object$by)) {
cli_warn(
c(
"{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions,
but {?wasn't/weren't} found in the population `df`.",
"i" = "Defaulting to join by {object$by}",
">" = "Double-check whether column names on the population `df` match those expected in your predictions",
">" = "Consider using population data with breakdowns by {suggested_min_keys}",
">" = "Manually specify `by =` to silence"
),
class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
)
}
}
# get the yj_params from the layer or from the workflow.
yj_params <-
object$yj_params %||%
get_params_in_layer(workflow, "epi_YeoJohnson", "yj_params")

# Establish the join columns.
object$by <- object$by %||%
intersect(
epi_keys_only(components$predictions),
colnames(select(yj_params, -starts_with(".yj_param_")))
)
joinby <- list(x = names(object$by) %||% object$by, y = object$by)
join_by_columns <- key_colnames(new_data, exclude = "time_value") %>% sort()
joinby <- list(x = join_by_columns, y = join_by_columns)
hardhat::validate_column_names(components$predictions, joinby$x)
hardhat::validate_column_names(yj_params, joinby$y)

Expand All @@ -115,55 +96,15 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
# The `object$terms` is where the user specifies the columns they want to
# untransform. We need to match the outcomes with their yj_param columns in our
# parameter table and then apply the inverse transformation.
if (identical(col_names, ".pred")) {
# In this case, we don't get a hint for the outcome column name, so we need
# to infer it from the mold.
if (length(components$mold$outcomes) > 1) {
cli_abort("Only one outcome is allowed when specifying `.pred`.", call = rlang::caller_env())
}
# `outcomes` is a vector of objects like ahead_1_cases, ahead_7_cases, etc.
# We want to extract the cases part.
outcome_cols <- names(components$mold$outcomes) %>%
stringr::str_match("ahead_\\d+_(.*)") %>%
magrittr::extract(, 2)

if (length(col_names) == 0) {
# not specified by the user, so just modify everything starting with `.pred`
components$predictions <- components$predictions %>%
mutate(.pred := yj_inverse(.pred, !!sym(paste0(".yj_param_", outcome_cols))))
} else if (identical(col_names, character(0))) {
# Wish I could suggest `all_outcomes()` here, but currently it's the same as
# not specifying any terms. I don't want to spend time with dealing with
# this case until someone asks for it.
cli::cli_abort(
"Not specifying columns to layer Yeo-Johnson is not implemented.
If you had a single outcome, you can use `.pred` as a column name.
If you had multiple outcomes, you'll need to specify them like
`.pred_ahead_1_<outcome_col>`, `.pred_ahead_7_<outcome_col>`, etc.
",
call = rlang::caller_env()
)
mutate(across(starts_with(".pred"), \(.pred) yj_inverse(.pred, .lambda))) %>%
select(-.lambda)
} else {
# In this case, we assume that the user has specified the columns they want
# transformed here. We then need to determine the yj_param columns for each of
# these columns. That is, we need to convert a vector of column names like
# c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to
# c(".yj_param_ahead_1_case_rate", ".yj_param_ahead_7_case_rate").
original_outcome_cols <- stringr::str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2]
outcomes_wout_ahead <- stringr::str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[, 2]
if (any(original_outcome_cols %nin% outcomes_wout_ahead)) {
cli_abort(
"All columns specified in `...` must be outcome columns.
They must be of the form `.pred_ahead_1_<outcome_col>`, `.pred_ahead_7_<outcome_col>`, etc.
",
call = rlang::caller_env()
)
}

for (i in seq_along(col_names)) {
col <- col_names[i]
yj_param_col <- paste0(".yj_param_", original_outcome_cols[i])
components$predictions <- components$predictions %>%
mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(yj_param_col)))
}
components$predictions <- components$predictions %>%
mutate(across(all_of(col_names), \(.pred) yj_inverse(.pred, .lambda))) %>%
select(-.lambda)
}

# Remove the yj_param columns.
Expand All @@ -182,75 +123,72 @@ print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30),
# Inverse Yeo-Johnson transformation
#
# Inverse of `yj_transform` in step_yeo_johnson.R.
yj_inverse <- function(x, lambda, eps = 0.001) {
yj_inverse <- function(x_in, lambda, eps = 0.001) {
if (any(is.na(lambda))) {
return(x)
}
if (length(x) > 1 && length(lambda) == 1) {
lambda <- rep(lambda, length(x))
} else if (length(x) != length(lambda)) {
cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_fn())
}
if (!inherits(x, "tbl_df") || is.data.frame(x)) {
x <- unlist(x, use.names = FALSE)
} else {
if (!is.vector(x)) {
x <- as.vector(x)
}
}

nn_inv_trans <- function(x, lambda) {
out <- double(length(x))
sm_lambdas <- abs(lambda) < eps
if (length(sm_lambdas) > 0) {
out[sm_lambdas] <- exp(x[sm_lambdas]) - 1
}
x <- x[!sm_lambdas]
lambda <- lambda[!sm_lambdas]
if (length(x) > 0) {
out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1
}
out
cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_call())
}

ng_inv_trans <- function(x, lambda) {
out <- double(length(x))
near2_lambdas <- abs(lambda - 2) < eps
if (length(near2_lambdas) > 0) {
out[near2_lambdas] <- -(exp(-x[near2_lambdas]) - 1)
}
x <- x[!near2_lambdas]
lambda <- lambda[!near2_lambdas]
if (length(x) > 0) {
out[!near2_lambdas] <- -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1)
}
out
}

dat_neg <- x < 0
not_neg <- which(!dat_neg)
is_neg <- which(dat_neg)

if (length(not_neg) > 0) {
x[not_neg] <- nn_inv_trans(x[not_neg], lambda[not_neg])
}

if (length(is_neg) > 0) {
x[is_neg] <- ng_inv_trans(x[is_neg], lambda[is_neg])
x_lambda <- yj_input_type_management(x_in, lambda)
x <- x_lambda[[1]]
lambda <- x_lambda[[2]]
inv_x <- ifelse(
x < 0,
# negative values we test if lambda is ~2
ifelse(
abs(lambda - 2) < eps,
-(exp(-x) - 1),
-(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1)
),
# non-negative values we test if lambda is ~0
ifelse(
abs(lambda) < eps,
(exp(x) - 1),
(lambda * x + 1)^(1 / lambda) - 1
)
)
if (x_in %>% inherits("quantile_pred")) {
inv_x <- inv_x %>% quantile_pred(x_in %@% "quantile_levels")
}
x
inv_x
}

get_yj_params_in_layer <- function(workflow) {

#' get the parameters used in the initial step
#'
#' @param workflow the workflow to extract the parameters from
#' @param step_name the name of the step to look for, as recognized by `detect_step`
#' @param param_name the parameter to pull out of the step
#' @keywords internal
get_params_in_layer <- function(workflow, step_name = "epi_YeoJohnson", param_name = "yj_params") {
full_step_name <- glue::glue("step_{step_name}")
this_recipe <- hardhat::extract_recipe(workflow)
if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) {
cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env())
if (!(this_recipe %>% recipes::detect_step(step_name))) {
cli_abort("`layer_{step_name}` requires `step_{step_name}` in the recipe.", call = rlang::caller_call())
}
outcomes <-
workflows::extract_recipe(workflow)$term_info %>%
filter(role == "outcome") %>%
pull(variable)
if (length(outcomes) > 1) {
cli_abort(
"`layer_{step_name}` doesn't support multiple output columns.
This workflow produces {outcomes} as output columns.",
call = rlang::caller_call(),
class = "epipredict__layer_yeo_johnson_multi_outcome_error"
)
}
for (step in this_recipe$steps) {
if (inherits(step, "step_epi_YeoJohnson")) {
yj_params <- step$yj_params
# if it's a `step_name` step that also transforms a column that is a subset
# of the output column name
is_outcome_subset <- map_lgl(step$columns, ~ grepl(.x, outcomes))
if (inherits(step, full_step_name) && any(is_outcome_subset)) {
params <- step[[param_name]] %>%
select(
key_colnames(workflow$original_data, exclude = "time_value"),
contains(step$columns[is_outcome_subset])
) %>%
rename(.lambda = contains(step$columns))
break
}
}
yj_params
params
}
20 changes: 19 additions & 1 deletion R/quantile_pred-methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ vec_proxy_equal.quantile_pred <- function(x, ...) {
dplyr::select(-.row)
}


# quantiles by treating quantile_pred like a distribution -----------------


Expand Down Expand Up @@ -287,13 +286,32 @@ vec_math.quantile_pred <- function(.fn, .x, ...) {
quantile_pred(.fn(.x), quantile_levels)
}

#' Internal vctrs methods
#'
#' @import vctrs
#' @keywords internal
#' @name epipredict-vctrs

#' @importFrom vctrs vec_arith vec_arith.numeric
#' @export
#' @method vec_arith quantile_pred
vec_arith.quantile_pred <- function(op, x, y, ...) {
UseMethod("vec_arith.quantile_pred", y)
}


#' @export
#' @method vec_arith.quantile_pred quantile_pred
vec_arith.quantile_pred.quantile_pred <- function(op, x, y, ...) {
all_quantiles <- unique(c(x %@% "quantile_levels", y %@% "quantile_levels"))
op_fn <- getExportedValue("base", op)
# Interpolate/extrapolate to the same quantiles
x <- quantile.quantile_pred(x, all_quantiles)
y <- quantile.quantile_pred(y, all_quantiles)
out <- op_fn(x, y, ...)
quantile_pred(out, all_quantiles)
}

#' @export
#' @method vec_arith.quantile_pred numeric
vec_arith.quantile_pred.numeric <- function(op, x, y, ...) {
Expand Down
Loading