cmu-delphi · dshemetov · Apr 10, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025
@@ -122,6 +122,7 @@ S3method(update,layer)
 S3method(vec_arith,quantile_pred)
 S3method(vec_arith.numeric,quantile_pred)
 S3method(vec_arith.quantile_pred,numeric)
+S3method(vec_arith.quantile_pred,quantile_pred)
 S3method(vec_math,quantile_pred)
 S3method(vec_proxy_equal,quantile_pred)
 S3method(weighted_interval_score,quantile_pred)
@@ -233,6 +234,7 @@ import(epidatasets)
 import(epiprocess)
 import(parsnip)
 import(recipes)
+import(vctrs)
 importFrom(checkmate,assert_class)
 importFrom(checkmate,assert_numeric)
 importFrom(checkmate,test_character)

@@ -1,11 +1,21 @@
 #' Unormalizing transformation
 #'
-#' Will undo a step_epi_YeoJohnson transformation.
+#' Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you
+#' are using this step on a column that will eventually become the outcome
+#' variable, you should make sure that the original name of that column is a
+#' subset of the outcome variable name. `ahead_7_cases` when `cases` is
+#' transformed will work well, while `ahead_7` will not.
 #'
 #' @inheritParams layer_population_scaling
-#' @param yj_params Internal. A data frame of parameters to be used for
-#'   inverting the transformation.
-#' @param by A (possibly named) character vector of variables to join by.
+#' @param yj_params A data frame of parameters to be used for inverting the
+#'   transformation. Typically set automatically. If you have done multiple
+#'   transformations such that the outcome variable name no longer contains the
+#'   column that this step transforms, then you should manually specify this to
+#'   be the parameters fit in the corresponding `step_epi_YeoJohnson`. For an
+#'   example where you wouldn't need to set this, if your output is
+#'   `ahead_7_cases` and `step_epi_YeoJohnson` transformed cases (possibly with
+#'   other columns), then you wouldn't need to set this. However if you have
+#'   renamed your output column to `diff_7`, then you will need to extract the `yj_params` from the step.
 #'
 #' @return an updated `frosting` postprocessor
 #' @export
@@ -37,65 +47,36 @@
 #' # Compare to the original data.
 #' jhu %>% filter(time_value == "2021-12-31")
 #' forecast(wf)
-layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) {
+layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, id = rand_id("epi_YeoJohnson")) {
   checkmate::assert_tibble(yj_params, min.rows = 1, null.ok = TRUE)
 
   add_layer(
     frosting,
     layer_epi_YeoJohnson_new(
       yj_params = yj_params,
-      by = by,
       terms = dplyr::enquos(...),
       id = id
     )
   )
 }
 
-layer_epi_YeoJohnson_new <- function(yj_params, by, terms, id) {
-  layer("epi_YeoJohnson", yj_params = yj_params, by = by, terms = terms, id = id)
+layer_epi_YeoJohnson_new <- function(yj_params, terms, id) {
+  layer("epi_YeoJohnson", yj_params = yj_params, terms = terms, id = id)
 }
 
 #' @export
 #' @importFrom workflows extract_preprocessor
 slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) {
   rlang::check_dots_empty()
 
-  # TODO: We will error if we don't have a workflow. Write a check later.
-
-  # Get the yj_params from the layer or from the workflow.
-  yj_params <- object$yj_params %||% get_yj_params_in_layer(workflow)
-
-  # If the by is not specified, try to infer it from the yj_params.
-  if (is.null(object$by)) {
-    # Assume `layer_predict` has calculated the prediction keys and other
-    # layers don't change the prediction key colnames:
-    prediction_key_colnames <- names(components$keys)
-    lhs_potential_keys <- prediction_key_colnames
-    rhs_potential_keys <- colnames(select(yj_params, -starts_with(".yj_param_")))
-    object$by <- intersect(lhs_potential_keys, rhs_potential_keys)
-    suggested_min_keys <- setdiff(lhs_potential_keys, "time_value")
-    if (!all(suggested_min_keys %in% object$by)) {
-      cli_warn(
-        c(
-          "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions,
-          but {?wasn't/weren't} found in the population `df`.",
-          "i" = "Defaulting to join by {object$by}",
-          ">" = "Double-check whether column names on the population `df` match those expected in your predictions",
-          ">" = "Consider using population data with breakdowns by {suggested_min_keys}",
-          ">" = "Manually specify `by =` to silence"
-        ),
-        class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
-      )
-    }
-  }
+  # get the yj_params from the layer or from the workflow.
+  yj_params <-
+    object$yj_params %||%
+    get_params_in_layer(workflow, "epi_YeoJohnson", "yj_params")
 
   # Establish the join columns.
-  object$by <- object$by %||%
-    intersect(
-      epi_keys_only(components$predictions),
-      colnames(select(yj_params, -starts_with(".yj_param_")))
-    )
-  joinby <- list(x = names(object$by) %||% object$by, y = object$by)
+  join_by_columns <- key_colnames(new_data, exclude = "time_value") %>% sort()
+  joinby <- list(x = join_by_columns, y = join_by_columns)
   hardhat::validate_column_names(components$predictions, joinby$x)
   hardhat::validate_column_names(yj_params, joinby$y)
 
@@ -115,55 +96,15 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
   # The `object$terms` is where the user specifies the columns they want to
   # untransform. We need to match the outcomes with their yj_param columns in our
   # parameter table and then apply the inverse transformation.
-  if (identical(col_names, ".pred")) {
-    # In this case, we don't get a hint for the outcome column name, so we need
-    # to infer it from the mold.
-    if (length(components$mold$outcomes) > 1) {
-      cli_abort("Only one outcome is allowed when specifying `.pred`.", call = rlang::caller_env())
-    }
-    # `outcomes` is a vector of objects like ahead_1_cases, ahead_7_cases, etc.
-    # We want to extract the cases part.
-    outcome_cols <- names(components$mold$outcomes) %>%
-      stringr::str_match("ahead_\\d+_(.*)") %>%
-      magrittr::extract(, 2)
-
+  if (length(col_names) == 0) {
+    # not specified by the user, so just modify everything starting with `.pred`
     components$predictions <- components$predictions %>%
-      mutate(.pred := yj_inverse(.pred, !!sym(paste0(".yj_param_", outcome_cols))))
-  } else if (identical(col_names, character(0))) {
-    # Wish I could suggest `all_outcomes()` here, but currently it's the same as
-    # not specifying any terms. I don't want to spend time with dealing with
-    # this case until someone asks for it.
-    cli::cli_abort(
-      "Not specifying columns to layer Yeo-Johnson is not implemented.
-    If you had a single outcome, you can use `.pred` as a column name.
-    If you had multiple outcomes, you'll need to specify them like
-    `.pred_ahead_1_<outcome_col>`, `.pred_ahead_7_<outcome_col>`, etc.
-    ",
-      call = rlang::caller_env()
-    )
+      mutate(across(starts_with(".pred"), \(.pred) yj_inverse(.pred, .lambda))) %>%
+      select(-.lambda)
   } else {
-    # In this case, we assume that the user has specified the columns they want
-    # transformed here. We then need to determine the yj_param columns for each of
-    # these columns. That is, we need to convert a vector of column names like
-    # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to
-    # c(".yj_param_ahead_1_case_rate", ".yj_param_ahead_7_case_rate").
-    original_outcome_cols <- stringr::str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2]
-    outcomes_wout_ahead <- stringr::str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[, 2]
-    if (any(original_outcome_cols %nin% outcomes_wout_ahead)) {
-      cli_abort(
-        "All columns specified in `...` must be outcome columns.
-      They must be of the form `.pred_ahead_1_<outcome_col>`, `.pred_ahead_7_<outcome_col>`, etc.
-      ",
-        call = rlang::caller_env()
-      )
-    }
-
-    for (i in seq_along(col_names)) {
-      col <- col_names[i]
-      yj_param_col <- paste0(".yj_param_", original_outcome_cols[i])
-      components$predictions <- components$predictions %>%
-        mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(yj_param_col)))
-    }
+    components$predictions <- components$predictions %>%
+      mutate(across(all_of(col_names), \(.pred) yj_inverse(.pred, .lambda))) %>%
+      select(-.lambda)
   }
 
   # Remove the yj_param columns.
@@ -182,75 +123,72 @@ print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30),
 # Inverse Yeo-Johnson transformation
 #
 # Inverse of `yj_transform` in step_yeo_johnson.R.
-yj_inverse <- function(x, lambda, eps = 0.001) {
+yj_inverse <- function(x_in, lambda, eps = 0.001) {
   if (any(is.na(lambda))) {
-    return(x)
-  }
-  if (length(x) > 1 && length(lambda) == 1) {
-    lambda <- rep(lambda, length(x))
-  } else if (length(x) != length(lambda)) {
-    cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_fn())
-  }
-  if (!inherits(x, "tbl_df") || is.data.frame(x)) {
-    x <- unlist(x, use.names = FALSE)
-  } else {
-    if (!is.vector(x)) {
-      x <- as.vector(x)
-    }
-  }
-
-  nn_inv_trans <- function(x, lambda) {
-    out <- double(length(x))
-    sm_lambdas <- abs(lambda) < eps
-    if (length(sm_lambdas) > 0) {
-      out[sm_lambdas] <- exp(x[sm_lambdas]) - 1
-    }
-    x <- x[!sm_lambdas]
-    lambda <- lambda[!sm_lambdas]
-    if (length(x) > 0) {
-      out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1
-    }
-    out
+    cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_call())
   }
-
-  ng_inv_trans <- function(x, lambda) {
-    out <- double(length(x))
-    near2_lambdas <- abs(lambda - 2) < eps
-    if (length(near2_lambdas) > 0) {
-      out[near2_lambdas] <- -(exp(-x[near2_lambdas]) - 1)
-    }
-    x <- x[!near2_lambdas]
-    lambda <- lambda[!near2_lambdas]
-    if (length(x) > 0) {
-      out[!near2_lambdas] <- -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1)
-    }
-    out
-  }
-
-  dat_neg <- x < 0
-  not_neg <- which(!dat_neg)
-  is_neg <- which(dat_neg)
-
-  if (length(not_neg) > 0) {
-    x[not_neg] <- nn_inv_trans(x[not_neg], lambda[not_neg])
-  }
-
-  if (length(is_neg) > 0) {
-    x[is_neg] <- ng_inv_trans(x[is_neg], lambda[is_neg])
+  x_lambda <- yj_input_type_management(x_in, lambda)
+  x <- x_lambda[[1]]
+  lambda <- x_lambda[[2]]
+  inv_x <- ifelse(
+    x < 0,
+    # negative values we test if lambda is ~2
+    ifelse(
+      abs(lambda - 2) < eps,
+      -(exp(-x) - 1),
+      -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1)
+    ),
+    # non-negative values we test if lambda is ~0
+    ifelse(
+      abs(lambda) < eps,
+      (exp(x) - 1),
+      (lambda * x + 1)^(1 / lambda) - 1
+    )
+  )
+  if (x_in %>% inherits("quantile_pred")) {
+    inv_x <- inv_x %>% quantile_pred(x_in %@% "quantile_levels")
   }
-  x
+  inv_x
 }
 
-get_yj_params_in_layer <- function(workflow) {
+
+#' get the parameters used in the initial step
+#'
+#' @param workflow the workflow to extract the parameters from
+#' @param step_name the name of the step to look for, as recognized by `detect_step`
+#' @param param_name the parameter to pull out of the step
+#' @keywords internal
+get_params_in_layer <- function(workflow, step_name = "epi_YeoJohnson", param_name = "yj_params") {
+  full_step_name <- glue::glue("step_{step_name}")
   this_recipe <- hardhat::extract_recipe(workflow)
-  if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) {
-    cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env())
+  if (!(this_recipe %>% recipes::detect_step(step_name))) {
+    cli_abort("`layer_{step_name}` requires `step_{step_name}` in the recipe.", call = rlang::caller_call())
+  }
+  outcomes <-
+    workflows::extract_recipe(workflow)$term_info %>%
+    filter(role == "outcome") %>%
+    pull(variable)
+  if (length(outcomes) > 1) {
+    cli_abort(
+      "`layer_{step_name}` doesn't support multiple output columns.
+      This workflow produces {outcomes} as output columns.",
+      call = rlang::caller_call(),
+      class = "epipredict__layer_yeo_johnson_multi_outcome_error"
+    )
   }
   for (step in this_recipe$steps) {
-    if (inherits(step, "step_epi_YeoJohnson")) {
-      yj_params <- step$yj_params
+    # if it's a `step_name` step that also transforms a column that is a subset
+    # of the output column name
+    is_outcome_subset <- map_lgl(step$columns, ~ grepl(.x, outcomes))
+    if (inherits(step, full_step_name) && any(is_outcome_subset)) {
+      params <- step[[param_name]] %>%
+        select(
+          key_colnames(workflow$original_data, exclude = "time_value"),
+          contains(step$columns[is_outcome_subset])
+        ) %>%
+        rename(.lambda = contains(step$columns))
       break
     }
   }
-  yj_params
+  params
 }
@@ -111,7 +111,6 @@ vec_proxy_equal.quantile_pred <- function(x, ...) {
     dplyr::select(-.row)
 }
 
-
 # quantiles by treating quantile_pred like a distribution -----------------
 
 
@@ -287,13 +286,32 @@ vec_math.quantile_pred <- function(.fn, .x, ...) {
   quantile_pred(.fn(.x), quantile_levels)
 }
 
+#' Internal vctrs methods
+#'
+#' @import vctrs
+#' @keywords internal
+#' @name epipredict-vctrs
+
 #' @importFrom vctrs vec_arith vec_arith.numeric
 #' @export
 #' @method vec_arith quantile_pred
 vec_arith.quantile_pred <- function(op, x, y, ...) {
   UseMethod("vec_arith.quantile_pred", y)
 }
 
+
+#' @export
+#' @method vec_arith.quantile_pred quantile_pred
+vec_arith.quantile_pred.quantile_pred <- function(op, x, y, ...) {
+  all_quantiles <- unique(c(x %@% "quantile_levels", y %@% "quantile_levels"))
+  op_fn <- getExportedValue("base", op)
+  # Interpolate/extrapolate to the same quantiles
+  x <- quantile.quantile_pred(x, all_quantiles)
+  y <- quantile.quantile_pred(y, all_quantiles)
+  out <- op_fn(x, y, ...)
+  quantile_pred(out, all_quantiles)
+}
+
 #' @export
 #' @method vec_arith.quantile_pred numeric
 vec_arith.quantile_pred.numeric <- function(op, x, y, ...) {