From 8546548612f8c09c61c11abcf694b0c012542695 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 2 Apr 2025 10:38:39 -0700 Subject: [PATCH 01/15] feat: add filter.epi_archive --- NAMESPACE | 1 + NEWS.md | 6 ++ R/methods-epi_archive.R | 116 ++++++++++++++++++++++++++++++++++++++ man/filter.epi_archive.Rd | 61 ++++++++++++++++++++ 4 files changed, 184 insertions(+) create mode 100644 man/filter.epi_archive.Rd diff --git a/NAMESPACE b/NAMESPACE index 2f97e5b3..3ad5e252 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,6 +31,7 @@ S3method(epix_slide,epi_archive) S3method(epix_slide,grouped_epi_archive) S3method(epix_truncate_versions_after,epi_archive) S3method(epix_truncate_versions_after,grouped_epi_archive) +S3method(filter,epi_archive) S3method(group_by,epi_archive) S3method(group_by,epi_df) S3method(group_by,grouped_epi_archive) diff --git a/NEWS.md b/NEWS.md index 3ac814aa..50af8535 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,12 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.x.y will indicate PR's. +# epiprocess 0.12 + +## New features + +- Added `dplyr::filter` implementation for `epi_archive`s. + # epiprocess 0.11 ## Breaking changes diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index a8421efc..5b93ba2b 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -984,3 +984,119 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { attr(data, "epiprocess::col_modify_recorder_df::cols") <- cols data } + + + +#' [`dplyr::filter`] for `epi_archive`s +#' +#' @param .data an `epi_archive` +#' @param ... as in [`dplyr::filter`]; using the `version` column is not allowed +#' unless you use `.format_aware = TRUE`; see details. +#' @param .by as in [`dplyr::filter`] +#' @param .format_aware optional, `TRUE` or `FALSE`; default `FALSE`. See +#' details. +#' +#' @details +#' +#' By default, using the `version` column is disabled as it's easy to +#' get unexpected results. See if either [`epix_as_of`] or [`epix_slide`] +#' works as an alternative. If they don't cover your use case, then you can +#' set `.format_aware = TRUE` to enable usage of the `version` column, but be +#' careful to: +#' * Factor in that `.data$DT` may be using a "compact" format based on diffing +#' consecutive versions; see details of [`as_epi_archive`] +#' * Set `clobberable_versions_start` and `versions_end` of the result +#' appropriately after the `filter` call. They will be initialized with the +#' same values as in `.data`. +#' +#' `dplyr::filter` also has an optional argument `.preserve`, which should not +#' have an impact on (ungrouped) `epi_archive`s, and `grouped_epi_archive`s do +#' not currently support `dplyr::filter`. +#' +#' @examples +#' +#' # Filter to one location and a particular time range: +#' archive_cases_dv_subset %>% +#' filter(geo_value == "fl", time_value >= as.Date("2020-10-01")) +#' +#' # Convert to weekly by taking the Saturday data for each week, so that +#' # `case_rate_7d_av` represents a Sun--Sat average: +#' archive_cases_dv_subset %>% +#' filter(as.POSIXlt(time_value)$wday == 6L) +#' +#' # Filtering involving versions requires extra care. See epix_as_of and +#' # epix_slide instead for some common operations. One semi-common operation +#' # that ends up being fairly simple is treating observations as finalized +#' # after some amount of time, and ignoring any revisions that were made after +#' # that point: +#' archive_cases_dv_subset %>% +#' filter(version <= time_value + as.difftime(60, units = "days"), +#' .format_aware = TRUE +#' ) +#' +#' @export +filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { + in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") + if (.format_aware) { + out_tbl <- in_tbl %>% + filter(..., .by = .by) + } else { + out_tbl <- in_tbl %>% + filter( + # Add our own fake filter arg to the user's ..., to update the data mask + # to prevent `version` column usage. + { + # We should be evaluating inside the data mask. To disable both + # `version` and `.data$version`, we need to go to the data mask's + # ------ + e <- environment() + while (!identical(e, globalenv()) && !identical(e, emptyenv())) { + if ("version" %in% names(e)) { + # "version" is expected to be an active binding, and directly + # assigning over it has issues; explicitly `rm` first. + rm(list = "version", envir = e) + delayedAssign("version", cli::cli_abort(c( + "Using `version` in `filter` may produce unexpected results.", + ">" = "See if `epix_as_of` or `epix_slide` would work instead.", + ">" = "If not, see `?filter.epi_archive` details for how to proceed." + )), assign.env = e) + break + } + e <- parent.env(e) + } + TRUE + }, + ..., + .by = .by + ) + } + out_geo_type <- + if (.data$geo_type == "custom") { + # We might be going from a multi-resolution to single-resolution archive; + # e.g. national+state -> state; try to re-infer: + guess_geo_type(out_tbl$geo_value) + } else { + # We risk less-understandable inference failures such as inferring "hhs" + # from selecting hrr 10 data; just use the old geo_type: + .data$geo_type + } + # We might be going from daily to weekly; re-infer: + out_time_type <- guess_time_type(out_tbl$time_value) + # Even if they narrow down to just a single value of an other_keys column, + # it's probably still better (& simpler) to treat it as an other_keys column + # since it still exists in the result: + out_other_keys <- .data$other_keys + # `filter` makes no guarantees about not aliasing columns in its result when + # the filter condition is all TRUE, so don't setDT. + out_dtbl <- as.data.table(out_tbl, key = out_other_keys) + result <- new_epi_archive( + out_dtbl, + out_geo_type, out_time_type, out_other_keys, + # Assume version-related metadata unchanged; part of why we want to push + # back on filter expressions like `.data$version <= .env$as_of`: + .data$clobberable_versions_start, .data$versions_end + ) + # Filtering down rows while keeping all (ukey) columns should preserve ukey + # uniqueness. + result +} diff --git a/man/filter.epi_archive.Rd b/man/filter.epi_archive.Rd new file mode 100644 index 00000000..b0012bd2 --- /dev/null +++ b/man/filter.epi_archive.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-epi_archive.R +\name{filter.epi_archive} +\alias{filter.epi_archive} +\title{\code{\link[dplyr:filter]{dplyr::filter}} for \code{epi_archive}s} +\usage{ +\method{filter}{epi_archive}(.data, ..., .by = NULL, .format_aware = FALSE) +} +\arguments{ +\item{.data}{an \code{epi_archive}} + +\item{...}{as in \code{\link[dplyr:filter]{dplyr::filter}}; using the \code{version} column is not allowed +unless you use \code{.format_aware = TRUE}; see details.} + +\item{.by}{as in \code{\link[dplyr:filter]{dplyr::filter}}} + +\item{.format_aware}{optional, \code{TRUE} or \code{FALSE}; default \code{FALSE}. See +details.} +} +\description{ +\code{\link[dplyr:filter]{dplyr::filter}} for \code{epi_archive}s +} +\details{ +By default, using the \code{version} column is disabled as it's easy to +get unexpected results. See if either \code{\link{epix_as_of}} or \code{\link{epix_slide}} +works as an alternative. If they don't cover your use case, then you can +set \code{.format_aware = TRUE} to enable usage of the \code{version} column, but be +careful to: +\itemize{ +\item Factor in that \code{.data$DT} may be using a "compact" format based on diffing +consecutive versions; see details of \code{\link{as_epi_archive}} +\item Set \code{clobberable_versions_start} and \code{versions_end} of the result +appropriately after the \code{filter} call. They will be initialized with the +same values as in \code{.data}. +} + +\code{dplyr::filter} also has an optional argument \code{.preserve}, which should not +have an impact on (ungrouped) \code{epi_archive}s, and \code{grouped_epi_archive}s do +not currently support \code{dplyr::filter}. +} +\examples{ + +# Filter to one location and a particular time range: +archive_cases_dv_subset \%>\% + filter(geo_value == "fl", time_value >= as.Date("2020-10-01")) + +# Convert to weekly by taking the Saturday data for each week, so that +# `case_rate_7d_av` represents a Sun--Sat average: +archive_cases_dv_subset \%>\% + filter(as.POSIXlt(time_value)$wday == 6L) + +# Filtering involving versions requires extra care. See epix_as_of and +# epix_slide instead for some common operations. One semi-common operation +# that ends up being fairly simple is treating observations as finalized +# after some amount of time, and ignoring any revisions that were made after +# that point: +archive_cases_dv_subset \%>\% + filter(version <= time_value + as.difftime(60, units = "days"), + .format_aware = TRUE) + +} From 9a4a6018ba0ddba0a939c47dd34f682e7c63a736 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 2 Apr 2025 17:35:45 -0700 Subject: [PATCH 02/15] In filter.epi_archive, favor avoiding incorrect geo_type reinferences --- R/methods-epi_archive.R | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 5b93ba2b..5c2534e3 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -1070,16 +1070,11 @@ filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { .by = .by ) } - out_geo_type <- - if (.data$geo_type == "custom") { - # We might be going from a multi-resolution to single-resolution archive; - # e.g. national+state -> state; try to re-infer: - guess_geo_type(out_tbl$geo_value) - } else { - # We risk less-understandable inference failures such as inferring "hhs" - # from selecting hrr 10 data; just use the old geo_type: - .data$geo_type - } + # We could try to re-infer the geo_type, e.g., when filtering from + # national+state to just state. However, we risk inference failures such as + # "hrr" -> "hhs" from filtering to hrr 10, or "custom" -> USA-related when + # working with non-USA data: + out_geo_type <- .data$geo_type # We might be going from daily to weekly; re-infer: out_time_type <- guess_time_type(out_tbl$time_value) # Even if they narrow down to just a single value of an other_keys column, From 1f2bd85386c6fe00a510efedbc32a27e054c3cc4 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 2 Apr 2025 18:08:24 -0700 Subject: [PATCH 03/15] feat(filter.epi_archive): also guard against using measurement columns --- R/methods-epi_archive.R | 53 +++++++++++++++++++++++++-------------- man/filter.epi_archive.Rd | 32 ++++++++++++----------- 2 files changed, 52 insertions(+), 33 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 5c2534e3..5357212f 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -998,13 +998,15 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { #' #' @details #' -#' By default, using the `version` column is disabled as it's easy to -#' get unexpected results. See if either [`epix_as_of`] or [`epix_slide`] -#' works as an alternative. If they don't cover your use case, then you can -#' set `.format_aware = TRUE` to enable usage of the `version` column, but be -#' careful to: -#' * Factor in that `.data$DT` may be using a "compact" format based on diffing -#' consecutive versions; see details of [`as_epi_archive`] +#' By default, using the `version` column or measurement columns is disabled as +#' it's easy to get unexpected results. See if either [`epix_as_of`] or +#' [`epix_slide`] works as an alternative. If they don't cover your use case, +#' then you can set `.format_aware = TRUE` to enable usage of these columns, but +#' be careful to: +#' * Factor in that `.data$DT` may have been converted into a compact format +#' based on diffing consecutive versions, and the last version of each +#' observation in `.data$DT` will always be carried forward to future +#' `version`s`; see details of [`as_epi_archive`]. #' * Set `clobberable_versions_start` and `versions_end` of the result #' appropriately after the `filter` call. They will be initialized with the #' same values as in `.data`. @@ -1024,13 +1026,14 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { #' archive_cases_dv_subset %>% #' filter(as.POSIXlt(time_value)$wday == 6L) #' -#' # Filtering involving versions requires extra care. See epix_as_of and -#' # epix_slide instead for some common operations. One semi-common operation -#' # that ends up being fairly simple is treating observations as finalized -#' # after some amount of time, and ignoring any revisions that were made after -#' # that point: +#' # Filtering involving the `version` column or measurement columns requires +#' # extra care. See epix_as_of and epix_slide instead for some common +#' # operations. One semi-common operation that ends up being fairly simple is +#' # treating observations as finalized after some amount of time, and ignoring +#' # any revisions that were made after that point: #' archive_cases_dv_subset %>% -#' filter(version <= time_value + as.difftime(60, units = "days"), +#' filter( +#' version <= time_value + as.difftime(60, units = "days"), #' .format_aware = TRUE #' ) #' @@ -1041,25 +1044,37 @@ filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { out_tbl <- in_tbl %>% filter(..., .by = .by) } else { + measurement_colnames <- setdiff(names(.data$DT), key_colnames(.data)) + forbidden_colnames <- c("version", measurement_colnames) out_tbl <- in_tbl %>% filter( # Add our own fake filter arg to the user's ..., to update the data mask # to prevent `version` column usage. { # We should be evaluating inside the data mask. To disable both - # `version` and `.data$version`, we need to go to the data mask's - # ------ + # `version` and `.data$version` etc., we need to go to the ancestor + # environment containing the data mask's column bindings. This is + # likely just the parent env, but search to make sure, in a way akin + # to `<<-`: e <- environment() while (!identical(e, globalenv()) && !identical(e, emptyenv())) { if ("version" %in% names(e)) { - # "version" is expected to be an active binding, and directly - # assigning over it has issues; explicitly `rm` first. - rm(list = "version", envir = e) + # This is where the column bindings are. Replace the forbidden ones. + # They are expected to be active bindings, so directly + # assigning has issues; `rm` first. + rm(list = forbidden_colnames, envir = e) delayedAssign("version", cli::cli_abort(c( - "Using `version` in `filter` may produce unexpected results.", + "Using `version` in `filter.epi_archive` may produce unexpected results.", ">" = "See if `epix_as_of` or `epix_slide` would work instead.", ">" = "If not, see `?filter.epi_archive` details for how to proceed." )), assign.env = e) + for (measurement_colname in measurement_colnames) { + delayedAssign(measurement_colname, cli::cli_abort(c( + "Using `{format_varname(measurement_colname)}` + in `filter.epi_archive` may produce unexpected results.", + ">" = "See `?filter.epi_archive` details for how to proceed." + )), assign.env = e) + } break } e <- parent.env(e) diff --git a/man/filter.epi_archive.Rd b/man/filter.epi_archive.Rd index b0012bd2..a007bdc5 100644 --- a/man/filter.epi_archive.Rd +++ b/man/filter.epi_archive.Rd @@ -21,14 +21,16 @@ details.} \code{\link[dplyr:filter]{dplyr::filter}} for \code{epi_archive}s } \details{ -By default, using the \code{version} column is disabled as it's easy to -get unexpected results. See if either \code{\link{epix_as_of}} or \code{\link{epix_slide}} -works as an alternative. If they don't cover your use case, then you can -set \code{.format_aware = TRUE} to enable usage of the \code{version} column, but be -careful to: +By default, using the \code{version} column or measurement columns is disabled as +it's easy to get unexpected results. See if either \code{\link{epix_as_of}} or +\code{\link{epix_slide}} works as an alternative. If they don't cover your use case, +then you can set \code{.format_aware = TRUE} to enable usage of these columns, but +be careful to: \itemize{ -\item Factor in that \code{.data$DT} may be using a "compact" format based on diffing -consecutive versions; see details of \code{\link{as_epi_archive}} +\item Factor in that \code{.data$DT} may have been converted into a compact format +based on diffing consecutive versions, and the last version of each +observation in \code{.data$DT} will always be carried forward to future +\code{version}s\verb{; see details of [}as_epi_archive`]. \item Set \code{clobberable_versions_start} and \code{versions_end} of the result appropriately after the \code{filter} call. They will be initialized with the same values as in \code{.data}. @@ -49,13 +51,15 @@ archive_cases_dv_subset \%>\% archive_cases_dv_subset \%>\% filter(as.POSIXlt(time_value)$wday == 6L) -# Filtering involving versions requires extra care. See epix_as_of and -# epix_slide instead for some common operations. One semi-common operation -# that ends up being fairly simple is treating observations as finalized -# after some amount of time, and ignoring any revisions that were made after -# that point: +# Filtering involving the `version` column or measurement columns requires +# extra care. See epix_as_of and epix_slide instead for some common +# operations. One semi-common operation that ends up being fairly simple is +# treating observations as finalized after some amount of time, and ignoring +# any revisions that were made after that point: archive_cases_dv_subset \%>\% - filter(version <= time_value + as.difftime(60, units = "days"), - .format_aware = TRUE) + filter( + version <= time_value + as.difftime(60, units = "days"), + .format_aware = TRUE + ) } From 319842952ddb4912d6a7650c505a324753838db4 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 3 Apr 2025 15:19:35 -0700 Subject: [PATCH 04/15] Test filter.epi_archive, fix&tweak some behaviors --- R/methods-epi_archive.R | 19 ++-- tests/testthat/test-methods-epi_archive.R | 100 ++++++++++++++++++++++ 2 files changed, 113 insertions(+), 6 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 5357212f..8595e316 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -1042,7 +1042,7 @@ filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") if (.format_aware) { out_tbl <- in_tbl %>% - filter(..., .by = .by) + filter(..., .by = {{.by}}) } else { measurement_colnames <- setdiff(names(.data$DT), key_colnames(.data)) forbidden_colnames <- c("version", measurement_colnames) @@ -1067,13 +1067,13 @@ filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { "Using `version` in `filter.epi_archive` may produce unexpected results.", ">" = "See if `epix_as_of` or `epix_slide` would work instead.", ">" = "If not, see `?filter.epi_archive` details for how to proceed." - )), assign.env = e) + ), class = "epiprocess__filter_archive__used_version"), assign.env = e) for (measurement_colname in measurement_colnames) { delayedAssign(measurement_colname, cli::cli_abort(c( "Using `{format_varname(measurement_colname)}` in `filter.epi_archive` may produce unexpected results.", ">" = "See `?filter.epi_archive` details for how to proceed." - )), assign.env = e) + ), class = "epiprocess__filter_archive__used_measurement"), assign.env = e) } break } @@ -1082,7 +1082,7 @@ filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { TRUE }, ..., - .by = .by + .by = {{.by}} ) } # We could try to re-infer the geo_type, e.g., when filtering from @@ -1090,8 +1090,15 @@ filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { # "hrr" -> "hhs" from filtering to hrr 10, or "custom" -> USA-related when # working with non-USA data: out_geo_type <- .data$geo_type - # We might be going from daily to weekly; re-infer: - out_time_type <- guess_time_type(out_tbl$time_value) + if (.data$time_type == "day") { + # We might be going from daily to weekly; re-infer: + out_time_type <- guess_time_type(out_tbl$time_value) + } else { + # We might be filtering weekly to a single time_value; avoid re-inferring to + # stay "week". Or in other cases, just skip inferring, as re-inferring is + # expected to match the input time_type: + out_time_type <- .data$time_type + } # Even if they narrow down to just a single value of an other_keys column, # it's probably still better (& simpler) to treat it as an other_keys column # since it still exists in the result: diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 45ba6ea1..9c80ffa1 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -128,3 +128,103 @@ test_that("group_vars works as expected", { "geo_value" ) }) + +test_that("filter.epi_archive works as expected", { + + ea2 <- ea2_data %>% + as_epi_archive() + + # Some basic output value checks: + + expect_equal( + ea2 %>% filter(geo_value == "tn"), + new_epi_archive( + ea2$DT[FALSE], + ea2$geo_type, ea2$time_type, ea2$other_keys, + ea2$clobberable_versions_start, ea2$versions_end + ) + ) + + expect_equal( + ea2 %>% filter(geo_value == "ca", time_value == as.Date("2020-06-02")), + new_epi_archive( + data.table::data.table(geo_value = "ca", time_value = as.Date("2020-06-02"), + version = as.Date("2020-06-02") + 0:2, cases = 0:2), + ea2$geo_type, ea2$time_type, ea2$other_keys, + ea2$clobberable_versions_start, ea2$versions_end + ) + ) + + # Output geo_type and time_type behavior: + + hrr_day_ea <- tibble( + geo_value = c(rep(1, 14), 100), + time_value = as.Date("2020-01-01") - 1 + c(1:14, 14), + version = time_value + 3, + value = 1:15 + ) %>% + as_epi_archive() + + expect_equal(hrr_day_ea$geo_type, "hrr") + expect_equal(hrr_day_ea$time_type, "day") + + hrr_week_ea <- hrr_day_ea %>% + filter(geo_value == 1, as.POSIXlt(time_value)$wday == 6L) + + expect_equal(hrr_week_ea$geo_type, "hrr") + expect_equal(hrr_week_ea$time_type, "week") + + hrr_one_week_ea <- hrr_week_ea %>% + filter(time_value == time_value[[1]]) + + expect_equal(hrr_one_week_ea$time_type, "week") + + intcustom_day_ea <- hrr_day_ea + intcustom_day_ea$geo_type <- "custom" + + intcustom_week_ea <- intcustom_day_ea %>% + filter(geo_value == 1, as.POSIXlt(time_value)$wday == 6L) + + expect_equal(intcustom_week_ea$geo_type, "custom") + expect_equal(intcustom_week_ea$time_type, "week") + + # Error-raising: + expect_error( + ea2 %>% filter(version == as.Date("2020-06-02")), + class = "epiprocess__filter_archive__used_version" + ) + expect_error( + ea2 %>% filter(version <= as.Date("2020-06-02")), + class = "epiprocess__filter_archive__used_version" + ) + expect_snapshot( + ea2 %>% filter(version <= as.Date("2020-06-02")), + error = TRUE, cnd_class = TRUE + ) + expect_error( + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L), + class = "epiprocess__filter_archive__used_measurement" + ) + expect_snapshot( + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L), + error = TRUE, cnd_class = TRUE + ) + expect_error( + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L), + class = "epiprocess__filter_archive__used_measurement" + ) + expect_error( + ea2 %>% filter(cases >= median(cases), .by = geo_value), + class = "epiprocess__filter_archive__used_measurement" + ) + + # Escape hatch: + expect_equal( + ea2 %>% + filter(version <= time_value + as.difftime(1, units = "days"), + .format_aware = TRUE) %>% + .$DT, + ea2$DT[version <= time_value + as.difftime(1, units = "days"), ] + ) + +}) From 24bda523f98b6a3c487d90180f37a93853586504 Mon Sep 17 00:00:00 2001 From: brookslogan Date: Thu, 3 Apr 2025 22:22:53 +0000 Subject: [PATCH 05/15] style: styler (GHA) --- R/methods-epi_archive.R | 4 ++-- tests/testthat/test-methods-epi_archive.R | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 8595e316..02cf2cf3 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -1042,7 +1042,7 @@ filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { in_tbl <- tibble::as_tibble(as.list(.data$DT), .name_repair = "minimal") if (.format_aware) { out_tbl <- in_tbl %>% - filter(..., .by = {{.by}}) + filter(..., .by = {{ .by }}) } else { measurement_colnames <- setdiff(names(.data$DT), key_colnames(.data)) forbidden_colnames <- c("version", measurement_colnames) @@ -1082,7 +1082,7 @@ filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { TRUE }, ..., - .by = {{.by}} + .by = {{ .by }} ) } # We could try to re-infer the geo_type, e.g., when filtering from diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 9c80ffa1..6e08c990 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -130,7 +130,6 @@ test_that("group_vars works as expected", { }) test_that("filter.epi_archive works as expected", { - ea2 <- ea2_data %>% as_epi_archive() @@ -148,8 +147,10 @@ test_that("filter.epi_archive works as expected", { expect_equal( ea2 %>% filter(geo_value == "ca", time_value == as.Date("2020-06-02")), new_epi_archive( - data.table::data.table(geo_value = "ca", time_value = as.Date("2020-06-02"), - version = as.Date("2020-06-02") + 0:2, cases = 0:2), + data.table::data.table( + geo_value = "ca", time_value = as.Date("2020-06-02"), + version = as.Date("2020-06-02") + 0:2, cases = 0:2 + ), ea2$geo_type, ea2$time_type, ea2$other_keys, ea2$clobberable_versions_start, ea2$versions_end ) @@ -222,9 +223,9 @@ test_that("filter.epi_archive works as expected", { expect_equal( ea2 %>% filter(version <= time_value + as.difftime(1, units = "days"), - .format_aware = TRUE) %>% + .format_aware = TRUE + ) %>% .$DT, ea2$DT[version <= time_value + as.difftime(1, units = "days"), ] ) - }) From 17a369cb6f6ab52ed1c4f765c0996a74dcd7d2e6 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 15 Apr 2025 10:41:07 -0700 Subject: [PATCH 06/15] test: add environment variable test --- tests/testthat/test-methods-epi_archive.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 6e08c990..2418af4e 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -189,6 +189,10 @@ test_that("filter.epi_archive works as expected", { expect_equal(intcustom_week_ea$geo_type, "custom") expect_equal(intcustom_week_ea$time_type, "week") + # Environment variables should be fine: + version <- as.Date("2020-06-02") + 1 + expect_no_error(ea2 %>% filter(geo_value == "ca", .env$version <= time_value)) + # Error-raising: expect_error( ea2 %>% filter(version == as.Date("2020-06-02")), From 80d787b9919b323787e05e00994dc349e762af91 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 16 Apr 2025 11:16:12 -0700 Subject: [PATCH 07/15] fix(filter.epi_archive): for+lazy, left-behind bindings --- R/methods-epi_archive.R | 18 +++++++--- tests/testthat/_snaps/methods-epi_archive.md | 35 ++++++++++++++++++++ tests/testthat/test-methods-epi_archive.R | 16 +++++++-- 3 files changed, 62 insertions(+), 7 deletions(-) create mode 100644 tests/testthat/_snaps/methods-epi_archive.md diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 02cf2cf3..4b443afa 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -1069,16 +1069,24 @@ filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { ">" = "If not, see `?filter.epi_archive` details for how to proceed." ), class = "epiprocess__filter_archive__used_version"), assign.env = e) for (measurement_colname in measurement_colnames) { - delayedAssign(measurement_colname, cli::cli_abort(c( - "Using `{format_varname(measurement_colname)}` - in `filter.epi_archive` may produce unexpected results.", - ">" = "See `?filter.epi_archive` details for how to proceed." - ), class = "epiprocess__filter_archive__used_measurement"), assign.env = e) + # Record current `measurement_colname` and set up delayed + # binding for error in a child environment, so that `for` loop + # updating its value and `rm` cleanup don't mess things up: + local({ + local_measurement_colname <- measurement_colname + delayedAssign(measurement_colname, cli::cli_abort(c( + "Using `{format_varname(local_measurement_colname)}` + in `filter.epi_archive` may produce unexpected results.", + ">" = "See `?filter.epi_archive` details for how to proceed." + ), class = "epiprocess__filter_archive__used_measurement"), assign.env = e) + }) } break } e <- parent.env(e) } + # Don't mask similarly-named user objects: + rm(list = c("e", "measurement_colname")) TRUE }, ..., diff --git a/tests/testthat/_snaps/methods-epi_archive.md b/tests/testthat/_snaps/methods-epi_archive.md new file mode 100644 index 00000000..200e4202 --- /dev/null +++ b/tests/testthat/_snaps/methods-epi_archive.md @@ -0,0 +1,35 @@ +# filter.epi_archive works as expected + + Code + ea2 %>% filter(version <= as.Date("2020-06-02")) + Condition + Error in `filter()`: + i In argument: `version <= as.Date("2020-06-02")`. + Caused by error: + ! Using `version` in `filter.epi_archive` may produce unexpected results. + > See if `epix_as_of` or `epix_slide` would work instead. + > If not, see `?filter.epi_archive` details for how to proceed. + +--- + + Code + ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L) + Condition + Error in `filter()`: + i In argument: `cases >= 2L`. + Caused by error: + ! Using `cases` in `filter.epi_archive` may produce unexpected results. + > See `?filter.epi_archive` details for how to proceed. + +--- + + Code + ea2p %>% filter(cases >= median(cases), .by = geo_value) + Condition + Error in `filter()`: + i In argument: `cases >= median(cases)`. + i In group 1: `geo_value = "ca"`. + Caused by error: + ! Using `cases` in `filter.epi_archive` may produce unexpected results. + > See `?filter.epi_archive` details for how to proceed. + diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index 2418af4e..f9299d0f 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -191,7 +191,11 @@ test_that("filter.epi_archive works as expected", { # Environment variables should be fine: version <- as.Date("2020-06-02") + 1 - expect_no_error(ea2 %>% filter(geo_value == "ca", .env$version <= time_value)) + e <- version + expected <- ea2 %>% filter(geo_value == "ca", as.Date("2020-06-02") + 1 <= time_value) + expect_equal(ea2 %>% filter(geo_value == "ca", .env$version <= time_value), expected) + expect_equal(ea2 %>% filter(geo_value == "ca", e <= time_value), expected) + expect_equal(ea2 %>% filter(geo_value == "ca", .env$e <= time_value), expected) # Error-raising: expect_error( @@ -218,10 +222,18 @@ test_that("filter.epi_archive works as expected", { ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L), class = "epiprocess__filter_archive__used_measurement" ) + ea2p <- ea2_data %>% + # to check for `for` + `delayedAssign` mishap in expect_snapshot + mutate(deaths = 0) %>% + as_epi_archive() expect_error( - ea2 %>% filter(cases >= median(cases), .by = geo_value), + ea2p %>% filter(cases >= median(cases), .by = geo_value), class = "epiprocess__filter_archive__used_measurement" ) + expect_snapshot( + ea2p %>% filter(cases >= median(cases), .by = geo_value), + error = TRUE, cnd_class = TRUE + ) # Escape hatch: expect_equal( From 07e54f40c340c57b4f956cf5e491963c408ac5a1 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 16 Apr 2025 12:20:48 -0700 Subject: [PATCH 08/15] fix(filter.epi_archive): avoid other lazy eval traps --- R/methods-epi_archive.R | 46 +++++++++++++++-------- tests/testthat/test-methods-epi_archive.R | 19 +++++++++- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 4b443afa..9f244aa9 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -1063,30 +1063,46 @@ filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { # They are expected to be active bindings, so directly # assigning has issues; `rm` first. rm(list = forbidden_colnames, envir = e) - delayedAssign("version", cli::cli_abort(c( - "Using `version` in `filter.epi_archive` may produce unexpected results.", - ">" = "See if `epix_as_of` or `epix_slide` would work instead.", - ">" = "If not, see `?filter.epi_archive` details for how to proceed." - ), class = "epiprocess__filter_archive__used_version"), assign.env = e) + eval_env <- new.env(parent = asNamespace("epiprocess")) # see (2) below + delayedAssign( + "version", + cli_abort(c( + "Using `version` in `filter.epi_archive` may produce unexpected results.", + ">" = "See if `epix_as_of` or `epix_slide` would work instead.", + ">" = "If not, see `?filter.epi_archive` details for how to proceed." + ), class = "epiprocess__filter_archive__used_version"), + eval.env = eval_env, + assign.env = e + ) for (measurement_colname in measurement_colnames) { - # Record current `measurement_colname` and set up delayed - # binding for error in a child environment, so that `for` loop - # updating its value and `rm` cleanup don't mess things up: - local({ - local_measurement_colname <- measurement_colname - delayedAssign(measurement_colname, cli::cli_abort(c( + # Record current `measurement_colname` and set up execution for + # the promise for the error in its own dedicated environment, so + # that (1) `for` loop updating its value and `rm` cleanup don't + # mess things up. We can also (2) prevent changes to data mask + # ancestry (to involve user's quosure env rather than our + # quosure env) or contents (from edge case of user binding + # functions inside the mask) from potentially interfering by + # setting the promise's execution environment to skip over the + # data mask. + eval_env <- new.env(parent = asNamespace("epiprocess")) + eval_env[["local_measurement_colname"]] <- measurement_colname + delayedAssign( + measurement_colname, + cli_abort(c( "Using `{format_varname(local_measurement_colname)}` in `filter.epi_archive` may produce unexpected results.", ">" = "See `?filter.epi_archive` details for how to proceed." - ), class = "epiprocess__filter_archive__used_measurement"), assign.env = e) - }) + ), class = "epiprocess__filter_archive__used_measurement"), + eval.env = eval_env, + assign.env = e + ) } break } e <- parent.env(e) } - # Don't mask similarly-named user objects: - rm(list = c("e", "measurement_colname")) + # Don't mask similarly-named user objects in ancestor envs: + rm(list = c("e", "measurement_colname", "eval_env")) TRUE }, ..., diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index f9299d0f..dafa5521 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -222,8 +222,10 @@ test_that("filter.epi_archive works as expected", { ea2 %>% filter(time_value >= as.Date("2020-06-02"), cases >= 2L), class = "epiprocess__filter_archive__used_measurement" ) + # Check for `for` + `delayedAssign` mishap in `expect_snapshot` (we should say + # something about `cases` (the relevant colname), not `deaths` (the last + # measurement colname)): ea2p <- ea2_data %>% - # to check for `for` + `delayedAssign` mishap in expect_snapshot mutate(deaths = 0) %>% as_epi_archive() expect_error( @@ -234,6 +236,21 @@ test_that("filter.epi_archive works as expected", { ea2p %>% filter(cases >= median(cases), .by = geo_value), error = TRUE, cnd_class = TRUE ) + # Check that we are insulated from other lazy eval traps: + expected <- rlang::catch_cnd(ea2p %>% filter(cases >= median(cases), .by = geo_value)) + expect_class(expected$parent, "epiprocess__filter_archive__used_measurement") + with(list(cli_abort = function(...) stop("now, pretend user didn't have cli attached")), { + expect_equal(rlang::catch_cnd(ea2p %>% filter(cases >= median(cases), .by = geo_value))$parent$message, + expected$parent$message) + }) + expect_equal( + rlang::catch_cnd(ea2p %>% filter({ + c <- function(...) stop("and that they overwrote `c` to try to debug their own code") + cases >= median(cases) + }, .by = geo_value))$parent$message, + expected$parent$message + ) + # Escape hatch: expect_equal( From 751f547c177f00cf0e4bba82e4cc887dd318bebc Mon Sep 17 00:00:00 2001 From: brookslogan Date: Wed, 16 Apr 2025 19:24:52 +0000 Subject: [PATCH 09/15] style: styler (GHA) --- tests/testthat/test-methods-epi_archive.R | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test-methods-epi_archive.R b/tests/testthat/test-methods-epi_archive.R index dafa5521..a073594f 100644 --- a/tests/testthat/test-methods-epi_archive.R +++ b/tests/testthat/test-methods-epi_archive.R @@ -240,14 +240,19 @@ test_that("filter.epi_archive works as expected", { expected <- rlang::catch_cnd(ea2p %>% filter(cases >= median(cases), .by = geo_value)) expect_class(expected$parent, "epiprocess__filter_archive__used_measurement") with(list(cli_abort = function(...) stop("now, pretend user didn't have cli attached")), { - expect_equal(rlang::catch_cnd(ea2p %>% filter(cases >= median(cases), .by = geo_value))$parent$message, - expected$parent$message) + expect_equal( + rlang::catch_cnd(ea2p %>% filter(cases >= median(cases), .by = geo_value))$parent$message, + expected$parent$message + ) }) expect_equal( - rlang::catch_cnd(ea2p %>% filter({ - c <- function(...) stop("and that they overwrote `c` to try to debug their own code") - cases >= median(cases) - }, .by = geo_value))$parent$message, + rlang::catch_cnd(ea2p %>% filter( + { + c <- function(...) stop("and that they overwrote `c` to try to debug their own code") + cases >= median(cases) + }, + .by = geo_value + ))$parent$message, expected$parent$message ) From 7fb25fb32c49693eb74c5808bd35c7ed63bf47d2 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 16 Apr 2025 12:28:37 -0700 Subject: [PATCH 10/15] Bump DESCRIPTION version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index c7a651d8..b008956d 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: epiprocess Title: Tools for basic signal processing in epidemiology -Version: 0.11.5 +Version: 0.11.6 Authors@R: c( person("Jacob", "Bien", role = "ctb"), person("Logan", "Brooks", , "lcbrooks+github@andrew.cmu.edu", role = c("aut", "cre")), From 8f63ad8e11ebed25f71c947e121a26302789735e Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 16 Apr 2025 12:39:01 -0700 Subject: [PATCH 11/15] Address failing checks --- R/methods-epi_archive.R | 2 +- _pkgdown.yml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 3f2c987a..cff16aed 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -1093,7 +1093,7 @@ filter.epi_archive <- function(.data, ..., .by = NULL, .format_aware = FALSE) { # likely just the parent env, but search to make sure, in a way akin # to `<<-`: e <- environment() - while (!identical(e, globalenv()) && !identical(e, emptyenv())) { + while (!identical(e, globalenv()) && !identical(e, emptyenv())) { # nolint:vector_logic_linter if ("version" %in% names(e)) { # This is where the column bindings are. Replace the forbidden ones. # They are expected to be active bindings, so directly diff --git a/_pkgdown.yml b/_pkgdown.yml index 2953039c..ebd73dc8 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -72,8 +72,9 @@ reference: - epix_as_of - epix_as_of_current - epix_slide - - epix_merge - revision_summary + - epix_merge + - filter.epi_archive - epix_fill_through_version - epix_truncate_versions_after - set_versions_end From 8b569ff96a9cdb6e006b700d0a15f54430c0fd09 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 16 Apr 2025 12:50:04 -0700 Subject: [PATCH 12/15] docs(filter.epi_archive): suggest alternative for meas col filtering --- R/methods-epi_archive.R | 9 ++++++--- man/filter.epi_archive.Rd | 9 ++++++--- man/revision_analysis.Rd | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index cff16aed..33f73141 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -1036,9 +1036,12 @@ dplyr_col_modify.col_modify_recorder_df <- function(data, cols) { #' #' By default, using the `version` column or measurement columns is disabled as #' it's easy to get unexpected results. See if either [`epix_as_of`] or -#' [`epix_slide`] works as an alternative. If they don't cover your use case, -#' then you can set `.format_aware = TRUE` to enable usage of these columns, but -#' be careful to: +#' [`epix_slide`] works for any version selection you have in mind: for version +#' selection, see the `version` or `.versions` args, respectively; for +#' measurement column-based filtering, try `filter`ing after `epix_as_of` or +#' inside the `.f` in `epix_slide()`. If they don't cover your use case, then +#' you can set `.format_aware = TRUE` to enable usage of these columns, but be +#' careful to: #' * Factor in that `.data$DT` may have been converted into a compact format #' based on diffing consecutive versions, and the last version of each #' observation in `.data$DT` will always be carried forward to future diff --git a/man/filter.epi_archive.Rd b/man/filter.epi_archive.Rd index a007bdc5..5f9d72db 100644 --- a/man/filter.epi_archive.Rd +++ b/man/filter.epi_archive.Rd @@ -23,9 +23,12 @@ details.} \details{ By default, using the \code{version} column or measurement columns is disabled as it's easy to get unexpected results. See if either \code{\link{epix_as_of}} or -\code{\link{epix_slide}} works as an alternative. If they don't cover your use case, -then you can set \code{.format_aware = TRUE} to enable usage of these columns, but -be careful to: +\code{\link{epix_slide}} works for any version selection you have in mind: for version +selection, see the \code{version} or \code{.versions} args, respectively; for +measurement column-based filtering, try \code{filter}ing after \code{epix_as_of} or +inside the \code{.f} in \code{epix_slide()}. If they don't cover your use case, then +you can set \code{.format_aware = TRUE} to enable usage of these columns, but be +careful to: \itemize{ \item Factor in that \code{.data$DT} may have been converted into a compact format based on diffing consecutive versions, and the last version of each diff --git a/man/revision_analysis.Rd b/man/revision_analysis.Rd index 23ddf021..1c7336b3 100644 --- a/man/revision_analysis.Rd +++ b/man/revision_analysis.Rd @@ -56,7 +56,7 @@ of the \code{versions_end} are removed. \code{min_waiting_period} should charact the typical time during which most significant revisions occur. The default of 60 days corresponds to a typical near-final value for case counts as reported in the context of insurance. To avoid this filtering, either set -to \code{NULL} or 0. This will be rounded up to the appropriate \code{time_type} if +to \code{NULL} or 0. A \code{difftime} will be rounded up to the appropriate \code{time_type} if necessary (that is 5 days will be rounded to 1 week if the data is weekly).} \item{within_latest}{double between 0 and 1. Determines the threshold From 26721facbfec803d8b97b03cd1d20b838632b894 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 16 Apr 2025 13:33:44 -0700 Subject: [PATCH 13/15] tests: `expect_equal` when comparing Dates due to inconsistent backing typeof --- tests/testthat/test-epi_slide.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-epi_slide.R b/tests/testthat/test-epi_slide.R index 0aa4aca7..b9be125d 100644 --- a/tests/testthat/test-epi_slide.R +++ b/tests/testthat/test-epi_slide.R @@ -610,7 +610,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { before <- 2L after <- 1L - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = days) %>% @@ -627,7 +627,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { pad_late_dates = as.Date(c("2022-01-08")) ) ) - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = weeks) %>% @@ -677,7 +677,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { before <- 5L after <- 0L - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = days) %>% @@ -701,7 +701,7 @@ test_that("epi_slide_opt helper `full_date_seq` returns expected date values", { before <- 0L after <- 3L - expect_identical( + expect_equal( full_date_seq( epi_data_missing %>% mutate(time_value = days) %>% From e4177567771a1135b97db42f98d69aa40082e7af Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 16 Apr 2025 14:06:46 -0700 Subject: [PATCH 14/15] Relax typeof checks due to Date backing typeof inconsistencies --- R/archive.R | 41 ++++++++------------ R/methods-epi_archive.R | 11 +----- man/epi_archive.Rd | 34 ++++++++-------- tests/testthat/test-archive-version-bounds.R | 15 +++++-- tests/testthat/test-time-utils.R | 4 +- 5 files changed, 48 insertions(+), 57 deletions(-) diff --git a/R/archive.R b/R/archive.R index 922371f1..88f8d704 100644 --- a/R/archive.R +++ b/R/archive.R @@ -52,13 +52,6 @@ validate_version_bound <- function(version_bound, x, na_ok = FALSE, class = "epiprocess__version_bound_mismatched_class" ) } - if (!identical(typeof(version_bound), typeof(x[["version"]]))) { - cli_abort( - "{version_bound_arg} must have the same `typeof` as x$version, - which has a `typeof` of {typeof(x$version)}", - class = "epiprocess__version_bound_mismatched_typeof" - ) - } } return(invisible(NULL)) @@ -207,23 +200,23 @@ next_after.Date <- function(x) x + 1L #' undergo tiny nonmeaningful revisions and the archive object with the #' default setting is too large. #' @param clobberable_versions_start Optional; `length`-1; either a value of the -#' same `class` and `typeof` as `x$version`, or an `NA` of any `class` and -#' `typeof`: specifically, either (a) the earliest version that could be -#' subject to "clobbering" (being overwritten with different update data, but -#' using the *same* version tag as the old update data), or (b) `NA`, to -#' indicate that no versions are clobberable. There are a variety of reasons -#' why versions could be clobberable under routine circumstances, such as (a) -#' today's version of one/all of the columns being published after initially -#' being filled with `NA` or LOCF, (b) a buggy version of today's data being -#' published but then fixed and republished later in the day, or (c) data -#' pipeline delays (e.g., publisher uploading, periodic scraping, database -#' syncing, periodic fetching, etc.) that make events (a) or (b) reflected -#' later in the day (or even on a different day) than expected; potential -#' causes vary between different data pipelines. The default value is `NA`, -#' which doesn't consider any versions to be clobberable. Another setting that -#' may be appropriate for some pipelines is `max_version_with_row_in(x)`. -#' @param versions_end Optional; length-1, same `class` and `typeof` as -#' `x$version`: what is the last version we have observed? The default is +#' same `class` as `x$version`, or an `NA` of any `class`: specifically, +#' either (a) the earliest version that could be subject to "clobbering" +#' (being overwritten with different update data, but using the *same* version +#' tag as the old update data), or (b) `NA`, to indicate that no versions are +#' clobberable. There are a variety of reasons why versions could be +#' clobberable under routine circumstances, such as (a) today's version of +#' one/all of the columns being published after initially being filled with +#' `NA` or LOCF, (b) a buggy version of today's data being published but then +#' fixed and republished later in the day, or (c) data pipeline delays (e.g., +#' publisher uploading, periodic scraping, database syncing, periodic +#' fetching, etc.) that make events (a) or (b) reflected later in the day (or +#' even on a different day) than expected; potential causes vary between +#' different data pipelines. The default value is `NA`, which doesn't consider +#' any versions to be clobberable. Another setting that may be appropriate for +#' some pipelines is `max_version_with_row_in(x)`. +#' @param versions_end Optional; length-1, same `class` as `x$version`: what is +#' the last version we have observed? The default is #' `max_version_with_row_in(x)`, but values greater than this could also be #' valid, and would indicate that we observed additional versions of the data #' beyond `max(x$version)`, but they all contained empty updates. (The default diff --git a/R/methods-epi_archive.R b/R/methods-epi_archive.R index 33f73141..faf3c128 100644 --- a/R/methods-epi_archive.R +++ b/R/methods-epi_archive.R @@ -80,19 +80,13 @@ epix_as_of <- function(x, version, min_time_value = -Inf, all_versions = FALSE, "`version` must have the same `class` vector as `epi_archive$DT$version`." ) } - if (!identical(typeof(version), typeof(x$DT$version))) { - cli_abort( - "`version` must have the same `typeof` as `epi_archive$DT$version`." - ) - } assert_scalar(version, na.ok = FALSE) if (version > x$versions_end) { cli_abort("`version` must be at most `epi_archive$versions_end`.") } assert_scalar(min_time_value, na.ok = FALSE) min_time_value_inf <- is.infinite(min_time_value) && min_time_value < 0 - min_time_value_same_type <- typeof(min_time_value) == typeof(x$DT$time_value) & - class(min_time_value) == class(x$DT$time_value) + min_time_value_same_type <- identical(class(min_time_value), class(x$DT$time_value)) if (!min_time_value_inf && !min_time_value_same_type) { cli_abort("`min_time_value` must be either -Inf or a time_value of the same type and class as `epi_archive$time_value`.") @@ -941,9 +935,6 @@ epix_truncate_versions_after.epi_archive <- function(x, max_version) { if (!identical(class(max_version), class(x$DT$version))) { cli_abort("`max_version` must have the same `class` as `epi_archive$DT$version`.") } - if (!identical(typeof(max_version), typeof(x$DT$version))) { - cli_abort("`max_version` must have the same `typeof` as `epi_archive$DT$version`.") - } assert_scalar(max_version, na.ok = FALSE) if (max_version > x$versions_end) { cli_abort("`max_version` must be at most `epi_archive$versions_end`.") diff --git a/man/epi_archive.Rd b/man/epi_archive.Rd index b92cd505..f91834f3 100644 --- a/man/epi_archive.Rd +++ b/man/epi_archive.Rd @@ -63,21 +63,21 @@ undergo tiny nonmeaningful revisions and the archive object with the default setting is too large.} \item{clobberable_versions_start}{Optional; \code{length}-1; either a value of the -same \code{class} and \code{typeof} as \code{x$version}, or an \code{NA} of any \code{class} and -\code{typeof}: specifically, either (a) the earliest version that could be -subject to "clobbering" (being overwritten with different update data, but -using the \emph{same} version tag as the old update data), or (b) \code{NA}, to -indicate that no versions are clobberable. There are a variety of reasons -why versions could be clobberable under routine circumstances, such as (a) -today's version of one/all of the columns being published after initially -being filled with \code{NA} or LOCF, (b) a buggy version of today's data being -published but then fixed and republished later in the day, or (c) data -pipeline delays (e.g., publisher uploading, periodic scraping, database -syncing, periodic fetching, etc.) that make events (a) or (b) reflected -later in the day (or even on a different day) than expected; potential -causes vary between different data pipelines. The default value is \code{NA}, -which doesn't consider any versions to be clobberable. Another setting that -may be appropriate for some pipelines is \code{max_version_with_row_in(x)}.} +same \code{class} as \code{x$version}, or an \code{NA} of any \code{class}: specifically, +either (a) the earliest version that could be subject to "clobbering" +(being overwritten with different update data, but using the \emph{same} version +tag as the old update data), or (b) \code{NA}, to indicate that no versions are +clobberable. There are a variety of reasons why versions could be +clobberable under routine circumstances, such as (a) today's version of +one/all of the columns being published after initially being filled with +\code{NA} or LOCF, (b) a buggy version of today's data being published but then +fixed and republished later in the day, or (c) data pipeline delays (e.g., +publisher uploading, periodic scraping, database syncing, periodic +fetching, etc.) that make events (a) or (b) reflected later in the day (or +even on a different day) than expected; potential causes vary between +different data pipelines. The default value is \code{NA}, which doesn't consider +any versions to be clobberable. Another setting that may be appropriate for +some pipelines is \code{max_version_with_row_in(x)}.} \item{.versions_end}{location based versions_end, used to avoid prefix \code{version = issue} from being assigned to \code{versions_end} instead of being @@ -86,8 +86,8 @@ used to rename columns.} \item{...}{used for specifying column names, as in \code{\link[dplyr:rename]{dplyr::rename}}. For example \code{version = release_date}} -\item{versions_end}{Optional; length-1, same \code{class} and \code{typeof} as -\code{x$version}: what is the last version we have observed? The default is +\item{versions_end}{Optional; length-1, same \code{class} as \code{x$version}: what is +the last version we have observed? The default is \code{max_version_with_row_in(x)}, but values greater than this could also be valid, and would indicate that we observed additional versions of the data beyond \code{max(x$version)}, but they all contained empty updates. (The default diff --git a/tests/testthat/test-archive-version-bounds.R b/tests/testthat/test-archive-version-bounds.R index ee500d30..878cde1c 100644 --- a/tests/testthat/test-archive-version-bounds.R +++ b/tests/testthat/test-archive-version-bounds.R @@ -71,14 +71,21 @@ test_that("`validate_version_bound` validate and class checks together allow and # Bad: expect_error(validate_version_bound(3.5, x_int, TRUE, "vb"), regexp = "must have the same `class`") expect_error(validate_version_bound(.Machine$integer.max, x_dbl, TRUE, "vb"), regexp = "must have the same `class`") - expect_error(validate_version_bound( - `class<-`(list(2), "clazz"), - tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" - ), regexp = "must have the same `typeof`", class = "epiprocess__version_bound_mismatched_typeof") # Maybe questionable: expect_error(validate_version_bound(3, x_int, TRUE, "vb")) expect_error(validate_version_bound(3L, x_dbl, TRUE, "vb")) + # Maybe questionable, but accept to relax things a bit, as this is happening + # with Dates in some R(?) versions. Might need to turn some things into + # vec_cast_common, but idea is just make Date stuff work for now: + validate_version_bound( + `class<-`(list(2), "clazz"), + tibble::tibble(version = `class<-`(5L, "clazz")), TRUE, "vb" + ) # Good: + validate_version_bound( + `class<-`(2, "Date"), + tibble::tibble(version = `class<-`(5L, "Date")), TRUE, "vb" + ) validate_version_bound(my_int, x_int, TRUE, "vb") validate_version_bound(my_dbl, x_dbl, TRUE, "vb") validate_version_bound(my_list, x_list, TRUE, "vb") diff --git a/tests/testthat/test-time-utils.R b/tests/testthat/test-time-utils.R index 6fe8d78a..7ddd70c0 100644 --- a/tests/testthat/test-time-utils.R +++ b/tests/testthat/test-time-utils.R @@ -17,11 +17,11 @@ test_that("guess_period works", { # On Dates: daily_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "day") weekly_dates <- seq(as.Date("2020-01-01"), as.Date("2020-01-15"), by = "week") - expect_identical( + expect_equal( daily_dates[[1L]] + guess_period(daily_dates) * (seq_along(daily_dates) - 1L), daily_dates ) - expect_identical( + expect_equal( weekly_dates[[1L]] + guess_period(weekly_dates) * (seq_along(weekly_dates) - 1L), weekly_dates ) From 332362bd17505cbe35ad5523f1094fe17b170fd7 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 16 Apr 2025 14:08:56 -0700 Subject: [PATCH 15/15] Address tidyselect warnings on .data$ --- R/methods-epi_df.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R index 7870dede..51f6cf33 100644 --- a/R/methods-epi_df.R +++ b/R/methods-epi_df.R @@ -532,7 +532,7 @@ sum_groups_epi_df <- function(.x, sum_cols, group_cols = "time_value") { if (!"geo_value" %in% group_cols) { out <- out %>% mutate(geo_value = "total") %>% - relocate(.data$geo_value, .before = 1) + relocate("geo_value", .before = 1) } # The `geo_type` will be correctly inherited here by the following logic: