diff --git a/.Rbuildignore b/.Rbuildignore index e13c4051..148324e6 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,8 @@ ^.*\.Rproj$ ^\.Rproj\.user$ ^LICENSE\.md$ +^\.github$ +^docs$ +^_pkgdown.yml +^index\.md$ +^data-raw$ \ No newline at end of file diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 00000000..2d19fc76 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000..6719131f --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +@dajmcdon \ No newline at end of file diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml new file mode 100644 index 00000000..ec397ab8 --- /dev/null +++ b/.github/workflows/R-CMD-check.yaml @@ -0,0 +1,29 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/master/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 diff --git a/.gitignore b/.gitignore index f4f606b0..ce8cc485 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .RData .Ruserdata *.Rproj +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index 5fe02139..ceb5b29c 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,63 +1,56 @@ -Package: epiprocess Type: Package +Package: epiprocess Title: Tools for basic signal processing in epidemiology Version: 1.0.0 -Authors@R: - c( - person(given = "Jacob", - family = "Bien", - role = "ctb"), - person(given = "Logan", - family = "Brooks", - role = "aut"), - person(given = "Rafael", - family = "Catoia", - role = "ctb"), - person(given = "Daniel", - family = "McDonald", - role = "ctb"), - person(given = "Quang", - family = "Nguyen", - role = "ctb"), - person(given = "Evan", - family = "Ray", - role = "aut"), - person(given = "Dmitry", - family = "Shemetov", - role = "ctb"), - person(given = "Ryan", - family = "Tibshirani", - role = c("aut", "cre"), - email = "ryantibs@cmu.edu")) -Description: This package introduces a common data structure for epidemiological - data sets measured over space and time, and offers associated utilities to - perform basic signal processing tasks. +Authors@R: c( + person("Jacob", "Bien", role = "ctb"), + person("Logan", "Brooks", role = "aut"), + person("Rafael", "Catoia", role = "ctb"), + person("Daniel", "McDonald", role = "ctb"), + person("Quang", "Nguyen", role = "ctb"), + person("Evan", "Ray", role = "aut"), + person("Dmitry", "Shemetov", role = "ctb"), + person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = c("aut", "cre")) + ) +Description: This package introduces a common data structure for + epidemiological data sets measured over space and time, and offers + associated utilities to perform basic signal processing tasks. License: MIT + file LICENSE -Encoding: UTF-8 -LazyData: true -Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.2 -Remotes: - cmu-delphi/delphi-epidata-r, - reconverse/outbreaks Imports: - data.table, - delphi.epidata, - dplyr, - fabletools, - feasts, - genlasso, - lubridate, - magrittr, - purrr, - R6, - rlang, - slider, - tibble, - tidyselect, - tidyr, - tsibble + data.table, + dplyr, + fabletools, + feasts, + generics, + genlasso, + lubridate, + magrittr, + purrr, + R6, + rlang, + slider, + tibble, + tidyr, + tidyselect, + tsibble, + utils Suggests: - outbreaks, - testthat (>= 3.0.0), + covidcast, + delphi.epidata, + ggplot2, + knitr, + outbreaks, + rmarkdown, + testthat (>= 3.0.0) +VignetteBuilder: + knitr +Remotes: + dajmcdon/delphi.epidata, + reconverse/outbreaks Config/testthat/edition: 3 +Encoding: UTF-8 +LazyData: true +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.2.0 +Depends: + R (>= 2.10) diff --git a/NAMESPACE b/NAMESPACE index d23145da..11f488e4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,8 +20,10 @@ S3method(tail,epi_df) S3method(ungroup,epi_df) S3method(unnest,epi_df) export("%>%") +export(arrange) export(as_epi_archive) export(as_epi_df) +export(as_tsibble) export(detect_outlr) export(detect_outlr_rm) export(detect_outlr_stl) @@ -31,10 +33,19 @@ export(epi_slide) export(epix_as_of) export(epix_merge) export(epix_slide) +export(filter) +export(group_by) +export(group_modify) export(growth_rate) export(is_epi_archive) export(is_epi_df) -export(quiet) +export(mutate) +export(relocate) +export(rename) +export(slice) +export(ungroup) +export(unnest) +importFrom(R6,R6Class) importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,key) @@ -48,6 +59,7 @@ importFrom(dplyr,group_modify) importFrom(dplyr,mutate) importFrom(dplyr,relocate) importFrom(dplyr,rename) +importFrom(dplyr,select) importFrom(dplyr,slice) importFrom(dplyr,ungroup) importFrom(lubridate,days) @@ -68,3 +80,5 @@ importFrom(tidyr,unnest) importFrom(tidyselect,eval_select) importFrom(tidyselect,starts_with) importFrom(tsibble,as_tsibble) +importFrom(utils,head) +importFrom(utils,tail) diff --git a/R/archive.R b/R/archive.R index f436d509..f7a98526 100644 --- a/R/archive.R +++ b/R/archive.R @@ -16,7 +16,7 @@ #' @details An `epi_archive` is an R6 class which contains a data table `DT`, of #' class `data.table` from the `data.table` package, with (at least) the #' following columns: -#' +#' #' * `geo_value`: the geographic value associated with each row of measurements. #' * `time_value`: the time value associated with each row of measurements. #' * `version`: the time value specifying the version for each row of @@ -31,7 +31,7 @@ #' on `DT` directly). There can only be a single row per unique combination of #' key variables, and thus the key variables are critical for figuring out how #' to generate a snapshot of data from the archive, as of a given version. -#' +#' #' In general, last observation carried forward (LOCF) is used to data in #' between recorded versions. Currently, deletions must be represented as #' revising a row to a special state (e.g., making the entries `NA` or @@ -43,7 +43,7 @@ #' reference semantics. A primary consequence of this is that objects are not #' copied when modified. You can read more about this in Hadley Wickham's #' [Advanced R](https://adv-r.hadley.nz/r6.html#r6-semantics) book. -#' +#' #' @section Metadata: #' The following pieces of metadata are included as fields in an `epi_archive` #' object: @@ -75,7 +75,8 @@ #' sliding computation at any given reference time point t is performed on #' **data that would have been available as of t**. More details on `slide()` #' are documented in the wrapper function `epix_slide()`. -#' +#' +#' @importFrom R6 R6Class #' @export epi_archive = R6::R6Class( @@ -88,7 +89,7 @@ epi_archive = additional_metadata = NULL, #' @description Creates a new `epi_archive` object. #' @param x A data frame, data table, or tibble, with columns `geo_value`, -#' `time_value`, `version`, and then any additional number of columns. +#' `time_value`, `version`, and then any additional number of columns. #' @param geo_type Type for the geo values. If missing, then the function will #' attempt to infer it from the geo values present; if this fails, then it #' will be set to "custom". @@ -104,12 +105,12 @@ epi_archive = #' @return An `epi_archive` object. #' @importFrom data.table as.data.table key setkeyv initialize = function(x, geo_type, time_type, other_keys, - additional_metadata) { + additional_metadata) { # Check that we have a data frame if (!is.data.frame(x)) { Abort("`x` must be a data frame.") } - + # Check that we have geo_value, time_value, version columns if (!("geo_value" %in% names(x))) { Abort("`x` must contain a `geo_value` column.") @@ -120,7 +121,7 @@ epi_archive = if (!("version" %in% names(x))) { Abort("`x` must contain a `version` column.") } - + # If geo type is missing, then try to guess it if (missing(geo_type)) { geo_type = guess_geo_type(x$geo_value) @@ -130,7 +131,7 @@ epi_archive = if (missing(time_type)) { time_type = guess_time_type(x$time_value) } - + # Finish off with small checks on keys variables and metadata if (missing(other_keys)) other_keys = NULL if (missing(additional_metadata)) additional_metadata = list() @@ -144,7 +145,7 @@ epi_archive = c("geo_type", "time_type"))) { Warn("`additional_metadata` names overlap with existing metadata fields \"geo_type\", \"time_type\".") } - + # Create the data table; if x was an un-keyed data.table itself, # then the call to as.data.table() will fail to set keys, so we # need to check this, then do it manually if needed @@ -162,8 +163,8 @@ epi_archive = cat("An `epi_archive` object, with metadata:\n") cat(sprintf("* %-9s = %s\n", "geo_type", self$geo_type)) cat(sprintf("* %-9s = %s\n", "time_type", self$time_type)) - if (!is.null(self$additional_metadata)) { - sapply(self$additional_metadata, function(m) { + if (!is.null(self$additional_metadata)) { + sapply(self$additional_metadata, function(m) { cat(sprintf("* %-9s = %s\n", names(m), m)) }) } @@ -177,7 +178,7 @@ epi_archive = cat(sprintf("* %-14s = %s\n", "max version", max(self$DT$version))) cat("----------\n") - cat(sprintf("Data archive (stored in DT field): %i x %i\n", + cat(sprintf("Data archive (stored in DT field): %i x %i\n", nrow(self$DT), ncol(self$DT))) cat("----------\n") cat(sprintf("Public methods: %s", @@ -194,7 +195,7 @@ epi_archive = other_keys = setdiff(key(self$DT), c("geo_value", "time_value", "version")) if (length(other_keys) == 0) other_keys = NULL - + # Check a few things on max_version if (!identical(class(max_version), class(self$DT$version))) { Abort("`max_version` and `DT$version` must have same class.") @@ -208,17 +209,17 @@ epi_archive = if (max_version == self_max) { Warn("Getting data as of the latest version possible. For a variety of reasons, it is possible that we only have a preliminary picture of this version (e.g., the upstream source has updated it but we have not seen it due to latency in synchronization). Thus, the snapshot that we produce here might not be reproducible at a later time (e.g., when the archive has caught up in terms of synchronization).") } - + # Filter by version and return return( - # Make sure to use data.table ways of filtering and selecting + # Make sure to use data.table ways of filtering and selecting self$DT[between(time_value, min_time_value, max_version) & version <= max_version, ] %>% unique(by = c("geo_value", "time_value", other_keys), fromLast = TRUE) %>% - tibble::as_tibble() %>% + tibble::as_tibble() %>% dplyr::select(-.data$version) %>% as_epi_df(geo_type = self$geo_type, time_type = self$time_type, @@ -226,7 +227,7 @@ epi_archive = additional_metadata = c(self$additional_metadata, other_keys = other_keys)) ) - }, + }, ##### #' @description Merges another `data.table` with the current one, and allows for #' a post-filling of `NA` values by last observation carried forward (LOCF). @@ -235,7 +236,7 @@ epi_archive = merge = function(y, ..., locf = TRUE, nan = NA) { # Check we have a `data.table` object if (!(inherits(y, "data.table") || inherits(y, "epi_archive"))) { - Abort("`y` must be of class `data.table` or `epi_archive`.") + Abort("`y` must be of class `data.table` or `epi_archive`.") } # Use the data.table merge function, carrying through ... args @@ -250,25 +251,25 @@ epi_archive = # Important: use nafill and not setnafill because the latter # returns the entire data frame by reference, and the former can - # be set to act on particular columns by reference using := + # be set to act on particular columns by reference using := self$DT[, - (cols) := nafill(.SD, type = "locf", nan = nan), - .SDcols = cols, + (cols) := nafill(.SD, type = "locf", nan = nan), + .SDcols = cols, by = by] } - }, + }, ##### #' @description Slides a given function over variables in an `epi_archive` #' object. See the documentation for the wrapper function `epix_as_of()` for -#' details. +#' details. #' @importFrom data.table key #' @importFrom rlang !! !!! enquo enquos is_quosure sym syms - slide = function(f, ..., n = 7, group_by, ref_time_values, + slide = function(f, ..., n = 7, group_by, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", - all_rows = FALSE) { + all_rows = FALSE) { # If missing, then set ref time values to be everything; else make - # sure we intersect with observed time values + # sure we intersect with observed time values if (missing(ref_time_values)) { ref_time_values = unique(self$DT$time_value) } @@ -276,16 +277,16 @@ epi_archive = ref_time_values = ref_time_values[ref_time_values %in% unique(self$DT$time_value)] } - - # If a custom time step is specified, then redefine units + + # If a custom time step is specified, then redefine units before_num = n-1 if (!missing(time_step)) before_num = time_step(n-1) - + # What to group by? If missing, set according to internal keys if (missing(group_by)) { group_by = setdiff(key(self$DT), c("time_value", "version")) } - + # Symbolize column name, defuse grouping variables. We have to do # the middle step here which is a bit complicated (unfortunately) # since the function epix_slide() could have called the current one, @@ -297,20 +298,20 @@ epi_archive = # Key variable names, apart from time value and version key_vars = setdiff(key(self$DT), c("time_value", "version")) - + # Computation for one group, one time value comp_one_grp = function(.data_group, - f, ..., + f, ..., time_value, key_vars, new_col) { - # Carry out the specified computation + # Carry out the specified computation comp_value = f(.data_group, ...) # Count the number of appearances of the reference time value. # Note: ideally, we want to directly count occurrences of the ref # time value but due to latency, this will often not appear in the - # data group. So we count the number of unique key values, outside + # data group. So we count the number of unique key values, outside # of the time value column count = sum(!duplicated(.data_group[, key_vars])) @@ -344,23 +345,23 @@ epi_archive = else { Abort("The slide computation must return an atomic vector or a data frame.") } - + # Note that we've already recycled comp value to make size stable, # so tibble() will just recycle time value appropriately - return(tibble::tibble(time_value = time_value, + return(tibble::tibble(time_value = time_value, !!new_col := comp_value)) } - + # If f is not missing, then just go ahead, slide by group if (!missing(f)) { if (rlang::is_formula(f)) f = rlang::as_function(f) - + x = purrr::map_dfr(ref_time_values, function(t) { self$as_of(t, min_time_value = t - before_num) %>% - tibble::as_tibble() %>% + tibble::as_tibble() %>% dplyr::group_by(!!!group_by) %>% dplyr::group_modify(comp_one_grp, - f = f, ..., + f = f, ..., time_value = t, key_vars = key_vars, new_col = new_col, @@ -378,14 +379,14 @@ epi_archive = if (length(quos) > 1) { Abort("If `f` is missing then only a single computation can be specified via `...`.") } - + quo = quos[[1]] f = function(x, quo, ...) rlang::eval_tidy(quo, x) new_col = sym(names(rlang::quos_auto_name(quos))) x = purrr::map_dfr(ref_time_values, function(t) { self$as_of(t, min_time_value = t - before_num) %>% - tibble::as_tibble() %>% + tibble::as_tibble() %>% dplyr::group_by(!!!group_by) %>% dplyr::group_modify(comp_one_grp, f = f, quo = quo, @@ -396,12 +397,12 @@ epi_archive = dplyr::ungroup() }) } - + # Unnest if we need to if (!as_list_col) { x = tidyr::unnest(x, !!new_col, names_sep = names_sep) } - + # Join to get all rows, if we need to, then return if (all_rows) { cols = c(as.character(group_by), "time_value") @@ -412,7 +413,7 @@ epi_archive = } ) ) - + #' Convert to `epi_archive` format #' #' Converts a data frame, data table, or tibble into an `epi_archive` @@ -447,17 +448,37 @@ epi_archive = #' ``` #' #' @export +#' @examples +#' df <- data.frame (geo_value = c(replicate(2, "ca"), replicate(2, "fl")), +#' county = c(1, 3, 2, 5), +#' time_value = c("2020-06-01", +#' "2020-06-02", +#' "2020-06-01", +#' "2020-06-02"), +#' version = c("2020-06-02", +#' "2020-06-03", +#' "2020-06-02", +#' "2020-06-03"), +#' cases = c(1, 2, 3, 4), +#' cases_rate = c(0.01, 0.02, 0.01, 0.05)) +#' +#' x <- df %>% as_epi_archive(geo_type = "state", +#' time_type = "day", +#' other_keys = "county") as_epi_archive = function(x, geo_type, time_type, other_keys, - additional_metadata = list()) { - epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata) + additional_metadata = list()) { + epi_archive$new(x, geo_type, time_type, other_keys, additional_metadata) } #' Test for `epi_archive` format #' #' @param x An object. #' @return `TRUE` if the object inherits from `epi_archive`. -#' +#' #' @export +#' @examples +#' is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) +#' is_epi_archive(archive_cases_dv_subset) # TRUE is_epi_archive = function(x) { inherits(x, "epi_archive") } diff --git a/R/correlation.R b/R/correlation.R index 0643e018..62d024bd 100644 --- a/R/correlation.R +++ b/R/correlation.R @@ -40,6 +40,33 @@ #' @importFrom rlang .data !! !!! enquo syms #' @importFrom tidyselect eval_select #' @export +#' @examples +#' +#' # linear association of case and death rates on any given day +#' epi_cor(x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = "time_value") +#' +#' # correlation of death rates and lagged case rates +#' epi_cor(x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = time_value, +#' dt1 = -2) +#' +#' # correlation grouped by location +#' epi_cor(x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = geo_value) +#' +#' # correlation grouped by location and incorporates lagged cases rates +#' epi_cor(x = jhu_csse_daily_subset, +#' var1 = case_rate_7d_av, +#' var2 = death_rate_7d_av, +#' cor_by = geo_value, +#' dt1 = -2) epi_cor = function(x, var1, var2, dt1 = 0, dt2 = 0, shift_by = geo_value, cor_by = geo_value, use = "na.or.complete", method = c("pearson", "kendall", "spearman")) { diff --git a/R/data.R b/R/data.R new file mode 100644 index 00000000..2edef89b --- /dev/null +++ b/R/data.R @@ -0,0 +1,105 @@ +#' Subset of JHU daily cases and deaths from California, Florida, Texas, New York, Georgia, and Pennsylvania +#' +#' This data source of confirmed COVID-19 cases and deaths +#' is based on reports made available by the Center for +#' Systems Science and Engineering at Johns Hopkins University. +#' This example data ranges from Mar 1, 2020 to Dec 31, 2021, and is limited to California, Florida, Texas, New York, Georgia, and Pennsylvania. +#' +#' @format A tibble with 4026 rows and 6 variables: +#' \describe{ +#' \item{geo_value}{the geographic value associated with each row of measurements.} +#' \item{time_value}{the time value associated with each row of measurements.} +#' \item{case_rate_7d_av}{7-day average signal of number of new confirmed COVID-19 cases per 100,000 population, daily} +#' \item{death_rate_7d_av}{7-day average signal of number of new confirmed deaths due to COVID-19 per 100,000 population, daily} +#' \item{cases}{Number of new confirmed COVID-19 cases, daily} +#' \item{cases_7d_av}{7-day average signal of number of new confirmed COVID-19 cases, daily} +#' } +#' @source This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the +#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +#' by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. +#' Copyright Johns Hopkins University 2020. +#' +#' Modifications: +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. The 7-day average signals are computed by Delphi by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. +#' * Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. +"jhu_csse_daily_subset" + + +#' Subset of daily doctor visits and cases from California, Florida, Texas, and New York in archive format +#' +#' This data source is based on information about outpatient visits, +#' provided to us by health system partners, and also contains confirmed +#' COVID-19 cases based on reports made available by the Center for +#' Systems Science and Engineering at Johns Hopkins University. +#' This example data ranges from June 1, 2020 to Dec 1, 2021, and is also limited to California, Florida, Texas, and New York. +#' +#' @format An `epi_archive` data format. The data table DT has 129,638 rows and 5 columns: +#' \describe{ +#' \item{geo_value}{the geographic value associated with each row of measurements.} +#' \item{time_value}{the time value associated with each row of measurements.} +#' \item{version}{the time value specifying the version for each row of measurements. } +#' \item{percent_cli}{percentage of doctor’s visits with CLI (COVID-like illness) computed from medical insurance claims} +#' \item{case_rate_7d_av}{7-day average signal of number of new confirmed deaths due to COVID-19 per 100,000 population, daily} +#' } +#' @source +#' This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the +#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +#' by Johns Hopkins University on behalf of its Center for Systems Science in Engineering. +#' Copyright Johns Hopkins University 2020. +#' +#' Modifications: +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From the COVIDcast Epidata Doctor Visits API}: These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. The 7-day average signals are computed by Delphi by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. +#' * Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. +"archive_cases_dv_subset" + + +#' Subset of JHU daily cases from California and Florida +#' +#' This data source of confirmed COVID-19 cases +#' is based on reports made available by the Center for +#' Systems Science and Engineering at Johns Hopkins University. +#' This example data is a snapshot as of Oct 28, 2021 and captures the cases from June 1, 2020 to May 31, 2021 +#' and is limited to California and Florida. +#' +#' @format A tibble with 730 rows and 3 variables: +#' \describe{ +#' \item{geo_value}{the geographic value associated with each row of measurements.} +#' \item{time_value}{the time value associated with each row of measurements.} +#' \item{cases}{Number of new confirmed COVID-19 cases, daily} +#' } +#' @source This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the +#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +#' by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. +#' Copyright Johns Hopkins University 2020. +#' +#' Modifications: +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: +#' These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. +#' * Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. +"incidence_num_outlier_example" + +#' Subset of JHU daily cases from counties in Massachusetts and Vermont +#' +#' This data source of confirmed COVID-19 cases and deaths +#' is based on reports made available by the Center for +#' Systems Science and Engineering at Johns Hopkins University. +#' This example data ranges from Mar 1, 2020 to Dec 31, 2021, and is limited to Massachusetts and Vermont. +#' +#' @format A tibble with 16,212 rows and 5 variables: +#' \describe{ +#' \item{geo_value}{the geographic value associated with each row of measurements.} +#' \item{time_value}{the time value associated with each row of measurements.} +#' \item{cases}{Number of new confirmed COVID-19 cases, daily} +#' \item{county_name}{the name of the county} +#' \item{state_name}{the full name of the state} +#' } +#' @source This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the +#' \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +#' by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. +#' Copyright Johns Hopkins University 2020. +#' +#' Modifications: +#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. The 7-day average signals are computed by Delphi by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. +#' * Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. + +"jhu_csse_county_level_subset" \ No newline at end of file diff --git a/R/epi_df.R b/R/epi_df.R index fb78ac1f..50b9a898 100644 --- a/R/epi_df.R +++ b/R/epi_df.R @@ -91,6 +91,7 @@ NULL #' guide](https://cmu-delphi.github.io/epiprocess/articles/epiprocess.html) for #' examples. #' +#' @param x A data.frame, [tibble::tibble], or [tsibble::tsibble] to be converted #' @param geo_type Type for the geo values. If missing, then the function will #' attempt to infer it from the geo values present; if this fails, then it #' will be set to "custom". diff --git a/R/methods2.R b/R/methods-epi_archive.R similarity index 85% rename from R/methods2.R rename to R/methods-epi_archive.R index 1aaa68a8..eb1544f1 100644 --- a/R/methods2.R +++ b/R/methods-epi_archive.R @@ -5,6 +5,7 @@ #' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for #' examples. #' +#' @param x An `epi_archive` object #' @param max_version Time value specifying the max version to permit in the #' snapshot. That is, the snapshot will comprise the unique rows of the #' current archive data that represent the most up-to-date signal values, as @@ -16,7 +17,7 @@ #' @return An `epi_df` object. #' #' @details This is simply a wrapper around the `as_of()` method of the -#' `epi_archive` class, so if `x` is an `epi_archive` object, then: +#' `epi_archive` class, so if `x` is an `epi_archive` object, then: #' ``` #' epix_as_of(x, max_version = v) #' ``` @@ -24,8 +25,15 @@ #' ``` #' x$as_of(max_version = v) #' ``` +#' +#' @export +#' @examples +#' # warning message of data latency shown +#' epix_as_of(x = archive_cases_dv_subset, +#' max_version = max(archive_cases_dv_subset$DT$version)) #' -#' @export +#' # no warning shown +#' epix_as_of(archive_cases_dv_subset, max_version = as.Date("2020-06-10")) epix_as_of = function(x, max_version, min_time_value = -Inf) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") return(x$as_of(max_version, min_time_value)) @@ -54,7 +62,7 @@ epix_as_of = function(x, max_version, min_time_value = -Inf) { #' step? Default is `NA`, which means that they are treated as `NA` values; if # `NaN`, then they are treated as distinct. #' @return Nothing; the data table in `x` is overwritten with the merged one. -#' +#' #' @details This is simply a wrapper around the `merge()` method of the #' `epi_archive` class, so if `x` and `y` are an `epi_archive` objects, then: #' ``` @@ -64,8 +72,19 @@ epix_as_of = function(x, max_version, min_time_value = -Inf) { #' ``` #' x$merge(y) #' ``` -#' +#' #' @export +#' @examples +#' # create two example epi_archive datasets +#' x <- archive_cases_dv_subset$DT %>% +#' dplyr::select(geo_value,time_value,version,case_rate_7d_av) %>% +#' as_epi_archive() +#' y <- archive_cases_dv_subset$DT %>% +#' dplyr::select(geo_value,time_value,version,percent_cli) %>% +#' as_epi_archive() +#' +#' # a full join stored in x +#' epix_merge(x, y, all = TRUE) epix_merge = function(x, y, ..., locf = TRUE, nan = NA) { if (!inherits(x, "epi_archive")) Abort("`x` must be of class `epi_archive`.") return(x$merge(y, ..., locf = locf, nan = nan)) @@ -80,7 +99,8 @@ epix_merge = function(x, y, ..., locf = TRUE, nan = NA) { #' [archive #' vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html) for #' examples. -#' +#' +#' @param x An `epi_archive` object. #' @param f Function or formula to slide over variables in `x`. To "slide" means #' to apply a function or formula over a running window of `n` time steps #' (where one time step is typically one day or one week). If a function, `f` @@ -114,7 +134,7 @@ epix_merge = function(x, y, ..., locf = TRUE, nan = NA) { #' contain the derivative values. Default is "slide_value"; note that setting #' `new_col_name` equal to an existing column name will overwrite this column. #' @param as_list_col Should the new column be stored as a list column? Default -#' is `FALSE`, in which case a list object returned by `f` would be unnested +#' is `FALSE`, in which case a list object returned by `f` would be unnested #' (using `tidyr::unnest()`), and the names of the resulting columns are given #' by prepending `new_col_name` to the names of the list elements. #' @param names_sep String specifying the separator to use in `tidyr::unnest()` @@ -130,13 +150,13 @@ epix_merge = function(x, y, ..., locf = TRUE, nan = NA) { #' values. #' #' @details Two key distinctions between inputs to the current function and -#' `epi_slide()`: +#' `epi_slide()`: #' 1. `epix_slide()` uses windows that are **always right-aligned** (in #' `epi_slide()`, custom alignments could be specified using the `align` or #' `before` arguments). #' 2. `epix_slide()` uses a `group_by` to specify the grouping upfront (in #' `epi_slide()`, this would be accomplished by a preceding function call to -#' `dplyr::group_by()`). +#' `dplyr::group_by()`). #' Apart from this, the interfaces between `epix_slide()` and `epi_slide()` are #' the same. #' @@ -144,7 +164,7 @@ epix_merge = function(x, y, ..., locf = TRUE, nan = NA) { #' returns the grouping variables, `time_value`, and the new columns from #' sliding, whereas `epi_slide()` returns all original variables plus the new #' columns from sliding. -#' +#' #' Furthermore, the current function can be considerably slower than #' `epi_slide()`, for two reasons: (1) it must repeatedly fetch #' properly-versioned snapshots from the data archive (via its `as_of()` @@ -154,7 +174,7 @@ epix_merge = function(x, y, ..., locf = TRUE, nan = NA) { #' version-aware sliding is necessary (as it its purpose). #' #' Finally, this is simply a wrapper around the `slide()` method of the -#' `epi_archive` class, so if `x` is an `epi_archive` object, then: +#' `epi_archive` class, so if `x` is an `epi_archive` object, then: #' ``` #' epix_slide(x, new_var = comp(old_var), n = 120) #' ``` @@ -162,9 +182,26 @@ epix_merge = function(x, y, ..., locf = TRUE, nan = NA) { #' ``` #' x$slide(x, new_var = comp(old_var), n = 120) #' ``` -#' +#' #' @importFrom rlang enquo #' @export +#' @examples +#' # these dates are reference time points for the 3 day average sliding window +#' # The resulting epi_archive ends up including data averaged from: +#' # 0 day which has no results, for 2020-06-01 +#' # 1 day, for 2020-06-02 +#' # 2 days, for the rest of the results +#' # never 3 days dur to data latency +#' +#' time_values <- seq(as.Date("2020-06-01"), +#' as.Date("2020-06-15"), +#' by = "1 day") +#' epix_slide(x = archive_cases_dv_subset, +#' f = ~ mean(.x$case_rate), +#' n = 3, +#' group_by = geo_value, +#' ref_time_values = time_values, +#' new_col_name = 'case_rate_3d_av') epix_slide = function(x, f, ..., n = 7, group_by, ref_time_values, time_step, new_col_name = "slide_value", as_list_col = FALSE, names_sep = "_", all_rows = FALSE) { diff --git a/R/methods.R b/R/methods-epi_df.R similarity index 58% rename from R/methods.R rename to R/methods-epi_df.R index 45aea4a4..1bf1dfb5 100644 --- a/R/methods.R +++ b/R/methods-epi_df.R @@ -4,15 +4,18 @@ #' `time_value`, and the key variables taken to be `geo_value` along with any #' others in the `other_keys` field of the metadata, or else explicitly set. #' -#' @importFrom tsibble as_tsibble #' @method as_tsibble epi_df +#' @param x The `epi_df` object. +#' @param key Optional. Any additional keys (other than `geo_value`) to add to +#' the `tsibble`. +#' @param ... additional arguments passed on to `tsibble::as_tsibble()` #' @export as_tsibble.epi_df = function(x, key, ...) { if (missing(key)) key = c("geo_value", attributes(x)$metadata$other_keys) return(as_tsibble(tibble::as_tibble(x), key, index = "time_value", ...)) } -#' S3 methods for an `epi_df` object +#' Base S3 methods for an `epi_df` object #' #' Print, summary, and `dplyr` verbs (that preserve class and attributes) for an #' `epi_df` object. @@ -45,21 +48,22 @@ print.epi_df = function(x, ...) { #' @importFrom rlang .data #' @importFrom stats median #' @export -summary.epi_df = function(x, ...) { +summary.epi_df = function(object, ...) { cat("An `epi_df` x, with metadata:\n") - cat(sprintf("* %-9s = %s\n", "geo_type", attributes(x)$metadata$geo_type)) - cat(sprintf("* %-9s = %s\n", "time_type", attributes(x)$metadata$time_type)) - cat(sprintf("* %-9s = %s\n", "as_of", attributes(x)$metadata$as_of)) + cat(sprintf("* %-9s = %s\n", "geo_type", attributes(object)$metadata$geo_type)) + cat(sprintf("* %-9s = %s\n", "time_type", attributes(object)$metadata$time_type)) + cat(sprintf("* %-9s = %s\n", "as_of", attributes(object)$metadata$as_of)) cat("----------\n") - cat(sprintf("* %-27s = %s\n", "min time value", min(x$time_value))) - cat(sprintf("* %-27s = %s\n", "max time value", max(x$time_value))) + cat(sprintf("* %-27s = %s\n", "min time value", min(object$time_value))) + cat(sprintf("* %-27s = %s\n", "max time value", max(object$time_value))) cat(sprintf("* %-27s = %i\n", "average rows per time value", - as.integer(x %>% dplyr::group_by(.data$time_value) %>% + as.integer(object %>% dplyr::group_by(.data$time_value) %>% dplyr::summarize(num = dplyr::n()) %>% dplyr::summarize(mean(.data$num))))) } #' @method head epi_df +#' @importFrom utils head #' @export #' @noRd head.epi_df = function(x, ...) { @@ -67,6 +71,7 @@ head.epi_df = function(x, ...) { } #' @method tail epi_df +#' @importFrom utils tail #' @export #' @noRd tail.epi_df = function(x, ...) { @@ -75,91 +80,83 @@ tail.epi_df = function(x, ...) { #' `dplyr` verbs #' -#' `dplyr` verbs for `epi_df` objexts, preserving class and attributes. +#' `dplyr` verbs for `epi_df` objects, preserving class and attributes. #' #' @method arrange epi_df +#' @param .data The `epi_df` object. #' @rdname print.epi_df -#' @importFrom dplyr arrange #' @export -arrange.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() - reclass(x, metadata) +arrange.epi_df = function(.data, ...) { + metadata = attributes(.data)$metadata + .data = NextMethod() + reclass(.data, metadata) } #' @method filter epi_df #' @rdname print.epi_df -#' @importFrom dplyr filter #' @export -filter.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() - reclass(x, metadata) +filter.epi_df = function(.data, ...) { + metadata = attributes(.data)$metadata + .data = NextMethod() + reclass(.data, metadata) } #' @method group_by epi_df #' @rdname print.epi_df -#' @importFrom dplyr group_by #' @export -group_by.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() - reclass(x, metadata) +group_by.epi_df = function(.data, ...) { + metadata = attributes(.data)$metadata + .data = NextMethod() + reclass(.data, metadata) } #' @method group_modify epi_df #' @rdname print.epi_df -#' @importFrom dplyr group_modify #' @export -group_modify.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() - reclass(x, metadata) +group_modify.epi_df = function(.data, ...) { + metadata = attributes(.data)$metadata + .data = NextMethod() + reclass(.data, metadata) } #' @method mutate epi_df #' @rdname print.epi_df -#' @importFrom dplyr mutate #' @export -mutate.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() - reclass(x, metadata) +mutate.epi_df = function(.data, ...) { + metadata = attributes(.data)$metadata + .data = NextMethod() + reclass(.data, metadata) } #' @method relocate epi_df #' @rdname print.epi_df -#' @importFrom dplyr relocate #' @export -relocate.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() - reclass(x, metadata) +relocate.epi_df = function(.data, ...) { + metadata = attributes(.data)$metadata + .data = NextMethod() + reclass(.data, metadata) } #' @method rename epi_df #' @rdname print.epi_df -#' @importFrom dplyr rename #' @export -rename.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() - reclass(x, metadata) +rename.epi_df = function(.data, ...) { + metadata = attributes(.data)$metadata + .data = NextMethod() + reclass(.data, metadata) } #' @method slice epi_df #' @rdname print.epi_df -#' @importFrom dplyr slice #' @export -slice.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() - reclass(x, metadata) +slice.epi_df = function(.data, ...) { + metadata = attributes(.data)$metadata + .data = NextMethod() + reclass(.data, metadata) } #' @method ungroup epi_df #' @rdname print.epi_df -#' @importFrom dplyr ungroup #' @export ungroup.epi_df = function(x, ...) { metadata = attributes(x)$metadata @@ -169,12 +166,12 @@ ungroup.epi_df = function(x, ...) { #' @method unnest epi_df #' @rdname print.epi_df -#' @importFrom tidyr unnest +#' @param data The `epi_df` object. #' @export -unnest.epi_df = function(x, ...) { - metadata = attributes(x)$metadata - x = NextMethod() - reclass(x, metadata) +unnest.epi_df = function(data, ...) { + metadata = attributes(data)$metadata + data = NextMethod() + reclass(data, metadata) } # Simple reclass function diff --git a/R/outliers.R b/R/outliers.R index 4acc52ab..6cc2ffb1 100644 --- a/R/outliers.R +++ b/R/outliers.R @@ -1,11 +1,11 @@ -#' Detect outliers +#' Detect outliers #' #' Applies one or more outlier detection methods to a given signal variable, and #' optionally aggregates the outputs to create a consensus result. See the #' [outliers #' vignette](https://cmu-delphi.github.io/epiprocess/articles/outliers.html) for -#' examples. -#' +#' examples. +#' #' @param x Design points corresponding to the signal values `y`. Default is #' `seq_along(y)` (that is, equally-spaced points from 1 to the length of #' `y`). @@ -13,11 +13,11 @@ #' @param methods A tibble specifying the method(s) to use for outlier #' detection, with one row per method, and the following columns: #' * `method`: Either "rm" or "stl", or a custom function for outlier -#' detection; see details for further explanation. +#' detection; see details for further explanation. #' * `args`: Named list of arguments that will be passed to the detection -#' method. +#' method. #' * `abbr`: Abbreviation to use in naming output columns with results from -#' this method. +#' this method. #' @param combiner String, one of "median", "mean", or "none", specifying how to #' combine results from different outlier detection methods for the thresholds #' determining whether a particular observation is classified as an outlier, @@ -27,7 +27,7 @@ #' purposes of determining whether a given observation is an outlier. #' @return An tibble with number of rows equal to `length(y)` and columns giving #' the outlier detection thresholds and replacement values from each detection -#' method. +#' method. #' #' @details Each outlier detection method, one per row of the passed `methods` #' tibble, is a function that must take as its first two arguments `x` and @@ -42,8 +42,35 @@ #' `detect_outlr_rm()`, which detects outliers via a rolling median; or by #' "stl", shorthand for `detect_outlr_stl()`, which detects outliers via an #' STL decomposition. -#' +#' #' @export +#' @importFrom dplyr select +#' @examples +#' detection_methods = dplyr::bind_rows( +#' dplyr::tibble(method = "rm", +#' args = list(list(detect_negatives = TRUE, +#' detection_multiplier = 2.5)), +#' abbr = "rm"), +#' dplyr::tibble(method = "stl", +#' args = list(list(detect_negatives = TRUE, +#' detection_multiplier = 2.5, +#' seasonal_period = 7)), +#' abbr = "stl_seasonal"), +#' dplyr::tibble(method = "stl", +#' args = list(list(detect_negatives = TRUE, +#' detection_multiplier = 2.5, +#' seasonal_period = NULL)), +#' abbr = "stl_nonseasonal")) +#' +#' x <- incidence_num_outlier_example %>% +#' dplyr::select(geo_value,time_value,cases) %>% +#' as_epi_df() %>% +#' group_by(geo_value) %>% +#' mutate(outlier_info = detect_outlr( +#' x = time_value, y = cases, +#' methods = detection_methods, +#' combiner = "median")) %>% +#' unnest(outlier_info) detect_outlr = function(x = seq_along(y), y, methods = tibble::tibble(method = "rm", args = list(list()), @@ -51,7 +78,7 @@ detect_outlr = function(x = seq_along(y), y, combiner = c("median", "mean", "none")) { # Validate combiner combiner = match.arg(combiner) - + # Validate that x contains all distinct values if (any(duplicated(x))) { Abort("`x` cannot contain duplicate values. (If being run on a column in an `epi_df`, did you group by relevant key variables?)") @@ -60,7 +87,7 @@ detect_outlr = function(x = seq_along(y), y, # Run all outlier detection methods results = purrr::pmap_dfc(methods, function(method, args, abbr) { if (is.character(method)) method = paste0("detect_outlr_", method) - + # Call the method results = do.call(method, args = c(list("x" = x, "y" = y), args)) @@ -71,15 +98,15 @@ detect_outlr = function(x = seq_along(y), y, } # Update column names with model abbreviation - colnames(results) = paste(abbr, colnames(results), sep = "_") + colnames(results) = paste(abbr, colnames(results), sep = "_") return(results) }) - + # Combine information about detected outliers if (combiner != "none") { if (combiner == "mean") combine_fun = mean else if (combiner == "median") combine_fun = median - + for (target in c("lower", "upper", "replacement")) { results[[paste0("combined_", target)]] = apply( results %>% @@ -100,15 +127,15 @@ detect_outlr = function(x = seq_along(y), y, #' `seq_along(y)` (that is, equally-spaced points from 1 to the length of #' `y`). #' @param y Signal values. -#' @param n Number of time steps to use in the rolling window. Default is 21. +#' @param n Number of time steps to use in the rolling window. Default is 21. #' @param log_transform Should a log transform be applied before running outlier #' detection? Default is `FALSE`. If `TRUE`, and zeros are present, then the #' log transform will be padded by 1. #' @param detect_negatives Should negative values automatically count as #' outliers? Default is `FALSE`. #' @param detection_multiplier Value determining how far the outlier detection -#' thresholds are from the rolling median, which are calculated as (rolling -#' median) +/- (detection multiplier) * (rolling IQR). Default is 2. +#' thresholds are from the rolling median, which are calculated as (rolling +#' median) +/- (detection multiplier) * (rolling IQR). Default is 2. #' @param min_radius Minimum distance between rolling median and threshold, on #' transformed scale. Default is 0. #' @param replacement_multiplier Value determining how far the replacement @@ -120,6 +147,15 @@ detect_outlr = function(x = seq_along(y), y, #' `lower`, `upper`, and `replacement`. #' #' @export +#' @examples +#' # Detect outliers based on a rolling median +#' incidence_num_outlier_example %>% +#' dplyr::select(geo_value,time_value,cases) %>% +#' as_epi_df() %>% +#' group_by(geo_value) %>% +#' mutate(outlier_info = detect_outlr_rm( +#' x = time_value, y = cases)) %>% +#' unnest(outlier_info) detect_outlr_rm = function(x = seq_along(y), y, n = 21, log_transform = FALSE, detect_negatives = FALSE, @@ -140,7 +176,7 @@ detect_outlr_rm = function(x = seq_along(y), y, n = 21, # Make an epi_df for easy sliding z = as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y)) - + # Calculate lower and upper thresholds and replacement value z = z %>% epi_slide(fitted = median(y), n = n, align = "center") %>% @@ -161,9 +197,9 @@ detect_outlr_rm = function(x = seq_along(y), y, n = 21, return(z) } -#' Detect outliers based on an STL decomposition +#' Detect outliers based on an STL decomposition #' -#' Detects outliers based on a seasonal-trend decomposition using LOESS (STL). +#' Detects outliers based on a seasonal-trend decomposition using LOESS (STL). #' #' @param x Design points corresponding to the signal values `y`. Default is #' `seq_along(y)` (that is, equally-spaced points from 1 to the length of @@ -172,14 +208,29 @@ detect_outlr_rm = function(x = seq_along(y), y, n = 21, #' @param n_trend Number of time steps to use in the rolling window for trend. #' Default is 21. #' @param n_seasonal Number of time steps to use in the rolling window for -#' seasonality. Default is 21. +#' seasonality. Default is 21. #' @param n_threshold Number of time steps to use in rolling window for the IQR #' outlier thresholds. #' @param seasonal_period Integer specifying period of seasonality. For example, #' for daily data, a period 7 means weekly seasonality. The default is `NULL`, #' meaning that no seasonal term will be included in the STL decomposition. +#' @param log_transform Should a log transform be applied before running outlier +#' detection? Default is `FALSE`. If `TRUE`, and zeros are present, then the +#' log transform will be padded by 1. +#' @param detect_negatives Should negative values automatically count as +#' outliers? Default is `FALSE`. +#' @param detection_multiplier Value determining how far the outlier detection +#' thresholds are from the rolling median, which are calculated as (rolling +#' median) +/- (detection multiplier) * (rolling IQR). Default is 2. +#' @param min_radius Minimum distance between rolling median and threshold, on +#' transformed scale. Default is 0. +#' @param replacement_multiplier Value determining how far the replacement +#' values are from the rolling median. The replacement is the original value +#' if it is within the detection thresholds, or otherwise it is rounded to the +#' nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). +#' Default is 0. #' @return A tibble with number of rows equal to `length(y)`, and columns -#' `lower`, `upper`, and `replacement`. +#' `lower`, `upper`, and `replacement`. #' #' @details The STL decomposition is computed using the `feasts` package. Once #' computed, the outlier detection method is analogous to the rolling median @@ -188,12 +239,21 @@ detect_outlr_rm = function(x = seq_along(y), y, n = 21, #' residuals to the rolling median, respectively. #' #' The last set of arguments, `log_transform` through `replacement_multiplier`, -#' are exactly as in `detect_outlr_rm()`; refer to its help file for their -#' description. +#' are exactly as in `detect_outlr_rm()`. #' #' @importFrom stats median #' @importFrom tidyselect starts_with #' @export +#' @examples +#' # Detects outliers based on a seasonal-trend decomposition using LOESS +#' incidence_num_outlier_example %>% +#' dplyr::select(geo_value,time_value,cases) %>% +#' as_epi_df() %>% +#' group_by(geo_value) %>% +#' mutate(outlier_info = detect_outlr_stl( +#' x = time_value, y = cases, +#' seasonal_period = 7 )) %>% # weekly seasonality for daily data +#' unnest(outlier_info) detect_outlr_stl = function(x = seq_along(y), y, n_trend = 21, n_seasonal = 21, @@ -211,10 +271,10 @@ detect_outlr_stl = function(x = seq_along(y), y, offset = as.integer(any(y == 0)) y = log(y + offset) } - + # Make a tsibble for fabletools, setup and run STL z_tsibble = tsibble::tsibble(x = x, y = y, index = x) - + stl_formula = y ~ trend(window = n_trend) + season(period = seasonal_period, window = n_seasonal) @@ -225,8 +285,8 @@ detect_outlr_stl = function(x = seq_along(y), y, dplyr::select(trend:remainder) %>% dplyr::rename_with(~ "seasonal", tidyselect::starts_with("season")) %>% dplyr::rename(resid = remainder) - - # Allocate the seasonal term from STL to either fitted or resid + + # Allocate the seasonal term from STL to either fitted or resid if (!is.null(seasonal_period)) { stl_components = stl_components %>% dplyr::mutate( @@ -241,17 +301,17 @@ detect_outlr_stl = function(x = seq_along(y), y, # Detect negatives if requested if (detect_negatives && !log_transform) min_lower = 0 else min_lower = -Inf - + # Make an epi_df for easy sliding z = as_epi_df(tibble::tibble(geo_value = 0, time_value = x, y = y)) - - # Calculate lower and upper thresholds and replacement value + + # Calculate lower and upper thresholds and replacement value z = z %>% dplyr::mutate( fitted = stl_components$fitted, resid = stl_components$resid) %>% roll_iqr(n = n_threshold, - detection_multiplier = detection_multiplier, + detection_multiplier = detection_multiplier, min_radius = min_radius, replacement_multiplier = replacement_multiplier, min_lower = min_lower) @@ -271,8 +331,8 @@ roll_iqr = function(z, n, detection_multiplier, min_radius, replacement_multiplier, min_lower) { if (typeof(z$y) == "integer") as_type = as.integer else as_type = as.numeric - - epi_slide(z, roll_iqr = IQR(resid), n = n, align = "center") %>% + + epi_slide(z, roll_iqr = stats::IQR(resid), n = n, align = "center") %>% dplyr::mutate( lower = pmax(min_lower, fitted - pmax(min_radius, detection_multiplier * roll_iqr)), diff --git a/R/reexports.R b/R/reexports.R new file mode 100644 index 00000000..4cc45e29 --- /dev/null +++ b/R/reexports.R @@ -0,0 +1,57 @@ +# NOTE: when providing a method for a generic in another package +# That generic needs to be rexported + + +# tsibble ----------------------------------------------------------------- + +#' @importFrom tsibble as_tsibble +#' @export +tsibble::as_tsibble + + + + +# dplyr ------------------------------------------------------------------- + +#' @importFrom dplyr arrange +#' @export +dplyr::arrange + +#' @importFrom dplyr filter +#' @export +dplyr::filter + +#' @importFrom dplyr group_by +#' @export +dplyr::group_by + +#' @importFrom dplyr ungroup +#' @export +dplyr::ungroup + +#' @importFrom dplyr group_modify +#' @export +dplyr::group_modify + +#' @importFrom dplyr mutate +#' @export +dplyr::mutate + +#' @importFrom dplyr relocate +#' @export +dplyr::relocate + +#' @importFrom dplyr rename +#' @export +dplyr::rename + +#' @importFrom dplyr slice +#' @export +dplyr::slice + + +# tidyr ------------------------------------------------------------------- + +#' @importFrom tidyr unnest +#' @export +tidyr::unnest diff --git a/R/slide.R b/R/slide.R index ce563187..5847b130 100644 --- a/R/slide.R +++ b/R/slide.R @@ -88,6 +88,25 @@ #' @importFrom lubridate days weeks #' @importFrom rlang .data .env !! enquo enquos sym #' @export +#' @examples +#' # slide a 7-day trailing average formula on cases +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide(cases_7dav = mean(cases), n = 7, +#' align = "right") +#' +#' # slide a left-aligned 7-day average +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide(cases_7dav = mean(cases), n = 7, +#' align = "left") +#' +#' # nested new columns +#' jhu_csse_daily_subset %>% +#' group_by(geo_value) %>% +#' epi_slide(a = data.frame(cases_2dav = mean(cases), +#' cases_2dma = mad(cases)), +#' n = 2, as_list_col = TRUE) epi_slide = function(x, f, ..., n = 7, ref_time_values, align = c("right", "center", "left"), before, time_step, new_col_name = "slide_value", as_list_col = FALSE, diff --git a/R/utils.R b/R/utils.R index cf483ede..4a0b3b72 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,5 +1,5 @@ break_str = function(str, nchar = 79, init = "") { - str = paste(strwrap(str, nchar, init = init), collapse = "\n") + str = paste(strwrap(str, nchar, initial = init), collapse = "\n") str[1] = substring(str, nchar(init)+1) return(str) } @@ -34,7 +34,8 @@ guess_geo_type = function(geo_value) { geo_value = tolower(geo_value) # If all geo values are state abbreviations, then use "state" - state_values = c(tolower(state.abb), "as", "dc", "gu", "mp", "pr", "vi") + state_values = c(tolower(datasets::state.abb), + "as", "dc", "gu", "mp", "pr", "vi") if (all(geo_value %in% state_values)) return("state") # Else if all geo values are 2 letters, then use "nation" @@ -99,8 +100,7 @@ guess_time_type = function(time_value) { ########## -#' @export -#' @noRd + quiet = function(x) { sink(tempfile()) on.exit(sink()) diff --git a/README.md b/README.md index e4ed6933..a48ee8d3 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,10 @@ # epiprocess + + [![R-CMD-check](https://github.com/dajmcdon/epiprocess/workflows/R-CMD-check/badge.svg)](https://github.com/dajmcdon/epiprocess/actions) + + + This repo contains an R package `epiprocess` which introduces a common data structure for epidemiological data sets measured over space and time, and offers associated utilities to perform basic signal processing tasks. diff --git a/data-raw/archive_cases_dv_subset.R b/data-raw/archive_cases_dv_subset.R new file mode 100644 index 00000000..b1c1071c --- /dev/null +++ b/data-raw/archive_cases_dv_subset.R @@ -0,0 +1,34 @@ +library(delphi.epidata) +library(epiprocess) +library(data.table) +library(dplyr) + +archive_cases_dv_subset <- covidcast( + data_source = "doctor-visits", + signals = "smoothed_adj_cli", + time_type = "day", + geo_type = "state", + time_value = epirange(20200601, 20211201), + geo_values = "ca,fl,ny,tx", + issues = epirange(20200601, 20211201) +) %>% + fetch_tbl() %>% + select(geo_value, time_value, version = issue, percent_cli = value) %>% + as_epi_archive() + +case_rate_subset <- covidcast( + data_source = "jhu-csse", + signals = "confirmed_7dav_incidence_prop", + time_type = "day", + geo_type = "state", + time_value = epirange(20200601, 20211201), + geo_values = "ca,fl,ny,tx", + issues = epirange(20200601, 20211201) +) %>% + fetch_tbl() %>% + select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% + as_epi_archive() + +epix_merge(archive_cases_dv_subset, case_rate_subset, all = TRUE) + +usethis::use_data(archive_cases_dv_subset, overwrite = TRUE) diff --git a/data-raw/incidence_num_outlier_example.R b/data-raw/incidence_num_outlier_example.R new file mode 100644 index 00000000..992e9d89 --- /dev/null +++ b/data-raw/incidence_num_outlier_example.R @@ -0,0 +1,19 @@ +library(delphi.epidata) +library(epiprocess) +library(dplyr) +library(tidyr) + +incidence_num_outlier_example <- covidcast( + data_source = "jhu-csse", + signals = "confirmed_incidence_num", + time_type = "day", + geo_type = "state", + time_values = epirange(20200601, 20210531), + geo_values = "fl,nj", + as_of = 20211028 +) %>% + fetch_tbl() %>% + select(geo_value, time_value, cases = value) %>% + as_epi_df() + +usethis::use_data(incidence_num_outlier_example, overwrite = TRUE) diff --git a/data-raw/jhu_csse_county_level_subset.R b/data-raw/jhu_csse_county_level_subset.R new file mode 100644 index 00000000..a5cacd83 --- /dev/null +++ b/data-raw/jhu_csse_county_level_subset.R @@ -0,0 +1,25 @@ +library(delphi.epidata) +library(covidcast) +library(epiprocess) +library(dplyr) + +# Use covidcast::county_census to get the county and state names +y <- covidcast::county_census %>% + filter(STNAME %in% c("Massachusetts", "Vermont"), STNAME != CTYNAME) %>% + select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME) + +# Fetch only counties from Massachusetts and Vermont, then append names columns as well +jhu_csse_county_level_subset <- covidcast( + data_source = "jhu-csse", + signals = "confirmed_incidence_num", + time_type = "day", + geo_type = "county", + time_values = epirange(20200601, 20211231), + geo_values = paste(y$geo_value, collapse = ",") +) %>% + fetch_tbl() %>% + select(geo_value, time_value, cases = value) %>% + full_join(y, by = "geo_value") %>% + as_epi_df() + +usethis::use_data(jhu_csse_county_level_subset, overwrite = TRUE) diff --git a/data-raw/jhu_csse_daily_subset.R b/data-raw/jhu_csse_daily_subset.R new file mode 100644 index 00000000..5aae3402 --- /dev/null +++ b/data-raw/jhu_csse_daily_subset.R @@ -0,0 +1,62 @@ +library(delphi.epidata) +library(epiprocess) +library(dplyr) + +confirmed_7dav_incidence_prop <- covidcast( + data_source = "jhu-csse", + signals = "confirmed_7dav_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200301, 20211231), + geo_values = "ca,fl,ny,tx,ga,pa" +) %>% + fetch_tbl() %>% + select(geo_value, time_value, case_rate_7d_av = value) %>% + arrange(geo_value, time_value) + +deaths_7dav_incidence_prop <- covidcast( + data_source = "jhu-csse", + signals = "deaths_7dav_incidence_prop", + time_type = "day", + geo_type = "state", + time_values = epirange(20200301, 20211231), + geo_values = "ca,fl,ny,tx,ga,pa" +) %>% + fetch_tbl() %>% + select(geo_value, time_value, death_rate_7d_av = value) %>% + arrange(geo_value, time_value) + +confirmed_incidence_num <- covidcast( + data_source = "jhu-csse", + signals = "confirmed_incidence_num", + time_type = "day", + geo_type = "state", + time_values = epirange(20200301, 20211231), + geo_values = "ca,fl,ny,tx,ga,pa" +) %>% + fetch_tbl() %>% + select(geo_value, time_value, cases = value) %>% + arrange(geo_value, time_value) + +confirmed_7dav_incidence_num <- covidcast( + data_source = "jhu-csse", + signals = "confirmed_7dav_incidence_num", + time_type = "day", + geo_type = "state", + time_values = epirange(20200301, 20211231), + geo_values = "ca,fl,ny,tx,ga,pa" +) %>% + fetch_tbl() %>% + select(geo_value, time_value, cases_7d_av = value) %>% + arrange(geo_value, time_value) + +jhu_csse_daily_subset <- confirmed_7dav_incidence_prop %>% + full_join(deaths_7dav_incidence_prop, + by = c("geo_value", "time_value")) %>% + full_join(confirmed_incidence_num, + by = c("geo_value", "time_value")) %>% + full_join(confirmed_7dav_incidence_num, + by = c("geo_value", "time_value")) %>% + as_epi_df() + +usethis::use_data(jhu_csse_daily_subset, overwrite = TRUE) diff --git a/data/archive_cases_dv_subset.rda b/data/archive_cases_dv_subset.rda new file mode 100644 index 00000000..8dd7c126 Binary files /dev/null and b/data/archive_cases_dv_subset.rda differ diff --git a/data/incidence_num_outlier_example.rda b/data/incidence_num_outlier_example.rda new file mode 100644 index 00000000..e898b5ea Binary files /dev/null and b/data/incidence_num_outlier_example.rda differ diff --git a/data/jhu_csse_county_level_subset.rda b/data/jhu_csse_county_level_subset.rda new file mode 100644 index 00000000..aca0983d Binary files /dev/null and b/data/jhu_csse_county_level_subset.rda differ diff --git a/data/jhu_csse_daily_subset.rda b/data/jhu_csse_daily_subset.rda new file mode 100644 index 00000000..6d073eea Binary files /dev/null and b/data/jhu_csse_daily_subset.rda differ diff --git a/man/archive_cases_dv_subset.Rd b/man/archive_cases_dv_subset.Rd new file mode 100644 index 00000000..68aa2001 --- /dev/null +++ b/man/archive_cases_dv_subset.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{archive_cases_dv_subset} +\alias{archive_cases_dv_subset} +\title{Subset of daily doctor visits and cases from California, Florida, Texas, and New York in archive format} +\format{ +An \code{epi_archive} data format. The data table DT has 129,638 rows and 5 columns: +\describe{ +\item{geo_value}{the geographic value associated with each row of measurements.} +\item{time_value}{the time value associated with each row of measurements.} +\item{version}{the time value specifying the version for each row of measurements. } +\item{percent_cli}{percentage of doctor’s visits with CLI (COVID-like illness) computed from medical insurance claims} +\item{case_rate_7d_av}{7-day average signal of number of new confirmed deaths due to COVID-19 per 100,000 population, daily} +} +} +\source{ +This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the +\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +by Johns Hopkins University on behalf of its Center for Systems Science in Engineering. +Copyright Johns Hopkins University 2020. + +Modifications: +\itemize{ +\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html}{From the COVIDcast Epidata Doctor Visits API}: These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. The 7-day average signals are computed by Delphi by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. +\item Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. +} +} +\usage{ +archive_cases_dv_subset +} +\description{ +This data source is based on information about outpatient visits, +provided to us by health system partners, and also contains confirmed +COVID-19 cases based on reports made available by the Center for +Systems Science and Engineering at Johns Hopkins University. +This example data ranges from June 1, 2020 to Dec 1, 2021, and is also limited to California, Florida, Texas, and New York. +} +\keyword{datasets} diff --git a/man/as_epi_archive.Rd b/man/as_epi_archive.Rd index 191b708b..a95550be 100644 --- a/man/as_epi_archive.Rd +++ b/man/as_epi_archive.Rd @@ -48,3 +48,21 @@ class, so for example:\preformatted{x <- as_epi_archive(df, geo_type = "state", would be equivalent to:\preformatted{x <- epi_archive$new(df, geo_type = "state", time_type = "day") } } +\examples{ +df <- data.frame (geo_value = c(replicate(2, "ca"), replicate(2, "fl")), + county = c(1, 3, 2, 5), + time_value = c("2020-06-01", + "2020-06-02", + "2020-06-01", + "2020-06-02"), + version = c("2020-06-02", + "2020-06-03", + "2020-06-02", + "2020-06-03"), + cases = c(1, 2, 3, 4), + cases_rate = c(0.01, 0.02, 0.01, 0.05)) + +x <- df \%>\% as_epi_archive(geo_type = "state", + time_type = "day", + other_keys = "county") +} diff --git a/man/as_epi_df.Rd b/man/as_epi_df.Rd index 0ea6ebc5..e092716e 100644 --- a/man/as_epi_df.Rd +++ b/man/as_epi_df.Rd @@ -19,6 +19,8 @@ as_epi_df(x, ...) \method{as_epi_df}{tbl_ts}(x, geo_type, time_type, as_of, additional_metadata = list(), ...) } \arguments{ +\item{x}{A data.frame, \link[tibble:tibble]{tibble::tibble}, or \link[tsibble:tsibble]{tsibble::tsibble} to be converted} + \item{...}{Additional arguments passed to methods.} \item{geo_type}{Type for the geo values. If missing, then the function will diff --git a/man/as_tsibble.epi_df.Rd b/man/as_tsibble.epi_df.Rd index 3ccb7cc2..98ca7f74 100644 --- a/man/as_tsibble.epi_df.Rd +++ b/man/as_tsibble.epi_df.Rd @@ -1,11 +1,19 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods.R +% Please edit documentation in R/methods-epi_df.R \name{as_tsibble.epi_df} \alias{as_tsibble.epi_df} \title{Convert to tsibble format} \usage{ \method{as_tsibble}{epi_df}(x, key, ...) } +\arguments{ +\item{x}{The \code{epi_df} object.} + +\item{key}{Optional. Any additional keys (other than \code{geo_value}) to add to +the \code{tsibble}.} + +\item{...}{additional arguments passed on to \code{tsibble::as_tsibble()}} +} \description{ Converts an \code{epi_df} object into a tsibble, where the index is taken to be \code{time_value}, and the key variables taken to be \code{geo_value} along with any diff --git a/man/detect_outlr.Rd b/man/detect_outlr.Rd index d832cd6d..4aa0b79c 100644 --- a/man/detect_outlr.Rd +++ b/man/detect_outlr.Rd @@ -63,3 +63,30 @@ For convenience, the outlier detection method can be specified (in the "stl", shorthand for \code{detect_outlr_stl()}, which detects outliers via an STL decomposition. } +\examples{ + detection_methods = dplyr::bind_rows( + dplyr::tibble(method = "rm", + args = list(list(detect_negatives = TRUE, + detection_multiplier = 2.5)), + abbr = "rm"), + dplyr::tibble(method = "stl", + args = list(list(detect_negatives = TRUE, + detection_multiplier = 2.5, + seasonal_period = 7)), + abbr = "stl_seasonal"), + dplyr::tibble(method = "stl", + args = list(list(detect_negatives = TRUE, + detection_multiplier = 2.5, + seasonal_period = NULL)), + abbr = "stl_nonseasonal")) + + x <- incidence_num_outlier_example \%>\% + dplyr::select(geo_value,time_value,cases) \%>\% + as_epi_df() \%>\% + group_by(geo_value) \%>\% + mutate(outlier_info = detect_outlr( + x = time_value, y = cases, + methods = detection_methods, + combiner = "median")) \%>\% + unnest(outlier_info) +} diff --git a/man/detect_outlr_rm.Rd b/man/detect_outlr_rm.Rd index 55410eb9..33e3f73b 100644 --- a/man/detect_outlr_rm.Rd +++ b/man/detect_outlr_rm.Rd @@ -52,3 +52,13 @@ A tibble with number of rows equal to \code{length(y)}, and columns Detects outliers based on a distance from the rolling median specified in terms of multiples of the rolling interquartile range (IQR). } +\examples{ +# Detect outliers based on a rolling median +incidence_num_outlier_example \%>\% + dplyr::select(geo_value,time_value,cases) \%>\% + as_epi_df() \%>\% + group_by(geo_value) \%>\% + mutate(outlier_info = detect_outlr_rm( + x = time_value, y = cases)) \%>\% + unnest(outlier_info) +} diff --git a/man/detect_outlr_stl.Rd b/man/detect_outlr_stl.Rd index 73de7eb9..7e724a4e 100644 --- a/man/detect_outlr_stl.Rd +++ b/man/detect_outlr_stl.Rd @@ -37,6 +37,26 @@ outlier thresholds.} \item{seasonal_period}{Integer specifying period of seasonality. For example, for daily data, a period 7 means weekly seasonality. The default is \code{NULL}, meaning that no seasonal term will be included in the STL decomposition.} + +\item{log_transform}{Should a log transform be applied before running outlier +detection? Default is \code{FALSE}. If \code{TRUE}, and zeros are present, then the +log transform will be padded by 1.} + +\item{detect_negatives}{Should negative values automatically count as +outliers? Default is \code{FALSE}.} + +\item{detection_multiplier}{Value determining how far the outlier detection +thresholds are from the rolling median, which are calculated as (rolling +median) +/- (detection multiplier) * (rolling IQR). Default is 2.} + +\item{min_radius}{Minimum distance between rolling median and threshold, on +transformed scale. Default is 0.} + +\item{replacement_multiplier}{Value determining how far the replacement +values are from the rolling median. The replacement is the original value +if it is within the detection thresholds, or otherwise it is rounded to the +nearest (rolling median) +/- (replacement multiplier) * (rolling IQR). +Default is 0.} } \value{ A tibble with number of rows equal to \code{length(y)}, and columns @@ -53,6 +73,16 @@ from the STL decomposition taking the place of the rolling median and residuals to the rolling median, respectively. The last set of arguments, \code{log_transform} through \code{replacement_multiplier}, -are exactly as in \code{detect_outlr_rm()}; refer to its help file for their -description. +are exactly as in \code{detect_outlr_rm()}. +} +\examples{ +# Detects outliers based on a seasonal-trend decomposition using LOESS +incidence_num_outlier_example \%>\% + dplyr::select(geo_value,time_value,cases) \%>\% + as_epi_df() \%>\% + group_by(geo_value) \%>\% + mutate(outlier_info = detect_outlr_stl( + x = time_value, y = cases, + seasonal_period = 7 )) \%>\% # weekly seasonality for daily data + unnest(outlier_info) } diff --git a/man/epi_cor.Rd b/man/epi_cor.Rd index 7c3e1ecf..6b2279db 100644 --- a/man/epi_cor.Rd +++ b/man/epi_cor.Rd @@ -57,3 +57,31 @@ grouping by geo value, time value, or any other variables. See the \href{https://cmu-delphi.github.io/epiprocess/articles/correlation.html}{correlation vignette} for examples. } +\examples{ + +# linear association of case and death rates on any given day +epi_cor(x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = "time_value") + +# correlation of death rates and lagged case rates +epi_cor(x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = time_value, + dt1 = -2) + +# correlation grouped by location +epi_cor(x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = geo_value) + +# correlation grouped by location and incorporates lagged cases rates +epi_cor(x = jhu_csse_daily_subset, + var1 = case_rate_7d_av, + var2 = death_rate_7d_av, + cor_by = geo_value, + dt1 = -2) +} diff --git a/man/epi_slide.Rd b/man/epi_slide.Rd index 6aa4addf..2e737293 100644 --- a/man/epi_slide.Rd +++ b/man/epi_slide.Rd @@ -116,3 +116,23 @@ tidy evaluation (first example, above), then the name for the new column is inferred from the given expression and overrides any name passed explicitly through the \code{new_col_name} argument. } +\examples{ + # slide a 7-day trailing average formula on cases + jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide(cases_7dav = mean(cases), n = 7, + align = "right") + + # slide a left-aligned 7-day average + jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide(cases_7dav = mean(cases), n = 7, + align = "left") + + # nested new columns + jhu_csse_daily_subset \%>\% + group_by(geo_value) \%>\% + epi_slide(a = data.frame(cases_2dav = mean(cases), + cases_2dma = mad(cases)), + n = 2, as_list_col = TRUE) +} diff --git a/man/epix_as_of.Rd b/man/epix_as_of.Rd index 51867318..658e7169 100644 --- a/man/epix_as_of.Rd +++ b/man/epix_as_of.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods2.R +% Please edit documentation in R/methods-epi_archive.R \name{epix_as_of} \alias{epix_as_of} \title{Generate a snapshot from an \code{epi_archive} object} @@ -7,6 +7,8 @@ epix_as_of(x, max_version, min_time_value = -Inf) } \arguments{ +\item{x}{An \code{epi_archive} object} + \item{max_version}{Time value specifying the max version to permit in the snapshot. That is, the snapshot will comprise the unique rows of the current archive data that represent the most up-to-date signal values, as @@ -33,3 +35,11 @@ This is simply a wrapper around the \code{as_of()} method of the is equivalent to:\preformatted{x$as_of(max_version = v) } } +\examples{ +# warning message of data latency shown +epix_as_of(x = archive_cases_dv_subset, + max_version = max(archive_cases_dv_subset$DT$version)) + +# no warning shown +epix_as_of(archive_cases_dv_subset, max_version = as.Date("2020-06-10")) +} diff --git a/man/epix_merge.Rd b/man/epix_merge.Rd index c48476c8..781ef6fe 100644 --- a/man/epix_merge.Rd +++ b/man/epix_merge.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods2.R +% Please edit documentation in R/methods-epi_archive.R \name{epix_merge} \alias{epix_merge} \title{Merge two \code{epi_archive} objects} @@ -41,3 +41,15 @@ This is simply a wrapper around the \code{merge()} method of the is equivalent to:\preformatted{x$merge(y) } } +\examples{ +# create two example epi_archive datasets +x <- archive_cases_dv_subset$DT \%>\% + dplyr::select(geo_value,time_value,version,case_rate_7d_av) \%>\% + as_epi_archive() +y <- archive_cases_dv_subset$DT \%>\% + dplyr::select(geo_value,time_value,version,percent_cli) \%>\% + as_epi_archive() + +# a full join stored in x +epix_merge(x, y, all = TRUE) +} diff --git a/man/epix_slide.Rd b/man/epix_slide.Rd index b237880b..f01a0a71 100644 --- a/man/epix_slide.Rd +++ b/man/epix_slide.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods2.R +% Please edit documentation in R/methods-epi_archive.R \name{epix_slide} \alias{epix_slide} \title{Slide a function over variables in an \code{epi_archive} object} @@ -19,6 +19,8 @@ epix_slide( ) } \arguments{ +\item{x}{An \code{epi_archive} object.} + \item{f}{Function or formula to slide over variables in \code{x}. To "slide" means to apply a function or formula over a running window of \code{n} time steps (where one time step is typically one day or one week). If a function, \code{f} @@ -119,3 +121,21 @@ Finally, this is simply a wrapper around the \code{slide()} method of the is equivalent to:\preformatted{x$slide(x, new_var = comp(old_var), n = 120) } } +\examples{ +# these dates are reference time points for the 3 day average sliding window +# The resulting epi_archive ends up including data averaged from: +# 0 day which has no results, for 2020-06-01 +# 1 day, for 2020-06-02 +# 2 days, for the rest of the results +# never 3 days dur to data latency + +time_values <- seq(as.Date("2020-06-01"), + as.Date("2020-06-15"), + by = "1 day") +epix_slide(x = archive_cases_dv_subset, + f = ~ mean(.x$case_rate), + n = 3, + group_by = geo_value, + ref_time_values = time_values, + new_col_name = 'case_rate_3d_av') +} diff --git a/man/incidence_num_outlier_example.Rd b/man/incidence_num_outlier_example.Rd new file mode 100644 index 00000000..23afdf51 --- /dev/null +++ b/man/incidence_num_outlier_example.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{incidence_num_outlier_example} +\alias{incidence_num_outlier_example} +\title{Subset of JHU daily cases from California and Florida} +\format{ +A tibble with 730 rows and 3 variables: +\describe{ +\item{geo_value}{the geographic value associated with each row of measurements.} +\item{time_value}{the time value associated with each row of measurements.} +\item{cases}{Number of new confirmed COVID-19 cases, daily} +} +} +\source{ +This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the +\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. +Copyright Johns Hopkins University 2020. + +Modifications: +\itemize{ +\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: +These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. +\item Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. +} +} +\usage{ +incidence_num_outlier_example +} +\description{ +This data source of confirmed COVID-19 cases +is based on reports made available by the Center for +Systems Science and Engineering at Johns Hopkins University. +This example data is a snapshot as of Oct 28, 2021 and captures the cases from June 1, 2020 to May 31, 2021 +and is limited to California and Florida. +} +\keyword{datasets} diff --git a/man/is_epi_archive.Rd b/man/is_epi_archive.Rd index 10a30681..f8dcf2c8 100644 --- a/man/is_epi_archive.Rd +++ b/man/is_epi_archive.Rd @@ -15,3 +15,7 @@ is_epi_archive(x) \description{ Test for \code{epi_archive} format } +\examples{ +is_epi_archive(jhu_csse_daily_subset) # FALSE (this is an epi_df, not epi_archive) +is_epi_archive(archive_cases_dv_subset) # TRUE +} diff --git a/man/jhu_csse_county_level_subset.Rd b/man/jhu_csse_county_level_subset.Rd new file mode 100644 index 00000000..6ab47a12 --- /dev/null +++ b/man/jhu_csse_county_level_subset.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{jhu_csse_county_level_subset} +\alias{jhu_csse_county_level_subset} +\title{Subset of JHU daily cases from counties in Massachusetts and Vermont} +\format{ +A tibble with 16,212 rows and 5 variables: +\describe{ +\item{geo_value}{the geographic value associated with each row of measurements.} +\item{time_value}{the time value associated with each row of measurements.} +\item{cases}{Number of new confirmed COVID-19 cases, daily} +\item{county_name}{the name of the county} +\item{state_name}{the full name of the state} +} +} +\source{ +This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the +\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. +Copyright Johns Hopkins University 2020. + +Modifications: +\itemize{ +\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. The 7-day average signals are computed by Delphi by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. +\item Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. +} +} +\usage{ +jhu_csse_county_level_subset +} +\description{ +This data source of confirmed COVID-19 cases and deaths +is based on reports made available by the Center for +Systems Science and Engineering at Johns Hopkins University. +This example data ranges from Mar 1, 2020 to Dec 31, 2021, and is limited to Massachusetts and Vermont. +} +\keyword{datasets} diff --git a/man/jhu_csse_daily_subset.Rd b/man/jhu_csse_daily_subset.Rd new file mode 100644 index 00000000..20c4d909 --- /dev/null +++ b/man/jhu_csse_daily_subset.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{jhu_csse_daily_subset} +\alias{jhu_csse_daily_subset} +\title{Subset of JHU daily cases and deaths from California, Florida, Texas, New York, Georgia, and Pennsylvania} +\format{ +A tibble with 4026 rows and 6 variables: +\describe{ +\item{geo_value}{the geographic value associated with each row of measurements.} +\item{time_value}{the time value associated with each row of measurements.} +\item{case_rate_7d_av}{7-day average signal of number of new confirmed COVID-19 cases per 100,000 population, daily} +\item{death_rate_7d_av}{7-day average signal of number of new confirmed deaths due to COVID-19 per 100,000 population, daily} +\item{cases}{Number of new confirmed COVID-19 cases, daily} +\item{cases_7d_av}{7-day average signal of number of new confirmed COVID-19 cases, daily} +} +} +\source{ +This object contains a modified part of the \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University} as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}. This data set is licensed under the terms of the +\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license} +by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. +Copyright Johns Hopkins University 2020. + +Modifications: +\itemize{ +\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}: These signals are taken directly from the JHU CSSE \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository} without changes. The 7-day average signals are computed by Delphi by calculating moving averages of the preceding 7 days, so the signal for June 7 is the average of the underlying data for June 1 through 7, inclusive. +\item Furthermore, the data has been limited to a very small number of rows, the signal names slightly altered, and formatted into a tibble. +} +} +\usage{ +jhu_csse_daily_subset +} +\description{ +This data source of confirmed COVID-19 cases and deaths +is based on reports made available by the Center for +Systems Science and Engineering at Johns Hopkins University. +This example data ranges from Mar 1, 2020 to Dec 31, 2021, and is limited to California, Florida, Texas, New York, Georgia, and Pennsylvania. +} +\keyword{datasets} diff --git a/man/print.epi_df.Rd b/man/print.epi_df.Rd index 11c7e1cd..9ac3af99 100644 --- a/man/print.epi_df.Rd +++ b/man/print.epi_df.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/methods.R +% Please edit documentation in R/methods-epi_df.R \name{print.epi_df} \alias{print.epi_df} \alias{summary.epi_df} @@ -13,31 +13,31 @@ \alias{slice.epi_df} \alias{ungroup.epi_df} \alias{unnest.epi_df} -\title{S3 methods for an \code{epi_df} object} +\title{Base S3 methods for an \code{epi_df} object} \usage{ \method{print}{epi_df}(x, ...) -\method{summary}{epi_df}(x, ...) +\method{summary}{epi_df}(object, ...) -\method{arrange}{epi_df}(x, ...) +\method{arrange}{epi_df}(.data, ...) -\method{filter}{epi_df}(x, ...) +\method{filter}{epi_df}(.data, ...) -\method{group_by}{epi_df}(x, ...) +\method{group_by}{epi_df}(.data, ...) -\method{group_modify}{epi_df}(x, ...) +\method{group_modify}{epi_df}(.data, ...) -\method{mutate}{epi_df}(x, ...) +\method{mutate}{epi_df}(.data, ...) -\method{relocate}{epi_df}(x, ...) +\method{relocate}{epi_df}(.data, ...) -\method{rename}{epi_df}(x, ...) +\method{rename}{epi_df}(.data, ...) -\method{slice}{epi_df}(x, ...) +\method{slice}{epi_df}(.data, ...) \method{ungroup}{epi_df}(x, ...) -\method{unnest}{epi_df}(x, ...) +\method{unnest}{epi_df}(data, ...) } \arguments{ \item{x}{The \code{epi_df} object.} @@ -46,6 +46,10 @@ Currently unused.} \item{object}{The \code{epi_df} object.} + +\item{.data}{The \code{epi_df} object.} + +\item{data}{The \code{epi_df} object.} } \description{ Print, summary, and \code{dplyr} verbs (that preserve class and attributes) for an @@ -54,5 +58,5 @@ Print, summary, and \code{dplyr} verbs (that preserve class and attributes) for Prints a variety of summary statistics about the \code{epi_df} object, such as the time range included and geographic coverage. -\code{dplyr} verbs for \code{epi_df} objexts, preserving class and attributes. +\code{dplyr} verbs for \code{epi_df} objects, preserving class and attributes. } diff --git a/man/reexports.Rd b/man/reexports.Rd new file mode 100644 index 00000000..b633e86c --- /dev/null +++ b/man/reexports.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reexports.R +\docType{import} +\name{reexports} +\alias{reexports} +\alias{as_tsibble} +\alias{arrange} +\alias{filter} +\alias{group_by} +\alias{ungroup} +\alias{group_modify} +\alias{mutate} +\alias{relocate} +\alias{rename} +\alias{slice} +\alias{unnest} +\title{Objects exported from other packages} +\keyword{internal} +\description{ +These objects are imported from other packages. Follow the links +below to see their documentation. + +\describe{ + \item{dplyr}{\code{\link[dplyr]{arrange}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr:group_map]{group_modify}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{relocate}}, \code{\link[dplyr]{rename}}, \code{\link[dplyr]{slice}}, \code{\link[dplyr:group_by]{ungroup}}} + + \item{tidyr}{\code{\link[tidyr:nest]{unnest}}} + + \item{tsibble}{\code{\link[tsibble:as-tsibble]{as_tsibble}}} +}} + diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 00000000..097b2416 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/advanced.Rmd b/vignettes/advanced.Rmd index f93e8fba..b9c082cb 100644 --- a/vignettes/advanced.Rmd +++ b/vignettes/advanced.Rmd @@ -191,7 +191,7 @@ vignette](https://cmu-delphi.github.io/epiprocess/articles/slide.html) in order to demonstrate the last point in a more realistic setting. First, we fetch the versioned data and build the archive. -```{r, message = FALSE, warning = FALSE} +```{r, message = FALSE, warning = FALSE, eval =FALSE} library(delphi.epidata) library(data.table) library(ggplot2) @@ -227,13 +227,24 @@ x <- y1 %>% epix_merge(x, y2 %>% select(geo_value, time_value, version = issue, - case_rate = value + case_rate_7d_av = value ) %>% as_epi_archive(), all = TRUE ) ``` +```{r, message = FALSE, warning = FALSE, echo =FALSE} +library(data.table) +library(ggplot2) +theme_set(theme_bw()) + +x <- archive_cases_dv_subset$DT %>% + filter(geo_value %in% c("ca","fl")) %>% + as_epi_archive() + +``` + Next, we extend the ARX function to handle multiple geo values, since in the present case, we will not be grouping by geo value and each slide computation will be run on multiple geo values at once. Note that, because `epix_slide()` @@ -341,7 +352,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide(fc = prob_arx(percent_cli, case_rate, geo_value, time_value, + epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, args = prob_arx_args(ahead = ahead)), n = 120, ref_time_values = fc_time_values) %>% mutate(target_date = time_value + ahead, as_of = TRUE, @@ -349,7 +360,7 @@ k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { } else { x_latest %>% - epi_slide(fc = prob_arx(percent_cli, case_rate, geo_value, time_value, + epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, geo_value, time_value, args = prob_arx_args(ahead = ahead)), n = 120, ref_time_values = fc_time_values) %>% mutate(target_date = time_value + ahead, as_of = FALSE) @@ -367,18 +378,16 @@ fc <- bind_rows(k_week_ahead(x, ahead = 7, as_of = TRUE), k_week_ahead(x, ahead = 28, as_of = FALSE)) # Plot them, on top of latest COVID-19 case rates -p <- ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + +ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + geom_ribbon(aes(ymin = fc_lower, ymax = fc_upper), alpha = 0.4) + + geom_line(data = x_latest, aes(x = time_value, y = case_rate_7d_av), + inherit.aes = FALSE, color = "gray50") + geom_line(aes(y = fc_point)) + geom_point(aes(y = fc_point), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + facet_grid(vars(geo_value), vars(as_of), scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") - -gginnards::append_layers( - p, geom_line(data = x_latest, aes(x = time_value, y = case_rate), - inherit.aes = FALSE, color = "gray50"), pos = "bottom") + theme(legend.position = "none") ``` We can see that these forecasts, which come from training an ARX model jointly @@ -387,3 +396,10 @@ compared to the ones from the archive vignette, which come from training a separate ARX model on each state. As in the archive vignette, we can see a difference between version-aware (right column) and -unaware (left column) forecasting, as well. + + +## Attribution +This document contains dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. + +[From the COVIDcast Epidatab Doctor's Visit API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. \ No newline at end of file diff --git a/vignettes/aggregation.Rmd b/vignettes/aggregation.Rmd index ed72e5a2..f1d32616 100644 --- a/vignettes/aggregation.Rmd +++ b/vignettes/aggregation.Rmd @@ -12,7 +12,7 @@ epidemiological data sets. This vignette demonstrates how to carry out these kinds of tasks with `epi_df` objects. We'll work with county-level reported COVID-19 cases in MA and VT. -```{r, message = FALSE} +```{r, message = FALSE, eval= FALSE, warning= FALSE} library(delphi.epidata) library(covidcast) library(epiprocess) @@ -23,7 +23,7 @@ y <- covidcast::county_census %>% filter(STNAME %in% c("Massachusetts", "Vermont"), STNAME != CTYNAME) %>% select(geo_value = FIPS, county_name = CTYNAME, state_name = STNAME) -# Fetch only counties from VT and NH, then append names columns as well +# Fetch only counties from Massachusetts and Vermont, then append names columns as well x <- covidcast( data_source = "jhu-csse", signals = "confirmed_incidence_num", @@ -38,6 +38,18 @@ x <- covidcast( as_epi_df() ``` +The data contains 16,212 rows and 5 columns. + +```{r, echo=FALSE, warning=FALSE, message=FALSE} +library(delphi.epidata) +library(covidcast) +library(epiprocess) +library(dplyr) + +data(jhu_csse_county_level_subset) +x <- jhu_csse_county_level_subset +``` + ## Converting to `tsibble` format For manipulating and wrangling time series data, the @@ -147,7 +159,6 @@ second call to `tidyr::fill()`.) ```{r} fill_gaps(xt, cases = 0) %>% - arrange(geo_value, time_value) %>% head() ``` @@ -157,8 +168,7 @@ went back to June 6, 2020. By setting `.full = TRUE`, we can at zero-fill over the entire span of the observed (censored) data. ```{r} -xt_filled <- fill_gaps(xt, cases = 0, .full = TRUE) %>% - arrange(geo_value, time_value) +xt_filled <- fill_gaps(xt, cases = 0, .full = TRUE) head(xt_filled) ``` @@ -228,4 +238,11 @@ head(xt_filled_month) ## Geographic aggregation -TODO \ No newline at end of file +TODO + +## Attribution +This document contains dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. + +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. + diff --git a/vignettes/archive.Rmd b/vignettes/archive.Rmd index b9a12f6f..0e399357 100644 --- a/vignettes/archive.Rmd +++ b/vignettes/archive.Rmd @@ -25,7 +25,7 @@ signal is subject to very heavy and regular revision; you can read more about it on its [API documentation page](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). -```{r, message = FALSE, warning = FALSE} +```{r, message = FALSE, warning = FALSE, eval=FALSE} library(delphi.epidata) library(epiprocess) library(data.table) @@ -36,12 +36,18 @@ dv <- covidcast( signals = "smoothed_adj_cli", time_type = "day", geo_type = "state", - time_value = epirange(20200601, 20211201), - geo_value = "ca,fl", + time_values = epirange(20200601, 20211201), + geo_values = "ca,fl,ny,tx", issues = epirange(20200601, 20211201) ) %>% fetch_tbl() -colnames(dv) +``` + +```{r, echo=FALSE, message=FALSE, warning=FALSE} +library(delphi.epidata) +library(epiprocess) +library(data.table) +library(dplyr) ``` ## Getting data into `epi_archive` format @@ -62,7 +68,7 @@ As we can see from the above, the data frame returned by format, with `issue` playing the role of `version`. We can now use `as_epi_archive()` to bring it into `epi_archive` format. -```{r} +```{r, eval=FALSE} x <- dv %>% select(geo_value, time_value, version = issue, percent_cli = value) %>% as_epi_archive() @@ -71,6 +77,15 @@ class(x) print(x) ``` +```{r, echo=FALSE, message=FALSE, warning=FALSE} +x <- archive_cases_dv_subset$DT %>% + select(geo_value, time_value, version , percent_cli) %>% + as_epi_archive() + +class(x) +print(x) +``` + An `epi_archive` is special kind of class called an R6 class. Its primary field is a data table `DT`, which is of class `data.table` (from the `data.table` package), and has columns `geo_value`, `time_value`, `version`, as well as any @@ -87,7 +102,7 @@ below). There can only be a single row per unique combination of key variables, and therefore the key variables are critical for figuring out how to generate a snapshot of data from the archive, as of a given version (also described below). -```{r} +```{r, error=TRUE} key(x$DT) ``` @@ -175,19 +190,17 @@ snapshots <- map_dfr(versions, function(v) { bind_rows(x_latest %>% mutate(version = self_max)) %>% mutate(latest = version == self_max) -p <- ggplot(snapshots %>% filter(!latest), +ggplot(snapshots %>% filter(!latest), aes(x = time_value, y = percent_cli)) + geom_line(aes(color = factor(version))) + geom_vline(aes(color = factor(version), xintercept = version), lty = 2) + facet_wrap(~ geo_value, scales = "free_y", ncol = 1) + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "% of doctor's visits with CLI") + - theme(legend.position = "none") - -gginnards::append_layers( - p, geom_line(data = snapshots %>% filter(latest), + theme(legend.position = "none") + + geom_line(data = snapshots %>% filter(latest), aes(x = time_value, y = percent_cli), - inherit.aes = FALSE, color = "black"), pos = "top") + inherit.aes = FALSE, color = "black") ``` We can see some interesting and highly nontrivial revision behavior: at some @@ -222,18 +235,18 @@ Because the original data archives are stored in LOCF (last observation carried forward) format in the first place, it generally makes sense to perform `NA` filling after merging using LOCF. Therefore `locf = TRUE` is the default. -```{r, message = FALSE, warning = FALSE} +```{r, message = FALSE, warning = FALSE,eval=FALSE} y <- covidcast( data_source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", time_type = "day", geo_type = "state", time_value = epirange(20200601, 20211201), - geo_values = "ca,fl", + geo_values = "ca,fl,ny,tx", issues = epirange(20200601, 20211201) ) %>% fetch_tbl() %>% - select(geo_value, time_value, version = issue, case_rate = value) %>% + select(geo_value, time_value, version = issue, case_rate_7d_av = value) %>% as_epi_archive() epix_merge(x, y, all = TRUE) @@ -241,6 +254,12 @@ print(x) head(x$DT) ``` +```{r, echo=FALSE, message=FALSE, warning=FALSE} +x <- archive_cases_dv_subset +print(x) +head(x$DT) +``` + Importantly, as we can see, the way `epix_merge()` works is that it **overwrites** the data table in the first `epi_archive` object `x` by the merged data table. @@ -323,7 +342,7 @@ fc_time_values <- seq(as.Date("2020-08-01"), as.Date("2021-12-01"), by = "1 month") -z <- epix_slide(x, fc = prob_arx(x = percent_cli, y = case_rate), n = 120, +z <- epix_slide(x, fc = prob_arx(x = percent_cli, y = case_rate_7d_av), n = 120, ref_time_values = fc_time_values, group_by = geo_value) head(z, 10) @@ -353,14 +372,14 @@ x_latest <- epix_as_of(x, max_version = max(x$DT$version)) k_week_ahead <- function(x, ahead = 7, as_of = TRUE) { if (as_of) { x %>% - epix_slide(fc = prob_arx(percent_cli, case_rate, ahead = ahead), n = 120, + epix_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), n = 120, ref_time_values = fc_time_values, group_by = geo_value) %>% mutate(target_date = time_value + ahead, as_of = TRUE) } else { x_latest %>% group_by(geo_value) %>% - epi_slide(fc = prob_arx(percent_cli, case_rate, ahead = ahead), n = 120, + epi_slide(fc = prob_arx(percent_cli, case_rate_7d_av, ahead = ahead), n = 120, ref_time_values = fc_time_values) %>% mutate(target_date = time_value + ahead, as_of = FALSE) } @@ -377,18 +396,16 @@ fc <- bind_rows(k_week_ahead(x, ahead = 7, as_of = TRUE), k_week_ahead(x, ahead = 28, as_of = FALSE)) # Plot them, on top of latest COVID-19 case rates -p <- ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + +ggplot(fc, aes(x = target_date, group = time_value, fill = as_of)) + geom_ribbon(aes(ymin = fc_lower, ymax = fc_upper), alpha = 0.4) + + geom_line(data = x_latest, aes(x = time_value, y = case_rate_7d_av), + inherit.aes = FALSE, color = "gray50") + geom_line(aes(y = fc_point)) + geom_point(aes(y = fc_point), size = 0.5) + geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + facet_grid(vars(geo_value), vars(as_of), scales = "free") + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + labs(x = "Date", y = "Reported COVID-19 case rates") + - theme(legend.position = "none") - -gginnards::append_layers( - p, geom_line(data = x_latest, aes(x = time_value, y = case_rate), - inherit.aes = FALSE, color = "gray50"), pos = "bottom") + theme(legend.position = "none") ``` Each row displays the forecasts for a different location (CA and FL), and each @@ -410,3 +427,10 @@ off the data structures and functionality in the current package, is the place to look for more robust forecasting methodology. The forecasters that appear in the vignettes in the current package are only meant to demo the slide functionality with some of the most basic forecasting methodology possible. + +## Attribution +This document contains dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. + +[From the COVIDcast Epidatab Doctor's Visit API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. + diff --git a/vignettes/correlation.Rmd b/vignettes/correlation.Rmd index d2ed5d75..f1fbada5 100644 --- a/vignettes/correlation.Rmd +++ b/vignettes/correlation.Rmd @@ -15,18 +15,21 @@ grouped in a specified way. In this vignette, we'll examine correlations between state-level COVID-19 case and death rates, smoothed using 7-day trailing averages. -```{r, message = FALSE} +```{r, message = FALSE, warning = FALSE} library(delphi.epidata) library(epiprocess) library(dplyr) +``` +The data is fetched with the following query: +```{r, message = FALSE, eval=F} x <- covidcast( data_source = "jhu-csse", signals = "confirmed_7dav_incidence_prop", time_type = "day", geo_type = "state", time_values = epirange(20200301, 20211231), - geo_values = "*" + geo_values = "ca,fl,ny,tx,ga,pa" ) %>% fetch_tbl() %>% select(geo_value, time_value, case_rate = value) @@ -37,7 +40,7 @@ y <- covidcast( time_type = "day", geo_type = "state", time_values = epirange(20200301, 20211231), - geo_values = "*" + geo_values = "ca,fl,ny,tx,ga,pa" ) %>% fetch_tbl() %>% select(geo_value, time_value, death_rate = value) @@ -47,6 +50,18 @@ x <- x %>% as_epi_df() ``` +The data has 4,026 rows and 4 columns. + +```{r, echo=FALSE} +data(jhu_csse_daily_subset) +x <- jhu_csse_daily_subset %>% + select(geo_value, time_value, + case_rate = case_rate_7d_av, + death_rate = death_rate_7d_av) %>% + arrange(geo_value, time_value) %>% + as_epi_df() +``` + ## Correlations grouped by time The `epi_cor()` function operates on an `epi_df` object, and it requires further @@ -158,3 +173,9 @@ z %>% We can see that some pretty clear curvature here in the mean correlation between case and death rates (where the correlations come from grouping by geo value) as a function of lag. The maximum occurs at a lag of somewhere around 18 days. + +## Attribution +This document contains dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. + +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/growth_rate.Rmd b/vignettes/growth_rate.Rmd index b3d324e0..707243c1 100644 --- a/vignettes/growth_rate.Rmd +++ b/vignettes/growth_rate.Rmd @@ -14,12 +14,15 @@ estimates the growth rate of a signal. We investigate this functionality in the current vignette, applied to state-level daily reported COVID-19 cases from GA and PA, smoothed using a 7-day trailing average. -```{r, message = FALSE} +```{r, message = FALSE, warning = FALSE} library(delphi.epidata) library(epiprocess) library(dplyr) library(tidyr) +``` +The data is fetched with the following query: +```{r, message = FALSE, eval=F} x <- covidcast( data_source = "jhu-csse", signals = "confirmed_7dav_incidence_num", @@ -34,6 +37,18 @@ x <- covidcast( as_epi_df() ``` +The data has 1,158 rows and 3 columns. + + +```{r, echo=FALSE} +data(jhu_csse_daily_subset) +x <- jhu_csse_daily_subset %>% + select(geo_value, time_value, cases = cases_7d_av) %>% + filter(geo_value %in% c("pa","ga") & time_value >= "2020-06-01") %>% + arrange(geo_value, time_value) %>% + as_epi_df() +``` + ## Growth rate basics The growth rate of a function $f$ defined over a continuously-valued parameter @@ -275,3 +290,9 @@ curve. + +## Attribution +This document contains dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. + +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. diff --git a/vignettes/outliers.Rmd b/vignettes/outliers.Rmd index 0e1a1be2..9be62556 100644 --- a/vignettes/outliers.Rmd +++ b/vignettes/outliers.Rmd @@ -14,13 +14,11 @@ so that you can define your own outlier detection and correction routines and apply them to `epi_df` objects. We'll demonstrate this using state-level daily reported COVID-19 case counts from FL and NJ. -```{r, message = FALSE, fig.width = 8, fig.height = 7} +```{r, message = FALSE, eval= FALSE} library(delphi.epidata) library(epiprocess) library(dplyr) library(tidyr) -library(ggplot2) -theme_set(theme_bw()) x <- covidcast( data_source = "jhu-csse", @@ -34,6 +32,23 @@ x <- covidcast( fetch_tbl() %>% select(geo_value, time_value, cases = value) %>% as_epi_df() +``` + +The dataset has 730 rows and 3 columns. + +```{r, echo=FALSE, warning=FALSE, message=FALSE} +library(delphi.epidata) +library(epiprocess) +library(dplyr) +library(tidyr) + +data(incidence_num_outlier_example) +x <- incidence_num_outlier_example +``` + +```{r, fig.width = 8, fig.height = 7, warning=FALSE,message=FALSE} +library(ggplot2) +theme_set(theme_bw()) ggplot(x, aes(x = time_value, y = cases)) + geom_line() + @@ -190,4 +205,12 @@ ggplot(y, aes(x = time_value)) + ``` More advanced correction functionality will be coming at some point in the -future. \ No newline at end of file +future. + + +## Attribution +This document contains dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. + +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. + diff --git a/vignettes/slide.Rmd b/vignettes/slide.Rmd index ce90450e..8bb11eab 100644 --- a/vignettes/slide.Rmd +++ b/vignettes/slide.Rmd @@ -29,11 +29,14 @@ FL, NY, and TX (note: here we're using new, not cumulative cases) using the [`delphi.epidata`](https://github.com/cmu-delphi/delphi-epidata-r) package, and then convert this to `epi_df` format. -```{r, message = FALSE} +```{r, message = FALSE, warning=FALSE} library(delphi.epidata) library(epiprocess) library(dplyr) +``` +The data is fetched with the following query: +```{r, message = FALSE, eval=F} x <- covidcast( data_source = "jhu-csse", signals = "confirmed_incidence_num", @@ -48,6 +51,17 @@ x <- covidcast( as_epi_df() ``` +The data has 2,684 rows and 3 columns. + +```{r, echo=FALSE} +data(jhu_csse_daily_subset) +x <- jhu_csse_daily_subset %>% + select(geo_value, time_value, cases) %>% + arrange(geo_value, time_value) %>% + as_epi_df() +``` + + ## Slide with a formula We first demonstrate how to apply a 7-day trailing average to the daily cases in @@ -268,3 +282,10 @@ version-aware sense (for the computation at each reference time $t$, it uses only data that would have been available as of $t$). We will revisit this example in the [archive vignette](https://cmu-delphi.github.io/epiprocess/articles/archive.html). + +## Attribution +This document contains dataset that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. + +[From the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html): + These signals are taken directly from the JHU CSSE [COVID-19 GitHub repository](https://github.com/CSSEGISandData/COVID-19) without changes. +