diff --git a/NEWS.md b/NEWS.md index 50544dcce6..42c4f25bdc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -30,6 +30,8 @@ * `scale_{x/y}_discrete()` can now accept a `sec.axis`. It is recommended to only use `dup_axis()` to set custom breaks or labels, as discrete variables cannot be transformed (@teunbrand, #3171). +* `stat_density()` has the new computed variable: `wdensity`, which is + calculated as the density times the sum of weights (@teunbrand, #4176). # ggplot2 3.5.1 diff --git a/R/stat-density.R b/R/stat-density.R index 4bf28f797b..1b84f2cdb4 100644 --- a/R/stat-density.R +++ b/R/stat-density.R @@ -24,6 +24,8 @@ #' @eval rd_computed_vars( #' density = "density estimate.", #' count = "density * number of points - useful for stacked density plots.", +#' wdensity = "density * sum of weights. In absence of weights, the same as +#' `count`.", #' scaled = "density estimate, scaled to maximum of 1.", #' n = "number of points.", #' ndensity = "alias for `scaled`, to mirror the syntax of [`stat_bin()`]." @@ -113,17 +115,19 @@ StatDensity <- ggproto("StatDensity", Stat, compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1, kernel = "gaussian", n = 512, bounds = c(-Inf, Inf)) { - nx <- length(x) + nx <- w_sum <- length(x) if (is.null(w)) { w <- rep(1 / nx, nx) } else { - w <- w / sum(w) + w_sum <- sum(w) + w <- w / w_sum } # Adjust data points and weights to all fit inside bounds sample_data <- fit_data_to_bounds(bounds, x, w) x <- sample_data$x w <- sample_data$w + w_sum <- sample_data$w_sum * w_sum nx <- length(x) # if less than 2 points return data frame of NAs and a warning @@ -135,6 +139,7 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1, scaled = NA_real_, ndensity = NA_real_, count = NA_real_, + wdensity = NA_real_, n = NA_integer_, .size = 1 )) @@ -158,6 +163,7 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1, scaled = dens$y / max(dens$y, na.rm = TRUE), ndensity = dens$y / max(dens$y, na.rm = TRUE), count = dens$y * nx, + wdensity = dens$y * w_sum, n = nx, .size = length(dens$x) ) @@ -166,7 +172,7 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1, # Check if all data points are inside bounds. If not, warn and remove them. fit_data_to_bounds <- function(bounds, x, w) { is_inside_bounds <- (bounds[1] <= x) & (x <= bounds[2]) - + w_sum <- 1 if (!all(is_inside_bounds)) { cli::cli_warn("Some data points are outside of `bounds`. Removing them.") x <- x[is_inside_bounds] @@ -177,7 +183,7 @@ fit_data_to_bounds <- function(bounds, x, w) { } } - return(list(x = x, w = w)) + return(list(x = x, w = w, w_sum = w_sum)) } # Update density estimation to mitigate boundary effect at known `bounds`: diff --git a/man/geom_density.Rd b/man/geom_density.Rd index cd119edcb3..26f2554e1c 100644 --- a/man/geom_density.Rd +++ b/man/geom_density.Rd @@ -188,6 +188,7 @@ These are calculated by the 'stat' part of layers and can be accessed with \link \itemize{ \item \code{after_stat(density)}\cr density estimate. \item \code{after_stat(count)}\cr density * number of points - useful for stacked density plots. +\item \code{after_stat(wdensity)}\cr density * sum of weights. In absence of weights, the same as \code{count} \item \code{after_stat(scaled)}\cr density estimate, scaled to maximum of 1. \item \code{after_stat(n)}\cr number of points. \item \code{after_stat(ndensity)}\cr alias for \code{scaled}, to mirror the syntax of \code{\link[=stat_bin]{stat_bin()}}. diff --git a/tests/testthat/test-stat-density.R b/tests/testthat/test-stat-density.R index 677dfe5100..0274d83d40 100644 --- a/tests/testthat/test-stat-density.R +++ b/tests/testthat/test-stat-density.R @@ -17,13 +17,16 @@ test_that("stat_density actually computes density", { test_that("stat_density can make weighted density estimation", { df <- mtcars - df$weight <- mtcars$cyl / sum(mtcars$cyl) + df$weight <- mtcars$cyl - dens <- stats::density(df$mpg, weights = df$weight, bw = bw.nrd0(df$mpg)) + dens <- stats::density( + df$mpg, weights = df$weight / sum(df$weight), + bw = bw.nrd0(df$mpg) + ) expected_density_fun <- stats::approxfun(data.frame(x = dens$x, y = dens$y)) - plot <- ggplot(df, aes(mpg, weight = weight)) + stat_density() - actual_density_fun <- stats::approxfun(layer_data(plot)[, c("x", "y")]) + plot <- layer_data(ggplot(df, aes(mpg, weight = weight)) + stat_density()) + actual_density_fun <- stats::approxfun(plot[, c("x", "y")]) test_sample <- unique(df$mpg) expect_equal( @@ -31,6 +34,11 @@ test_that("stat_density can make weighted density estimation", { actual_density_fun(test_sample), tolerance = 1e-3 ) + + expect_equal( + plot$wdensity, + plot$density * sum(mtcars$cyl) + ) }) test_that("stat_density uses `bounds`", { @@ -115,7 +123,7 @@ test_that("compute_density returns useful df and throws warning when <2 values", expect_warning(dens <- compute_density(1, NULL, from = 0, to = 0)) expect_equal(nrow(dens), 1) - expect_equal(names(dens), c("x", "density", "scaled", "ndensity", "count", "n")) + expect_equal(names(dens), c("x", "density", "scaled", "ndensity", "count", "wdensity", "n")) expect_type(dens$x, "double") })