Skip to content

Commit 1df93c4

Browse files
authored
Weighted density (#5254)
* Add weighted density computed variable * Add NEWS bullet * Fix typo
1 parent e942833 commit 1df93c4

File tree

4 files changed

+26
-9
lines changed

4 files changed

+26
-9
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
* `scale_{x/y}_discrete()` can now accept a `sec.axis`. It is recommended to
3030
only use `dup_axis()` to set custom breaks or labels, as discrete variables
3131
cannot be transformed (@teunbrand, #3171).
32+
* `stat_density()` has the new computed variable: `wdensity`, which is
33+
calculated as the density times the sum of weights (@teunbrand, #4176).
3234

3335
# ggplot2 3.5.1
3436

R/stat-density.R

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
#' @eval rd_computed_vars(
2525
#' density = "density estimate.",
2626
#' count = "density * number of points - useful for stacked density plots.",
27+
#' wdensity = "density * sum of weights. In absence of weights, the same as
28+
#' `count`.",
2729
#' scaled = "density estimate, scaled to maximum of 1.",
2830
#' n = "number of points.",
2931
#' ndensity = "alias for `scaled`, to mirror the syntax of [`stat_bin()`]."
@@ -113,17 +115,19 @@ StatDensity <- ggproto("StatDensity", Stat,
113115
compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
114116
kernel = "gaussian", n = 512,
115117
bounds = c(-Inf, Inf)) {
116-
nx <- length(x)
118+
nx <- w_sum <- length(x)
117119
if (is.null(w)) {
118120
w <- rep(1 / nx, nx)
119121
} else {
120-
w <- w / sum(w)
122+
w_sum <- sum(w)
123+
w <- w / w_sum
121124
}
122125

123126
# Adjust data points and weights to all fit inside bounds
124127
sample_data <- fit_data_to_bounds(bounds, x, w)
125128
x <- sample_data$x
126129
w <- sample_data$w
130+
w_sum <- sample_data$w_sum * w_sum
127131
nx <- length(x)
128132

129133
# if less than 2 points return data frame of NAs and a warning
@@ -135,6 +139,7 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
135139
scaled = NA_real_,
136140
ndensity = NA_real_,
137141
count = NA_real_,
142+
wdensity = NA_real_,
138143
n = NA_integer_,
139144
.size = 1
140145
))
@@ -158,6 +163,7 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
158163
scaled = dens$y / max(dens$y, na.rm = TRUE),
159164
ndensity = dens$y / max(dens$y, na.rm = TRUE),
160165
count = dens$y * nx,
166+
wdensity = dens$y * w_sum,
161167
n = nx,
162168
.size = length(dens$x)
163169
)
@@ -166,7 +172,7 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1,
166172
# Check if all data points are inside bounds. If not, warn and remove them.
167173
fit_data_to_bounds <- function(bounds, x, w) {
168174
is_inside_bounds <- (bounds[1] <= x) & (x <= bounds[2])
169-
175+
w_sum <- 1
170176
if (!all(is_inside_bounds)) {
171177
cli::cli_warn("Some data points are outside of `bounds`. Removing them.")
172178
x <- x[is_inside_bounds]
@@ -177,7 +183,7 @@ fit_data_to_bounds <- function(bounds, x, w) {
177183
}
178184
}
179185

180-
return(list(x = x, w = w))
186+
return(list(x = x, w = w, w_sum = w_sum))
181187
}
182188

183189
# Update density estimation to mitigate boundary effect at known `bounds`:

man/geom_density.Rd

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-stat-density.R

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,28 @@ test_that("stat_density actually computes density", {
1717

1818
test_that("stat_density can make weighted density estimation", {
1919
df <- mtcars
20-
df$weight <- mtcars$cyl / sum(mtcars$cyl)
20+
df$weight <- mtcars$cyl
2121

22-
dens <- stats::density(df$mpg, weights = df$weight, bw = bw.nrd0(df$mpg))
22+
dens <- stats::density(
23+
df$mpg, weights = df$weight / sum(df$weight),
24+
bw = bw.nrd0(df$mpg)
25+
)
2326
expected_density_fun <- stats::approxfun(data.frame(x = dens$x, y = dens$y))
2427

25-
plot <- ggplot(df, aes(mpg, weight = weight)) + stat_density()
26-
actual_density_fun <- stats::approxfun(layer_data(plot)[, c("x", "y")])
28+
plot <- layer_data(ggplot(df, aes(mpg, weight = weight)) + stat_density())
29+
actual_density_fun <- stats::approxfun(plot[, c("x", "y")])
2730

2831
test_sample <- unique(df$mpg)
2932
expect_equal(
3033
expected_density_fun(test_sample),
3134
actual_density_fun(test_sample),
3235
tolerance = 1e-3
3336
)
37+
38+
expect_equal(
39+
plot$wdensity,
40+
plot$density * sum(mtcars$cyl)
41+
)
3442
})
3543

3644
test_that("stat_density uses `bounds`", {
@@ -115,7 +123,7 @@ test_that("compute_density returns useful df and throws warning when <2 values",
115123
expect_warning(dens <- compute_density(1, NULL, from = 0, to = 0))
116124

117125
expect_equal(nrow(dens), 1)
118-
expect_equal(names(dens), c("x", "density", "scaled", "ndensity", "count", "n"))
126+
expect_equal(names(dens), c("x", "density", "scaled", "ndensity", "count", "wdensity", "n"))
119127
expect_type(dens$x, "double")
120128
})
121129

0 commit comments

Comments
 (0)