diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 51fd3b1076ade..976f3524e3c71 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -103,7 +103,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) - Bug in ``DataFrame.resample(...)`` when there is a time change (DST) and resampling frequecy is 12h or higher (:issue:`15549`) - Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) -- +- Bug in ``rolling.var`` where calculation is inaccurate with a zero-valued array (:issue:`18430`) - - diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 4d5ebdc0c581a..95df5a07a390b 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -661,9 +661,11 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] + 1 - delta = (val - mean_x[0]) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] mean_x[0] = mean_x[0] + delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] + delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] cdef inline void remove_var(double val, double *nobs, double *mean_x, @@ -675,9 +677,11 @@ cdef inline void remove_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] - 1 if nobs[0]: - delta = (val - mean_x[0]) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] mean_x[0] = mean_x[0] - delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] - delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0] else: mean_x[0] = 0 ssqdm_x[0] = 0 @@ -689,7 +693,7 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, Numerically stable implementation using Welford's method. """ cdef: - double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta, mean_x_old int64_t s, e bint is_variable Py_ssize_t i, j, N @@ -749,6 +753,9 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, add_var(input[i], &nobs, &mean_x, &ssqdm_x) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + # After the first window, observations can both be added and # removed for i from win <= i < N: @@ -760,10 +767,12 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # Adding one observation and removing another one delta = val - prev - prev -= mean_x + mean_x_old = mean_x + mean_x += delta / nobs - val -= mean_x - ssqdm_x += (val + prev) * delta + ssqdm_x += ((nobs - 1) * val + + (nobs + 1) * prev + - 2 * nobs * mean_x_old) * delta / nobs else: add_var(val, &nobs, &mean_x, &ssqdm_x) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 2427bcea4053d..8135e263f412f 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2482,6 +2482,14 @@ def test_rolling_corr_pairwise(self): self._check_pairwise_moment('rolling', 'corr', window=10, min_periods=5) + @pytest.mark.parametrize('window', range(7)) + def test_rolling_corr_with_zero_variance(self, window): + # GH 18430 + s = pd.Series(np.zeros(20)) + other = pd.Series(np.arange(20)) + + assert s.rolling(window=window).corr(other=other).isna().all() + def _check_pairwise_moment(self, dispatch, name, **kwargs): def get_result(obj, obj2=None): return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2)