Skip to content

Commit c346d3b

Browse files
authored
Bug fixes for Dataset.reduce() and n-dimensional cumsum/cumprod (pydata#2156)
* Bug fixes for Dataset.reduce() and n-dimensional cumsum/cumprod Fixes GH1470, "Dataset.mean drops coordinates" Fixes a bug where non-scalar data-variables that did not include the aggregated dimension were not properly reduced: Previously:: >>> ds = Dataset({'x': ('a', [2, 2]), 'y': 2, 'z': ('b', [2])}) >>> ds.var('a') <xarray.Dataset> Dimensions: (b: 1) Dimensions without coordinates: b Data variables: x float64 0.0 y float64 0.0 z (b) int64 2 Now:: >>> ds.var('a') <xarray.Dataset> Dimensions: (b: 1) Dimensions without coordinates: b Data variables: x int64 0 y int64 0 z (b) int64 0 Finally, adds support for n-dimensional cumsum() and cumprod(), reducing over all dimensions of an array. (This was necessary as part of the above fix.) * Lint fixup * remove confusing comments
1 parent ecb10e3 commit c346d3b

File tree

7 files changed

+140
-38
lines changed

7 files changed

+140
-38
lines changed

doc/whats-new.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,24 @@ Documentation
3737
Enhancements
3838
~~~~~~~~~~~~
3939

40+
- :py:meth:`~DataArray.cumsum` and :py:meth:`~DataArray.cumprod` now support
41+
aggregation over multiple dimensions at the same time. This is the default
42+
behavior when dimensions are not specified (previously this raised an error).
43+
By `Stephan Hoyer <https://github.com/shoyer>`_
44+
4045
Bug fixes
4146
~~~~~~~~~
47+
4248
- Fixed a bug where `to_netcdf(..., unlimited_dims='bar'` yielded NetCDF files
4349
with spurious 0-length dimensions (i.e. `b`, `a`, and `r`) (:issue:`2134`).
4450
By `Joe Hamman <https://github.com/jhamman>`_.
4551

52+
- Aggregations with :py:meth:`Dataset.reduce` (including ``mean``, ``sum``,
53+
etc) no longer drop unrelated coordinates (:issue:`1470`). Also fixed a
54+
bug where non-scalar data-variables that did not include the aggregation
55+
dimension were improperly skipped.
56+
By `Stephan Hoyer <https://github.com/shoyer>`_
57+
4658
.. _whats-new.0.10.4:
4759

4860
v0.10.4 (May 16, 2018)

xarray/core/dataset.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2594,26 +2594,26 @@ def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False,
25942594
variables = OrderedDict()
25952595
for name, var in iteritems(self._variables):
25962596
reduce_dims = [dim for dim in var.dims if dim in dims]
2597-
if reduce_dims or not var.dims:
2598-
if name not in self.coords:
2599-
if (not numeric_only or
2600-
np.issubdtype(var.dtype, np.number) or
2601-
(var.dtype == np.bool_)):
2602-
if len(reduce_dims) == 1:
2603-
# unpack dimensions for the benefit of functions
2604-
# like np.argmin which can't handle tuple arguments
2605-
reduce_dims, = reduce_dims
2606-
elif len(reduce_dims) == var.ndim:
2607-
# prefer to aggregate over axis=None rather than
2608-
# axis=(0, 1) if they will be equivalent, because
2609-
# the former is often more efficient
2610-
reduce_dims = None
2611-
variables[name] = var.reduce(func, dim=reduce_dims,
2612-
keep_attrs=keep_attrs,
2613-
allow_lazy=allow_lazy,
2614-
**kwargs)
2597+
if name in self.coords:
2598+
if not reduce_dims:
2599+
variables[name] = var
26152600
else:
2616-
variables[name] = var
2601+
if (not numeric_only or
2602+
np.issubdtype(var.dtype, np.number) or
2603+
(var.dtype == np.bool_)):
2604+
if len(reduce_dims) == 1:
2605+
# unpack dimensions for the benefit of functions
2606+
# like np.argmin which can't handle tuple arguments
2607+
reduce_dims, = reduce_dims
2608+
elif len(reduce_dims) == var.ndim:
2609+
# prefer to aggregate over axis=None rather than
2610+
# axis=(0, 1) if they will be equivalent, because
2611+
# the former is often more efficient
2612+
reduce_dims = None
2613+
variables[name] = var.reduce(func, dim=reduce_dims,
2614+
keep_attrs=keep_attrs,
2615+
allow_lazy=allow_lazy,
2616+
**kwargs)
26172617

26182618
coord_names = set(k for k in self.coords if k in variables)
26192619
attrs = self.attrs if keep_attrs else None

xarray/core/duck_array_ops.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -281,8 +281,7 @@ def _nanvar_object(value, axis=None, **kwargs):
281281

282282

283283
def _create_nan_agg_method(name, numeric_only=False, np_compat=False,
284-
no_bottleneck=False, coerce_strings=False,
285-
keep_dims=False):
284+
no_bottleneck=False, coerce_strings=False):
286285
def f(values, axis=None, skipna=None, **kwargs):
287286
if kwargs.pop('out', None) is not None:
288287
raise TypeError('`out` is not valid for {}'.format(name))
@@ -343,7 +342,6 @@ def f(values, axis=None, skipna=None, **kwargs):
343342
'or newer to use skipna=True or skipna=None' % name)
344343
raise NotImplementedError(msg)
345344
f.numeric_only = numeric_only
346-
f.keep_dims = keep_dims
347345
f.__name__ = name
348346
return f
349347

@@ -358,10 +356,34 @@ def f(values, axis=None, skipna=None, **kwargs):
358356
var = _create_nan_agg_method('var', numeric_only=True)
359357
median = _create_nan_agg_method('median', numeric_only=True)
360358
prod = _create_nan_agg_method('prod', numeric_only=True, no_bottleneck=True)
361-
cumprod = _create_nan_agg_method('cumprod', numeric_only=True, np_compat=True,
362-
no_bottleneck=True, keep_dims=True)
363-
cumsum = _create_nan_agg_method('cumsum', numeric_only=True, np_compat=True,
364-
no_bottleneck=True, keep_dims=True)
359+
cumprod_1d = _create_nan_agg_method(
360+
'cumprod', numeric_only=True, np_compat=True, no_bottleneck=True)
361+
cumsum_1d = _create_nan_agg_method(
362+
'cumsum', numeric_only=True, np_compat=True, no_bottleneck=True)
363+
364+
365+
def _nd_cum_func(cum_func, array, axis, **kwargs):
366+
array = asarray(array)
367+
if axis is None:
368+
axis = tuple(range(array.ndim))
369+
if isinstance(axis, int):
370+
axis = (axis,)
371+
372+
out = array
373+
for ax in axis:
374+
out = cum_func(out, axis=ax, **kwargs)
375+
return out
376+
377+
378+
def cumprod(array, axis=None, **kwargs):
379+
"""N-dimensional version of cumprod."""
380+
return _nd_cum_func(cumprod_1d, array, axis, **kwargs)
381+
382+
383+
def cumsum(array, axis=None, **kwargs):
384+
"""N-dimensional version of cumsum."""
385+
return _nd_cum_func(cumsum_1d, array, axis, **kwargs)
386+
365387

366388
_fail_on_dask_array_input_skipna = partial(
367389
fail_on_dask_array_input,

xarray/core/variable.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1256,11 +1256,6 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False,
12561256
if dim is not None and axis is not None:
12571257
raise ValueError("cannot supply both 'axis' and 'dim' arguments")
12581258

1259-
if getattr(func, 'keep_dims', False):
1260-
if dim is None and axis is None:
1261-
raise ValueError("must supply either single 'dim' or 'axis' "
1262-
"argument to %s" % (func.__name__))
1263-
12641259
if dim is not None:
12651260
axis = self.get_axis_num(dim)
12661261
data = func(self.data if allow_lazy else self.values,

xarray/tests/test_dataarray.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1767,6 +1767,11 @@ def test_cumops(self):
17671767
orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords,
17681768
dims=['x', 'y'])
17691769

1770+
actual = orig.cumsum()
1771+
expected = DataArray([[-1, -1, 0], [-4, -4, 0]], coords,
1772+
dims=['x', 'y'])
1773+
assert_identical(expected, actual)
1774+
17701775
actual = orig.cumsum('x')
17711776
expected = DataArray([[-1, 0, 1], [-4, 0, 4]], coords,
17721777
dims=['x', 'y'])

xarray/tests/test_dataset.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3331,7 +3331,18 @@ def test_reduce(self):
33313331

33323332
assert_equal(data.mean(dim=[]), data)
33333333

3334-
# uint support
3334+
def test_reduce_coords(self):
3335+
# regression test for GH1470
3336+
data = xr.Dataset({'a': ('x', [1, 2, 3])}, coords={'b': 4})
3337+
expected = xr.Dataset({'a': 2}, coords={'b': 4})
3338+
actual = data.mean('x')
3339+
assert_identical(actual, expected)
3340+
3341+
# should be consistent
3342+
actual = data['a'].mean('x').to_dataset()
3343+
assert_identical(actual, expected)
3344+
3345+
def test_mean_uint_dtype(self):
33353346
data = xr.Dataset({'a': (('x', 'y'),
33363347
np.arange(6).reshape(3, 2).astype('uint')),
33373348
'b': (('x', ), np.array([0.1, 0.2, np.nan]))})
@@ -3345,15 +3356,20 @@ def test_reduce_bad_dim(self):
33453356
with raises_regex(ValueError, 'Dataset does not contain'):
33463357
data.mean(dim='bad_dim')
33473358

3359+
def test_reduce_cumsum(self):
3360+
data = xr.Dataset({'a': 1,
3361+
'b': ('x', [1, 2]),
3362+
'c': (('x', 'y'), [[np.nan, 3], [0, 4]])})
3363+
assert_identical(data.fillna(0), data.cumsum('y'))
3364+
3365+
expected = xr.Dataset({'a': 1,
3366+
'b': ('x', [1, 3]),
3367+
'c': (('x', 'y'), [[0, 3], [0, 7]])})
3368+
assert_identical(expected, data.cumsum())
3369+
33483370
def test_reduce_cumsum_test_dims(self):
33493371
data = create_test_data()
33503372
for cumfunc in ['cumsum', 'cumprod']:
3351-
with raises_regex(ValueError,
3352-
"must supply either single 'dim' or 'axis'"):
3353-
getattr(data, cumfunc)()
3354-
with raises_regex(ValueError,
3355-
"must supply either single 'dim' or 'axis'"):
3356-
getattr(data, cumfunc)(dim=['dim1', 'dim2'])
33573373
with raises_regex(ValueError, 'Dataset does not contain'):
33583374
getattr(data, cumfunc)(dim='bad_dim')
33593375

@@ -3460,6 +3476,10 @@ def test_reduce_scalars(self):
34603476
actual = ds.var()
34613477
assert_identical(expected, actual)
34623478

3479+
expected = Dataset({'x': 0, 'y': 0, 'z': ('b', [0])})
3480+
actual = ds.var('a')
3481+
assert_identical(expected, actual)
3482+
34633483
def test_reduce_only_one_axis(self):
34643484

34653485
def mean_only_one_axis(x, axis):

xarray/tests/test_duck_array_ops.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import warnings
99

1010
from xarray import DataArray, concat
11+
from xarray.core import duck_array_ops
1112
from xarray.core.duck_array_ops import (
1213
array_notnull_equiv, concatenate, count, first, last, mean, rolling_window,
1314
stack, where)
@@ -103,6 +104,53 @@ def test_all_nan_arrays(self):
103104
assert np.isnan(mean([np.nan, np.nan]))
104105

105106

107+
def test_cumsum_1d():
108+
inputs = np.array([0, 1, 2, 3])
109+
expected = np.array([0, 1, 3, 6])
110+
actual = duck_array_ops.cumsum(inputs)
111+
assert_array_equal(expected, actual)
112+
113+
actual = duck_array_ops.cumsum(inputs, axis=0)
114+
assert_array_equal(expected, actual)
115+
116+
actual = duck_array_ops.cumsum(inputs, axis=-1)
117+
assert_array_equal(expected, actual)
118+
119+
actual = duck_array_ops.cumsum(inputs, axis=(0,))
120+
assert_array_equal(expected, actual)
121+
122+
actual = duck_array_ops.cumsum(inputs, axis=())
123+
assert_array_equal(inputs, actual)
124+
125+
126+
def test_cumsum_2d():
127+
inputs = np.array([[1, 2], [3, 4]])
128+
129+
expected = np.array([[1, 3], [4, 10]])
130+
actual = duck_array_ops.cumsum(inputs)
131+
assert_array_equal(expected, actual)
132+
133+
actual = duck_array_ops.cumsum(inputs, axis=(0, 1))
134+
assert_array_equal(expected, actual)
135+
136+
actual = duck_array_ops.cumsum(inputs, axis=())
137+
assert_array_equal(inputs, actual)
138+
139+
140+
def test_cumprod_2d():
141+
inputs = np.array([[1, 2], [3, 4]])
142+
143+
expected = np.array([[1, 2], [3, 2 * 3 * 4]])
144+
actual = duck_array_ops.cumprod(inputs)
145+
assert_array_equal(expected, actual)
146+
147+
actual = duck_array_ops.cumprod(inputs, axis=(0, 1))
148+
assert_array_equal(expected, actual)
149+
150+
actual = duck_array_ops.cumprod(inputs, axis=())
151+
assert_array_equal(inputs, actual)
152+
153+
106154
class TestArrayNotNullEquiv():
107155
@pytest.mark.parametrize("arr1, arr2", [
108156
(np.array([1, 2, 3]), np.array([1, 2, 3])),

0 commit comments

Comments
 (0)