diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index c27db1e46a9..b8fbfbc288f 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -46,6 +46,7 @@ Dataset.T Dataset.cumsum Dataset.cumprod + Dataset.rank DataArray.ndim DataArray.shape @@ -91,6 +92,7 @@ DataArray.T DataArray.cumsum DataArray.cumprod + DataArray.rank ufuncs.angle ufuncs.arccos diff --git a/doc/api.rst b/doc/api.rst index 7bcb844783a..9db59a3cc1f 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -160,6 +160,7 @@ Computation :py:attr:`~Dataset.real` :py:attr:`~Dataset.cumsum` :py:attr:`~Dataset.cumprod` +:py:attr:`~Dataset.rank` **Grouped operations**: :py:attr:`~core.groupby.DatasetGroupBy.assign` @@ -312,6 +313,7 @@ Computation :py:attr:`~DataArray.T` :py:attr:`~DataArray.cumsum` :py:attr:`~DataArray.cumprod` +:py:attr:`~DataArray.rank` **Grouped operations**: :py:attr:`~core.groupby.DataArrayGroupBy.assign_coords` diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a1851633fd1..a5b5970dc96 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -34,6 +34,12 @@ Enhancements in :py:func:`xarray.open_rasterio`. By `Matti Eskelinen ` +**New functions/methods** + +- New :py:meth:`~xarray.DataArray.rank` on arrays and datasets. Requires + bottleneck (:issue:`1731`). + By `0x0L `_. + Bug fixes ~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4ab5136d071..12b010f9801 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -19,6 +19,7 @@ from .accessors import DatetimeAccessor from .alignment import align, reindex_like_indexers from .common import AbstractArray, BaseDataObject +from .computation import apply_ufunc from .coordinates import (DataArrayCoordinates, LevelCoordinatesSource, Indexes, assert_coordinate_consistent, remap_label_indexers) @@ -1971,6 +1972,45 @@ def quantile(self, q, dim=None, interpolation='linear', keep_attrs=False): interpolation=interpolation) return self._from_temp_dataset(ds) + def rank(self, dim, pct=False, keep_attrs=False): + """Ranks the data. + + Equal values are assigned a rank that is the average of the ranks that + would have been otherwise assigned to all of the values within that set. + Ranks begin at 1, not 0. If pct is True, computes percentage ranks. + + NaNs in the input array are returned as NaNs. + + The `bottleneck` library is required. + + Parameters + ---------- + dim : str + Dimension over which to compute rank. + pct : bool, optional + If True, compute percentage ranks, otherwise compute integer ranks. + keep_attrs : bool, optional + If True, the dataset's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. + + Returns + ------- + ranked : DataArray + DataArray with the same coordinates and dtype 'float64'. + + Examples + -------- + + >>> arr = xr.DataArray([5, 6, 7], dims='x') + >>> arr.rank('x') + + array([ 1., 2., 3.]) + Dimensions without coordinates: x + """ + ds = self._to_temp_dataset().rank(dim, pct=pct, keep_attrs=keep_attrs) + return self._from_temp_dataset(ds) + # priority most be higher than Variable to properly work with binary ufuncs ops.inject_all_ops_and_reduce_methods(DataArray, priority=60) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 681390f8504..000c53a829d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3222,6 +3222,48 @@ def quantile(self, q, dim=None, interpolation='linear', new.coords['quantile'] = q return new + def rank(self, dim, pct=False, keep_attrs=False): + """Ranks the data. + + Equal values are assigned a rank that is the average of the ranks that + would have been otherwise assigned to all of the values within that set. + Ranks begin at 1, not 0. If pct is True, computes percentage ranks. + + NaNs in the input array are returned as NaNs. + + The `bottleneck` library is required. + + Parameters + ---------- + dim : str + Dimension over which to compute rank. + pct : bool, optional + If True, compute percentage ranks, otherwise compute integer ranks. + keep_attrs : bool, optional + If True, the dataset's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. + + Returns + ------- + ranked : Dataset + Variables that do not depend on `dim` are dropped. + """ + if dim not in self.dims: + raise ValueError('Dataset does not contain the dimension: %s' % dim) + + variables = OrderedDict() + for name, var in iteritems(self.variables): + if name in self.data_vars: + if dim in var.dims: + variables[name] = var.rank(dim, pct=pct) + else: + variables[name] = var + + coord_names = set(self.coords) + attrs = self.attrs if keep_attrs else None + return self._replace_vars_and_dims(variables, coord_names, attrs=attrs) + @property def real(self): return self._unary_op(lambda x: x.real, keep_attrs=True)(self) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 5c1a2bbe1a7..19ce10145cc 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1348,7 +1348,6 @@ def quantile(self, q, dim=None, interpolation='linear'): numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile, DataArray.quantile """ - if isinstance(self.data, dask_array_type): raise TypeError("quantile does not work for arrays stored as dask " "arrays. Load the data via .compute() or .load() " @@ -1379,6 +1378,47 @@ def quantile(self, q, dim=None, interpolation='linear'): interpolation=interpolation) return Variable(new_dims, qs) + def rank(self, dim, pct=False): + """Ranks the data. + + Equal values are assigned a rank that is the average of the ranks that + would have been otherwise assigned to all of the values within that set. + Ranks begin at 1, not 0. If pct is True, computes percentage ranks. + + NaNs in the input array are returned as NaNs. + + The `bottleneck` library is required. + + Parameters + ---------- + dim : str + Dimension over which to compute rank. + pct : bool, optional + If True, compute percentage ranks, otherwise compute integer ranks. + + Returns + ------- + ranked : Variable + + See Also + -------- + Dataset.rank, DataArray.rank + """ + import bottleneck as bn + + if isinstance(self.data, dask_array_type): + raise TypeError("rank does not work for arrays stored as dask " + "arrays. Load the data via .compute() or .load() " + "prior to calling this method.") + + axis = self.get_axis_num(dim) + func = bn.nanrankdata if self.dtype.kind is 'f' else bn.rankdata + ranked = func(self.data, axis=axis) + if pct: + count = np.sum(~np.isnan(self.data), axis=axis, keepdims=True) + ranked /= count + return Variable(self.dims, ranked) + @property def real(self): return type(self)(self.dims, self.data.real, self._attrs) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 2a14742c948..58b5d2ee140 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -19,7 +19,7 @@ from xarray.tests import ( TestCase, ReturnItem, source_ndarray, unittest, requires_dask, assert_identical, assert_equal, assert_allclose, assert_array_equal, - raises_regex, requires_scipy) + raises_regex, requires_scipy, requires_bottleneck) class TestDataArray(TestCase): @@ -3104,6 +3104,25 @@ def test_sortby(self): actual = da.sortby(['x', 'y']) self.assertDataArrayEqual(actual, expected) + @requires_bottleneck + def test_rank(self): + # floats + ar = DataArray([[3, 4, np.nan, 1]]) + expect_0 = DataArray([[1, 1, np.nan, 1]]) + expect_1 = DataArray([[2, 3, np.nan, 1]]) + self.assertDataArrayEqual(ar.rank('dim_0'), expect_0) + self.assertDataArrayEqual(ar.rank('dim_1'), expect_1) + # int + x = DataArray([3,2,1]) + self.assertDataArrayEqual(x.rank('dim_0'), x) + # str + y = DataArray(['c', 'b', 'a']) + self.assertDataArrayEqual(y.rank('dim_0'), x) + + x = DataArray([3.0, 1.0, np.nan, 2.0, 4.0], dims=('z',)) + y = DataArray([0.75, 0.25, np.nan, 0.5, 1.0], dims=('z',)) + self.assertDataArrayEqual(y.rank('z', pct=True), y) + @pytest.fixture(params=[1]) def da(request): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index d0b6ed55f45..4f592d53ee5 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -31,7 +31,8 @@ requires_dask, source_ndarray) from xarray.tests import (assert_equal, assert_allclose, - assert_array_equal, requires_scipy) + assert_array_equal, requires_bottleneck, + requires_scipy) def create_test_data(seed=None): @@ -3410,6 +3411,23 @@ def test_quantile(self): assert 'dim3' in ds_quantile.dims assert all(d not in ds_quantile.dims for d in dim) + @requires_bottleneck + def test_rank(self): + ds = create_test_data(seed=1234) + # only ds.var3 depends on dim3 + z = ds.rank('dim3') + self.assertItemsEqual(['var3'], list(z.data_vars)) + # same as dataarray version + x = z.var3 + y = ds.var3.rank('dim3') + self.assertDataArrayEqual(x, y) + # coordinates stick + self.assertItemsEqual(list(z.coords), list(ds.coords)) + self.assertItemsEqual(list(x.coords), list(y.coords)) + # invalid dim + with raises_regex(ValueError, 'does not contain'): + x.rank('invalid_dim') + def test_count(self): ds = Dataset({'x': ('a', [np.nan, 1]), 'y': 0, 'z': np.nan}) expected = Dataset({'x': 1, 'y': 1, 'z': 0}) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 2e5446bcafc..9c34baf3a67 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -26,6 +26,8 @@ from . import TestCase, source_ndarray, requires_dask, raises_regex +from xarray.tests import requires_bottleneck + class VariableSubclassTestCases(object): def test_properties(self): @@ -1353,6 +1355,38 @@ def test_quantile_dask_raises(self): with raises_regex(TypeError, 'arrays stored as dask'): v.quantile(0.5, dim='x') + @requires_dask + @requires_bottleneck + def test_rank_dask_raises(self): + v = Variable(['x'], [3.0, 1.0, np.nan, 2.0, 4.0]).chunk(2) + with raises_regex(TypeError, 'arrays stored as dask'): + v.rank('x') + + @requires_bottleneck + def test_rank(self): + import bottleneck as bn + # floats + v = Variable(['x', 'y'], [[3, 4, np.nan, 1]]) + expect_0 = bn.nanrankdata(v.data, axis=0) + expect_1 = bn.nanrankdata(v.data, axis=1) + np.testing.assert_allclose(v.rank('x').values, expect_0) + np.testing.assert_allclose(v.rank('y').values, expect_1) + # int + v = Variable(['x'], [3,2,1]) + expect = bn.rankdata(v.data, axis=0) + np.testing.assert_allclose(v.rank('x').values, expect) + # str + v = Variable(['x'], ['c', 'b', 'a']) + expect = bn.rankdata(v.data, axis=0) + np.testing.assert_allclose(v.rank('x').values, expect) + # pct + v = Variable(['x'], [3.0, 1.0, np.nan, 2.0, 4.0]) + v_expect = Variable(['x'], [0.75, 0.25, np.nan, 0.5, 1.0]) + self.assertVariableEqual(v.rank('x', pct=True), v_expect) + # invalid dim + with raises_regex(ValueError, 'not found'): + v.rank('y') + def test_big_endian_reduce(self): # regression test for GH489 data = np.ones(5, dtype='>f4')