-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Rank Methods #1733
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Rank Methods #1733
Changes from all commits
9a2d73a
061df2c
aa26204
80099f4
e3d3276
420c308
34ab7c4
80f6711
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3222,6 +3222,48 @@ def quantile(self, q, dim=None, interpolation='linear', | |
new.coords['quantile'] = q | ||
return new | ||
|
||
def rank(self, dim, pct=False, keep_attrs=False): | ||
"""Ranks the data. | ||
|
||
Equal values are assigned a rank that is the average of the ranks that | ||
would have been otherwise assigned to all of the values within that set. | ||
Ranks begin at 1, not 0. If pct is True, computes percentage ranks. | ||
|
||
NaNs in the input array are returned as NaNs. | ||
|
||
The `bottleneck` library is required. | ||
|
||
Parameters | ||
---------- | ||
dim : str | ||
Dimension over which to compute rank. | ||
pct : bool, optional | ||
If True, compute percentage ranks, otherwise compute integer ranks. | ||
keep_attrs : bool, optional | ||
If True, the dataset's attributes (`attrs`) will be copied from | ||
the original object to the new one. If False (default), the new | ||
object will be returned without attributes. | ||
|
||
Returns | ||
------- | ||
ranked : Dataset | ||
Variables that do not depend on `dim` are dropped. | ||
""" | ||
if dim not in self.dims: | ||
raise ValueError('Dataset does not contain the dimension: %s' % dim) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add test coverage for this condition |
||
|
||
variables = OrderedDict() | ||
for name, var in iteritems(self.variables): | ||
if name in self.data_vars: | ||
if dim in var.dims: | ||
variables[name] = var.rank(dim, pct=pct) | ||
else: | ||
variables[name] = var | ||
|
||
coord_names = set(self.coords) | ||
attrs = self.attrs if keep_attrs else None | ||
return self._replace_vars_and_dims(variables, coord_names, attrs=attrs) | ||
|
||
@property | ||
def real(self): | ||
return self._unary_op(lambda x: x.real, keep_attrs=True)(self) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1348,7 +1348,6 @@ def quantile(self, q, dim=None, interpolation='linear'): | |
numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile, | ||
DataArray.quantile | ||
""" | ||
|
||
if isinstance(self.data, dask_array_type): | ||
raise TypeError("quantile does not work for arrays stored as dask " | ||
"arrays. Load the data via .compute() or .load() " | ||
|
@@ -1379,6 +1378,47 @@ def quantile(self, q, dim=None, interpolation='linear'): | |
interpolation=interpolation) | ||
return Variable(new_dims, qs) | ||
|
||
def rank(self, dim, pct=False): | ||
"""Ranks the data. | ||
|
||
Equal values are assigned a rank that is the average of the ranks that | ||
would have been otherwise assigned to all of the values within that set. | ||
Ranks begin at 1, not 0. If pct is True, computes percentage ranks. | ||
|
||
NaNs in the input array are returned as NaNs. | ||
|
||
The `bottleneck` library is required. | ||
|
||
Parameters | ||
---------- | ||
dim : str | ||
Dimension over which to compute rank. | ||
pct : bool, optional | ||
If True, compute percentage ranks, otherwise compute integer ranks. | ||
|
||
Returns | ||
------- | ||
ranked : Variable | ||
|
||
See Also | ||
-------- | ||
Dataset.rank, DataArray.rank | ||
""" | ||
import bottleneck as bn | ||
|
||
if isinstance(self.data, dask_array_type): | ||
raise TypeError("rank does not work for arrays stored as dask " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add a test that this error is raised, e.g., using |
||
"arrays. Load the data via .compute() or .load() " | ||
"prior to calling this method.") | ||
|
||
axis = self.get_axis_num(dim) | ||
func = bn.nanrankdata if self.dtype.kind is 'f' else bn.rankdata | ||
ranked = func(self.data, axis=axis) | ||
if pct: | ||
count = np.sum(~np.isnan(self.data), axis=axis, keepdims=True) | ||
ranked /= count | ||
return Variable(self.dims, ranked) | ||
|
||
@property | ||
def real(self): | ||
return type(self)(self.dims, self.data.real, self._attrs) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,7 @@ | |
from xarray.tests import ( | ||
TestCase, ReturnItem, source_ndarray, unittest, requires_dask, | ||
assert_identical, assert_equal, assert_allclose, assert_array_equal, | ||
raises_regex, requires_scipy) | ||
raises_regex, requires_scipy, requires_bottleneck) | ||
|
||
|
||
class TestDataArray(TestCase): | ||
|
@@ -3104,6 +3104,25 @@ def test_sortby(self): | |
actual = da.sortby(['x', 'y']) | ||
self.assertDataArrayEqual(actual, expected) | ||
|
||
@requires_bottleneck | ||
def test_rank(self): | ||
# floats | ||
ar = DataArray([[3, 4, np.nan, 1]]) | ||
expect_0 = DataArray([[1, 1, np.nan, 1]]) | ||
expect_1 = DataArray([[2, 3, np.nan, 1]]) | ||
self.assertDataArrayEqual(ar.rank('dim_0'), expect_0) | ||
self.assertDataArrayEqual(ar.rank('dim_1'), expect_1) | ||
# int | ||
x = DataArray([3,2,1]) | ||
self.assertDataArrayEqual(x.rank('dim_0'), x) | ||
# str | ||
y = DataArray(['c', 'b', 'a']) | ||
self.assertDataArrayEqual(y.rank('dim_0'), x) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need to add test coverage for the |
||
x = DataArray([3.0, 1.0, np.nan, 2.0, 4.0], dims=('z',)) | ||
y = DataArray([0.75, 0.25, np.nan, 0.5, 1.0], dims=('z',)) | ||
self.assertDataArrayEqual(y.rank('z', pct=True), y) | ||
|
||
|
||
@pytest.fixture(params=[1]) | ||
def da(request): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,8 @@ | |
|
||
from . import TestCase, source_ndarray, requires_dask, raises_regex | ||
|
||
from xarray.tests import requires_bottleneck | ||
|
||
|
||
class VariableSubclassTestCases(object): | ||
def test_properties(self): | ||
|
@@ -1353,6 +1355,38 @@ def test_quantile_dask_raises(self): | |
with raises_regex(TypeError, 'arrays stored as dask'): | ||
v.quantile(0.5, dim='x') | ||
|
||
@requires_dask | ||
@requires_bottleneck | ||
def test_rank_dask_raises(self): | ||
v = Variable(['x'], [3.0, 1.0, np.nan, 2.0, 4.0]).chunk(2) | ||
with raises_regex(TypeError, 'arrays stored as dask'): | ||
v.rank('x') | ||
|
||
@requires_bottleneck | ||
def test_rank(self): | ||
import bottleneck as bn | ||
# floats | ||
v = Variable(['x', 'y'], [[3, 4, np.nan, 1]]) | ||
expect_0 = bn.nanrankdata(v.data, axis=0) | ||
expect_1 = bn.nanrankdata(v.data, axis=1) | ||
np.testing.assert_allclose(v.rank('x').values, expect_0) | ||
np.testing.assert_allclose(v.rank('y').values, expect_1) | ||
# int | ||
v = Variable(['x'], [3,2,1]) | ||
expect = bn.rankdata(v.data, axis=0) | ||
np.testing.assert_allclose(v.rank('x').values, expect) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI for the future, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A shameless copy/paste from from the test above :) |
||
# str | ||
v = Variable(['x'], ['c', 'b', 'a']) | ||
expect = bn.rankdata(v.data, axis=0) | ||
np.testing.assert_allclose(v.rank('x').values, expect) | ||
# pct | ||
v = Variable(['x'], [3.0, 1.0, np.nan, 2.0, 4.0]) | ||
v_expect = Variable(['x'], [0.75, 0.25, np.nan, 0.5, 1.0]) | ||
self.assertVariableEqual(v.rank('x', pct=True), v_expect) | ||
# invalid dim | ||
with raises_regex(ValueError, 'not found'): | ||
v.rank('y') | ||
|
||
def test_big_endian_reduce(self): | ||
# regression test for GH489 | ||
data = np.ones(5, dtype='>f4') | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a note here that this requires bottleneck.