Skip to content

Commit a47ff7b

Browse files
committed
Squashed commit of the following:
commit bc39877 Merge: 507b1f6 278d2e6 Author: dcherian <[email protected]> Date: Tue Oct 29 09:36:30 2019 -0600 Merge remote-tracking branch 'upstream/master' into dask-tokenize * upstream/master: upgrade black verison to 19.10b0 (pydata#3456) Remove outdated code related to compatibility with netcdftime (pydata#3450) Remove deprecated behavior from dataset.drop docstring (pydata#3451) jupyterlab dark theme (pydata#3443) Drop groups associated with nans in group variable (pydata#3406) Allow ellipsis (...) in transpose (pydata#3421) Another groupby.reduce bugfix. (pydata#3403) add icomoon license (pydata#3448) change ALL_DIMS to equal ellipsis (pydata#3418) Escaping dtypes (pydata#3444) Html repr (pydata#3425) commit 507b1f6 Author: dcherian <[email protected]> Date: Tue Oct 29 09:34:47 2019 -0600 Fix window test commit 4ab6a66 Author: dcherian <[email protected]> Date: Thu Oct 24 14:30:57 2019 -0600 Implement __dask_tokenize__
1 parent 705e656 commit a47ff7b

File tree

6 files changed

+84
-0
lines changed

6 files changed

+84
-0
lines changed

doc/whats-new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ Internal Changes
9494

9595
- Use Python 3.6 idioms throughout the codebase. (:pull:3419)
9696
By `Maximilian Roos <https://github.com/max-sixty>`_
97+
- Implement :py:func:`__dask_tokenize__` for xarray objects.
98+
By `Deepak Cherian <https://github.com/dcherian>`_ and `Guido Imperiale <https://github.com/crusaderky>`_.
9799

98100
.. _whats-new.0.14.0:
99101

xarray/core/dataarray.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,9 @@ def reset_coords(
754754
dataset[self.name] = self.variable
755755
return dataset
756756

757+
def __dask_tokenize__(self):
758+
return (DataArray, self._variable, self._coords, self._name)
759+
757760
def __dask_graph__(self):
758761
return self._to_temp_dataset().__dask_graph__()
759762

xarray/core/dataset.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,9 @@ def load(self, **kwargs) -> "Dataset":
649649

650650
return self
651651

652+
def __dask_tokenize__(self):
653+
return (Dataset, self._variables, self._coord_names, self._attrs)
654+
652655
def __dask_graph__(self):
653656
graphs = {k: v.__dask_graph__() for k, v in self.variables.items()}
654657
graphs = {k: v for k, v in graphs.items() if v is not None}

xarray/core/variable.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,9 @@ def compute(self, **kwargs):
390390
new = self.copy(deep=False)
391391
return new.load(**kwargs)
392392

393+
def __dask_tokenize__(self):
394+
return Variable, self._dims, self.data, self._attrs
395+
393396
def __dask_graph__(self):
394397
if isinstance(self._data, dask_array_type):
395398
return self._data.__dask_graph__()
@@ -1967,6 +1970,9 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False):
19671970
if not isinstance(self._data, PandasIndexAdapter):
19681971
self._data = PandasIndexAdapter(self._data)
19691972

1973+
def __dask_tokenize__(self):
1974+
return (IndexVariable, self._dims, self._data.array, self._attrs)
1975+
19701976
def load(self):
19711977
# data is already loaded into memory for IndexVariable
19721978
return self

xarray/tests/test_dask.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import operator
22
import pickle
3+
import sys
34
from contextlib import suppress
45
from distutils.version import LooseVersion
56
from textwrap import dedent
@@ -23,11 +24,14 @@
2324
raises_regex,
2425
)
2526
from ..core.duck_array_ops import lazy_array_equiv
27+
from .test_backends import create_tmp_file
2628

2729
dask = pytest.importorskip("dask")
2830
da = pytest.importorskip("dask.array")
2931
dd = pytest.importorskip("dask.dataframe")
3032

33+
ON_WINDOWS = sys.platform == "win32"
34+
3135

3236
class CountingScheduler:
3337
""" Simple dask scheduler counting the number of computes.
@@ -1221,3 +1225,57 @@ def test_lazy_array_equiv():
12211225
"no_conflicts",
12221226
]:
12231227
xr.merge([lons1, lons2], compat=compat)
1228+
1229+
1230+
@pytest.mark.parametrize("obj", [make_da(), make_ds()])
1231+
@pytest.mark.parametrize(
1232+
"transform",
1233+
[
1234+
lambda x: x.reset_coords(),
1235+
lambda x: x.reset_coords(drop=True),
1236+
lambda x: x.isel(x=1),
1237+
lambda x: x.attrs.update(new_attrs=1),
1238+
lambda x: x.assign_coords(cxy=1),
1239+
lambda x: x.rename({"x": "xnew"}),
1240+
lambda x: x.rename({"cxy": "cxynew"}),
1241+
],
1242+
)
1243+
def test_normalize_token_not_identical(obj, transform):
1244+
with raise_if_dask_computes():
1245+
assert not dask.base.tokenize(obj) == dask.base.tokenize(transform(obj))
1246+
assert not dask.base.tokenize(obj.compute()) == dask.base.tokenize(
1247+
transform(obj.compute())
1248+
)
1249+
1250+
1251+
@pytest.mark.parametrize("transform", [lambda x: x, lambda x: x.compute()])
1252+
def test_normalize_differently_when_data_changes(transform):
1253+
obj = transform(make_ds())
1254+
new = obj.copy(deep=True)
1255+
new["a"] *= 2
1256+
with raise_if_dask_computes():
1257+
assert not dask.base.tokenize(obj) == dask.base.tokenize(new)
1258+
1259+
obj = transform(make_da())
1260+
new = obj.copy(deep=True)
1261+
new *= 2
1262+
with raise_if_dask_computes():
1263+
assert not dask.base.tokenize(obj) == dask.base.tokenize(new)
1264+
1265+
1266+
@pytest.mark.parametrize(
1267+
"transform", [lambda x: x, lambda x: x.copy(), lambda x: x.copy(deep=True)]
1268+
)
1269+
@pytest.mark.parametrize(
1270+
"obj", [make_da(), make_ds(), make_da().indexes["x"], make_ds().variables["a"]]
1271+
)
1272+
def test_normalize_token_identical(obj, transform):
1273+
with raise_if_dask_computes():
1274+
assert dask.base.tokenize(obj) == dask.base.tokenize(transform(obj))
1275+
1276+
1277+
def test_normalize_token_netcdf_backend(map_ds):
1278+
with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as tmp_file:
1279+
map_ds.to_netcdf(tmp_file)
1280+
read = xr.open_dataset(tmp_file)
1281+
assert not dask.base.tokenize(map_ds) == dask.base.tokenize(read)

xarray/tests/test_sparse.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
)
2323

2424
sparse = pytest.importorskip("sparse")
25+
dask = pytest.importorskip("dask")
2526

2627

2728
def assert_sparse_equal(a, b):
@@ -849,3 +850,14 @@ def test_chunk():
849850
dsc = ds.chunk(2)
850851
assert dsc.chunks == {"dim_0": (2, 2)}
851852
assert_identical(dsc, ds)
853+
854+
855+
def test_normalize_token():
856+
s = sparse.COO.from_numpy(np.array([0, 0, 1, 2]))
857+
a = DataArray(s)
858+
dask.base.tokenize(a)
859+
assert isinstance(a.data, sparse.COO)
860+
861+
ac = a.chunk(2)
862+
dask.base.tokenize(ac)
863+
assert isinstance(ac.data._meta, sparse.COO)

0 commit comments

Comments
 (0)