diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9d300741955..00701c718af 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,10 @@ Breaking changes merges will now succeed in cases that previously raised ``xarray.MergeError``. Set ``compat='broadcast_equals'`` to restore the previous default. +- Pickling an xarray object based on the dask backend, or reading its + :py:meth:`values` property, won't automatically convert the array from dask + to numpy in the original object anymore. + By `Guido Imperiale `_. Deprecations ~~~~~~~~~~~~ @@ -45,33 +49,33 @@ Deprecations Enhancements ~~~~~~~~~~~~ - Add checking of ``attr`` names and values when saving to netCDF, raising useful -error messages if they are invalid. (:issue:`911`). -By `Robin Wilson `_. - + error messages if they are invalid. (:issue:`911`). + By `Robin Wilson `_. - Added ability to save ``DataArray`` objects directly to netCDF files using :py:meth:`~xarray.DataArray.to_netcdf`, and to load directly from netCDF files using :py:func:`~xarray.open_dataarray` (:issue:`915`). These remove the need to convert a ``DataArray`` to a ``Dataset`` before saving as a netCDF file, and deals with names to ensure a perfect 'roundtrip' capability. By `Robin Wilson `_. - - Added the ``compat`` option ``'no_conflicts'`` to ``merge``, allowing the combination of xarray objects with disjoint (:issue:`742`) or overlapping (:issue:`835`) coordinates as long as all present data agrees. By `Johnnie Gray `_. See :ref:`combining.no_conflicts` for more details. - - It is now possible to set ``concat_dim=None`` explicitly in :py:func:`~xarray.open_mfdataset` to disable inferring a dimension along which to concatenate. By `Stephan Hoyer `_. +- Added methods :py:meth:`DataArray.compute`, :py:meth:`Dataset.compute`, and + :py:meth:`Variable.compute` as a non-mutating alternative to + :py:meth:`~DataArray.load`. + By `Guido Imperiale `_. Bug fixes ~~~~~~~~~ @@ -95,6 +99,9 @@ Bug fixes - ``.where()`` and ``.fillna()`` now preserve attributes(:issue:`1009`). By `Fabien Maussion `_. +- Applying :py:func:`broadcast()` to an xarray object based on the dask backend + won't accidentally convert the array from dask to numpy anymore (:issue:`978`). + By `Guido Imperiale `_. .. _whats-new.0.8.2: v0.8.2 (18 August 2016) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 44d5865ad37..79fdd7c1100 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -561,6 +561,19 @@ def load(self): self._coords = new._coords return self + def compute(self): + """Manually trigger loading of this array's data from disk or a + remote source into memory and return a new array. The original is + left unaltered. + + Normally, it should not be necessary to call this method in user code, + because all xarray functions should either work on deferred data or + load data automatically. However, this method can be necessary when + working with many file objects on disk. + """ + new = self.copy(deep=False) + return new.load() + def copy(self, deep=True): """Returns a copy of this array. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 45650d21501..d060e845ceb 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -255,8 +255,11 @@ def load_store(cls, store, decoder=None): return obj def __getstate__(self): - """Always load data in-memory before pickling""" - self.load() + """Load data in-memory before pickling (except for Dask data)""" + for v in self.variables.values(): + if not isinstance(v.data, dask_array_type): + v.load() + # self.__dict__ is the default pickle object, we don't need to # implement our own __setstate__ method to make pickle work state = self.__dict__.copy() @@ -306,7 +309,7 @@ def load(self): """ # access .data to coerce everything to numpy or dask arrays all_data = dict((k, v.data) for k, v in self.variables.items()) - lazy_data = dict((k, v) for k, v in all_data.items() + lazy_data = OrderedDict((k, v) for k, v in all_data.items() if isinstance(v, dask_array_type)) if lazy_data: import dask.array as da @@ -319,6 +322,19 @@ def load(self): return self + def compute(self): + """Manually trigger loading of this dataset's data from disk or a + remote source into memory and return a new dataset. The original is + left unaltered. + + Normally, it should not be necessary to call this method in user code, + because all xarray functions should either work on deferred data or + load data automatically. However, this method can be necessary when + working with many file objects on disk. + """ + new = self.copy(deep=False) + return new.load() + @classmethod def _construct_direct(cls, variables, coord_names, dims=None, attrs=None, file_obj=None): @@ -404,11 +420,8 @@ def copy(self, deep=False): Otherwise, a shallow copy is made, so each variable in the new dataset is also a variable in the original dataset. """ - if deep: - variables = OrderedDict((k, v.copy(deep=True)) - for k, v in iteritems(self._variables)) - else: - variables = self._variables.copy() + variables = OrderedDict((k, v.copy(deep=deep)) + for k, v in iteritems(self._variables)) # skip __init__ to avoid costly validation return self._construct_direct(variables, self._coord_names.copy(), self._dims.copy(), self._attrs_copy()) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ed8d15f03b5..0573387b254 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -119,6 +119,9 @@ def as_compatible_data(data, fastpath=False): # can't use fastpath (yet) for scalars return _maybe_wrap_data(data) + if isinstance(data, Variable): + return data.data + # add a custom fast-path for dask.array to avoid expensive checks for the # dtype attribute if isinstance(data, dask_array_type): @@ -271,10 +274,21 @@ def data(self, data): "replacement data must match the Variable's shape") self._data = data + def _data_cast(self): + if isinstance(self._data, (np.ndarray, PandasIndexAdapter)): + return self._data + else: + return np.asarray(self._data) + def _data_cached(self): - if not isinstance(self._data, (np.ndarray, PandasIndexAdapter)): - self._data = np.asarray(self._data) - return self._data + """Load data into memory and return it. + Do not cache dask arrays automatically; that should + require an explicit load() call. + """ + new_data = self._data_cast() + if not isinstance(self._data, dask_array_type): + self._data = new_data + return new_data @property def _indexable_data(self): @@ -288,11 +302,26 @@ def load(self): because all xarray functions should either work on deferred data or load data automatically. """ - self._data_cached() + new_data = self._data_cached() + if isinstance(self._data, dask_array_type): + self._data = new_data return self + def compute(self): + """Manually trigger loading of this variable's data from disk or a + remote source into memory and return a new variable. The original is + left unaltered. + + Normally, it should not be necessary to call this method in user code, + because all xarray functions should either work on deferred data or + load data automatically. + """ + new = self.copy(deep=False) + return new.load() + def __getstate__(self): - """Always cache data as an in-memory array before pickling""" + """Always cache data as an in-memory array before pickling + (with the exception of dask backend)""" self._data_cached() # self.__dict__ is the default pickle object, we don't need to # implement our own __setstate__ method to make pickle work @@ -1090,10 +1119,11 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): raise ValueError('%s objects must be 1-dimensional' % type(self).__name__) - def _data_cached(self): - if not isinstance(self._data, PandasIndexAdapter): - self._data = PandasIndexAdapter(self._data) - return self._data + def _data_cast(self): + if isinstance(self._data, PandasIndexAdapter): + return self._data + else: + return PandasIndexAdapter(self._data) def __getitem__(self, key): key = self._item_key_to_tuple(key) diff --git a/xarray/test/test_backends.py b/xarray/test/test_backends.py index eeb5561579b..0ff5cffec26 100644 --- a/xarray/test/test_backends.py +++ b/xarray/test/test_backends.py @@ -149,6 +149,24 @@ def assert_loads(vars=None): actual = ds.load() self.assertDatasetAllClose(expected, actual) + def test_dataset_compute(self): + expected = create_test_data() + + with self.roundtrip(expected) as actual: + # Test Dataset.compute() + for v in actual.variables.values(): + self.assertFalse(v._in_memory) + + computed = actual.compute() + + for v in actual.data_vars.values(): + self.assertFalse(v._in_memory) + for v in computed.variables.values(): + self.assertTrue(v._in_memory) + + self.assertDatasetAllClose(expected, actual) + self.assertDatasetAllClose(expected, computed) + def test_roundtrip_None_variable(self): expected = Dataset({None: (('x', 'y'), [[0, 1], [2, 3]])}) with self.roundtrip(expected) as actual: @@ -230,18 +248,6 @@ def test_roundtrip_coordinates(self): with self.roundtrip(expected) as actual: self.assertDatasetIdentical(expected, actual) - expected = original.copy() - expected.attrs['coordinates'] = 'something random' - with self.assertRaisesRegexp(ValueError, 'cannot serialize'): - with self.roundtrip(expected): - pass - - expected = original.copy(deep=True) - expected['foo'].attrs['coordinates'] = 'something random' - with self.assertRaisesRegexp(ValueError, 'cannot serialize'): - with self.roundtrip(expected): - pass - def test_roundtrip_boolean_dtype(self): original = create_boolean_data() self.assertEqual(original['x'].dtype, 'bool') @@ -872,7 +878,26 @@ def test_read_byte_attrs_as_unicode(self): @requires_dask @requires_scipy @requires_netCDF4 -class DaskTest(TestCase): +class DaskTest(TestCase, DatasetIOTestCases): + @contextlib.contextmanager + def create_store(self): + yield Dataset() + + @contextlib.contextmanager + def roundtrip(self, data, save_kwargs={}, open_kwargs={}): + yield data.chunk() + + def test_roundtrip_datetime_data(self): + # Override method in DatasetIOTestCases - remove not applicable save_kwds + times = pd.to_datetime(['2000-01-01', '2000-01-02', 'NaT']) + expected = Dataset({'t': ('t', times), 't0': times[0]}) + with self.roundtrip(expected) as actual: + self.assertDatasetIdentical(expected, actual) + + def test_write_store(self): + # Override method in DatasetIOTestCases - not applicable to dask + pass + def test_open_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp1: @@ -992,7 +1017,16 @@ def test_deterministic_names(self): self.assertIn(tmp, dask_name) self.assertEqual(original_names, repeat_names) - + def test_dataarray_compute(self): + # Test DataArray.compute() on dask backend. + # The test for Dataset.compute() is already in DatasetIOTestCases; + # however dask is the only tested backend which supports DataArrays + actual = DataArray([1,2]).chunk() + computed = actual.compute() + self.assertFalse(actual._in_memory) + self.assertTrue(computed._in_memory) + self.assertDataArrayAllClose(actual, computed) + @requires_scipy_or_netCDF4 @requires_pydap class PydapTest(TestCase): diff --git a/xarray/test/test_dask.py b/xarray/test/test_dask.py index 13a8817ce68..652966770e2 100644 --- a/xarray/test/test_dask.py +++ b/xarray/test/test_dask.py @@ -1,3 +1,4 @@ +import pickle import numpy as np import pandas as pd @@ -321,3 +322,36 @@ def test_dot(self): eager = self.eager_array.dot(self.eager_array[0]) lazy = self.lazy_array.dot(self.lazy_array[0]) self.assertLazyAndAllClose(eager, lazy) + + def test_dataarray_pickle(self): + # Test that pickling/unpickling does not convert the dask + # backend to numpy + a1 = DataArray([1,2]).chunk() + self.assertFalse(a1._in_memory) + a2 = pickle.loads(pickle.dumps(a1)) + self.assertDataArrayIdentical(a1, a2) + self.assertFalse(a1._in_memory) + self.assertFalse(a2._in_memory) + + def test_dataset_pickle(self): + ds1 = Dataset({'a': [1,2]}).chunk() + self.assertFalse(ds1['a']._in_memory) + ds2 = pickle.loads(pickle.dumps(ds1)) + self.assertDatasetIdentical(ds1, ds2) + self.assertFalse(ds1['a']._in_memory) + self.assertFalse(ds2['a']._in_memory) + + def test_values(self): + # Test that invoking the values property does not convert the dask + # backend to numpy + a = DataArray([1,2]).chunk() + self.assertFalse(a._in_memory) + self.assertEquals(a.values.tolist(), [1, 2]) + self.assertFalse(a._in_memory) + + def test_from_dask_variable(self): + # Test array creation from Variable with dask backend. + # This is used e.g. in broadcast() + a = DataArray(self.lazy_array.variable) + self.assertLazyAndIdentical(self.lazy_array, a) + diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index b9798dbd611..bac07d68d0f 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -1287,10 +1287,13 @@ def test_copy(self): for copied in [data.copy(deep=False), copy(data)]: self.assertDatasetIdentical(data, copied) - for k in data: + # Note: IndexVariable objects with string dtype are always + # copied because of xarray.core.util.safe_cast_to_index. + # Limiting the test to data variables. + for k in data.data_vars: v0 = data.variables[k] v1 = copied.variables[k] - self.assertIs(v0, v1) + assert source_ndarray(v0.data) is source_ndarray(v1.data) copied['foo'] = ('z', np.arange(5)) self.assertNotIn('foo', data)