diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 99986e0beb8..c69719cf32d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,6 +27,11 @@ Breaking changes (:issue:`727`). By `Joe Hamman `_. +- ``repr`` and the Jupyter Notebook won't automatically compute dask variables. + Datasets loaded with ``open_dataset`` won't automatically read coords from + disk when calling ``repr`` (:issue:`1522`). + By `Guido Imperiale `_. + Backward Incompatible Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -153,7 +158,7 @@ Bug fixes ``rtol`` arguments when called on ``DataArray`` objects. By `Stephan Hoyer `_. -- Xarray ``quantile`` methods now properly raise a ``TypeError`` when applied to +- xarray ``quantile`` methods now properly raise a ``TypeError`` when applied to objects with data stored as ``dask`` arrays (:issue:`1529`). By `Joe Hamman `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index dd8805d2e96..64b0e87710a 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -447,8 +447,8 @@ def _level_coords(self): """ level_coords = OrderedDict() for cname, var in self._coords.items(): - if var.ndim == 1: - level_names = var.to_index_variable().level_names + if var.ndim == 1 and isinstance(var, IndexVariable): + level_names = var.level_names if level_names is not None: dim, = var.dims level_coords.update({lname: dim for lname in level_names}) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 37021f865d9..f2c176c9d3e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -654,8 +654,8 @@ def _level_coords(self): level_coords = OrderedDict() for cname in self._coord_names: var = self.variables[cname] - if var.ndim == 1: - level_names = var.to_index_variable().level_names + if var.ndim == 1 and isinstance(var, IndexVariable): + level_names = var.level_names if level_names is not None: dim, = var.dims level_coords.update({lname: dim for lname in level_names}) @@ -1669,12 +1669,12 @@ def expand_dims(self, dim, axis=None): for d in dim: if d in self.dims: raise ValueError( - 'Dimension {dim} already exists.'.format(dim=d)) + 'Dimension {dim} already exists.'.format(dim=d)) if (d in self._variables and not utils.is_scalar(self._variables[d])): raise ValueError( - '{dim} already exists as coordinate or' - ' variable name.'.format(dim=d)) + '{dim} already exists as coordinate or' + ' variable name.'.format(dim=d)) if len(dim) != len(set(dim)): raise ValueError('dims should not contain duplicate values.') @@ -1691,7 +1691,7 @@ def expand_dims(self, dim, axis=None): raise IndexError( 'Axis {a} is out of bounds of the expanded' ' dimension size {dim}.'.format( - a=a, v=k, dim=result_ndim)) + a=a, v=k, dim=result_ndim)) axis_pos = [a if a >= 0 else result_ndim + a for a in axis] @@ -3008,8 +3008,8 @@ def filter_by_attrs(self, **kwargs): for var_name, variable in self.data_vars.items(): for attr_name, pattern in kwargs.items(): attr_value = variable.attrs.get(attr_name) - if ((callable(pattern) and pattern(attr_value)) - or attr_value == pattern): + if ((callable(pattern) and pattern(attr_value)) or + attr_value == pattern): selection.append(var_name) return self[selection] diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 0996ef91cd9..ff6f9fb21aa 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -196,8 +196,8 @@ def format_array_flat(items_ndarray, max_width): return pprint_str -def _summarize_var_or_coord(name, var, col_width, show_values=True, - marker=' ', max_width=None): +def summarize_variable(name, var, col_width, show_values=True, + marker=' ', max_width=None): if max_width is None: max_width = OPTIONS['display_width'] first_col = pretty_print(u' %s %s ' % (marker, name), col_width) @@ -208,6 +208,8 @@ def _summarize_var_or_coord(name, var, col_width, show_values=True, front_str = u'%s%s%s ' % (first_col, dims_str, var.dtype) if show_values: values_str = format_array_flat(var, max_width - len(front_str)) + elif isinstance(var.data, dask_array_type): + values_str = short_dask_repr(var, show_dtype=False) else: values_str = u'...' @@ -222,30 +224,20 @@ def _summarize_coord_multiindex(coord, col_width, marker): def _summarize_coord_levels(coord, col_width, marker=u'-'): relevant_coord = coord[:30] return u'\n'.join( - [_summarize_var_or_coord(lname, - relevant_coord.get_level_variable(lname), - col_width, marker=marker) + [summarize_variable(lname, + relevant_coord.get_level_variable(lname), + col_width, marker=marker) for lname in coord.level_names]) -def _not_remote(var): - """Helper function to identify if array is positively identifiable as - coming from a remote source. - """ - source = var.encoding.get('source') - if source and source.startswith('http') and not var._in_memory: - return False - return True - - -def summarize_var(name, var, col_width): - show_values = _not_remote(var) - return _summarize_var_or_coord(name, var, col_width, show_values) +def summarize_datavar(name, var, col_width): + show_values = var._in_memory + return summarize_variable(name, var.variable, col_width, show_values) def summarize_coord(name, var, col_width): is_index = name in var.dims - show_values = is_index or _not_remote(var) + show_values = var._in_memory marker = u'*' if is_index else u' ' if is_index: coord = var.variable.to_index_variable() @@ -253,7 +245,8 @@ def summarize_coord(name, var, col_width): return u'\n'.join( [_summarize_coord_multiindex(coord, col_width, marker), _summarize_coord_levels(coord, col_width)]) - return _summarize_var_or_coord(name, var, col_width, show_values, marker) + return summarize_variable( + name, var.variable, col_width, show_values, marker) def summarize_attr(key, value, col_width=None): @@ -307,7 +300,7 @@ def _mapping_repr(mapping, title, summarizer, col_width=None): data_vars_repr = functools.partial(_mapping_repr, title=u'Data variables', - summarizer=summarize_var) + summarizer=summarize_datavar) attrs_repr = functools.partial(_mapping_repr, title=u'Attributes', @@ -370,6 +363,19 @@ def short_array_repr(array): return repr(array) +def short_dask_repr(array, show_dtype=True): + """Similar to dask.array.DataArray.__repr__, but without + redundant information that's already printed by the repr + function of the xarray wrapper. + """ + chunksize = tuple(c[0] for c in array.chunks) + if show_dtype: + return 'dask.array' % ( + array.shape, array.dtype, chunksize) + else: + return 'dask.array' % (array.shape, chunksize) + + def array_repr(arr): # used for DataArray, Variable and IndexVariable if hasattr(arr, 'name') and arr.name is not None: @@ -381,7 +387,7 @@ def array_repr(arr): % (type(arr).__name__, name_str, dim_summary(arr))] if isinstance(getattr(arr, 'variable', arr)._data, dask_array_type): - summary.append(repr(arr.data)) + summary.append(short_dask_repr(arr)) elif arr._in_memory or arr.size < 1e5: summary.append(short_array_repr(arr.values)) else: diff --git a/xarray/core/merge.py b/xarray/core/merge.py index f32c7bd82a4..be11be2209b 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -113,7 +113,7 @@ def merge_variables( list_of_variables_dicts, # type: List[Mapping[Any, Variable]] priority_vars=None, # type: Optional[Mapping[Any, Variable]] compat='minimal', # type: str - ): +): # type: (...) -> OrderedDict[Any, Variable] """Merge dicts of variables, while resolving conflicts appropriately. @@ -180,7 +180,7 @@ def expand_variable_dicts(list_of_variable_dicts): Parameters ---------- list_of_variable_dicts : list of dict or Dataset objects - The each value for the mappings must be of the following types: + Each value for the mappings must be of the following types: - an xarray.Variable - a tuple `(dims, data[, attrs[, encoding]])` that can be converted in an xarray.Variable diff --git a/xarray/core/variable.py b/xarray/core/variable.py index dc8f3b39d2d..ae4cb08d6a4 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -284,7 +284,7 @@ def nbytes(self): @property def _in_memory(self): - return (isinstance(self._data, (np.ndarray, PandasIndexAdapter)) or + return (isinstance(self._data, (np.ndarray, np.number, PandasIndexAdapter)) or (isinstance(self._data, indexing.MemoryCachedArray) and isinstance(self._data.array, np.ndarray))) @@ -1210,6 +1210,7 @@ def func(self, other): return self return func + ops.inject_all_ops_and_reduce_methods(Variable) @@ -1374,6 +1375,7 @@ def name(self): def name(self, value): raise AttributeError('cannot modify name of IndexVariable in-place') + # for backwards compatibility Coordinate = utils.alias(IndexVariable, 'Coordinate') diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 7fe684528fc..7e2bf868b31 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1,7 +1,9 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function + import pickle +from textwrap import dedent import numpy as np import pandas as pd import pytest @@ -131,6 +133,25 @@ def test_binary_op(self): self.assertLazyAndIdentical(u + u, v + v) self.assertLazyAndIdentical(u[0] + u, v[0] + v) + def test_repr(self): + expected = dedent("""\ + + dask.array""") + self.assertEqual(expected, repr(self.lazy_var)) + + def test_pickle(self): + # Test that pickling/unpickling does not convert the dask + # backend to numpy + a1 = Variable(['x'], build_dask_array('x')) + a1.compute() + self.assertFalse(a1._in_memory) + self.assertEquals(kernel_call_count, 1) + a2 = pickle.loads(pickle.dumps(a1)) + self.assertEquals(kernel_call_count, 1) + self.assertVariableIdentical(a1, a2) + self.assertFalse(a1._in_memory) + self.assertFalse(a2._in_memory) + def test_reduce(self): u = self.eager_var v = self.lazy_var @@ -341,47 +362,103 @@ def test_dot(self): lazy = self.lazy_array.dot(self.lazy_array[0]) self.assertLazyAndAllClose(eager, lazy) - def test_variable_pickle(self): - # Test that pickling/unpickling does not convert the dask - # backend to numpy - a1 = Variable(['x'], build_dask_array()) - a1.compute() - self.assertFalse(a1._in_memory) - self.assertEquals(kernel_call_count, 1) - a2 = pickle.loads(pickle.dumps(a1)) - self.assertEquals(kernel_call_count, 1) - self.assertVariableIdentical(a1, a2) - self.assertFalse(a1._in_memory) - self.assertFalse(a2._in_memory) + def test_dataarray_repr(self): + # Test that __repr__ converts the dask backend to numpy + # in neither the data variable nor the non-index coords + data = build_dask_array('data') + nonindex_coord = build_dask_array('coord') + a = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) + expected = dedent("""\ + + dask.array + Coordinates: + y (x) int64 dask.array + Dimensions without coordinates: x""") + self.assertEqual(expected, repr(a)) + self.assertEquals(kernel_call_count, 0) + + def test_dataset_repr(self): + # Test that pickling/unpickling converts the dask backend + # to numpy in neither the data variables nor the non-index coords + data = build_dask_array('data') + nonindex_coord = build_dask_array('coord') + ds = Dataset(data_vars={'a': ('x', data)}, + coords={'y': ('x', nonindex_coord)}) + expected = dedent("""\ + + Dimensions: (x: 1) + Coordinates: + y (x) int64 dask.array + Dimensions without coordinates: x + Data variables: + a (x) int64 dask.array""") + self.assertEqual(expected, repr(ds)) + self.assertEquals(kernel_call_count, 0) def test_dataarray_pickle(self): - # Test that pickling/unpickling does not convert the dask - # backend to numpy - a1 = DataArray(build_dask_array()) + # Test that pickling/unpickling converts the dask backend + # to numpy in neither the data variable nor the non-index coords + data = build_dask_array('data') + nonindex_coord = build_dask_array('coord') + a1 = DataArray(data, dims=['x'], coords={'y': ('x', nonindex_coord)}) a1.compute() self.assertFalse(a1._in_memory) - self.assertEquals(kernel_call_count, 1) + self.assertFalse(a1.coords['y']._in_memory) + self.assertEquals(kernel_call_count, 2) a2 = pickle.loads(pickle.dumps(a1)) - self.assertEquals(kernel_call_count, 1) + self.assertEquals(kernel_call_count, 2) self.assertDataArrayIdentical(a1, a2) self.assertFalse(a1._in_memory) self.assertFalse(a2._in_memory) + self.assertFalse(a1.coords['y']._in_memory) + self.assertFalse(a2.coords['y']._in_memory) def test_dataset_pickle(self): - ds1 = Dataset({'a': DataArray(build_dask_array())}) + # Test that pickling/unpickling converts the dask backend + # to numpy in neither the data variables nor the non-index coords + data = build_dask_array('data') + nonindex_coord = build_dask_array('coord') + ds1 = Dataset(data_vars={'a': ('x', data)}, + coords={'y': ('x', nonindex_coord)}) ds1.compute() self.assertFalse(ds1['a']._in_memory) - self.assertEquals(kernel_call_count, 1) + self.assertFalse(ds1['y']._in_memory) + self.assertEquals(kernel_call_count, 2) ds2 = pickle.loads(pickle.dumps(ds1)) - self.assertEquals(kernel_call_count, 1) + self.assertEquals(kernel_call_count, 2) self.assertDatasetIdentical(ds1, ds2) self.assertFalse(ds1['a']._in_memory) self.assertFalse(ds2['a']._in_memory) + self.assertFalse(ds1['y']._in_memory) + self.assertFalse(ds2['y']._in_memory) + + def test_dataarray_getattr(self): + # ipython/jupyter does a long list of getattr() calls to when trying to + # represent an object. + # Make sure we're not accidentally computing dask variables. + data = build_dask_array('data') + nonindex_coord = build_dask_array('coord') + a = DataArray(data, dims=['x'], + coords={'y': ('x', nonindex_coord)}) + with suppress(AttributeError): + getattr(a, 'NOTEXIST') + self.assertEquals(kernel_call_count, 0) + + def test_dataset_getattr(self): + # Test that pickling/unpickling converts the dask backend + # to numpy in neither the data variables nor the non-index coords + data = build_dask_array('data') + nonindex_coord = build_dask_array('coord') + ds = Dataset(data_vars={'a': ('x', data)}, + coords={'y': ('x', nonindex_coord)}) + with suppress(AttributeError): + getattr(ds, 'NOTEXIST') + self.assertEquals(kernel_call_count, 0) def test_values(self): # Test that invoking the values property does not convert the dask # backend to numpy - a = DataArray([1,2]).chunk() + a = DataArray([1, 2]).chunk() self.assertFalse(a._in_memory) self.assertEquals(a.values.tolist(), [1, 2]) self.assertFalse(a._in_memory) @@ -436,18 +513,20 @@ def test_dask_kwargs_dataset(method): kernel_call_count = 0 + + def kernel(): - """Dask kernel to test pickling/unpickling. + """Dask kernel to test pickling/unpickling and __repr__. Must be global to make it pickleable. """ global kernel_call_count kernel_call_count += 1 - return np.ones(1) + return np.ones(1, dtype=np.int64) -def build_dask_array(): +def build_dask_array(name): global kernel_call_count kernel_call_count = 0 return dask.array.Array( - dask={('foo', 0): (kernel, )}, name='foo', - chunks=((1,),), dtype=int) + dask={(name, 0): (kernel, )}, name=name, + chunks=((1,),), dtype=np.int64)