Skip to content

Avoid computing dask variables on __repr__ and __getattr__ #1532

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Sep 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ Breaking changes
(:issue:`727`).
By `Joe Hamman <https://github.com/jhamman>`_.

- ``repr`` and the Jupyter Notebook won't automatically compute dask variables.
Datasets loaded with ``open_dataset`` won't automatically read coords from
disk when calling ``repr`` (:issue:`1522`).
By `Guido Imperiale <https://github.com/crusaderky>`_.

Backward Incompatible Changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down Expand Up @@ -153,7 +158,7 @@ Bug fixes
``rtol`` arguments when called on ``DataArray`` objects.
By `Stephan Hoyer <https://github.com/shoyer>`_.

- Xarray ``quantile`` methods now properly raise a ``TypeError`` when applied to
- xarray ``quantile`` methods now properly raise a ``TypeError`` when applied to
objects with data stored as ``dask`` arrays (:issue:`1529`).
By `Joe Hamman <https://github.com/jhamman>`_.

Expand Down
4 changes: 2 additions & 2 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,8 +447,8 @@ def _level_coords(self):
"""
level_coords = OrderedDict()
for cname, var in self._coords.items():
if var.ndim == 1:
level_names = var.to_index_variable().level_names
if var.ndim == 1 and isinstance(var, IndexVariable):
level_names = var.level_names
if level_names is not None:
dim, = var.dims
level_coords.update({lname: dim for lname in level_names})
Expand Down
16 changes: 8 additions & 8 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,8 +654,8 @@ def _level_coords(self):
level_coords = OrderedDict()
for cname in self._coord_names:
var = self.variables[cname]
if var.ndim == 1:
level_names = var.to_index_variable().level_names
if var.ndim == 1 and isinstance(var, IndexVariable):
level_names = var.level_names
if level_names is not None:
dim, = var.dims
level_coords.update({lname: dim for lname in level_names})
Expand Down Expand Up @@ -1669,12 +1669,12 @@ def expand_dims(self, dim, axis=None):
for d in dim:
if d in self.dims:
raise ValueError(
'Dimension {dim} already exists.'.format(dim=d))
'Dimension {dim} already exists.'.format(dim=d))
if (d in self._variables and
not utils.is_scalar(self._variables[d])):
raise ValueError(
'{dim} already exists as coordinate or'
' variable name.'.format(dim=d))
'{dim} already exists as coordinate or'
' variable name.'.format(dim=d))

if len(dim) != len(set(dim)):
raise ValueError('dims should not contain duplicate values.')
Expand All @@ -1691,7 +1691,7 @@ def expand_dims(self, dim, axis=None):
raise IndexError(
'Axis {a} is out of bounds of the expanded'
' dimension size {dim}.'.format(
a=a, v=k, dim=result_ndim))
a=a, v=k, dim=result_ndim))

axis_pos = [a if a >= 0 else result_ndim + a
for a in axis]
Expand Down Expand Up @@ -3008,8 +3008,8 @@ def filter_by_attrs(self, **kwargs):
for var_name, variable in self.data_vars.items():
for attr_name, pattern in kwargs.items():
attr_value = variable.attrs.get(attr_name)
if ((callable(pattern) and pattern(attr_value))
or attr_value == pattern):
if ((callable(pattern) and pattern(attr_value)) or
attr_value == pattern):
selection.append(var_name)
return self[selection]

Expand Down
50 changes: 28 additions & 22 deletions xarray/core/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ def format_array_flat(items_ndarray, max_width):
return pprint_str


def _summarize_var_or_coord(name, var, col_width, show_values=True,
marker=' ', max_width=None):
def summarize_variable(name, var, col_width, show_values=True,
marker=' ', max_width=None):
if max_width is None:
max_width = OPTIONS['display_width']
first_col = pretty_print(u' %s %s ' % (marker, name), col_width)
Expand All @@ -208,6 +208,8 @@ def _summarize_var_or_coord(name, var, col_width, show_values=True,
front_str = u'%s%s%s ' % (first_col, dims_str, var.dtype)
if show_values:
values_str = format_array_flat(var, max_width - len(front_str))
elif isinstance(var.data, dask_array_type):
values_str = short_dask_repr(var, show_dtype=False)
else:
values_str = u'...'

Expand All @@ -222,38 +224,29 @@ def _summarize_coord_multiindex(coord, col_width, marker):
def _summarize_coord_levels(coord, col_width, marker=u'-'):
relevant_coord = coord[:30]
return u'\n'.join(
[_summarize_var_or_coord(lname,
relevant_coord.get_level_variable(lname),
col_width, marker=marker)
[summarize_variable(lname,
relevant_coord.get_level_variable(lname),
col_width, marker=marker)
for lname in coord.level_names])


def _not_remote(var):
"""Helper function to identify if array is positively identifiable as
coming from a remote source.
"""
source = var.encoding.get('source')
if source and source.startswith('http') and not var._in_memory:
return False
return True


def summarize_var(name, var, col_width):
show_values = _not_remote(var)
return _summarize_var_or_coord(name, var, col_width, show_values)
def summarize_datavar(name, var, col_width):
show_values = var._in_memory
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our current heuristic uses the _not_remote() helper function, so it doesn't display arrays loaded over a network (via opendap), which can often be quite slow. But it does display a summary of values from netCDF files on disk, which I do think is generally helpful and for which I haven't noticed any performance issues.

Based on the current definition of _in_memory, we wouldn't display any of these arrays:

@property
def _in_memory(self):
return (isinstance(self._data, (np.ndarray, PandasIndexAdapter)) or
(isinstance(self._data, indexing.MemoryCachedArray) and
isinstance(self._data.array, np.ndarray)))

So instead of using _in_memory, I would suggest something like _not_remote(var) and not isinstance(var._data, dask_array_type) as the condition for showing values.

Copy link
Contributor Author

@crusaderky crusaderky Sep 2, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shoyer loading a NetCDF variable from disk every time you do __repr__ is a terrible idea if that variable has been compressed without chunking. If the variable is a single block of 100MB of zlib-compressed data, you will have to read it and decompress it every time.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shoyer also, your netcdf array might be sitting on a network file system on the opposite side of a narrowband VPN.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's certainly possible, but in my experience very few people writes 100MB chunks -- those are very large.

Let's summarize our options:

  1. Always show a preview of data from netCDF files with Dataset.__repr__
  2. Never show a preview for data if it isn't already in memory
  3. Show a preview depending on a global option (with default choice TBD).

Reasons to show data from disk in __repr__:

  • It's what we've always done.
  • "Most" of the time it's fast and convenient.
  • It provides a good experience for new users, who don't need to hunt for a separate preview() or load() command to see what's in a Dataset. You can simply print it at a console.

Reasons not to show data from disk in __repr__:

  • IO can be slow/expensive, especially if compression or networks are involved.
  • Heuristics to detect expensive IO are unreliable and somewhat distasteful.

Maybe we should solicit a few more opinions here before we change the default behavior?

Another possibility is to try loading data in a separate thread and timeout if it takes too long (say more than 0.5 seconds), but that might open up it's own set of performance issues (it's not easy to kill a thread, short of terminating a process).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think my vote would be to only print a preview of data that is in memory. For my uses, I typically have fill values in the first 10-20 data points so the previous __repr__ didn't give me any information.

@pydata/xarray - anyone else have thoughts on this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shoyer - do we have results from your google poll on this issue yet?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds like I was wrong -- the consensus is pretty clear that we should go ahead with this

screen shot 2017-09-20 at 12 22 32 pm

screen shot 2017-09-20 at 12 22 38 pm

screen shot 2017-09-20 at 12 22 41 pm

screen shot 2017-09-20 at 12 22 48 pm

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this sample size is going to give us statistically significant results but I'm glad to see @delgadom and I are in agreement.

@crusaderky - are you up for implementing this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the current implementation (in this PR) is actually already correct.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep - data is eagerly loaded from disk only for index coords on __init__ now.

return summarize_variable(name, var.variable, col_width, show_values)


def summarize_coord(name, var, col_width):
is_index = name in var.dims
show_values = is_index or _not_remote(var)
show_values = var._in_memory
marker = u'*' if is_index else u' '
if is_index:
coord = var.variable.to_index_variable()
if coord.level_names is not None:
return u'\n'.join(
[_summarize_coord_multiindex(coord, col_width, marker),
_summarize_coord_levels(coord, col_width)])
return _summarize_var_or_coord(name, var, col_width, show_values, marker)
return summarize_variable(
name, var.variable, col_width, show_values, marker)


def summarize_attr(key, value, col_width=None):
Expand Down Expand Up @@ -307,7 +300,7 @@ def _mapping_repr(mapping, title, summarizer, col_width=None):


data_vars_repr = functools.partial(_mapping_repr, title=u'Data variables',
summarizer=summarize_var)
summarizer=summarize_datavar)


attrs_repr = functools.partial(_mapping_repr, title=u'Attributes',
Expand Down Expand Up @@ -370,6 +363,19 @@ def short_array_repr(array):
return repr(array)


def short_dask_repr(array, show_dtype=True):
"""Similar to dask.array.DataArray.__repr__, but without
redundant information that's already printed by the repr
function of the xarray wrapper.
"""
chunksize = tuple(c[0] for c in array.chunks)
if show_dtype:
return 'dask.array<shape=%s, dtype=%s, chunksize=%s>' % (
array.shape, array.dtype, chunksize)
else:
return 'dask.array<shape=%s, chunksize=%s>' % (array.shape, chunksize)


def array_repr(arr):
# used for DataArray, Variable and IndexVariable
if hasattr(arr, 'name') and arr.name is not None:
Expand All @@ -381,7 +387,7 @@ def array_repr(arr):
% (type(arr).__name__, name_str, dim_summary(arr))]

if isinstance(getattr(arr, 'variable', arr)._data, dask_array_type):
summary.append(repr(arr.data))
summary.append(short_dask_repr(arr))
elif arr._in_memory or arr.size < 1e5:
summary.append(short_array_repr(arr.values))
else:
Expand Down
4 changes: 2 additions & 2 deletions xarray/core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def merge_variables(
list_of_variables_dicts, # type: List[Mapping[Any, Variable]]
priority_vars=None, # type: Optional[Mapping[Any, Variable]]
compat='minimal', # type: str
):
):
# type: (...) -> OrderedDict[Any, Variable]
"""Merge dicts of variables, while resolving conflicts appropriately.

Expand Down Expand Up @@ -180,7 +180,7 @@ def expand_variable_dicts(list_of_variable_dicts):
Parameters
----------
list_of_variable_dicts : list of dict or Dataset objects
The each value for the mappings must be of the following types:
Each value for the mappings must be of the following types:
- an xarray.Variable
- a tuple `(dims, data[, attrs[, encoding]])` that can be converted in
an xarray.Variable
Expand Down
4 changes: 3 additions & 1 deletion xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def nbytes(self):

@property
def _in_memory(self):
return (isinstance(self._data, (np.ndarray, PandasIndexAdapter)) or
return (isinstance(self._data, (np.ndarray, np.number, PandasIndexAdapter)) or
(isinstance(self._data, indexing.MemoryCachedArray) and
isinstance(self._data.array, np.ndarray)))

Expand Down Expand Up @@ -1210,6 +1210,7 @@ def func(self, other):
return self
return func


ops.inject_all_ops_and_reduce_methods(Variable)


Expand Down Expand Up @@ -1374,6 +1375,7 @@ def name(self):
def name(self, value):
raise AttributeError('cannot modify name of IndexVariable in-place')


# for backwards compatibility
Coordinate = utils.alias(IndexVariable, 'Coordinate')

Expand Down
Loading