Skip to content

WIP: explicit indexes #2195

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions xarray/core/coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import Mapping
from contextlib import contextmanager

import numpy as np
import pandas as pd

from . import formatting, indexing
Expand Down
31 changes: 18 additions & 13 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@
from .utils import (
_check_inplace, decode_numpy_dict_values, either_dict_or_kwargs,
ensure_us_time_resolution)
from .merge import expand_variable_dicts, merge_variables
from .variable import (
IndexVariable, Variable, as_compatible_data, as_variable,
assert_unique_multiindex_level_names)
assert_unique_multiindex_level_names, maybe_expand_multiindex)


def _infer_coords_and_dims(shape, coords, dims):
Expand Down Expand Up @@ -58,19 +59,24 @@ def _infer_coords_and_dims(shape, coords, dims):
if not isinstance(d, basestring):
raise TypeError('dimension %s is not a string' % d)

new_coords = OrderedDict()

if utils.is_dict_like(coords):
for k, v in coords.items():
new_coords[k] = as_variable(v, name=k)
elif coords is not None:
if coords is None:
coords = OrderedDict()
elif not utils.is_dict_like(coords):
# Convert list-like coords into a dict
coords_dict = OrderedDict()
for dim, coord in zip(dims, coords):
var = as_variable(coord, name=dim)
var.dims = (dim,)
new_coords[dim] = var
coords_dict[dim] = var
coords = coords_dict

# Combine coordinates, including MultiIndex levels
expanded = expand_variable_dicts([coords])
coords = merge_variables(expanded, compat='equals')

# Check consistent
sizes = dict(zip(dims, shape))
for k, v in new_coords.items():
for k, v in coords.items():
if any(d not in dims for d in v.dims):
raise ValueError('coordinate %s has dimensions %s, but these '
'are not a subset of the DataArray '
Expand All @@ -88,9 +94,9 @@ def _infer_coords_and_dims(shape, coords, dims):
'matching the dimension size'
% (k, v.shape, (sizes[k],)))

assert_unique_multiindex_level_names(new_coords)
# assert_unique_multiindex_level_names(coords)

return new_coords, dims
return coords, dims


class _LocIndexer(object):
Expand Down Expand Up @@ -462,8 +468,7 @@ def _getitem_coord(self, key):
var = self._coords[key]
except KeyError:
dim_sizes = dict(zip(self.dims, self.shape))
_, key, var = _get_virtual_variable(
self._coords, key, self._level_coords, dim_sizes)
_, key, var = _get_virtual_variable(self._coords, key, dim_sizes)

return self._replace_maybe_drop_dims(var, name=key)

Expand Down
31 changes: 5 additions & 26 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,10 @@
'quarter']


def _get_virtual_variable(variables, key, level_vars=None, dim_sizes=None):
"""Get a virtual variable (e.g., 'time.year' or a MultiIndex level)
def _get_virtual_variable(variables, key, dim_sizes=None):
"""Get a virtual variable (e.g., 'time.year')
from a dict of xarray.Variable objects (if possible)
"""
if level_vars is None:
level_vars = {}
if dim_sizes is None:
dim_sizes = {}

Expand All @@ -69,11 +67,7 @@ def _get_virtual_variable(variables, key, level_vars=None, dim_sizes=None):
else:
raise KeyError(key)

if ref_name in level_vars:
dim_var = variables[level_vars[ref_name]]
ref_var = dim_var.to_index_variable().get_level_variable(ref_name)
else:
ref_var = variables[ref_name]
ref_var = variables[ref_name]

if var_name is None:
virtual_var = ref_var
Expand Down Expand Up @@ -843,21 +837,6 @@ def _subset_with_all_valid_coords(self, variables, coord_names, attrs):

return self._construct_direct(variables, coord_names, dims, attrs)

@property
def _level_coords(self):
"""Return a mapping of all MultiIndex levels and their corresponding
coordinate name.
"""
level_coords = OrderedDict()
for cname in self._coord_names:
var = self.variables[cname]
if var.ndim == 1 and isinstance(var, IndexVariable):
level_names = var.level_names
if level_names is not None:
dim, = var.dims
level_coords.update({lname: dim for lname in level_names})
return level_coords

def _copy_listed(self, names):
"""Create a new Dataset with the listed variables from this dataset and
the all relevant coordinates. Skips all validation.
Expand All @@ -870,7 +849,7 @@ def _copy_listed(self, names):
variables[name] = self._variables[name]
except KeyError:
ref_name, var_name, var = _get_virtual_variable(
self._variables, name, self._level_coords, self.dims)
self._variables, name, self.dims)
variables[var_name] = var
if ref_name in self._coord_names or ref_name in self.dims:
coord_names.add(var_name)
Expand All @@ -887,7 +866,7 @@ def _construct_dataarray(self, name):
variable = self._variables[name]
except KeyError:
_, name, variable = _get_virtual_variable(
self._variables, name, self._level_coords, self.dims)
self._variables, name, self.dims)

coords = OrderedDict()
needed_dims = set(variable.dims)
Expand Down
8 changes: 4 additions & 4 deletions xarray/core/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,9 @@ def summarize_variable(name, var, col_width, show_values=True,

def _summarize_coord_multiindex(coord, col_width, marker):
first_col = pretty_print(u' %s %s ' % (marker, coord.name), col_width)
return u'%s(%s) MultiIndex' % (first_col, unicode_type(coord.dims[0]))
level_names_str = ', '.join(map(str, coord.level_names))
return (u'%s(%s) MultiIndex[%s]' %
(first_col, unicode_type(coord.dims[0]), level_names_str))


def _summarize_coord_levels(coord, col_width, marker=u'-'):
Expand All @@ -277,9 +279,7 @@ def summarize_coord(name, var, col_width):
if is_index:
coord = var.variable.to_index_variable()
if coord.level_names is not None:
return u'\n'.join(
[_summarize_coord_multiindex(coord, col_width, marker),
_summarize_coord_levels(coord, col_width)])
return _summarize_coord_multiindex(coord, col_width, marker)
return summarize_variable(
name, var.variable, col_width, show_values, marker)

Expand Down
73 changes: 73 additions & 0 deletions xarray/core/indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from __future__ import absolute_import, division, print_function

import numpy as np
import pandas as pd


def normalize_indexes(coords, sizes, indexes=None):
"""Normalize indexes for Dataset/DataArray.

Validates that all indexes are pd.Index instances (or at least satisfy
the Index API we need for xarray). Creates default indexes for variables
whose name matches their sole dimension.

Eventually: consider combining indexes along the same dimension into a
MultiIndex.

Parameters
----------
coords : Mapping[Any, xarray.Variable]
Coordinate variables from which to draw default indexes.
dim_sizes : Mapping[Any, int]
Integer sizes for each Dataset/DataArray dimension.
indexes : Optional[Dict[Any, pandas.Index]]
Explicitly supplied indexes, if any.

Returns
-------
Mapping[Any, pandas.Index] mapping indexing keys (levels/dimension names)
to indexes used for indexing along that dimension.
"""
indexes = {} if indexes is None else dict(indexes)

# default indexes
for key in sizes:
if key not in indexes:
if key in coords:
indexes[key] = coords[key].to_index()
else:
# need to ensure dtype=int64 in case range is empty on Python 2
indexes[key] = pd.Index(
range(sizes[key]), name=key, dtype=np.int64)

return indexes


def result_indexes(input_indexes, output_coords):
"""Combine indexes from inputs into indexes for an operation result.

Drops indexes corresponding to dropped coordinates.

IMPORTANT: Assumes outputs are already aligned!

Parameters
----------
input_indexes : Sequence[Mapping[Any, pandas.Index]]
Sequence of mappings of indexes to combine.
output_coords : Sequence[Mapping[Any, pandas.Variable]
Optional sequence of mappings provided output coordinates.

Returns
-------
List[Mapping[Any, pandas.Index]] mapping variable names to indexes,
for each requested mapping of output coordinates.
"""
output_indexes = []
for output_coords_item in output_coords:
indexes = {}
for input_indexes_item in input_indexes:
for k, v in input_indexes_item.items():
if k in output_coords_item:
indexes[k] = v
output_indexes.append(indexes)
return output_indexes
21 changes: 14 additions & 7 deletions xarray/core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from .alignment import deep_align
from .pycompat import OrderedDict, basestring
from .utils import Frozen
from .variable import as_variable, assert_unique_multiindex_level_names
from .variable import (
as_variable, assert_unique_multiindex_level_names, maybe_expand_multiindex)

PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)

Expand Down Expand Up @@ -197,11 +198,10 @@ def expand_variable_dicts(list_of_variable_dicts):

for variables in list_of_variable_dicts:
if isinstance(variables, Dataset):
sanitized_vars = variables.variables
var_dicts.append(variables.variables)
else:
# append coords to var_dicts before appending sanitized_vars,
# because we want coords to appear first
sanitized_vars = OrderedDict()
var_dicts.append(sanitized_vars)

for name, var in variables.items():
if isinstance(var, DataArray):
Expand All @@ -211,10 +211,13 @@ def expand_variable_dicts(list_of_variable_dicts):
coords.pop(name, None)
var_dicts.append(coords)

multiindex_vars = maybe_expand_multiindex(var, name)
if multiindex_vars is not None:
var_dicts.append(multiindex_vars)

var = as_variable(var, name=name)
sanitized_vars[name] = var

var_dicts.append(sanitized_vars)

return var_dicts

Expand Down Expand Up @@ -253,6 +256,10 @@ def determine_coords(list_of_variable_dicts):
coords.discard(name)
coord_names.update(coords)

multiindex_vars = maybe_expand_multiindex(var, name)
if multiindex_vars is not None:
coord_names.update(multiindex_vars)

return coord_names, noncoord_names


Expand Down Expand Up @@ -296,7 +303,7 @@ def merge_coords_for_inplace_math(objs, priority_vars=None):
"""
expanded = expand_variable_dicts(objs)
variables = merge_variables(expanded, priority_vars)
assert_unique_multiindex_level_names(variables)
# assert_unique_multiindex_level_names(variables)
return variables


Expand Down Expand Up @@ -443,7 +450,7 @@ def merge_core(objs,

priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat)
variables = merge_variables(expanded, priority_vars, compat=compat)
assert_unique_multiindex_level_names(variables)
# assert_unique_multiindex_level_names(variables)

dims = calculate_dimensions(variables)

Expand Down
2 changes: 1 addition & 1 deletion xarray/core/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
FILE_CACHE_MAXSIZE: 128,
CMAP_SEQUENTIAL: 'viridis',
CMAP_DIVERGENT: 'RdBu_r',
KEEP_ATTRS: 'default'
KEEP_ATTRS: 'default',
}

_JOIN_OPTIONS = frozenset(['inner', 'outer', 'left', 'right', 'exact'])
Expand Down
57 changes: 57 additions & 0 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,63 @@ def as_variable(obj, name=None):
return obj


def maybe_expand_multiindex(obj, name):
"""Expand an object into one or more Variable objects.

Parameters
----------
obj : object
Object to convert into a variable or variables. Like the obj argument
to as_variable(), but if data is a MultiIndex, each level is extracted
as a separate IndexVariable.
name : any
Name of this object, when used as a key in a dictionary. This is used
to set a default dimension name.

Returns
-------
OrderedDict with a single Variable/IndexVariable value or multiple
IndexVariable values (keyed by level name) if input data is a MultiIndex.

Examples
--------
>>> as_variables_with_multiindex_expansion([1, 2, 3], name='x')
OrderedDict([('x', IndexVariable(('x',), array([1, 2, 3])))])

>>> as_variables_with_multiindex_expansion(('y', [1, 2, 3]), name='x')
OrderedDict([('x', Variable(('y',), array([1, 2, 3])))])

>>> idx = pd.MultiIndex.from_tuples([('a', 1), ('b', 2)], names=['y', 'z'])
>>> as_variables_with_multiindex_expansion(idx, name='x')
OrderedDict([('y', Variable(('x',), array(['a', 'b']))),
('z', Variable(('x',), array([1, 2])))])
"""
tuple_with_multiindex = (isinstance(obj, tuple) and len(obj) > 1 and
isinstance(obj[1], pd.MultiIndex))
if tuple_with_multiindex or isinstance(obj, pd.MultiIndex):
if isinstance(obj, tuple):
dims, index = obj[:2]
else:
dims = (name,)
index = obj
if any(level_name is None for level_name in index.names):
raise ValueError(
'cannot convert a MultiIndex with unknown level names {} into '
'xarray variables: {}'.format(index.names, index))
if len(set(index.names)) != len(index.names):
raise ValueError(
'cannot convert a MultiIndex with non-unique level names {} '
'into xarray variables: {}'.format(index.names, index))
multiindex_vars = OrderedDict()
for level_name in index.names:
multiindex_vars[level_name] = Variable(
dims, index.get_level_values(level_name))
else:
multiindex_vars = None

return multiindex_vars


def _maybe_wrap_data(data):
"""
Put pandas.Index and numpy.ndarray arguments in adapter objects to ensure
Expand Down
6 changes: 3 additions & 3 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ def test_repr_multiindex(self):
<xarray.DataArray (x: 4)>
array([0, 1, 2, 3])
Coordinates:
* x (x) MultiIndex
- level_1 (x) object 'a' 'a' 'b' 'b'
- level_2 (x) int64 1 2 1 2""")
* x (x) MultiIndex[level_1, level_2]
level_1 (x) object 'a' 'a' 'b' 'b'
level_2 (x) int64 1 2 1 2""")
assert expected == repr(self.mda)

def test_properties(self):
Expand Down
Loading