Skip to content

Internal refactor of XArray, with a new CoordXArray subtype #54

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 11, 2014
2 changes: 1 addition & 1 deletion src/xray/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .xarray import XArray, broadcast_xarrays
from .xarray import as_xarray, XArray, CoordXArray, broadcast_xarrays
from .dataset import Dataset, open_dataset
from .dataset_array import DatasetArray, align
from .utils import (orthogonal_indexer, decode_cf_datetime, encode_cf_datetime,
Expand Down
19 changes: 0 additions & 19 deletions src/xray/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,6 @@ def func(self, dimension=cls._reduce_dimension_default,


class AbstractArray(ImplementsReduce):
@property
def dtype(self):
return self._data.dtype

@property
def shape(self):
return self._data.shape

@property
def size(self):
return self._data.size

@property
def ndim(self):
return self._data.ndim

def __len__(self):
return len(self._data)

def __nonzero__(self):
return bool(self.data)

Expand Down
13 changes: 5 additions & 8 deletions src/xray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,11 @@ def encode_cf_variable(array):
attributes['units'] = units
attributes['calendar'] = calendar
elif data.dtype == np.dtype('O'):
# Unfortunately, pandas.Index arrays often have dtype=object even if
# they were created from an array with a sensible datatype (e.g.,
# pandas.Float64Index always has dtype=object for some reason). Because
# we allow for doing math with coordinates, these object arrays can
# propagate onward to other variables, which is why we don't only apply
# this check to XArrays with data that is a pandas.Index.
# Accordingly, we convert object arrays to the type of their first
# variable.
# Occasionally, one will end up with variables with dtype=object
# (likely because they were created from pandas objects which don't
# maintain dtype careful). Thie code makes a best effort attempt to
# encode them into a dtype that NETCDF can handle by inspecting the
# dtype of the first element.
dtype = np.array(data.reshape(-1)[0]).dtype
# N.B. the "astype" call below will fail if data cannot be cast to the
# type of its first element (which is probably the only sensible thing
Expand Down
23 changes: 12 additions & 11 deletions src/xray/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ class _VariablesDict(OrderedDict):
"""
def _datetimeindices(self):
return [k for k, v in self.iteritems()
if isinstance(v._data, pd.DatetimeIndex)]
if np.issubdtype(v.dtype, np.datetime64)
and isinstance(v.index, pd.DatetimeIndex)]

@property
def virtual(self):
Expand All @@ -76,10 +77,10 @@ def _get_virtual_variable(self, key):
if ref_var in self._datetimeindices():
if suffix == 'season':
# seasons = np.array(['DJF', 'MAM', 'JJA', 'SON'])
month = self[ref_var].data.month
month = self[ref_var].index.month
data = (month // 3) % 4 + 1
else:
data = getattr(self[ref_var].data, suffix)
data = getattr(self[ref_var].index, suffix)
return xarray.XArray(self[ref_var].dimensions, data)
raise KeyError('virtual variable %r not found' % key)

Expand Down Expand Up @@ -130,14 +131,15 @@ def __init__(self, variables=None, attributes=None, decode_cf=False):

def _as_variable(self, name, var, decode_cf=False):
if isinstance(var, DatasetArray):
var = var.array
if not isinstance(var, xarray.XArray):
var = xarray.as_xarray(var)
elif not isinstance(var, xarray.XArray):
try:
var = xarray.XArray(*var)
except TypeError:
raise TypeError('Dataset variables must be of type '
'DatasetArray or XArray, or a sequence of the '
'form (dimensions, data[, attributes])')
'form (dimensions, data[, attributes, '
'encoding])')
# this will unmask and rescale the data as well as convert
# time variables to datetime indices.
if decode_cf:
Expand All @@ -147,9 +149,7 @@ def _as_variable(self, name, var, decode_cf=False):
if var.ndim != 1:
raise ValueError('a coordinate variable must be defined with '
'1-dimensional data')
# create a new XArray object on which to modify the data
var = xarray.XArray(var.dimensions, pd.Index(var.data),
var.attributes, encoding=var.encoding)
var = var.to_coord()
return var

def set_variables(self, variables, decode_cf=False):
Expand Down Expand Up @@ -487,7 +487,7 @@ def labeled_by(self, **indexers):
Dataset.indexed_by
Array.indexed_by
"""
return self.indexed_by(**remap_loc_indexers(self.variables, indexers))
return self.indexed_by(**remap_loc_indexers(self, indexers))

def renamed(self, name_dict):
"""Returns a new object with renamed variables and dimensions.
Expand Down Expand Up @@ -625,7 +625,8 @@ def unselect(self, *names):
New dataset based on this dataset. Only the named variables are
removed.
"""
if any(k not in self.variables for k in names):
if any(k not in self.variables and k not in self.virtual_variables
for k in names):
raise ValueError('One or more of the specified variable '
'names does not exist on this dataset')
drop = set(names)
Expand Down
87 changes: 53 additions & 34 deletions src/xray/dataset_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,28 +66,50 @@ def __init__(self, dataset, focus):
self.focus = focus

@property
def array(self):
def variable(self):
return self.dataset.variables[self.focus]
@array.setter
def array(self, value):
@variable.setter
def variable(self, value):
self.dataset[self.focus] = value

# _data is necessary for AbstractArray
@property
def _data(self):
return self.array._data
def dtype(self):
return self.variable.dtype

@property
def shape(self):
return self.variable.shape

@property
def size(self):
return self.variable.size

@property
def ndim(self):
return self.variable.ndim

def __len__(self):
return len(self.variable)

@property
def data(self):
"""The array's data as a numpy.ndarray"""
return self.array.data
"""The variables's data as a numpy.ndarray"""
return self.variable.data
@data.setter
def data(self, value):
self.array.data = value
self.variable.data = value

@property
def index(self):
"""The variable's data as a pandas.Index"""
return self.variable.index

def is_coord(self):
return isinstance(self.variable, xarray.CoordXArray)

@property
def dimensions(self):
return self.array.dimensions
return self.variable.dimensions

def _key_to_indexers(self, key):
return OrderedDict(
Expand All @@ -107,7 +129,7 @@ def __setitem__(self, key, value):
self.dataset[key] = value
else:
# orthogonal array indexing
self.array[key] = value
self.variable[key] = value

def __delitem__(self, key):
del self.dataset[key]
Expand All @@ -127,11 +149,11 @@ def __iter__(self):

@property
def attributes(self):
return self.array.attributes
return self.variable.attributes

@property
def encoding(self):
return self.array.encoding
return self.variable.encoding

@property
def variables(self):
Expand Down Expand Up @@ -175,10 +197,11 @@ def indexed_by(self, **indexers):
Dataset.indexed_by
"""
ds = self.dataset.indexed_by(**indexers)
if self.focus not in ds:
if self.focus not in ds and self.focus in self.dataset:
# always keep focus variable in the dataset, even if it was
# unselected because indexing made it a scaler
ds[self.focus] = self.array.indexed_by(**indexers)
# don't add back in virtual variables (not found in the dataset)
ds[self.focus] = self.variable.indexed_by(**indexers)
return type(self)(ds, self.focus)

def labeled_by(self, **indexers):
Expand Down Expand Up @@ -236,13 +259,8 @@ def refocus(self, new_var, name=None):
If `new_var` is a dataset array, its contents will be merged in.
"""
if not hasattr(new_var, 'dimensions'):
new_var = type(self.array)(self.array.dimensions, new_var)
if self.focus not in self.dimensions:
# only unselect the focus from the dataset if it isn't a coordinate
# variable
ds = self.unselected()
else:
ds = self.dataset
new_var = type(self.variable)(self.variable.dimensions, new_var)
ds = self.dataset.copy() if self.is_coord() else self.unselected()
if name is None:
name = self.focus + '_'
ds[name] = new_var
Expand Down Expand Up @@ -301,7 +319,7 @@ def transpose(self, *dimensions):
numpy.transpose
Array.transpose
"""
return self.refocus(self.array.transpose(*dimensions), self.focus)
return self.refocus(self.variable.transpose(*dimensions), self.focus)

def squeeze(self, dimension=None):
"""Return a new DatasetArray object with squeezed data.
Expand Down Expand Up @@ -361,7 +379,7 @@ def reduce(self, func, dimension=None, axis=None, **kwargs):
DatasetArray with this object's array replaced with an array with
summarized data and the indicated dimension(s) removed.
"""
var = self.array.reduce(func, dimension, axis, **kwargs)
var = self.variable.reduce(func, dimension, axis, **kwargs)
drop = set(self.dimensions) - set(var.dimensions)
# For now, take an aggressive strategy of removing all variables
# associated with any dropped dimensions
Expand Down Expand Up @@ -495,13 +513,13 @@ def to_series(self):
return pd.Series(self.data.reshape(-1), index=index, name=self.focus)

def __array_wrap__(self, obj, context=None):
return self.refocus(self.array.__array_wrap__(obj, context))
return self.refocus(self.variable.__array_wrap__(obj, context))

@staticmethod
def _unary_op(f):
@functools.wraps(f)
def func(self, *args, **kwargs):
return self.refocus(f(self.array, *args, **kwargs),
return self.refocus(f(self.variable, *args, **kwargs),
self.focus + '_' + f.__name__)
return func

Expand All @@ -520,15 +538,15 @@ def func(self, other):
# TODO: automatically group by other variable dimensions to allow
# for broadcasting dimensions like 'dayofyear' against 'time'
self._check_coordinates_compat(other)
ds = self.unselected()
ds = self.dataset.copy() if self.is_coord() else self.unselected()
if hasattr(other, 'unselected'):
ds.merge(other.unselected(), inplace=True)
other_array = getattr(other, 'array', other)
other_array = getattr(other, 'variable', other)
other_focus = getattr(other, 'focus', 'other')
focus = self.focus + '_' + f.__name__ + '_' + other_focus
ds[focus] = (f(self.array, other_array)
ds[focus] = (f(self.variable, other_array)
if not reflexive
else f(other_array, self.array))
else f(other_array, self.variable))
return type(self)(ds, focus)
return func

Expand All @@ -537,8 +555,8 @@ def _inplace_binary_op(f):
@functools.wraps(f)
def func(self, other):
self._check_coordinates_compat(other)
other_array = getattr(other, 'array', other)
self.array = f(self.array, other_array)
other_array = getattr(other, 'variable', other)
self.variable = f(self.variable, other_array)
if hasattr(other, 'unselected'):
self.dataset.merge(other.unselected(), inplace=True)
return self
Expand All @@ -555,8 +573,9 @@ def align(array1, array2):
# TODO: automatically align when doing math with arrays, or better yet
# calculate the union of the indices and fill in the mis-aligned data with
# NaN.
overlapping_coords = {k: (array1.coordinates[k].data
& array2.coordinates[k].data)
# TODO: generalize this function to any number of arguments
overlapping_coords = {k: (array1.coordinates[k].index
& array2.coordinates[k].index)
for k in array1.coordinates
if k in array2.coordinates}
return tuple(ar.labeled_by(**overlapping_coords)
Expand Down
Loading