From 27c50f2efb744138d7942ce0c6354bc4fce6b384 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 17 Dec 2014 23:08:27 -0800 Subject: [PATCH 1/3] add test for out of order reindex --- xray/test/test_dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 77d5ad0ba6e..a1a208c716e 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -518,6 +518,11 @@ def test_reindex(self): with self.assertRaisesRegexp(ValueError, 'dictionary'): data.reindex('foo') + # out of order + expected = data.sel(dim1=data['dim1'][:10:-1]) + actual = data.reindex(dim1=data['dim1'][:10:-1]) + self.assertDatasetIdentical(actual, expected) + def test_align(self): left = create_test_data() right = left.copy(deep=True) From 3130c4aa940638220510056473fd12929766279e Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 17 Dec 2014 23:43:13 -0800 Subject: [PATCH 2/3] Fastpath for variable construction --- xray/core/variable.py | 117 ++++++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 51 deletions(-) diff --git a/xray/core/variable.py b/xray/core/variable.py index 5bfc717ebf3..64a8e12a1b5 100644 --- a/xray/core/variable.py +++ b/xray/core/variable.py @@ -50,26 +50,43 @@ def as_variable(obj, key=None, strict=True): return obj -def _as_compatible_data(data): +def _maybe_wrap_data(data): + """ + Put pandas.Index and numpy.ndarray arguments in adapter objects to ensure + they can be indexed properly. + + NumpyArrayAdapter, PandasIndexAdapter and LazilyIndexedArray should + all pass through unmodified. + """ + if isinstance(data, pd.Index): + # check pd.Index first since it may be an ndarray subclass + return PandasIndexAdapter(data) + if isinstance(data, np.ndarray): + return NumpyArrayAdapter(data) + return data + + +def _as_compatible_data(data, fastpath=False): """Prepare and wrap data to put in a Variable. - Prepare the data: - If data does not have the necessary attributes, convert it to ndarray. - If data has dtype=datetime64, ensure that it has ns precision. If it's a pandas.Timestamp, convert it to datetime64. - If data is already a pandas or xray object (other than an Index), just use the values. - Wrap it up: - - Finally, put pandas.Index and numpy.ndarray arguments in adapter objects - to ensure they can be indexed properly. - - NumpyArrayAdapter, PandasIndexAdapter and LazilyIndexedArray should - all pass through unmodified. + Finally, wrap it up with an adapter if necessary. """ - if isinstance(data, pd.MultiIndex): - raise NotImplementedError( - 'no support yet for using a pandas.MultiIndex in an ' - 'xray.Coordinate') + if fastpath and getattr(data, 'ndim', 0) > 0: + # can't use fastpath (yet) for scalars + return _maybe_wrap_data(data) + + if isinstance(data, pd.Index): + if isinstance(data, pd.MultiIndex): + raise NotImplementedError( + 'no support yet for using a pandas.MultiIndex in an ' + 'xray.Coordinate') + return _maybe_wrap_data(data) if isinstance(data, pd.Timestamp): # TODO: convert, handle datetime objects, too @@ -85,32 +102,26 @@ def _as_compatible_data(data): # data must be ndarray-like data = np.asarray(data) - # ensure data is properly wrapped up - if isinstance(data, pd.Index): - # check pd.Index first since it may be an ndarray subclass - data = PandasIndexAdapter(data) - else: - # we don't want nested self-described arrays - data = getattr(data, 'values', data) - - if isinstance(data, np.ma.MaskedArray): - mask = np.ma.getmaskarray(data) - if mask.any(): - dtype, fill_value = common._maybe_promote(data.dtype) - data = np.asarray(data, dtype=dtype) - data[mask] = fill_value - else: - data = np.asarray(data) + # we don't want nested self-described arrays + data = getattr(data, 'values', data) + + if isinstance(data, np.ma.MaskedArray): + mask = np.ma.getmaskarray(data) + if mask.any(): + dtype, fill_value = common._maybe_promote(data.dtype) + data = np.asarray(data, dtype=dtype) + data[mask] = fill_value + else: + data = np.asarray(data) - if isinstance(data, np.ndarray): - if data.dtype.kind == 'M': - # TODO: automatically cast arrays of datetime objects as well - data = np.asarray(data, 'datetime64[ns]') - if data.dtype.kind == 'm': - data = np.asarray(data, 'timedelta64[ns]') - data = NumpyArrayAdapter(data) + if isinstance(data, np.ndarray): + if data.dtype.kind == 'M': + # TODO: automatically cast arrays of datetime objects as well + data = np.asarray(data, 'datetime64[ns]') + if data.dtype.kind == 'm': + data = np.asarray(data, 'timedelta64[ns]') - return data + return _maybe_wrap_data(data) class NumpyArrayAdapter(utils.NDArrayMixin): @@ -237,7 +248,7 @@ class Variable(common.AbstractArray): form of a Dataset or DataArray should almost always be preferred, because they can use more complete metadata in context of coordinate labels. """ - def __init__(self, dims, data, attrs=None, encoding=None): + def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): """ Parameters ---------- @@ -257,7 +268,7 @@ def __init__(self, dims, data, attrs=None, encoding=None): Well behaviored code to serialize a Variable should ignore unrecognized encoding items. """ - self._data = _as_compatible_data(data) + self._data = _as_compatible_data(data, fastpath=fastpath) self._dims = self._parse_dimensions(dims) self._attrs = None self._encoding = None @@ -329,8 +340,8 @@ def values(self, values): def to_coord(self): """Return this variable as an xray.Coordinate""" - return Coordinate(self.dims, self._data, self.attrs, - encoding=self.encoding) + return Coordinate(self.dims, self._data, self._attrs, + encoding=self._encoding, fastpath=True) @property def as_index(self): @@ -391,15 +402,15 @@ def __getitem__(self, key): """ key = self._item_key_to_tuple(key) key = indexing.expanded_indexer(key, self.ndim) - dims = [dim for k, dim in zip(key, self.dims) - if not isinstance(k, (int, np.integer))] + dims = tuple(dim for k, dim in zip(key, self.dims) + if not isinstance(k, (int, np.integer))) values = self._data[key] # orthogonal indexing should ensure the dimensionality is consistent if hasattr(values, 'ndim'): assert values.ndim == len(dims), (values.ndim, len(dims)) else: assert len(dims) == 0, len(dims) - return type(self)(dims, values, self.attrs) + return type(self)(dims, values, self._attrs, fastpath=True) def __setitem__(self, key, value): """__setitem__ is overloaded to access the underlying numpy values with @@ -454,7 +465,8 @@ def copy(self, deep=True): # note: # dims is already an immutable tuple # attributes and encoding will be copied when the new Array is created - return type(self)(self.dims, data, self.attrs, self.encoding) + return type(self)(self.dims, data, self._attrs, self._encoding, + fastpath=True) def __copy__(self): return self.copy(deep=False) @@ -524,7 +536,7 @@ def transpose(self, *dims): dims = self.dims[::-1] axes = self.get_axis_num(dims) data = self.values.transpose(axes) - return type(self)(dims, data, self.attrs, self.encoding) + return type(self)(dims, data, self._attrs, self._encoding, fastpath=True) def squeeze(self, dim=None): """Return a new Variable object with squeezed data. @@ -585,7 +597,8 @@ def set_dims(self, dims): self_dims = set(self.dims) exp_dims = tuple(d for d in dims if d not in self_dims) + self.dims exp_data = utils.as_shape(self, [dims[d] for d in exp_dims]) - expanded_var = Variable(exp_dims, exp_data, self.attrs, self.encoding) + expanded_var = Variable(exp_dims, exp_data, self._attrs, + self._encoding, fastpath=True) return expanded_var.transpose(*dims) def reduce(self, func, dim=None, axis=None, keep_attrs=False, @@ -634,7 +647,7 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False, dims = [dim for n, dim in enumerate(self.dims) if n not in removed_axes] - attrs = self.attrs if keep_attrs else None + attrs = self._attrs if keep_attrs else None return Variable(dims, data, attrs=attrs) @@ -827,8 +840,8 @@ class Coordinate(Variable): """ _cache_data_class = PandasIndexAdapter - def __init__(self, name, data, attrs=None, encoding=None): - super(Coordinate, self).__init__(name, data, attrs, encoding) + def __init__(self, name, data, attrs=None, encoding=None, fastpath=False): + super(Coordinate, self).__init__(name, data, attrs, encoding, fastpath) if self.ndim != 1: raise ValueError('%s objects must be 1-dimensional' % type(self).__name__) @@ -837,9 +850,10 @@ def __getitem__(self, key): key = self._item_key_to_tuple(key) values = self._data[key] if not hasattr(values, 'ndim') or values.ndim == 0: - return Variable((), values, self.attrs, self.encoding) + return Variable((), values, self._attrs, self._encoding) else: - return type(self)(self.dims, values, self.attrs, self.encoding) + return type(self)(self.dims, values, self._attrs, self._encoding, + fastpath=True) def __setitem__(self, key, value): raise TypeError('%s values cannot be modified' % type(self).__name__) @@ -853,7 +867,8 @@ def copy(self, deep=True): # there is no need to copy the index values here even if deep=True # since pandas.Index objects are immutable data = PandasIndexAdapter(self) if deep else self._data - return type(self)(self.dims, data, self.attrs, self.encoding) + return type(self)(self.dims, data, self._attrs, self._encoding, + fastpath=True) def _data_equals(self, other): return self.to_index().equals(other.to_index()) From 3925fda821688b8d824b277b0d82798e83dc40e6 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 18 Dec 2014 19:34:34 -0800 Subject: [PATCH 3/3] Coerce datetime/timedelta arrays to datetime64/timedelta64 --- xray/core/alignment.py | 2 +- xray/core/common.py | 11 ++++ xray/core/variable.py | 1 + xray/test/test_variable.py | 108 ++++++++++++++++++++++--------------- 4 files changed, 79 insertions(+), 43 deletions(-) diff --git a/xray/core/alignment.py b/xray/core/alignment.py index f5050d280a9..37c3aa1ef1f 100644 --- a/xray/core/alignment.py +++ b/xray/core/alignment.py @@ -156,7 +156,7 @@ def var_indexers(var, indexers): data = np.empty(shape, dtype=dtype) data[:] = fill_value # create a new Variable so we can use orthogonal indexing - new_var = Variable(var.dims, data, var.attrs) + new_var = Variable(var.dims, data, var.attrs, fastpath=True) new_var[assign_to] = var[assign_from].values elif any_not_full_slices(assign_from): # type coercion is not necessary as there are no missing diff --git a/xray/core/common.py b/xray/core/common.py index 54cb0107c58..78037e75032 100644 --- a/xray/core/common.py +++ b/xray/core/common.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from .pycompat import basestring, iteritems from . import formatting @@ -127,3 +128,13 @@ def _maybe_promote(dtype): dtype = object fill_value = np.nan return dtype, fill_value + + +def _possibly_convert_objects(values): + try: + converter = pd.core.common._possibly_convert_objects + except AttributeError: + # our fault for using a private pandas API that has gone missing + # this should do the same coercion (though it will be slower) + converter = lambda x: np.asarray(pd.Series(x)) + return converter(values.ravel()).reshape(values.shape) diff --git a/xray/core/variable.py b/xray/core/variable.py index 64a8e12a1b5..c3cf495c046 100644 --- a/xray/core/variable.py +++ b/xray/core/variable.py @@ -115,6 +115,7 @@ def _as_compatible_data(data, fastpath=False): data = np.asarray(data) if isinstance(data, np.ndarray): + data = common._possibly_convert_objects(data) if data.dtype.kind == 'M': # TODO: automatically cast arrays of datetime objects as well data = np.asarray(data, 'datetime64[ns]') diff --git a/xray/test/test_variable.py b/xray/test/test_variable.py index b17ba0a7437..e7c8e24c1a5 100644 --- a/xray/test/test_variable.py +++ b/xray/test/test_variable.py @@ -88,7 +88,7 @@ def test_index_0d_string(self): def test_index_0d_datetime(self): d = datetime(2000, 1, 1) x = self.cls(['x'], [d]) - self.assertIndexedLikeNDArray(x, d) + self.assertIndexedLikeNDArray(x, np.datetime64(d)) x = self.cls(['x'], [np.datetime64(d)]) self.assertIndexedLikeNDArray(x, np.datetime64(d), 'datetime64[ns]') @@ -148,6 +148,42 @@ def test_0d_time_data(self): expected = np.datetime64('2000-01-01T00Z', 'ns') self.assertEqual(x[0].values, expected) + def test_datetime64_conversion(self): + times = pd.date_range('2000-01-01', periods=3) + for values, preserve_source in [ + (times, False), + (times.values, True), + (times.values.astype('datetime64[s]'), False), + (times.to_pydatetime(), False), + ]: + v = self.cls(['t'], values) + self.assertEqual(v.dtype, np.dtype('datetime64[ns]')) + self.assertArrayEqual(v.values, times.values) + self.assertEqual(v.values.dtype, np.dtype('datetime64[ns]')) + same_source = source_ndarray(v.values) is source_ndarray(values) + if preserve_source and self.cls is Variable: + self.assertTrue(same_source) + else: + self.assertFalse(same_source) + + def test_timedelta64_conversion(self): + times = pd.timedelta_range(start=0, periods=3) + for values, preserve_source in [ + (times, False), + (times.values, True), + (times.values.astype('timedelta64[s]'), False), + (times.to_pytimedelta(), False), + ]: + v = self.cls(['t'], values) + self.assertEqual(v.dtype, np.dtype('timedelta64[ns]')) + self.assertArrayEqual(v.values, times.values) + self.assertEqual(v.values.dtype, np.dtype('timedelta64[ns]')) + same_source = source_ndarray(v.values) is source_ndarray(values) + if preserve_source and self.cls is Variable: + self.assertTrue(same_source) + else: + self.assertFalse(same_source) + def test_pandas_data(self): v = self.cls(['x'], pd.Series([0, 1, 2], index=[3, 2, 1])) self.assertVariableIdentical(v, v[[0, 1, 2]]) @@ -333,29 +369,29 @@ def test_numpy_same_methods(self): v = Coordinate('x', np.arange(5)) self.assertEqual(2, v.searchsorted(2)) - def test_datetime64_conversion(self): - # verify that datetime64 is always converted to ns precision with - # sources preserved - values = np.datetime64('2000-01-01T00') - v = Variable([], values) - self.assertEqual(v.dtype, np.dtype('datetime64[ns]')) - self.assertEqual(v.values, values) - self.assertEqual(v.values.dtype, np.dtype('datetime64[ns]')) - - values = pd.date_range('2000-01-01', periods=3).values.astype( - 'datetime64[s]') - v = Variable(['t'], values) - self.assertEqual(v.dtype, np.dtype('datetime64[ns]')) - self.assertArrayEqual(v.values, values) - self.assertEqual(v.values.dtype, np.dtype('datetime64[ns]')) - self.assertIsNot(source_ndarray(v.values), values) - - values = pd.date_range('2000-01-01', periods=3).values.copy() - v = Variable(['t'], values) - self.assertEqual(v.dtype, np.dtype('datetime64[ns]')) - self.assertArrayEqual(v.values, values) - self.assertEqual(v.values.dtype, np.dtype('datetime64[ns]')) - self.assertIs(source_ndarray(v.values), values) + def test_datetime64_conversion_scalar(self): + expected = np.datetime64('2000-01-01T00:00:00Z', 'ns') + for values in [ + np.datetime64('2000-01-01T00Z'), + pd.Timestamp('2000-01-01T00'), + datetime(2000, 1, 1), + ]: + v = Variable([], values) + self.assertEqual(v.dtype, np.dtype('datetime64[ns]')) + self.assertEqual(v.values, expected) + self.assertEqual(v.values.dtype, np.dtype('datetime64[ns]')) + + def test_timedelta64_conversion_scalar(self): + expected = np.timedelta64(24 * 60 * 60 * 10 ** 9, 'ns') + for values in [ + np.timedelta64(1, 'D'), + pd.Timedelta('1 day'), + timedelta(days=1), + ]: + v = Variable([], values) + self.assertEqual(v.dtype, np.dtype('timedelta64[ns]')) + self.assertEqual(v.values, expected) + self.assertEqual(v.values.dtype, np.dtype('timedelta64[ns]')) def test_0d_str(self): v = Variable([], u'foo') @@ -676,18 +712,6 @@ def test_data(self): with self.assertRaisesRegexp(TypeError, 'cannot be modified'): x[:] = 0 - def test_avoid_index_dtype_inference(self): - # verify our work-around for (pandas<0.14): - # https://github.com/pydata/pandas/issues/6370 - data = pd.date_range('2000-01-01', periods=3).to_pydatetime() - t = Coordinate('t', data) - self.assertArrayEqual(t.values[:2], data[:2]) - self.assertArrayEqual(t[:2].values, data[:2]) - self.assertArrayEqual(t.values[:2], data[:2]) - self.assertArrayEqual(t[:2].values, data[:2]) - self.assertEqual(t.dtype, object) - self.assertEqual(t[:2].dtype, object) - def test_name(self): coord = Coordinate('x', [10.0]) self.assertEqual(coord.name, 'x') @@ -729,27 +753,27 @@ def test_masked_array(self): self.assertEqual(np.dtype(float), actual.dtype) def test_datetime(self): - expected = np.datetime64('2000-01-01T00') + expected = np.datetime64('2000-01-01T00Z') actual = _as_compatible_data(expected) self.assertEqual(expected, actual) self.assertEqual(NumpyArrayAdapter, type(actual)) self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype) - expected = np.array([np.datetime64('2000-01-01T00')]) + expected = np.array([np.datetime64('2000-01-01T00Z')]) actual = _as_compatible_data(expected) self.assertEqual(np.asarray(expected), actual) self.assertEqual(NumpyArrayAdapter, type(actual)) self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype) - expected = np.array([np.datetime64('2000-01-01T00', 'ns')]) + expected = np.array([np.datetime64('2000-01-01T00Z', 'ns')]) actual = _as_compatible_data(expected) self.assertEqual(np.asarray(expected), actual) self.assertEqual(NumpyArrayAdapter, type(actual)) self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype) self.assertIs(expected, source_ndarray(np.asarray(actual))) - expected = pd.Timestamp('2000-01-01T00').to_datetime() - actual = _as_compatible_data(expected) + expected = np.datetime64('2000-01-01T00Z', 'ns') + actual = _as_compatible_data(datetime(2000, 1, 1)) self.assertEqual(np.asarray(expected), actual) self.assertEqual(NumpyArrayAdapter, type(actual)) - self.assertEqual(np.dtype('O'), actual.dtype) + self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype)