diff --git a/test/test_utils.py b/test/test_utils.py index 78545630b49..fa72290e09d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,6 +21,17 @@ def test(self): self.assertEqual(expected.dtype, actual.dtype) +class TestArrayEquiv(TestCase): + def test_0d(self): + # verify our work around for pd.isnull not working for 0-dimensional + # object arrays + self.assertTrue(utils.array_equiv(0, np.array(0, dtype=object))) + self.assertTrue( + utils.array_equiv(np.nan, np.array(np.nan, dtype=object))) + self.assertFalse( + utils.array_equiv(0, np.array(1, dtype=object))) + + class TestDictionaries(TestCase): def setUp(self): self.x = {'a': 'A', 'b': 'B'} diff --git a/test/test_variable.py b/test/test_variable.py index 5aa43fd7cab..632db73a0a0 100644 --- a/test/test_variable.py +++ b/test/test_variable.py @@ -6,9 +6,10 @@ import numpy as np import pandas as pd -from xray import Variable, Dataset, DataArray +from xray import Variable, Dataset, DataArray, indexing from xray.variable import (Coordinate, as_variable, NumpyArrayAdapter, - PandasIndexAdapter) + PandasIndexAdapter, _as_compatible_data) +from xray.pycompat import PY3 from . import TestCase, source_ndarray @@ -36,32 +37,89 @@ def test_attrs(self): v.attrs['foo'] = 'baz' self.assertEqual(v.attrs['foo'], 'baz') - def test_0d_data(self): - d = datetime(2000, 1, 1) - for value, dtype in [(0, int), - (np.float32(0.5), np.float32), - ('foo', np.str_), - (d, None), - (np.datetime64(d), np.datetime64)]: + def assertIndexedLikeNDArray(self, variable, expected_value0, + expected_dtype=None): + """Given a 1-dimensional variable, verify that the variable is indexed + like a numpy.ndarray. + """ + self.assertEqual(variable[0].shape, ()) + self.assertEqual(variable[0].ndim, 0) + self.assertEqual(variable[0].size, 1) + # test identity + self.assertTrue(variable.equals(variable.copy())) + self.assertTrue(variable.identical(variable.copy())) + # check value is equal for both ndarray and Variable + self.assertEqual(variable.values[0], expected_value0) + self.assertEqual(variable[0].values, expected_value0) + # check type or dtype is consistent for both ndarray and Variable + if expected_dtype is None: + # check output type instead of array dtype + self.assertEqual(type(variable.values[0]), type(expected_value0)) + self.assertEqual(type(variable[0].values), type(expected_value0)) + else: + self.assertEqual(variable.values[0].dtype, expected_dtype) + self.assertEqual(variable[0].values.dtype, expected_dtype) + + def test_index_0d_int(self): + for value, dtype in [(0, np.int_), + (np.int32(0), np.int32)]: + x = self.cls(['x'], [value]) + self.assertIndexedLikeNDArray(x, value, dtype) + + def test_index_0d_float(self): + for value, dtype in [(0.5, np.float_), + (np.float32(0.5), np.float32)]: + x = self.cls(['x'], [value]) + self.assertIndexedLikeNDArray(x, value, dtype) + + def test_index_0d_string(self): + for value, dtype in [('foo', np.dtype('U3' if PY3 else 'S3')), + (u'foo', np.dtype('U3'))]: x = self.cls(['x'], [value]) - # check array properties - self.assertEqual(x[0].shape, ()) - self.assertEqual(x[0].ndim, 0) - self.assertEqual(x[0].size, 1) - # test identity - self.assertTrue(x.equals(x.copy())) - self.assertTrue(x.identical(x.copy())) - # check value is equal for both ndarray and Variable - self.assertEqual(x.values[0], value) - self.assertEqual(x[0].values, value) - # check type or dtype is consistent for both ndarray and Variable - if dtype is None: - # check output type instead of array dtype - self.assertEqual(type(x.values[0]), type(value)) - self.assertEqual(type(x[0].values), type(value)) - else: - assert np.issubdtype(x.values[0].dtype, dtype), (x.values[0].dtype, dtype) - assert np.issubdtype(x[0].values.dtype, dtype), (x[0].values.dtype, dtype) + self.assertIndexedLikeNDArray(x, value, dtype) + + def test_index_0d_datetime(self): + d = datetime(2000, 1, 1) + x = self.cls(['x'], [d]) + self.assertIndexedLikeNDArray(x, d) + + x = self.cls(['x'], [np.datetime64(d)]) + self.assertIndexedLikeNDArray(x, np.datetime64(d), 'datetime64[ns]') + + x = self.cls(['x'], pd.DatetimeIndex([d])) + self.assertIndexedLikeNDArray(x, np.datetime64(d), 'datetime64[ns]') + + def test_index_0d_object(self): + + class HashableItemWrapper(object): + def __init__(self, item): + self.item = item + + def __eq__(self, other): + return self.item == other.item + + def __hash__(self): + return hash(self.item) + + def __repr__(self): + return '%s(item=%r)' % (type(self).__name__, self.item) + + item = HashableItemWrapper((1, 2, 3)) + x = self.cls('x', [item]) + self.assertIndexedLikeNDArray(x, item) + + def test_index_and_concat_datetime(self): + # regression test for #125 + date_range = pd.date_range('2011-09-01', periods=10) + for dates in [date_range, date_range.values, + date_range.to_pydatetime()]: + expected = self.cls('t', dates) + for times in [[expected[i] for i in range(10)], + [expected[i:(i + 1)] for i in range(10)], + [expected[[i]] for i in range(10)]]: + actual = Variable.concat(times, 't') + self.assertEqual(expected.dtype, actual.dtype) + self.assertArrayEqual(expected, actual) def test_0d_time_data(self): # regression test for #105 @@ -229,6 +287,39 @@ def test_item(self): self.assertEqual(v.item(), 0) self.assertIs(type(v.item()), float) + def test_datetime64_conversion(self): + # verify that datetime64 is always converted to ns precision with + # sources preserved + values = np.datetime64('2000-01-01T00') + v = Variable([], values) + self.assertEqual(v.dtype, np.dtype('datetime64[ns]')) + self.assertEqual(v.values, values) + self.assertEqual(v.values.dtype, np.dtype('datetime64[ns]')) + + values = pd.date_range('2000-01-01', periods=3).values.astype( + 'datetime64[s]') + v = Variable(['t'], values) + self.assertEqual(v.dtype, np.dtype('datetime64[ns]')) + self.assertArrayEqual(v.values, values) + self.assertEqual(v.values.dtype, np.dtype('datetime64[ns]')) + self.assertIsNot(source_ndarray(v.values), values) + + values = pd.date_range('2000-01-01', periods=3).values.copy() + v = Variable(['t'], values) + self.assertEqual(v.dtype, np.dtype('datetime64[ns]')) + self.assertArrayEqual(v.values, values) + self.assertEqual(v.values.dtype, np.dtype('datetime64[ns]')) + self.assertIs(source_ndarray(v.values), values) + + def test_0d_str(self): + v = Variable([], u'foo') + self.assertEqual(v.dtype, np.dtype('U3')) + self.assertEqual(v.values, 'foo') + + v = Variable([], np.string_('foo')) + self.assertEqual(v.dtype, np.dtype('S3')) + self.assertEqual(v.values, bytes('foo', 'ascii') if PY3 else 'foo') + def test_equals_and_identical(self): d = np.random.rand(10, 3) d[0, 0] = np.nan @@ -463,3 +554,60 @@ def test_data(self): self.assertIsInstance(x._data, PandasIndexAdapter) with self.assertRaisesRegexp(TypeError, 'cannot be modified'): x[:] = 0 + + def test_avoid_index_dtype_inference(self): + # verify our work-around for (pandas<0.14): + # https://github.com/pydata/pandas/issues/6370 + data = pd.date_range('2000-01-01', periods=3).to_pydatetime() + t = Coordinate('t', data) + self.assertArrayEqual(t.values[:2], data[:2]) + self.assertArrayEqual(t[:2].values, data[:2]) + self.assertArrayEqual(t.values[:2], data[:2]) + self.assertArrayEqual(t[:2].values, data[:2]) + self.assertEqual(t.dtype, object) + self.assertEqual(t[:2].dtype, object) + + +class TestAsCompatibleData(TestCase): + def test_unchanged_types(self): + types = (NumpyArrayAdapter, PandasIndexAdapter, + indexing.LazilyIndexedArray) + for t in types: + for data in [np.arange(3), + pd.date_range('2000-01-01', periods=3), + pd.date_range('2000-01-01', periods=3).values]: + x = t(data) + self.assertIs(x, _as_compatible_data(x)) + + def test_converted_types(self): + for input_array in [[[0, 1, 2]], pd.DataFrame([[0, 1, 2]])]: + actual = _as_compatible_data(input_array) + self.assertArrayEqual(np.asarray(input_array), actual) + self.assertEqual(NumpyArrayAdapter, type(actual)) + self.assertEqual(np.dtype(int), actual.dtype) + + def test_datetime(self): + expected = np.datetime64('2000-01-01T00') + actual = _as_compatible_data(expected) + self.assertEqual(expected, actual) + self.assertEqual(np.datetime64, type(actual)) + self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype) + + expected = np.array([np.datetime64('2000-01-01T00')]) + actual = _as_compatible_data(expected) + self.assertEqual(np.asarray(expected), actual) + self.assertEqual(NumpyArrayAdapter, type(actual)) + self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype) + + expected = np.array([np.datetime64('2000-01-01T00', 'ns')]) + actual = _as_compatible_data(expected) + self.assertEqual(np.asarray(expected), actual) + self.assertEqual(NumpyArrayAdapter, type(actual)) + self.assertEqual(np.dtype('datetime64[ns]'), actual.dtype) + self.assertIs(expected, source_ndarray(np.asarray(actual))) + + expected = pd.Timestamp('2000-01-01T00').to_datetime() + actual = _as_compatible_data(expected) + self.assertEqual(np.asarray(expected), actual) + self.assertEqual(NumpyArrayAdapter, type(actual)) + self.assertEqual(np.dtype('O'), actual.dtype) diff --git a/xray/backends/netCDF4_.py b/xray/backends/netCDF4_.py index 0268a8c41ef..54beebc7b4c 100644 --- a/xray/backends/netCDF4_.py +++ b/xray/backends/netCDF4_.py @@ -7,7 +7,7 @@ from .netcdf3 import encode_nc3_variable import xray from xray.conventions import encode_cf_variable -from xray.utils import FrozenOrderedDict, NDArrayMixin, as_array_or_item +from xray.utils import FrozenOrderedDict, NDArrayMixin from xray import indexing from xray.pycompat import iteritems, basestring @@ -31,7 +31,7 @@ def __getitem__(self, key): # work around for netCDF4-python's broken handling of 0-d # arrays (slicing them always returns a 1-dimensional array): # https://github.com/Unidata/netcdf4-python/pull/220 - data = as_array_or_item(np.asscalar(self.array[key])) + data = np.asscalar(self.array[key]) else: data = self.array[key] return data diff --git a/xray/indexing.py b/xray/indexing.py index 0456b32f191..fee7ceae052 100644 --- a/xray/indexing.py +++ b/xray/indexing.py @@ -212,12 +212,8 @@ def shape(self): shape.append(k.size) return tuple(shape) - @property - def values(self): - return self.array[self.key] - def __array__(self, dtype=None): - return np.asarray(self.values, dtype=None) + return np.asarray(self.array[self.key], dtype=None) def __getitem__(self, key): return type(self)(self.array, self._updated_key(key)) diff --git a/xray/utils.py b/xray/utils.py index 35337481f0a..7a1ce1f0e65 100644 --- a/xray/utils.py +++ b/xray/utils.py @@ -4,7 +4,6 @@ import operator import warnings from collections import OrderedDict, Mapping, MutableMapping -from datetime import datetime import numpy as np import pandas as pd @@ -36,35 +35,6 @@ def __new__(cls, *args, **kwargs): return Wrapper -def as_safe_array(values, dtype=None): - """Like np.asarray, but convert all datetime64 arrays to ns precision - """ - values = np.asarray(values, dtype=dtype) - if values.dtype.kind == 'M': - # np.datetime64 - values = values.astype('datetime64[ns]') - return values - - -def as_array_or_item(values, dtype=None): - """Return the given values as a numpy array of the indicated dtype, or as - an individual value if it's a 0-dimensional object array or datetime. - """ - if isinstance(values, datetime): - # shortcut because if you try to make a datetime or Timestamp object - # into an array with the proper dtype, it is liable to be silently - # converted into an integer instead :( - return values - values = as_safe_array(values, dtype=dtype) - if values.ndim == 0 and values.dtype.kind == 'O': - # unpack 0d object arrays to be consistent with numpy - values = values.item() - if isinstance(values, pd.Timestamp): - # turn Timestamps back into datetime64 objects - values = np.datetime64(values, 'ns') - return values - - def squeeze(xray_obj, dimensions, dimension=None): """Squeeze the dimensions of an xray object.""" if dimension is None: @@ -93,11 +63,22 @@ def array_equiv(arr1, arr2): arr1, arr2 = np.asarray(arr1), np.asarray(arr2) if arr1.shape != arr2.shape: return False - # we could make this faster by not-checking for null values if the dtype - # does not support them, but the logic would get more convoluted. - # using pd.isnull lets us defer the NaN handling to pandas (and unlike - # np.isnan it works on every dtype). - return ((arr1 == arr2) | (pd.isnull(arr1) & pd.isnull(arr2))).all() + if arr1.ndim == 0: + # work around for pd.isnull not working for 0-dimensional object + # arrays: https://github.com/pydata/pandas/pull/7176 (should be fixed + # in pandas 0.14) + # use .item() instead of keeping around 0-dimensional arrays to avoid + # the numpy quirk where object arrays are checked as equal by identity + # (hence NaN in an object array is equal to itself): + arr1 = arr1.item() + arr2 = arr2.item() + return arr1 == arr2 or (arr1 != arr1 and arr2 != arr2) + else: + # we could make this faster by not-checking for null values if the + # dtype does not support them, but the logic would get more convoluted. + # using pd.isnull lets us defer the NaN handling to pandas (and unlike + # np.isnan it works on every dtype). + return ((arr1 == arr2) | (pd.isnull(arr1) & pd.isnull(arr2))).all() def safe_cast_to_index(array): diff --git a/xray/variable.py b/xray/variable.py index 87673e685b9..97b43e0ebb5 100644 --- a/xray/variable.py +++ b/xray/variable.py @@ -52,16 +52,30 @@ def as_variable(obj, strict=True): def _as_compatible_data(data): - """If data does not have the necessary attributes to be the private _data - attribute, convert it to a np.ndarray and raise an warning + """Prepare and wrap data to put in a Variable. + + Prepare the data: + - If data does not have the necessary attributes, convert it to ndarray. + - If data has dtype=datetime64, ensure that it has ns precision. + - If data is already a pandas or xray object (other than an Index), just + use the values. + + Wrap it up: + - Finally, put pandas.Index and numpy.ndarray arguments in adapter objects + to ensure they can be indexed properly. + - NumpyArrayAdapter, PandasIndexAdapter and LazilyIndexedArray should + all pass through unmodified. """ - # don't check for __len__ or __iter__ so as not to warn if data is a numpy + # don't check for __len__ or __iter__ so as not to cast if data is a numpy # numeric type like np.float32 required = ['dtype', 'shape', 'size', 'ndim'] - if (any(not hasattr(data, attr) for attr in required) - or isinstance(data, np.string_)): - data = utils.as_safe_array(data) - elif not isinstance(data, (pd.Index, indexing.LazilyIndexedArray)): + if any(not hasattr(data, attr) for attr in required): + # data must be ndarray-like + data = np.asarray(data) + elif isinstance(data, np.datetime64): + # note: np.datetime64 is ndarray-like + data = np.datetime64(data, 'ns') + elif not isinstance(data, pd.Index): try: # we don't want nested self-described arrays # use try/except instead of hasattr to only calculate values once @@ -73,7 +87,10 @@ def _as_compatible_data(data): # check pd.Index first since it's (currently) an ndarray subclass data = PandasIndexAdapter(data) elif isinstance(data, np.ndarray): - data = NumpyArrayAdapter(utils.as_safe_array(data)) + if data.dtype.kind == 'M': + data = np.asarray(data, 'datetime64[ns]') + data = NumpyArrayAdapter(data) + return data @@ -130,23 +147,45 @@ def __getitem__(self, key): # unpack key so it can index a pandas.Index object (pandas.Index # objects don't like tuples) key, = key + if isinstance(key, (int, np.integer)): - return utils.as_array_or_item(self.array[key], dtype=self.dtype) + value = np.asarray(self.array[key], dtype=self.dtype) else: - if isinstance(key, slice) and key == slice(None): - # pandas<0.14 does dtype inference when slicing; we would like - # to avoid this if possible + arr = self.array[key] + if arr.dtype != self.array.dtype: + # pandas<0.14 does dtype inference when slicing: # https://github.com/pydata/pandas/issues/6370 - arr = self.array - else: - arr = self.array[key] - return PandasIndexAdapter(arr, dtype=self.dtype) + # To avoid this, slice values instead if necessary and accept + # that we will need to rebuild the index: + arr = self.array.values[key] + value = PandasIndexAdapter(arr, dtype=self.dtype) + + return value def __repr__(self): return ('%s(array=%r, dtype=%r)' % (type(self).__name__, self.array, self.dtype)) +def _as_array_or_item(data): + """Return the given values as a numpy array, or as an individual item if + it's a 0-dimensional object array or datetime64. + + Importantly, this function does not copy data if it is already an ndarray - + otherwise, it will not be possible to update Variable values in place. + """ + data = np.asarray(data) + if data.ndim == 0: + if data.dtype.kind == 'O': + # unpack 0d object arrays to be consistent with numpy + data = data.item() + elif data.dtype.kind == 'M': + # convert to a np.datetime64 object, because 0-dimensional ndarrays + # with dtype=datetime64 are broken :( + data = np.datetime64(data, 'ns') + return data + + class Variable(AbstractArray): """A netcdf-like variable consisting of dimensions, data and attributes which describe a single Array. A single Variable object is not fully @@ -219,7 +258,7 @@ def __getstate__(self): @property def values(self): """The variable's data as a numpy.ndarray""" - return utils.as_array_or_item(self._data_cached()) + return _as_array_or_item(self._data_cached()) @values.setter def values(self, values):