pydata
diff --git a/‎doc/time-series.rst
Lines changed: 95 additions & 1 deletion b/‎doc/time-series.rst
Lines changed: 95 additions & 1 deletion
diff --git a/‎doc/whats-new.rst
Lines changed: 10 additions & 0 deletions b/‎doc/whats-new.rst
Lines changed: 10 additions & 0 deletions
diff --git a/‎xarray/coding/cftimeindex.py
Lines changed: 252 additions & 0 deletions b/‎xarray/coding/cftimeindex.py
Lines changed: 252 additions & 0 deletions
@@ -70,7 +70,11 @@ You can manual decode arrays in this form by passing a dataset to
 One unfortunate limitation of using ``datetime64[ns]`` is that it limits the
 native representation of dates to those that fall between the years 1678 and
 2262. When a netCDF file contains dates outside of these bounds, dates will be
-returned as arrays of ``netcdftime.datetime`` objects.
+returned as arrays of ``cftime.datetime`` objects and a ``CFTimeIndex``
+can be used for indexing.  The ``CFTimeIndex`` enables only a subset of
+the indexing functionality of a ``pandas.DatetimeIndex`` and is only enabled
+when using standalone version of ``cftime`` (not the version packaged with
+earlier versions ``netCDF4``).  See :ref:`CFTimeIndex` for more information.
 
 Datetime indexing
 -----------------
@@ -207,3 +211,93 @@ Dataset and DataArray objects with an arbitrary number of dimensions.
 
 For more examples of using grouped operations on a time dimension, see
 :ref:`toy weather data`.
+
+
+.. _CFTimeIndex:
+     
+Non-standard calendars and dates outside the Timestamp-valid range
+------------------------------------------------------------------
+
+Through the standalone ``cftime`` library and a custom subclass of
+``pandas.Index``, xarray supports a subset of the indexing functionality enabled
+through the standard ``pandas.DatetimeIndex`` for dates from non-standard
+calendars or dates using a standard calendar, but outside the
+`Timestamp-valid range`_ (approximately between years 1678 and 2262).  This
+behavior has not yet been turned on by default; to take advantage of this
+functionality, you must have the ``enable_cftimeindex`` option set to
+``True`` within your context (see :py:func:`~xarray.set_options` for more
+information).  It is expected that this will become the default behavior in
+xarray version 0.11.
+
+For instance, you can create a DataArray indexed by a time
+coordinate with a no-leap calendar within a context manager setting the
+``enable_cftimeindex`` option, and the time index will be cast to a
+``CFTimeIndex``:
+
+.. ipython:: python
+
+   from itertools import product
+   from cftime import DatetimeNoLeap
+   
+   dates = [DatetimeNoLeap(year, month, 1) for year, month in
+            product(range(1, 3), range(1, 13))]
+   with xr.set_options(enable_cftimeindex=True):
+       da = xr.DataArray(np.arange(24), coords=[dates], dims=['time'],
+                         name='foo')
+                         
+.. note::
+
+   With the ``enable_cftimeindex`` option activated, a ``CFTimeIndex``
+   will be used for time indexing if any of the following are true:
+
+   - The dates are from a non-standard calendar
+   - Any dates are outside the Timestamp-valid range
+
+   Otherwise a ``pandas.DatetimeIndex`` will be used.  In addition, if any
+   variable (not just an index variable) is encoded using a non-standard
+   calendar, its times will be decoded into ``cftime.datetime`` objects,
+   regardless of whether or not they can be represented using
+   ``np.datetime64[ns]`` objects.
+                         
+For data indexed by a ``CFTimeIndex`` xarray currently supports:
+
+- `Partial datetime string indexing`_ using strictly `ISO 8601-format`_ partial
+  datetime strings:
+  
+.. ipython:: python
+
+   da.sel(time='0001')
+   da.sel(time=slice('0001-05', '0002-02'))
+
+- Access of basic datetime components via the ``dt`` accessor (in this case
+  just "year", "month", "day", "hour", "minute", "second", "microsecond", and
+  "season"): 
+
+.. ipython:: python
+
+   da.time.dt.year
+   da.time.dt.month
+   da.time.dt.season
+
+- Group-by operations based on datetime accessor attributes (e.g. by month of
+  the year):
+
+.. ipython:: python
+
+   da.groupby('time.month').sum()
+   
+- And serialization:
+
+.. ipython:: python
+
+   da.to_netcdf('example.nc')
+   xr.open_dataset('example.nc')
+
+.. note::
+   
+   Currently resampling along the time dimension for data indexed by a
+   ``CFTimeIndex`` is not supported.
+  
+.. _Timestamp-valid range: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#timestamp-limitations
+.. _ISO 8601-format: https://en.wikipedia.org/wiki/ISO_8601
+.. _partial datetime string indexing: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#partial-string-indexing
@@ -34,6 +34,16 @@ v0.10.4 (unreleased)
 Enhancements
 ~~~~~~~~~~~~
 
+- Add an option for using a ``CFTimeIndex`` for indexing times with
+  non-standard calendars and/or outside the Timestamp-valid range; this index
+  enables a subset of the functionality of a standard
+  ``pandas.DatetimeIndex`` (:issue:`789`, :issue:`1084`, :issue:`1252`).
+  By `Spencer Clark <https://github.com/spencerkclark>`_ with help from
+  `Stephan Hoyer <https://github.com/shoyer>`_.
+- Allow for serialization of ``cftime.datetime`` objects (:issue:`789`,
+  :issue:`1084`, :issue:`2008`, :issue:`1252`) using the standalone ``cftime``
+         library. By `Spencer Clark
+         <https://github.com/spencerkclark>`_. 
 - Support writing lists of strings as netCDF attributes (:issue:`2044`).
   By `Dan Nowacki <https://github.com/dnowacki-usgs>`_.
 - :py:meth:`~xarray.Dataset.to_netcdf(engine='h5netcdf')` now accepts h5py
 
@@ -0,0 +1,252 @@
+from __future__ import absolute_import
+import re
+from datetime import timedelta
+
+import numpy as np
+import pandas as pd
+
+from xarray.core import pycompat
+from xarray.core.utils import is_scalar
+
+
+def named(name, pattern):
+    return '(?P<' + name + '>' + pattern + ')'
+
+
+def optional(x):
+    return '(?:' + x + ')?'
+
+
+def trailing_optional(xs):
+    if not xs:
+        return ''
+    return xs[0] + optional(trailing_optional(xs[1:]))
+
+
+def build_pattern(date_sep='\-', datetime_sep='T', time_sep='\:'):
+    pieces = [(None, 'year', '\d{4}'),
+              (date_sep, 'month', '\d{2}'),
+              (date_sep, 'day', '\d{2}'),
+              (datetime_sep, 'hour', '\d{2}'),
+              (time_sep, 'minute', '\d{2}'),
+              (time_sep, 'second', '\d{2}')]
+    pattern_list = []
+    for sep, name, sub_pattern in pieces:
+        pattern_list.append((sep if sep else '') + named(name, sub_pattern))
+        # TODO: allow timezone offsets?
+    return '^' + trailing_optional(pattern_list) + '$'
+
+
+_BASIC_PATTERN = build_pattern(date_sep='', time_sep='')
+_EXTENDED_PATTERN = build_pattern()
+_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN]
+
+
+def parse_iso8601(datetime_string):
+    for pattern in _PATTERNS:
+        match = re.match(pattern, datetime_string)
+        if match:
+            return match.groupdict()
+    raise ValueError('no ISO-8601 match for string: %s' % datetime_string)
+
+
+def _parse_iso8601_with_reso(date_type, timestr):
+    default = date_type(1, 1, 1)
+    result = parse_iso8601(timestr)
+    replace = {}
+
+    for attr in ['year', 'month', 'day', 'hour', 'minute', 'second']:
+        value = result.get(attr, None)
+        if value is not None:
+            # Note ISO8601 conventions allow for fractional seconds.
+            # TODO: Consider adding support for sub-second resolution?
+            replace[attr] = int(value)
+            resolution = attr
+
+    return default.replace(**replace), resolution
+
+
+def _parsed_string_to_bounds(date_type, resolution, parsed):
+    """Generalization of
+    pandas.tseries.index.DatetimeIndex._parsed_string_to_bounds
+    for use with non-standard calendars and cftime.datetime
+    objects.
+    """
+    if resolution == 'year':
+        return (date_type(parsed.year, 1, 1),
+                date_type(parsed.year + 1, 1, 1) - timedelta(microseconds=1))
+    elif resolution == 'month':
+        if parsed.month == 12:
+            end = date_type(parsed.year + 1, 1, 1) - timedelta(microseconds=1)
+        else:
+            end = (date_type(parsed.year, parsed.month + 1, 1) -
+                   timedelta(microseconds=1))
+        return date_type(parsed.year, parsed.month, 1), end
+    elif resolution == 'day':
+        start = date_type(parsed.year, parsed.month, parsed.day)
+        return start, start + timedelta(days=1, microseconds=-1)
+    elif resolution == 'hour':
+        start = date_type(parsed.year, parsed.month, parsed.day, parsed.hour)
+        return start, start + timedelta(hours=1, microseconds=-1)
+    elif resolution == 'minute':
+        start = date_type(parsed.year, parsed.month, parsed.day, parsed.hour,
+                          parsed.minute)
+        return start, start + timedelta(minutes=1, microseconds=-1)
+    elif resolution == 'second':
+        start = date_type(parsed.year, parsed.month, parsed.day, parsed.hour,
+                          parsed.minute, parsed.second)
+        return start, start + timedelta(seconds=1, microseconds=-1)
+    else:
+        raise KeyError
+
+
+def get_date_field(datetimes, field):
+    """Adapted from pandas.tslib.get_date_field"""
+    return np.array([getattr(date, field) for date in datetimes])
+
+
+def _field_accessor(name, docstring=None):
+    """Adapted from pandas.tseries.index._field_accessor"""
+    def f(self):
+        return get_date_field(self._data, name)
+
+    f.__name__ = name
+    f.__doc__ = docstring
+    return property(f)
+
+
+def get_date_type(self):
+    return type(self._data[0])
+
+
+def assert_all_valid_date_type(data):
+    import cftime
+
+    sample = data[0]
+    date_type = type(sample)
+    if not isinstance(sample, cftime.datetime):
+        raise TypeError(
+            'CFTimeIndex requires cftime.datetime '
+            'objects. Got object of {}.'.format(date_type))
+    if not all(isinstance(value, date_type) for value in data):
+        raise TypeError(
+            'CFTimeIndex requires using datetime '
+            'objects of all the same type.  Got\n{}.'.format(data))
+
+
+class CFTimeIndex(pd.Index):
+    year = _field_accessor('year', 'The year of the datetime')
+    month = _field_accessor('month', 'The month of the datetime')
+    day = _field_accessor('day', 'The days of the datetime')
+    hour = _field_accessor('hour', 'The hours of the datetime')
+    minute = _field_accessor('minute', 'The minutes of the datetime')
+    second = _field_accessor('second', 'The seconds of the datetime')
+    microsecond = _field_accessor('microsecond',
+                                  'The microseconds of the datetime')
+    date_type = property(get_date_type)
+
+    def __new__(cls, data):
+        result = object.__new__(cls)
+        assert_all_valid_date_type(data)
+        result._data = np.array(data)
+        return result
+
+    def _partial_date_slice(self, resolution, parsed):
+        """Adapted from
+        pandas.tseries.index.DatetimeIndex._partial_date_slice
+
+        Note that when using a CFTimeIndex, if a partial-date selection
+        returns a single element, it will never be converted to a scalar
+        coordinate; this is in slight contrast to the behavior when using
+        a DatetimeIndex, which sometimes will return a DataArray with a scalar
+        coordinate depending on the resolution of the datetimes used in
+        defining the index.  For example:
+
+        >>> from cftime import DatetimeNoLeap
+        >>> import pandas as pd
+        >>> import xarray as xr
+        >>> da = xr.DataArray([1, 2],
+                              coords=[[DatetimeNoLeap(2001, 1, 1),
+                                       DatetimeNoLeap(2001, 2, 1)]],
+                              dims=['time'])
+        >>> da.sel(time='2001-01-01')
+        <xarray.DataArray (time: 1)>
+        array([1])
+        Coordinates:
+          * time     (time) object 2001-01-01 00:00:00
+        >>> da = xr.DataArray([1, 2],
+                              coords=[[pd.Timestamp(2001, 1, 1),
+                                       pd.Timestamp(2001, 2, 1)]],
+                              dims=['time'])
+        >>> da.sel(time='2001-01-01')
+        <xarray.DataArray ()>
+        array(1)
+        Coordinates:
+            time     datetime64[ns] 2001-01-01
+        >>> da = xr.DataArray([1, 2],
+                              coords=[[pd.Timestamp(2001, 1, 1, 1),
+                                       pd.Timestamp(2001, 2, 1)]],
+                              dims=['time'])
+        >>> da.sel(time='2001-01-01')
+        <xarray.DataArray (time: 1)>
+        array([1])
+        Coordinates:
+          * time     (time) datetime64[ns] 2001-01-01T01:00:00
+        """
+        start, end = _parsed_string_to_bounds(self.date_type, resolution,
+                                              parsed)
+        lhs_mask = (self._data >= start)
+        rhs_mask = (self._data <= end)
+        return (lhs_mask & rhs_mask).nonzero()[0]
+
+    def _get_string_slice(self, key):
+        """Adapted from pandas.tseries.index.DatetimeIndex._get_string_slice"""
+        parsed, resolution = _parse_iso8601_with_reso(self.date_type, key)
+        loc = self._partial_date_slice(resolution, parsed)
+        return loc
+
+    def get_loc(self, key, method=None, tolerance=None):
+        """Adapted from pandas.tseries.index.DatetimeIndex.get_loc"""
+        if isinstance(key, pycompat.basestring):
+            return self._get_string_slice(key)
+        else:
+            return pd.Index.get_loc(self, key, method=method,
+                                    tolerance=tolerance)
+
+    def _maybe_cast_slice_bound(self, label, side, kind):
+        """Adapted from
+        pandas.tseries.index.DatetimeIndex._maybe_cast_slice_bound"""
+        if isinstance(label, pycompat.basestring):
+            parsed, resolution = _parse_iso8601_with_reso(self.date_type,
+                                                          label)
+            start, end = _parsed_string_to_bounds(self.date_type, resolution,
+                                                  parsed)
+            if self.is_monotonic_decreasing and len(self):
+                return end if side == 'left' else start
+            return start if side == 'left' else end
+        else:
+            return label
+
+    # TODO: Add ability to use integer range outside of iloc?
+    # e.g. series[1:5].
+    def get_value(self, series, key):
+        """Adapted from pandas.tseries.index.DatetimeIndex.get_value"""
+        if not isinstance(key, slice):
+            return series.iloc[self.get_loc(key)]
+        else:
+            return series.iloc[self.slice_indexer(
+                key.start, key.stop, key.step)]
+
+    def __contains__(self, key):
+        """Adapted from
+        pandas.tseries.base.DatetimeIndexOpsMixin.__contains__"""
+        try:
+            result = self.get_loc(key)
+            return (is_scalar(result) or type(result) == slice or
+                    (isinstance(result, np.ndarray) and result.size))
+        except (KeyError, TypeError, ValueError):
+            return False
+
+    def contains(self, key):
+        """Needed for .loc based partial-string indexing"""
+        return self.__contains__(key)