From b9663b24774cce16bec747bcd75a3dd509a5d00d Mon Sep 17 00:00:00 2001 From: jwenfai Date: Fri, 1 Mar 2019 20:41:44 -0500 Subject: [PATCH 01/17] Quarter offset implemented (base is now latest pydata-master). (#2721) * Quarter offset implemented (base is now latest pydata-master). * Fixed issues raised in review (https://github.com/pydata/xarray/pull/2721#pullrequestreview-199346642) * Updated whats-new.rst with info on quarter offset support. * Updated whats-new.rst with info on quarter offset support. * Update doc/whats-new.rst Co-Authored-By: jwenfai * Added support for quarter frequencies when resampling CFTimeIndex. Less redundancy in CFTimeIndex resampling tests. * Removed normalization code (unnecessary for cftime_range) in cftime_offsets.py. Removed redundant lines in whats-new.rst. * Removed invalid option from _get_day_of_month docstring. Added tests back in that raises ValueError when resampling (base=24 when resampling to daily freq, e.g., '8D'). * Minor edits to docstrings/comments * lint --- doc/whats-new.rst | 2 + xarray/coding/cftime_offsets.py | 369 ++++++++++++++++++---- xarray/core/resample_cftime.py | 8 +- xarray/tests/test_cftime_offsets.py | 221 +++++++++++-- xarray/tests/test_cftimeindex_resample.py | 62 +++- 5 files changed, 557 insertions(+), 105 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9ac671d5858..59683f690eb 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -68,6 +68,8 @@ Enhancements - :py:meth:`pandas.Series.dropna` is now supported for a :py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex` (:issue:`2688`). By `Spencer Clark `_. +- :py:meth:`~xarray.cftime_range` now supports QuarterBegin and QuarterEnd offsets (:issue:`2663`). + By `Jwen Fai Low `_ - :py:meth:`~xarray.open_dataset` now accepts a ``use_cftime`` argument, which can be used to require that ``cftime.datetime`` objects are always used, or never used when decoding dates encoded with a standard calendar. This can be diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 4b5770ac90a..a74c735224b 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -75,6 +75,7 @@ def get_date_type(calendar): class BaseCFTimeOffset(object): _freq = None # type: ClassVar[str] + _day_option = None def __init__(self, n=1): if not isinstance(n, int): @@ -151,6 +152,41 @@ def __str__(self): def __repr__(self): return str(self) + def _get_offset_day(self, other): + # subclass must implement `_day_option`; calling from the base class + # will raise NotImplementedError. + return _get_day_of_month(other, self._day_option) + + +def _get_day_of_month(other, day_option): + """Find the day in `other`'s month that satisfies a BaseCFTimeOffset's + onOffset policy, as described by the `day_option` argument. + + Parameters + ---------- + other : cftime.datetime + day_option : 'start', 'end' + 'start': returns 1 + 'end': returns last day of the month + + Returns + ------- + day_of_month : int + + """ + + if day_option == 'start': + return 1 + elif day_option == 'end': + days_in_month = _days_in_month(other) + return days_in_month + elif day_option is None: + # Note: unlike `_shift_month`, _get_day_of_month does not + # allow day_option = None + raise NotImplementedError + else: + raise ValueError(day_option) + def _days_in_month(date): """The number of days in the month of the given date""" @@ -186,7 +222,7 @@ def _adjust_n_years(other, n, month, reference_day): return n -def _shift_months(date, months, day_option='start'): +def _shift_month(date, months, day_option='start'): """Shift the date to a month start or end a given number of months away. """ delta_year = (date.month + months) // 12 @@ -211,12 +247,69 @@ def _shift_months(date, months, day_option='start'): return date.replace(year=year, month=month, day=day, dayofwk=-1) +def roll_qtrday(other, n, month, day_option, modby=3): + """Possibly increment or decrement the number of periods to shift + based on rollforward/rollbackward conventions. + + Parameters + ---------- + other : cftime.datetime + n : number of periods to increment, before adjusting for rolling + month : int reference month giving the first month of the year + day_option : 'start', 'end' + The convention to use in finding the day in a given month against + which to compare for rollforward/rollbackward decisions. + modby : int 3 for quarters, 12 for years + + Returns + ------- + n : int number of periods to increment + + See Also + -------- + _get_day_of_month : Find the day in a month provided an offset. + """ + + months_since = other.month % modby - month % modby + + if n > 0: + if months_since < 0 or ( + months_since == 0 and + other.day < _get_day_of_month(other, day_option)): + # pretend to roll back if on same month but + # before compare_day + n -= 1 + else: + if months_since > 0 or ( + months_since == 0 and + other.day > _get_day_of_month(other, day_option)): + # make sure to roll forward, so negate + n += 1 + return n + + +def _validate_month(month, default_month): + if month is None: + result_month = default_month + else: + result_month = month + if not isinstance(result_month, int): + raise TypeError("'self.month' must be an integer value between 1 " + "and 12. Instead, it was set to a value of " + "{!r}".format(result_month)) + elif not (1 <= result_month <= 12): + raise ValueError("'self.month' must be an integer value between 1 " + "and 12. Instead, it was set to a value of " + "{!r}".format(result_month)) + return result_month + + class MonthBegin(BaseCFTimeOffset): _freq = 'MS' def __apply__(self, other): n = _adjust_n_months(other.day, self.n, 1) - return _shift_months(other, n, 'start') + return _shift_month(other, n, 'start') def onOffset(self, date): """Check if the given date is in the set of possible dates created @@ -229,7 +322,7 @@ class MonthEnd(BaseCFTimeOffset): def __apply__(self, other): n = _adjust_n_months(other.day, self.n, _days_in_month(other)) - return _shift_months(other, n, 'end') + return _shift_month(other, n, 'end') def onOffset(self, date): """Check if the given date is in the set of possible dates created @@ -253,6 +346,105 @@ def onOffset(self, date): } +class QuarterOffset(BaseCFTimeOffset): + """Quarter representation copied off of pandas/tseries/offsets.py + """ + _freq = None # type: ClassVar[str] + _default_month = None # type: ClassVar[int] + + def __init__(self, n=1, month=None): + BaseCFTimeOffset.__init__(self, n) + self.month = _validate_month(month, self._default_month) + + def __apply__(self, other): + # months_since: find the calendar quarter containing other.month, + # e.g. if other.month == 8, the calendar quarter is [Jul, Aug, Sep]. + # Then find the month in that quarter containing an onOffset date for + # self. `months_since` is the number of months to shift other.month + # to get to this on-offset month. + months_since = other.month % 3 - self.month % 3 + qtrs = roll_qtrday(other, self.n, self.month, + day_option=self._day_option, modby=3) + months = qtrs * 3 - months_since + return _shift_month(other, months, self._day_option) + + def onOffset(self, date): + """Check if the given date is in the set of possible dates created + using a length-one version of this offset class.""" + mod_month = (date.month - self.month) % 3 + return mod_month == 0 and date.day == self._get_offset_day(date) + + def __sub__(self, other): + import cftime + + if isinstance(other, cftime.datetime): + raise TypeError('Cannot subtract cftime.datetime from offset.') + elif type(other) == type(self) and other.month == self.month: + return type(self)(self.n - other.n, month=self.month) + else: + return NotImplemented + + def __mul__(self, other): + return type(self)(n=other * self.n, month=self.month) + + def rule_code(self): + return '{}-{}'.format(self._freq, _MONTH_ABBREVIATIONS[self.month]) + + def __str__(self): + return '<{}: n={}, month={}>'.format( + type(self).__name__, self.n, self.month) + + +class QuarterBegin(QuarterOffset): + # When converting a string to an offset, pandas converts + # 'QS' to a QuarterBegin offset starting in the month of + # January. When creating a QuarterBegin offset directly + # from the constructor, however, the default month is March. + # We follow that behavior here. + _default_month = 3 + _freq = 'QS' + _day_option = 'start' + + def rollforward(self, date): + """Roll date forward to nearest start of quarter""" + if self.onOffset(date): + return date + else: + return date + QuarterBegin(month=self.month) + + def rollback(self, date): + """Roll date backward to nearest start of quarter""" + if self.onOffset(date): + return date + else: + return date - QuarterBegin(month=self.month) + + +class QuarterEnd(QuarterOffset): + # When converting a string to an offset, pandas converts + # 'Q' to a QuarterEnd offset starting in the month of + # December. When creating a QuarterEnd offset directly + # from the constructor, however, the default month is March. + # We follow that behavior here. + _default_month = 3 + _freq = 'Q' + _day_option = 'end' + + def rollforward(self, date): + """Roll date forward to nearest end of quarter""" + if self.onOffset(date): + return date + else: + return date + QuarterEnd(month=self.month) + + def rollback(self, date): + """Roll date backward to nearest end of quarter""" + if self.onOffset(date): + return date + else: + return date - QuarterEnd(month=self.month) + + class YearOffset(BaseCFTimeOffset): _freq = None # type: ClassVar[str] _day_option = None # type: ClassVar[str] @@ -260,29 +452,13 @@ class YearOffset(BaseCFTimeOffset): def __init__(self, n=1, month=None): BaseCFTimeOffset.__init__(self, n) - if month is None: - self.month = self._default_month - else: - self.month = month - if not isinstance(self.month, int): - raise TypeError("'self.month' must be an integer value between 1 " - "and 12. Instead, it was set to a value of " - "{!r}".format(self.month)) - elif not (1 <= self.month <= 12): - raise ValueError("'self.month' must be an integer value between 1 " - "and 12. Instead, it was set to a value of " - "{!r}".format(self.month)) + self.month = _validate_month(month, self._default_month) def __apply__(self, other): - if self._day_option == 'start': - reference_day = 1 - elif self._day_option == 'end': - reference_day = _days_in_month(other) - else: - raise ValueError(self._day_option) + reference_day = _get_day_of_month(other, self._day_option) years = _adjust_n_years(other, self.n, self.month, reference_day) months = years * 12 + (self.month - other.month) - return _shift_months(other, months, self._day_option) + return _shift_month(other, months, self._day_option) def __sub__(self, other): import cftime @@ -400,6 +576,8 @@ def __apply__(self, other): 'AS': YearBegin, 'Y': YearEnd, 'YS': YearBegin, + 'Q': partial(QuarterEnd, month=12), + 'QS': partial(QuarterBegin, month=1), 'M': MonthEnd, 'MS': MonthBegin, 'D': Day, @@ -430,7 +608,31 @@ def __apply__(self, other): 'A-SEP': partial(YearEnd, month=9), 'A-OCT': partial(YearEnd, month=10), 'A-NOV': partial(YearEnd, month=11), - 'A-DEC': partial(YearEnd, month=12) + 'A-DEC': partial(YearEnd, month=12), + 'QS-JAN': partial(QuarterBegin, month=1), + 'QS-FEB': partial(QuarterBegin, month=2), + 'QS-MAR': partial(QuarterBegin, month=3), + 'QS-APR': partial(QuarterBegin, month=4), + 'QS-MAY': partial(QuarterBegin, month=5), + 'QS-JUN': partial(QuarterBegin, month=6), + 'QS-JUL': partial(QuarterBegin, month=7), + 'QS-AUG': partial(QuarterBegin, month=8), + 'QS-SEP': partial(QuarterBegin, month=9), + 'QS-OCT': partial(QuarterBegin, month=10), + 'QS-NOV': partial(QuarterBegin, month=11), + 'QS-DEC': partial(QuarterBegin, month=12), + 'Q-JAN': partial(QuarterEnd, month=1), + 'Q-FEB': partial(QuarterEnd, month=2), + 'Q-MAR': partial(QuarterEnd, month=3), + 'Q-APR': partial(QuarterEnd, month=4), + 'Q-MAY': partial(QuarterEnd, month=5), + 'Q-JUN': partial(QuarterEnd, month=6), + 'Q-JUL': partial(QuarterEnd, month=7), + 'Q-AUG': partial(QuarterEnd, month=8), + 'Q-SEP': partial(QuarterEnd, month=9), + 'Q-OCT': partial(QuarterEnd, month=10), + 'Q-NOV': partial(QuarterEnd, month=11), + 'Q-DEC': partial(QuarterEnd, month=12) } @@ -624,55 +826,84 @@ def cftime_range(start=None, end=None, periods=None, freq='D', Valid simple frequency strings for use with ``cftime``-calendars include any multiples of the following. - +--------+-----------------------+ - | Alias | Description | - +========+=======================+ - | A, Y | Year-end frequency | - +--------+-----------------------+ - | AS, YS | Year-start frequency | - +--------+-----------------------+ - | M | Month-end frequency | - +--------+-----------------------+ - | MS | Month-start frequency | - +--------+-----------------------+ - | D | Day frequency | - +--------+-----------------------+ - | H | Hour frequency | - +--------+-----------------------+ - | T, min | Minute frequency | - +--------+-----------------------+ - | S | Second frequency | - +--------+-----------------------+ + +--------+--------------------------+ + | Alias | Description | + +========+==========================+ + | A, Y | Year-end frequency | + +--------+--------------------------+ + | AS, YS | Year-start frequency | + +--------+--------------------------+ + | Q | Quarter-end frequency | + +--------+--------------------------+ + | QS | Quarter-start frequency | + +--------+--------------------------+ + | M | Month-end frequency | + +--------+--------------------------+ + | MS | Month-start frequency | + +--------+--------------------------+ + | D | Day frequency | + +--------+--------------------------+ + | H | Hour frequency | + +--------+--------------------------+ + | T, min | Minute frequency | + +--------+--------------------------+ + | S | Second frequency | + +--------+--------------------------+ Any multiples of the following anchored offsets are also supported. - +----------+-------------------------------------------------------------------+ - | Alias | Description | - +==========+===================================================================+ - | A(S)-JAN | Annual frequency, anchored at the end (or beginning) of January | - +----------+-------------------------------------------------------------------+ - | A(S)-FEB | Annual frequency, anchored at the end (or beginning) of February | - +----------+-------------------------------------------------------------------+ - | A(S)-MAR | Annual frequency, anchored at the end (or beginning) of March | - +----------+-------------------------------------------------------------------+ - | A(S)-APR | Annual frequency, anchored at the end (or beginning) of April | - +----------+-------------------------------------------------------------------+ - | A(S)-MAY | Annual frequency, anchored at the end (or beginning) of May | - +----------+-------------------------------------------------------------------+ - | A(S)-JUN | Annual frequency, anchored at the end (or beginning) of June | - +----------+-------------------------------------------------------------------+ - | A(S)-JUL | Annual frequency, anchored at the end (or beginning) of July | - +----------+-------------------------------------------------------------------+ - | A(S)-AUG | Annual frequency, anchored at the end (or beginning) of August | - +----------+-------------------------------------------------------------------+ - | A(S)-SEP | Annual frequency, anchored at the end (or beginning) of September | - +----------+-------------------------------------------------------------------+ - | A(S)-OCT | Annual frequency, anchored at the end (or beginning) of October | - +----------+-------------------------------------------------------------------+ - | A(S)-NOV | Annual frequency, anchored at the end (or beginning) of November | - +----------+-------------------------------------------------------------------+ - | A(S)-DEC | Annual frequency, anchored at the end (or beginning) of December | - +----------+-------------------------------------------------------------------+ + +----------+--------------------------------------------------------------------+ + | Alias | Description | + +==========+====================================================================+ + | A(S)-JAN | Annual frequency, anchored at the end (or beginning) of January | + +----------+--------------------------------------------------------------------+ + | A(S)-FEB | Annual frequency, anchored at the end (or beginning) of February | + +----------+--------------------------------------------------------------------+ + | A(S)-MAR | Annual frequency, anchored at the end (or beginning) of March | + +----------+--------------------------------------------------------------------+ + | A(S)-APR | Annual frequency, anchored at the end (or beginning) of April | + +----------+--------------------------------------------------------------------+ + | A(S)-MAY | Annual frequency, anchored at the end (or beginning) of May | + +----------+--------------------------------------------------------------------+ + | A(S)-JUN | Annual frequency, anchored at the end (or beginning) of June | + +----------+--------------------------------------------------------------------+ + | A(S)-JUL | Annual frequency, anchored at the end (or beginning) of July | + +----------+--------------------------------------------------------------------+ + | A(S)-AUG | Annual frequency, anchored at the end (or beginning) of August | + +----------+--------------------------------------------------------------------+ + | A(S)-SEP | Annual frequency, anchored at the end (or beginning) of September | + +----------+--------------------------------------------------------------------+ + | A(S)-OCT | Annual frequency, anchored at the end (or beginning) of October | + +----------+--------------------------------------------------------------------+ + | A(S)-NOV | Annual frequency, anchored at the end (or beginning) of November | + +----------+--------------------------------------------------------------------+ + | A(S)-DEC | Annual frequency, anchored at the end (or beginning) of December | + +----------+--------------------------------------------------------------------+ + | Q(S)-JAN | Quarter frequency, anchored at the end (or beginning) of January | + +----------+--------------------------------------------------------------------+ + | Q(S)-FEB | Quarter frequency, anchored at the end (or beginning) of February | + +----------+--------------------------------------------------------------------+ + | Q(S)-MAR | Quarter frequency, anchored at the end (or beginning) of March | + +----------+--------------------------------------------------------------------+ + | Q(S)-APR | Quarter frequency, anchored at the end (or beginning) of April | + +----------+--------------------------------------------------------------------+ + | Q(S)-MAY | Quarter frequency, anchored at the end (or beginning) of May | + +----------+--------------------------------------------------------------------+ + | Q(S)-JUN | Quarter frequency, anchored at the end (or beginning) of June | + +----------+--------------------------------------------------------------------+ + | Q(S)-JUL | Quarter frequency, anchored at the end (or beginning) of July | + +----------+--------------------------------------------------------------------+ + | Q(S)-AUG | Quarter frequency, anchored at the end (or beginning) of August | + +----------+--------------------------------------------------------------------+ + | Q(S)-SEP | Quarter frequency, anchored at the end (or beginning) of September | + +----------+--------------------------------------------------------------------+ + | Q(S)-OCT | Quarter frequency, anchored at the end (or beginning) of October | + +----------+--------------------------------------------------------------------+ + | Q(S)-NOV | Quarter frequency, anchored at the end (or beginning) of November | + +----------+--------------------------------------------------------------------+ + | Q(S)-DEC | Quarter frequency, anchored at the end (or beginning) of December | + +----------+--------------------------------------------------------------------+ + Finally, the following calendar aliases are supported. diff --git a/xarray/core/resample_cftime.py b/xarray/core/resample_cftime.py index 6b6d214768e..161945f118d 100644 --- a/xarray/core/resample_cftime.py +++ b/xarray/core/resample_cftime.py @@ -38,7 +38,7 @@ from ..coding.cftimeindex import CFTimeIndex from ..coding.cftime_offsets import (cftime_range, normalize_date, - Day, MonthEnd, YearEnd, + Day, MonthEnd, QuarterEnd, YearEnd, CFTIME_TICKS, to_offset) import datetime import numpy as np @@ -50,14 +50,14 @@ class CFTimeGrouper(object): single method, the only one required for resampling in xarray. It cannot be used in a call to groupby like a pandas.Grouper object can.""" - def __init__(self, freq, closed, label, base, loffset): + def __init__(self, freq, closed=None, label=None, base=0, loffset=None): self.freq = to_offset(freq) self.closed = closed self.label = label self.base = base self.loffset = loffset - if isinstance(self.freq, (MonthEnd, YearEnd)): + if isinstance(self.freq, (MonthEnd, QuarterEnd, YearEnd)): if self.closed is None: self.closed = 'right' if self.label is None: @@ -199,7 +199,7 @@ def _adjust_bin_edges(datetime_bins, offset, closed, index, labels): This is also required for daily frequencies longer than one day and year-end frequencies. """ - is_super_daily = (isinstance(offset, (MonthEnd, YearEnd)) or + is_super_daily = (isinstance(offset, (MonthEnd, QuarterEnd, YearEnd)) or (isinstance(offset, Day) and offset.n > 1)) if is_super_daily: if closed == 'right': diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 29caa88cc53..1cf257c96eb 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -6,9 +6,9 @@ from xarray import CFTimeIndex from xarray.coding.cftime_offsets import ( - _MONTH_ABBREVIATIONS, BaseCFTimeOffset, Day, Hour, Minute, MonthBegin, - MonthEnd, Second, YearBegin, YearEnd, _days_in_month, cftime_range, - get_date_type, to_cftime_datetime, to_offset) + _MONTH_ABBREVIATIONS, BaseCFTimeOffset, Day, Hour, Minute, Second, + MonthBegin, MonthEnd, YearBegin, YearEnd, QuarterBegin, QuarterEnd, + _days_in_month, cftime_range, get_date_type, to_cftime_datetime, to_offset) cftime = pytest.importorskip('cftime') @@ -32,9 +32,13 @@ def calendar(request): [(BaseCFTimeOffset(), 1), (YearBegin(), 1), (YearEnd(), 1), + (QuarterBegin(), 1), + (QuarterEnd(), 1), (BaseCFTimeOffset(n=2), 2), (YearBegin(n=2), 2), - (YearEnd(n=2), 2)], + (YearEnd(n=2), 2), + (QuarterBegin(n=2), 2), + (QuarterEnd(n=2), 2)], ids=_id_func ) def test_cftime_offset_constructor_valid_n(offset, expected_n): @@ -45,7 +49,9 @@ def test_cftime_offset_constructor_valid_n(offset, expected_n): ('offset', 'invalid_n'), [(BaseCFTimeOffset, 1.5), (YearBegin, 1.5), - (YearEnd, 1.5)], + (YearEnd, 1.5), + (QuarterBegin, 1.5), + (QuarterEnd, 1.5)], ids=_id_func ) def test_cftime_offset_constructor_invalid_n(offset, invalid_n): @@ -58,7 +64,11 @@ def test_cftime_offset_constructor_invalid_n(offset, invalid_n): [(YearBegin(), 1), (YearEnd(), 12), (YearBegin(month=5), 5), - (YearEnd(month=5), 5)], + (YearEnd(month=5), 5), + (QuarterBegin(), 3), + (QuarterEnd(), 3), + (QuarterBegin(month=5), 5), + (QuarterEnd(month=5), 5)], ids=_id_func ) def test_year_offset_constructor_valid_month(offset, expected_month): @@ -72,7 +82,13 @@ def test_year_offset_constructor_valid_month(offset, expected_month): (YearBegin, 13, ValueError,), (YearEnd, 13, ValueError), (YearBegin, 1.5, TypeError), - (YearEnd, 1.5, TypeError)], + (YearEnd, 1.5, TypeError), + (QuarterBegin, 0, ValueError), + (QuarterEnd, 0, ValueError), + (QuarterBegin, 1.5, TypeError), + (QuarterEnd, 1.5, TypeError), + (QuarterBegin, 13, ValueError), + (QuarterEnd, 13, ValueError)], ids=_id_func ) def test_year_offset_constructor_invalid_month( @@ -85,7 +101,8 @@ def test_year_offset_constructor_invalid_month( ('offset', 'expected'), [(BaseCFTimeOffset(), None), (MonthBegin(), 'MS'), - (YearBegin(), 'AS-JAN')], + (YearBegin(), 'AS-JAN'), + (QuarterBegin(), 'QS-MAR')], ids=_id_func ) def test_rule_code(offset, expected): @@ -95,7 +112,8 @@ def test_rule_code(offset, expected): @pytest.mark.parametrize( ('offset', 'expected'), [(BaseCFTimeOffset(), ''), - (YearBegin(), '')], + (YearBegin(), ''), + (QuarterBegin(), '')], ids=_id_func ) def test_str_and_repr(offset, expected): @@ -105,7 +123,7 @@ def test_str_and_repr(offset, expected): @pytest.mark.parametrize( 'offset', - [BaseCFTimeOffset(), MonthBegin(), YearBegin()], + [BaseCFTimeOffset(), MonthBegin(), QuarterBegin(), YearBegin()], ids=_id_func ) def test_to_offset_offset_input(offset): @@ -164,7 +182,47 @@ def test_to_offset_annual(month_label, month_int, multiple, offset_str): assert result == expected -@pytest.mark.parametrize('freq', ['Z', '7min2', 'AM', 'M-', 'AS-', '1H1min']) +_QUARTER_OFFSET_TYPES = { + 'Q': QuarterEnd, + 'QS': QuarterBegin +} + + +@pytest.mark.parametrize(('month_int', 'month_label'), + list(_MONTH_ABBREVIATIONS.items()) + [(0, '')]) +@pytest.mark.parametrize('multiple', [None, 2]) +@pytest.mark.parametrize('offset_str', ['QS', 'Q']) +def test_to_offset_quarter(month_label, month_int, multiple, offset_str): + freq = offset_str + offset_type = _QUARTER_OFFSET_TYPES[offset_str] + if month_label: + freq = '-'.join([freq, month_label]) + if multiple: + freq = '{}'.format(multiple) + freq + result = to_offset(freq) + + if multiple and month_int: + expected = offset_type(n=multiple, month=month_int) + elif multiple: + if month_int: + expected = offset_type(n=multiple) + else: + if offset_type == QuarterBegin: + expected = offset_type(n=multiple, month=1) + elif offset_type == QuarterEnd: + expected = offset_type(n=multiple, month=12) + elif month_int: + expected = offset_type(month=month_int) + else: + if offset_type == QuarterBegin: + expected = offset_type(month=1) + elif offset_type == QuarterEnd: + expected = offset_type(month=12) + assert result == expected + + +@pytest.mark.parametrize('freq', ['Z', '7min2', 'AM', 'M-', 'AS-', 'QS-', + '1H1min']) def test_invalid_to_offset_str(freq): with pytest.raises(ValueError): to_offset(freq) @@ -197,13 +255,16 @@ def test_to_cftime_datetime_error_type_error(): _EQ_TESTS_A = [ BaseCFTimeOffset(), YearBegin(), YearEnd(), YearBegin(month=2), - YearEnd(month=2), MonthBegin(), MonthEnd(), Day(), Hour(), Minute(), + YearEnd(month=2), QuarterBegin(), QuarterEnd(), QuarterBegin(month=2), + QuarterEnd(month=2), MonthBegin(), MonthEnd(), Day(), Hour(), Minute(), Second() ] _EQ_TESTS_B = [ BaseCFTimeOffset(n=2), YearBegin(n=2), YearEnd(n=2), - YearBegin(n=2, month=2), YearEnd(n=2, month=2), MonthBegin(n=2), - MonthEnd(n=2), Day(n=2), Hour(n=2), Minute(n=2), Second(n=2) + YearBegin(n=2, month=2), YearEnd(n=2, month=2), QuarterBegin(n=2), + QuarterEnd(n=2), QuarterBegin(n=2, month=2), QuarterEnd(n=2, month=2), + MonthBegin(n=2), MonthEnd(n=2), Day(n=2), Hour(n=2), Minute(n=2), + Second(n=2) ] @@ -216,8 +277,10 @@ def test_neq(a, b): _EQ_TESTS_B_COPY = [ BaseCFTimeOffset(n=2), YearBegin(n=2), YearEnd(n=2), - YearBegin(n=2, month=2), YearEnd(n=2, month=2), MonthBegin(n=2), - MonthEnd(n=2), Day(n=2), Hour(n=2), Minute(n=2), Second(n=2) + YearBegin(n=2, month=2), YearEnd(n=2, month=2), QuarterBegin(n=2), + QuarterEnd(n=2), QuarterBegin(n=2, month=2), QuarterEnd(n=2, month=2), + MonthBegin(n=2), MonthEnd(n=2), Day(n=2), Hour(n=2), Minute(n=2), + Second(n=2) ] @@ -232,6 +295,8 @@ def test_eq(a, b): (BaseCFTimeOffset(), BaseCFTimeOffset(n=3)), (YearEnd(), YearEnd(n=3)), (YearBegin(), YearBegin(n=3)), + (QuarterEnd(), QuarterEnd(n=3)), + (QuarterBegin(), QuarterBegin(n=3)), (MonthEnd(), MonthEnd(n=3)), (MonthBegin(), MonthBegin(n=3)), (Day(), Day(n=3)), @@ -256,6 +321,8 @@ def test_rmul(offset, expected): [(BaseCFTimeOffset(), BaseCFTimeOffset(n=-1)), (YearEnd(), YearEnd(n=-1)), (YearBegin(), YearBegin(n=-1)), + (QuarterEnd(), QuarterEnd(n=-1)), + (QuarterBegin(), QuarterBegin(n=-1)), (MonthEnd(), MonthEnd(n=-1)), (MonthBegin(), MonthBegin(n=-1)), (Day(), Day(n=-1)), @@ -536,6 +603,89 @@ def test_add_year_end_onOffset( assert result == expected +@pytest.mark.parametrize( + ('initial_date_args', 'offset', 'expected_date_args'), + [((1, 1, 1), QuarterBegin(), (1, 3, 1)), + ((1, 1, 1), QuarterBegin(n=2), (1, 6, 1)), + ((1, 1, 1), QuarterBegin(month=2), (1, 2, 1)), + ((1, 1, 7), QuarterBegin(n=2), (1, 6, 1)), + ((2, 2, 1), QuarterBegin(n=-1), (1, 12, 1)), + ((1, 3, 2), QuarterBegin(n=-1), (1, 3, 1)), + ((1, 1, 1, 5, 5, 5, 5), QuarterBegin(), (1, 3, 1, 5, 5, 5, 5)), + ((2, 1, 1, 5, 5, 5, 5), QuarterBegin(n=-1), (1, 12, 1, 5, 5, 5, 5))], + ids=_id_func +) +def test_add_quarter_begin(calendar, initial_date_args, offset, + expected_date_args): + date_type = get_date_type(calendar) + initial = date_type(*initial_date_args) + result = initial + offset + expected = date_type(*expected_date_args) + assert result == expected + + +@pytest.mark.parametrize( + ('initial_date_args', 'offset', 'expected_year_month', + 'expected_sub_day'), + [((1, 1, 1), QuarterEnd(), (1, 3), ()), + ((1, 1, 1), QuarterEnd(n=2), (1, 6), ()), + ((1, 1, 1), QuarterEnd(month=1), (1, 1), ()), + ((2, 3, 1), QuarterEnd(n=-1), (1, 12), ()), + ((1, 3, 1), QuarterEnd(n=-1, month=2), (1, 2), ()), + ((1, 1, 1, 5, 5, 5, 5), QuarterEnd(), (1, 3), (5, 5, 5, 5)), + ((1, 1, 1, 5, 5, 5, 5), QuarterEnd(n=2), (1, 6), (5, 5, 5, 5))], + ids=_id_func +) +def test_add_quarter_end( + calendar, initial_date_args, offset, expected_year_month, + expected_sub_day +): + date_type = get_date_type(calendar) + initial = date_type(*initial_date_args) + result = initial + offset + reference_args = expected_year_month + (1,) + reference = date_type(*reference_args) + + # Here the days at the end of each month varies based on the calendar used + expected_date_args = (expected_year_month + + (_days_in_month(reference),) + expected_sub_day) + expected = date_type(*expected_date_args) + assert result == expected + + +@pytest.mark.parametrize( + ('initial_year_month', 'initial_sub_day', 'offset', 'expected_year_month', + 'expected_sub_day'), + [((1, 12), (), QuarterEnd(), (2, 3), ()), + ((1, 12), (), QuarterEnd(n=2), (2, 6), ()), + ((1, 12), (), QuarterEnd(n=-1), (1, 9), ()), + ((1, 12), (), QuarterEnd(n=-2), (1, 6), ()), + ((1, 1), (), QuarterEnd(month=2), (1, 2), ()), + ((1, 12), (5, 5, 5, 5), QuarterEnd(), (2, 3), (5, 5, 5, 5)), + ((1, 12), (5, 5, 5, 5), QuarterEnd(n=-1), (1, 9), (5, 5, 5, 5))], + ids=_id_func +) +def test_add_quarter_end_onOffset( + calendar, initial_year_month, initial_sub_day, offset, expected_year_month, + expected_sub_day +): + date_type = get_date_type(calendar) + reference_args = initial_year_month + (1,) + reference = date_type(*reference_args) + initial_date_args = (initial_year_month + (_days_in_month(reference),) + + initial_sub_day) + initial = date_type(*initial_date_args) + result = initial + offset + reference_args = expected_year_month + (1,) + reference = date_type(*reference_args) + + # Here the days at the end of each month varies based on the calendar used + expected_date_args = (expected_year_month + + (_days_in_month(reference),) + expected_sub_day) + expected = date_type(*expected_date_args) + assert result == expected + + # Note for all sub-monthly offsets, pandas always returns True for onOffset @pytest.mark.parametrize( ('date_args', 'offset', 'expected'), @@ -543,6 +693,10 @@ def test_add_year_end_onOffset( ((1, 1, 1, 1), MonthBegin(), True), ((1, 1, 5), MonthBegin(), False), ((1, 1, 5), MonthEnd(), False), + ((1, 3, 1), QuarterBegin(), True), + ((1, 3, 1, 1), QuarterBegin(), True), + ((1, 3, 5), QuarterBegin(), False), + ((1, 12, 1), QuarterEnd(), False), ((1, 1, 1), YearBegin(), True), ((1, 1, 1, 1), YearBegin(), True), ((1, 1, 5), YearBegin(), False), @@ -565,16 +719,19 @@ def test_onOffset(calendar, date_args, offset, expected): ('year_month_args', 'sub_day_args', 'offset'), [((1, 1), (), MonthEnd()), ((1, 1), (1,), MonthEnd()), + ((1, 12), (), QuarterEnd()), + ((1, 1), (), QuarterEnd(month=1)), ((1, 12), (), YearEnd()), ((1, 1), (), YearEnd(month=1))], ids=_id_func ) -def test_onOffset_month_or_year_end( +def test_onOffset_month_or_quarter_or_year_end( calendar, year_month_args, sub_day_args, offset): date_type = get_date_type(calendar) reference_args = year_month_args + (1,) reference = date_type(*reference_args) - date_args = year_month_args + (_days_in_month(reference),) + sub_day_args + date_args = (year_month_args + (_days_in_month(reference),) + + sub_day_args) date = date_type(*date_args) result = offset.onOffset(date) assert result @@ -590,6 +747,14 @@ def test_onOffset_month_or_year_end( (YearEnd(n=2), (1, 3, 1), (1, 12)), (YearEnd(n=2, month=2), (1, 3, 1), (2, 2)), (YearEnd(n=2, month=4), (1, 4, 30), (1, 4)), + (QuarterBegin(), (1, 3, 2), (1, 6)), + (QuarterBegin(), (1, 4, 1), (1, 6)), + (QuarterBegin(n=2), (1, 4, 1), (1, 6)), + (QuarterBegin(n=2, month=2), (1, 4, 1), (1, 5)), + (QuarterEnd(), (1, 3, 1), (1, 3)), + (QuarterEnd(n=2), (1, 3, 1), (1, 3)), + (QuarterEnd(n=2, month=2), (1, 3, 1), (1, 5)), + (QuarterEnd(n=2, month=4), (1, 4, 30), (1, 4)), (MonthBegin(), (1, 3, 2), (1, 4)), (MonthBegin(), (1, 3, 1), (1, 3)), (MonthBegin(n=2), (1, 3, 2), (1, 4)), @@ -606,9 +771,9 @@ def test_rollforward(calendar, offset, initial_date_args, partial_expected_date_args): date_type = get_date_type(calendar) initial = date_type(*initial_date_args) - if isinstance(offset, (MonthBegin, YearBegin)): + if isinstance(offset, (MonthBegin, QuarterBegin, YearBegin)): expected_date_args = partial_expected_date_args + (1,) - elif isinstance(offset, (MonthEnd, YearEnd)): + elif isinstance(offset, (MonthEnd, QuarterEnd, YearEnd)): reference_args = partial_expected_date_args + (1,) reference = date_type(*reference_args) expected_date_args = (partial_expected_date_args + @@ -631,6 +796,14 @@ def test_rollforward(calendar, offset, initial_date_args, (YearEnd(n=2), (2, 3, 1), (1, 12)), (YearEnd(n=2, month=2), (2, 3, 1), (2, 2)), (YearEnd(month=4), (1, 4, 30), (1, 4)), + (QuarterBegin(), (1, 3, 2), (1, 3)), + (QuarterBegin(), (1, 4, 1), (1, 3)), + (QuarterBegin(n=2), (1, 4, 1), (1, 3)), + (QuarterBegin(n=2, month=2), (1, 4, 1), (1, 2)), + (QuarterEnd(), (2, 3, 1), (1, 12)), + (QuarterEnd(n=2), (2, 3, 1), (1, 12)), + (QuarterEnd(n=2, month=2), (2, 3, 1), (2, 2)), + (QuarterEnd(n=2, month=4), (1, 4, 30), (1, 4)), (MonthBegin(), (1, 3, 2), (1, 3)), (MonthBegin(n=2), (1, 3, 2), (1, 3)), (MonthBegin(), (1, 3, 1), (1, 3)), @@ -647,9 +820,9 @@ def test_rollback(calendar, offset, initial_date_args, partial_expected_date_args): date_type = get_date_type(calendar) initial = date_type(*initial_date_args) - if isinstance(offset, (MonthBegin, YearBegin)): + if isinstance(offset, (MonthBegin, QuarterBegin, YearBegin)): expected_date_args = partial_expected_date_args + (1,) - elif isinstance(offset, (MonthEnd, YearEnd)): + elif isinstance(offset, (MonthEnd, QuarterEnd, YearEnd)): reference_args = partial_expected_date_args + (1,) reference = date_type(*reference_args) expected_date_args = (partial_expected_date_args + @@ -687,7 +860,9 @@ def test_rollback(calendar, offset, initial_date_args, ('0010', None, 4, YearBegin(n=-2), None, False, [(10, 1, 1), (8, 1, 1), (6, 1, 1), (4, 1, 1)]), ('0001-01-01', '0001-01-04', 4, None, None, False, - [(1, 1, 1), (1, 1, 2), (1, 1, 3), (1, 1, 4)]) + [(1, 1, 1), (1, 1, 2), (1, 1, 3), (1, 1, 4)]), + ('0001-06-01', None, 4, '3QS-JUN', None, False, + [(1, 6, 1), (2, 3, 1), (2, 12, 1), (3, 9, 1)]) ] diff --git a/xarray/tests/test_cftimeindex_resample.py b/xarray/tests/test_cftimeindex_resample.py index 0b56f1d1fc6..636f9ef7b0e 100644 --- a/xarray/tests/test_cftimeindex_resample.py +++ b/xarray/tests/test_cftimeindex_resample.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd import xarray as xr +from xarray.core.resample_cftime import CFTimeGrouper pytest.importorskip('cftime') pytest.importorskip('pandas', minversion='0.24') @@ -13,10 +14,10 @@ params=[ dict(start='2004-01-01T12:07:01', periods=91, freq='3D'), dict(start='1892-01-03T12:07:01', periods=15, freq='41987T'), - dict(start='2004-01-01T12:07:01', periods=31, freq='2MS'), + dict(start='2004-01-01T12:07:01', periods=7, freq='3Q-AUG'), dict(start='1892-01-03T12:07:01', periods=10, freq='3AS-JUN') ], - ids=['3D', '41987T', '2MS', '3AS_JUN'] + ids=['3D', '41987T', '3Q_AUG', '3AS_JUN'] ) def time_range_kwargs(request): return request.param @@ -40,15 +41,18 @@ def da(index): @pytest.mark.parametrize('freq', [ '700T', '8001T', '12H', '8001H', - '3D', '8D', '8001D', - '2MS', '2M', '3MS', '3M', '4MS', '4M', - '3AS', '3A', '4AS', '4A']) -@pytest.mark.parametrize('closed', [None, 'left', 'right']) -@pytest.mark.parametrize('label', [None, 'left', 'right']) -@pytest.mark.parametrize('base', [17, 24]) + '8D', '8001D', + '2MS', '3MS', + '2QS-AUG', '3QS-SEP', + '3AS-MAR', '4A-MAY']) +@pytest.mark.parametrize('closed', [None, 'right']) +@pytest.mark.parametrize('label', [None, 'right']) +@pytest.mark.parametrize('base', [24, 31]) def test_resampler(freq, closed, label, base, datetime_index, cftime_index): # Fairly extensive testing for standard/proleptic Gregorian calendar + # For any frequencies which are not greater-than-day and anchored + # at the end, the default values for closed and label are 'left'. loffset = '12H' try: da_datetime = da(datetime_index).resample( @@ -67,11 +71,51 @@ def test_resampler(freq, closed, label, base, xr.testing.assert_identical(da_cftime, da_datetime) +@pytest.mark.parametrize('freq', [ + '2M', '3M', + '2Q-JUN', '3Q-JUL', + '3A-FEB', '4A-APR']) +@pytest.mark.parametrize('closed', ['left', None]) +@pytest.mark.parametrize('label', ['left', None]) +@pytest.mark.parametrize('base', [17, 24]) +def test_resampler_end_super_day(freq, closed, label, base, + datetime_index, cftime_index): + # Fairly extensive testing for standard/proleptic Gregorian calendar. + # For greater-than-day frequencies anchored at the end, the default values + # for closed and label are 'right'. + loffset = '12H' + try: + da_datetime = da(datetime_index).resample( + time=freq, closed=closed, label=label, base=base, + loffset=loffset).mean() + except ValueError: + with pytest.raises(ValueError): + da(cftime_index).resample( + time=freq, closed=closed, label=label, base=base, + loffset=loffset).mean() + else: + da_cftime = da(cftime_index).resample(time=freq, closed=closed, + label=label, base=base, + loffset=loffset).mean() + da_cftime['time'] = da_cftime.indexes['time'].to_datetimeindex() + xr.testing.assert_identical(da_cftime, da_datetime) + + +@pytest.mark.parametrize( + ('freq', 'expected'), + [('S', 'left'), ('T', 'left'), ('H', 'left'), ('D', 'left'), + ('M', 'right'), ('MS', 'left'), ('Q', 'right'), ('QS', 'left'), + ('A', 'right'), ('AS', 'left')]) +def test_closed_label_defaults(freq, expected): + assert CFTimeGrouper(freq=freq).closed == expected + assert CFTimeGrouper(freq=freq).label == expected + + @pytest.mark.parametrize('calendar', ['gregorian', 'noleap', 'all_leap', '360_day', 'julian']) def test_calendars(calendar): # Limited testing for non-standard calendars - freq, closed, label, base = '81T', None, None, 17 + freq, closed, label, base = '8001T', None, None, 17 loffset = datetime.timedelta(hours=12) xr_index = xr.cftime_range(start='2004-01-01T12:07:01', periods=7, freq='3D', calendar=calendar) From ab268de1f672e61bf8c80205c26d760009b88861 Mon Sep 17 00:00:00 2001 From: Kevin Squire Date: Sun, 3 Mar 2019 11:39:39 -0800 Subject: [PATCH 02/17] Add `Dataset.drop_dims` (#2767) * ENH: Add Dataset.drop_dims() * Drops full dimensions and any corresponding variables in a Dataset * Fixes GH1949 * DOC: Add Dataset.drop_dims() documentation --- doc/api.rst | 1 + doc/data-structures.rst | 7 +++++++ doc/indexing.rst | 11 +++++++++-- doc/whats-new.rst | 7 +++++++ xarray/core/dataset.py | 31 +++++++++++++++++++++++++++++++ xarray/tests/test_dataset.py | 20 ++++++++++++++++++++ 6 files changed, 75 insertions(+), 2 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 552582a553f..00b33959eed 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -87,6 +87,7 @@ Dataset contents Dataset.swap_dims Dataset.expand_dims Dataset.drop + Dataset.drop_dims Dataset.set_coords Dataset.reset_coords diff --git a/doc/data-structures.rst b/doc/data-structures.rst index 618ccccff3e..a8887471ec7 100644 --- a/doc/data-structures.rst +++ b/doc/data-structures.rst @@ -408,6 +408,13 @@ operations keep around coordinates: list(ds[['x']]) list(ds.drop('temperature')) +To remove a dimension, you can use :py:meth:`~xarray.Dataset.drop_dims` method. +Any variables using that dimension are dropped: + +.. ipython:: python + + ds.drop_dims('time') + As an alternate to dictionary-like modifications, you can use :py:meth:`~xarray.Dataset.assign` and :py:meth:`~xarray.Dataset.assign_coords`. These methods return a new dataset with additional (or replaced) or values: diff --git a/doc/indexing.rst b/doc/indexing.rst index 77ec7428991..9af9db227bc 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -229,8 +229,8 @@ arrays). However, you can do normal indexing with dimension names: Using indexing to *assign* values to a subset of dataset (e.g., ``ds[dict(space=0)] = 1``) is not yet supported. -Dropping labels ---------------- +Dropping labels and dimensions +------------------------------ The :py:meth:`~xarray.Dataset.drop` method returns a new object with the listed index labels along a dimension dropped: @@ -241,6 +241,13 @@ index labels along a dimension dropped: ``drop`` is both a ``Dataset`` and ``DataArray`` method. +Use :py:meth:`~xarray.Dataset.drop_dims` to drop a full dimension from a Dataset. +Any variables with these dimensions are also dropped: + +.. ipython:: python + + ds.drop_dims('time') + .. _masking with where: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 59683f690eb..b1c74aca740 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -79,6 +79,13 @@ Enhancements be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By `Spencer Clark `_. +- Allow ``expand_dims`` method to support inserting/broadcasting dimensions + with size > 1. (:issue:`2710`) + By `Martin Pletcher `_. + +- Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). + By `Kevin Squire `_. + Bug fixes ~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 7bb085848ef..f3e6cac1c5b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2797,6 +2797,37 @@ def _drop_vars(self, names): coord_names = set(k for k in self._coord_names if k in variables) return self._replace_vars_and_dims(variables, coord_names) + def drop_dims(self, drop_dims): + """Drop dimensions and associated variables from this dataset. + + Parameters + ---------- + drop_dims : str or list + Dimension or dimensions to drop. + + Returns + ------- + obj : Dataset + The dataset without the given dimensions (or any variables + containing those dimensions) + """ + if utils.is_scalar(drop_dims): + drop_dims = [drop_dims] + + missing_dimensions = [d for d in drop_dims if d not in self.dims] + if missing_dimensions: + raise ValueError('Dataset does not contain the dimensions: %s' + % missing_dimensions) + + drop_vars = set(k for k, v in self._variables.items() + for d in v.dims if d in drop_dims) + + variables = OrderedDict((k, v) for k, v in self._variables.items() + if k not in drop_vars) + coord_names = set(k for k in self._coord_names if k in variables) + + return self._replace_with_new_dims(variables, coord_names) + def transpose(self, *dims): """Return a new Dataset object with all array dimensions transposed. diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index c26968b1db0..7063e217ac2 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1863,6 +1863,26 @@ def test_drop_index_labels(self): ValueError, 'does not have coordinate labels'): data.drop(1, 'y') + def test_drop_dims(self): + data = xr.Dataset({'A': (['x', 'y'], np.random.randn(2, 3)), + 'B': ('x', np.random.randn(2)), + 'x': ['a', 'b'], 'z': np.pi}) + + actual = data.drop_dims('x') + expected = data.drop(['A', 'B', 'x']) + assert_identical(expected, actual) + + actual = data.drop_dims('y') + expected = data.drop('A') + assert_identical(expected, actual) + + actual = data.drop_dims(['x', 'y']) + expected = data.drop(['A', 'B', 'x']) + assert_identical(expected, actual) + + with pytest.raises((ValueError, KeyError)): + data.drop_dims('z') # not a dimension + def test_copy(self): data = create_test_data() From b393754b2cab90737fdf2e6d467db3157ac6354b Mon Sep 17 00:00:00 2001 From: Tom Nicholas <35968931+TomNicholas@users.noreply.github.com> Date: Mon, 4 Mar 2019 05:39:20 +0000 Subject: [PATCH 03/17] Improve name concat (#2792) * Added tests of desired name inferring behaviour * Infers names * updated what's new --- doc/whats-new.rst | 5 +++++ xarray/core/combine.py | 6 +++++- xarray/tests/test_combine.py | 9 +++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b1c74aca740..3ebd4001ae5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -114,6 +114,11 @@ Bug fixes - Fixed error when trying to reduce a DataArray using a function which does not require an axis argument. (:issue:`2768`) By `Tom Nicholas `_. +- Concatenating a sequence of :py:class:`~xarray.DataArray` with varying names + sets the name of the output array to ``None``, instead of the name of the + first input array. If the names are the same it sets the name to that, + instead to the name of the first DataArray in the list as it did before. + (:issue:`2775`). By `Tom Nicholas `_. - Per `CF conventions `_, diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 11961dff520..1abd14cd20b 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -9,6 +9,7 @@ from .merge import merge from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars +from .computation import result_name def concat(objs, dim=None, data_vars='all', coords='different', @@ -336,7 +337,10 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat, ds = _dataset_concat(datasets, dim, data_vars, coords, compat, positions) - return arrays[0]._from_temp_dataset(ds, name) + result = arrays[0]._from_temp_dataset(ds, name) + + result.name = result_name(arrays) + return result def _auto_concat(datasets, dim=None, data_vars='all', coords='different'): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index c37abc98f07..0d03b6e0cdf 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -285,6 +285,15 @@ def test_concat_encoding(self): assert concat([foo, foo], dim="x").encoding == foo.encoding assert concat([ds, ds], dim="x").encoding == ds.encoding + @pytest.mark.parametrize("colors, expected_name", + [(['blue', 'green', 'red'], None), + (['red', 'red', 'red'], 'red')]) + def test_concat_determine_name(self, colors, expected_name): + das = [DataArray(np.random.random((2, 2)), dims=['x', 'y'], name=k) + for k in colors] + result = concat(das, dim="band") + assert result.name is expected_name + @requires_dask def test_concat_lazy(self): import dask.array as da From 872b49c3fed2c213849ee80df176c9b573db2a6d Mon Sep 17 00:00:00 2001 From: TimoRoth Date: Wed, 6 Mar 2019 00:00:28 +0100 Subject: [PATCH 04/17] Don't use deprecated np.asscalar() (#2800) It got deprecated in numpy 1.16 and throws a ton of warnings due to that. All the function does is returning .item() anyway, which is why it got deprecated. --- xarray/convert.py | 2 +- xarray/core/utils.py | 2 +- xarray/tests/test_dataarray.py | 2 +- xarray/tests/test_dataset.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/convert.py b/xarray/convert.py index efcdd079a9f..b8c0c2a7eca 100644 --- a/xarray/convert.py +++ b/xarray/convert.py @@ -247,7 +247,7 @@ def from_iris(cube): if coord_dims: coords[_name(coord)] = (coord_dims, coord.points, coord_attrs) else: - coords[_name(coord)] = ((), np.asscalar(coord.points), coord_attrs) + coords[_name(coord)] = ((), coord.points.item(), coord_attrs) array_attrs = _iris_obj_to_attrs(cube) cell_methods = _iris_cell_methods_to_str(cube.cell_methods) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 053a45f01cb..fd1330a4e1f 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -550,7 +550,7 @@ def decode_numpy_dict_values(attrs): if isinstance(v, np.ndarray): attrs[k] = v.tolist() elif isinstance(v, np.generic): - attrs[k] = np.asscalar(v) + attrs[k] = v.item() return attrs diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 09c0f003888..ab05f19dbbe 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2963,7 +2963,7 @@ def test_to_dict_with_numpy_attrs(self): 'maintainer': 'bar'} da = DataArray(x, {'t': t, 'lat': lat}, dims=['t', 'lat'], attrs=attrs) - expected_attrs = {'created': np.asscalar(attrs['created']), + expected_attrs = {'created': attrs['created'].item(), 'coords': attrs['coords'].tolist(), 'maintainer': 'bar'} actual = da.to_dict() diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 7063e217ac2..d130363a7c0 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3152,7 +3152,7 @@ def test_to_dict_with_numpy_attrs(self): ds = Dataset(OrderedDict([('a', ('t', x, attrs)), ('b', ('t', y, attrs)), ('t', ('t', t))])) - expected_attrs = {'created': np.asscalar(attrs['created']), + expected_attrs = {'created': attrs['created'].item(), 'coords': attrs['coords'].tolist(), 'maintainer': 'bar'} actual = ds.to_dict() From 849eb186f93b1e1da1ecca67329ed7bf190e7d4c Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Wed, 6 Mar 2019 14:47:47 -0500 Subject: [PATCH 05/17] Add support for cftime.datetime coordinates with coarsen (#2778) --- doc/whats-new.rst | 4 ++++ xarray/core/common.py | 13 +++++++++---- xarray/core/duck_array_ops.py | 26 +++++++++++++++++++++++--- xarray/tests/test_dataset.py | 11 ++++++++++- xarray/tests/test_duck_array_ops.py | 25 +++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 8 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3ebd4001ae5..9ef2960ad76 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -45,6 +45,10 @@ Enhancements See :ref:`comput.coarsen` for details. (:issue:`2525`) By `Keisuke Fujii `_. +- Taking the mean of arrays of :py:class:`cftime.datetime` objects, and + by extension, use of :py:meth:`~xarray.DataArray.coarsen` with + :py:class:`cftime.datetime` coordinates is now possible. By `Spencer Clark + `_. - Upsampling an array via interpolation with resample is now dask-compatible, as long as the array is not chunked along the resampling dimension. By `Spencer Clark `_. diff --git a/xarray/core/common.py b/xarray/core/common.py index 2f32ca941be..6ec07156160 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -997,15 +997,15 @@ def is_np_datetime_like(dtype): np.issubdtype(dtype, np.timedelta64)) -def contains_cftime_datetimes(var): - """Check if a variable contains cftime datetime objects""" +def _contains_cftime_datetimes(array): + """Check if an array contains cftime.datetime objects""" try: from cftime import datetime as cftime_datetime except ImportError: return False else: - if var.dtype == np.dtype('O') and var.data.size > 0: - sample = var.data.ravel()[0] + if array.dtype == np.dtype('O') and array.size > 0: + sample = array.ravel()[0] if isinstance(sample, dask_array_type): sample = sample.compute() if isinstance(sample, np.ndarray): @@ -1015,6 +1015,11 @@ def contains_cftime_datetimes(var): return False +def contains_cftime_datetimes(var): + """Check if an xarray.Variable contains cftime.datetime objects""" + return _contains_cftime_datetimes(var.data) + + def _contains_datetime_like_objects(var): """Check if a variable contains datetime like objects (either np.datetime64, np.timedelta64, or cftime.datetime)""" diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 4d6d716a164..b67a220ed4c 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -294,7 +294,7 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): Parameters ---------- - da : array + da : np.array Input data offset: Scalar with the same type of array or None If None, subtract minimum values to reduce round off error @@ -306,6 +306,7 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): ------- array """ + # TODO: make this function dask-compatible? if offset is None: offset = array.min() array = array - offset @@ -326,15 +327,34 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): return array.astype(dtype) +def _to_pytimedelta(array, unit='us'): + index = pd.TimedeltaIndex(array.ravel(), unit=unit) + return index.to_pytimedelta().reshape(array.shape) + + def mean(array, axis=None, skipna=None, **kwargs): - """ inhouse mean that can handle datatime dtype """ + """inhouse mean that can handle np.datetime64 or cftime.datetime + dtypes""" + from .common import _contains_cftime_datetimes + array = asarray(array) if array.dtype.kind in 'Mm': offset = min(array) - # xarray always uses datetime[ns] for datetime + # xarray always uses np.datetime64[ns] for np.datetime64 data dtype = 'timedelta64[ns]' return _mean(datetime_to_numeric(array, offset), axis=axis, skipna=skipna, **kwargs).astype(dtype) + offset + elif _contains_cftime_datetimes(array): + if isinstance(array, dask_array_type): + raise NotImplementedError( + 'Computing the mean of an array containing ' + 'cftime.datetime objects is not yet implemented on ' + 'dask arrays.') + offset = min(array) + timedeltas = datetime_to_numeric(array, offset, datetime_unit='us') + mean_timedeltas = _mean(timedeltas, axis=axis, skipna=skipna, + **kwargs) + return _to_pytimedelta(mean_timedeltas, unit='us') + offset else: return _mean(array, axis=axis, skipna=skipna, **kwargs) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index d130363a7c0..8e8c6c4b419 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -23,7 +23,7 @@ InaccessibleArray, UnexpectedDataAccess, assert_allclose, assert_array_equal, assert_equal, assert_identical, has_cftime, has_dask, raises_regex, requires_bottleneck, requires_dask, requires_scipy, - source_ndarray) + source_ndarray, requires_cftime) try: import dask.array as da @@ -4530,6 +4530,15 @@ def test_coarsen_coords(ds, dask): actual = da.coarsen(time=2).mean() +@requires_cftime +def test_coarsen_coords_cftime(): + times = xr.cftime_range('2000', periods=6) + da = xr.DataArray(range(6), [('time', times)]) + actual = da.coarsen(time=3).mean() + expected_times = xr.cftime_range('2000-01-02', freq='3D', periods=2) + np.testing.assert_array_equal(actual.time, expected_times) + + def test_rolling_properties(ds): # catching invalid args with pytest.raises(ValueError) as exception: diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index ab3cafed449..5d425f648bd 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -270,6 +270,31 @@ def test_datetime_reduce(dask): assert da['time'][0].mean() == da['time'][:1].mean() +@requires_cftime +def test_cftime_datetime_mean(): + times = cftime_range('2000', periods=4) + da = DataArray(times, dims=['time']) + + assert da.isel(time=0).mean() == da.isel(time=0) + + expected = DataArray(times.date_type(2000, 1, 2, 12)) + result = da.mean() + assert_equal(result, expected) + + da_2d = DataArray(times.values.reshape(2, 2)) + result = da_2d.mean() + assert_equal(result, expected) + + +@requires_cftime +@requires_dask +def test_cftime_datetime_mean_dask_error(): + times = cftime_range('2000', periods=4) + da = DataArray(times, dims=['time']).chunk() + with pytest.raises(NotImplementedError): + da.mean() + + @pytest.mark.parametrize('dim_num', [1, 2]) @pytest.mark.parametrize('dtype', [float, int, np.float32, np.bool_]) @pytest.mark.parametrize('dask', [False, True]) From 54883bafbfbfbfb96d8215c0270d4e162adb2092 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 12 Mar 2019 09:01:17 -0600 Subject: [PATCH 06/17] some docs updates (#2746) * Friendlier io title. * Fix lists. * Fix *args, **kwargs "inline emphasis..." * misc * Reference xarray_extras for csv writing. Closes #2289 * Add metpy accessor. Closes #461 * fix transpose docstring. Closes #2576 * Revert "Fix lists." This reverts commit 39983a5835612d7158ae91a9cce7196a03742983. * Revert "Fix *args, **kwargs" This reverts commit 1b9da35ef43e44ce7855f2ab8406a781c9a68933. * Add MetPy to related projects. * Add Weather and Climate specific page. * Add hvplot. * Note open_dataset, mfdataset open files as read-only (closes #2345). * Update metpy 1 Co-Authored-By: dcherian * Update doc/weather-climate.rst Co-Authored-By: dcherian --- doc/index.rst | 2 + doc/io.rst | 13 ++-- doc/plotting.rst | 4 + doc/related-projects.rst | 1 + doc/time-series.rst | 137 --------------------------------- doc/weather-climate.rst | 160 +++++++++++++++++++++++++++++++++++++++ doc/whats-new.rst | 2 +- xarray/backends/api.py | 14 ++++ xarray/core/dataarray.py | 13 ++-- xarray/core/dataset.py | 5 +- xarray/core/variable.py | 4 +- 11 files changed, 202 insertions(+), 153 deletions(-) create mode 100644 doc/weather-climate.rst diff --git a/doc/index.rst b/doc/index.rst index dbe911011cd..1d3bb110ddb 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -52,6 +52,7 @@ Documentation * :doc:`reshaping` * :doc:`combining` * :doc:`time-series` +* :doc:`weather-climate` * :doc:`pandas` * :doc:`io` * :doc:`dask` @@ -70,6 +71,7 @@ Documentation reshaping combining time-series + weather-climate pandas io dask diff --git a/doc/io.rst b/doc/io.rst index 0dc5181f9b8..51c747189da 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -1,11 +1,11 @@ .. _io: -Serialization and IO -==================== +Reading and writing files +========================= xarray supports direct serialization and IO to several file formats, from simple :ref:`io.pickle` files to the more flexible :ref:`io.netcdf` -format. +format (recommended). .. ipython:: python :suppress: @@ -739,11 +739,14 @@ options are listed on the PseudoNetCDF page. .. _PseudoNetCDF: http://github.com/barronh/PseudoNetCDF -Formats supported by Pandas ---------------------------- +CSV and other formats supported by Pandas +----------------------------------------- For more options (tabular formats and CSV files in particular), consider exporting your objects to pandas and using its broad range of `IO tools`_. +For CSV files, one might also consider `xarray_extras`_. + +.. _xarray_extras: https://xarray-extras.readthedocs.io/en/latest/api/csv.html .. _IO tools: http://pandas.pydata.org/pandas-docs/stable/io.html diff --git a/doc/plotting.rst b/doc/plotting.rst index a705c683594..c8f568e516f 100644 --- a/doc/plotting.rst +++ b/doc/plotting.rst @@ -39,6 +39,10 @@ For more extensive plotting applications consider the following projects: data structures for building even complex visualizations easily." Includes native support for xarray objects. +- `hvplot `_: ``hvplot`` makes it very easy to produce + dynamic plots (backed by ``Holoviews`` or ``Geoviews``) by adding a ``hvplot`` + accessor to DataArrays. + - `Cartopy `_: Provides cartographic tools. diff --git a/doc/related-projects.rst b/doc/related-projects.rst index c89e324ff7c..e899022e5d4 100644 --- a/doc/related-projects.rst +++ b/doc/related-projects.rst @@ -13,6 +13,7 @@ Geosciences - `aospy `_: Automated analysis and management of gridded climate data. - `infinite-diff `_: xarray-based finite-differencing, focused on gridded climate/meterology data - `marc_analysis `_: Analysis package for CESM/MARC experiments and output. +- `MetPy `_: A collection of tools in Python for reading, visualizing, and performing calculations with weather data. - `MPAS-Analysis `_: Analysis for simulations produced with Model for Prediction Across Scales (MPAS) components and the Accelerated Climate Model for Energy (ACME). - `OGGM `_: Open Global Glacier Model - `Oocgcm `_: Analysis of large gridded geophysical datasets diff --git a/doc/time-series.rst b/doc/time-series.rst index 3249dad2ec6..53efcd45ba2 100644 --- a/doc/time-series.rst +++ b/doc/time-series.rst @@ -212,140 +212,3 @@ Data that has indices outside of the given ``tolerance`` are set to ``NaN``. For more examples of using grouped operations on a time dimension, see :ref:`toy weather data`. - - -.. _CFTimeIndex: - -Non-standard calendars and dates outside the Timestamp-valid range ------------------------------------------------------------------- - -Through the standalone ``cftime`` library and a custom subclass of -:py:class:`pandas.Index`, xarray supports a subset of the indexing -functionality enabled through the standard :py:class:`pandas.DatetimeIndex` for -dates from non-standard calendars commonly used in climate science or dates -using a standard calendar, but outside the `Timestamp-valid range`_ -(approximately between years 1678 and 2262). - -.. note:: - - As of xarray version 0.11, by default, :py:class:`cftime.datetime` objects - will be used to represent times (either in indexes, as a - :py:class:`~xarray.CFTimeIndex`, or in data arrays with dtype object) if - any of the following are true: - - - The dates are from a non-standard calendar - - Any dates are outside the Timestamp-valid range. - - Otherwise pandas-compatible dates from a standard calendar will be - represented with the ``np.datetime64[ns]`` data type, enabling the use of a - :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[ns]`` - and their full set of associated features. - -For example, you can create a DataArray indexed by a time -coordinate with dates from a no-leap calendar and a -:py:class:`~xarray.CFTimeIndex` will automatically be used: - -.. ipython:: python - - from itertools import product - from cftime import DatetimeNoLeap - dates = [DatetimeNoLeap(year, month, 1) for year, month in - product(range(1, 3), range(1, 13))] - da = xr.DataArray(np.arange(24), coords=[dates], dims=['time'], name='foo') - -xarray also includes a :py:func:`~xarray.cftime_range` function, which enables -creating a :py:class:`~xarray.CFTimeIndex` with regularly-spaced dates. For -instance, we can create the same dates and DataArray we created above using: - -.. ipython:: python - - dates = xr.cftime_range(start='0001', periods=24, freq='MS', calendar='noleap') - da = xr.DataArray(np.arange(24), coords=[dates], dims=['time'], name='foo') - -For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: - -- `Partial datetime string indexing`_ using strictly `ISO 8601-format`_ partial - datetime strings: - -.. ipython:: python - - da.sel(time='0001') - da.sel(time=slice('0001-05', '0002-02')) - -- Access of basic datetime components via the ``dt`` accessor (in this case - just "year", "month", "day", "hour", "minute", "second", "microsecond", - "season", "dayofyear", and "dayofweek"): - -.. ipython:: python - - da.time.dt.year - da.time.dt.month - da.time.dt.season - da.time.dt.dayofyear - da.time.dt.dayofweek - -- Group-by operations based on datetime accessor attributes (e.g. by month of - the year): - -.. ipython:: python - - da.groupby('time.month').sum() - -- Interpolation using :py:class:`cftime.datetime` objects: - -.. ipython:: python - - da.interp(time=[DatetimeNoLeap(1, 1, 15), DatetimeNoLeap(1, 2, 15)]) - -- Interpolation using datetime strings: - -.. ipython:: python - - da.interp(time=['0001-01-15', '0001-02-15']) - -- Differentiation: - -.. ipython:: python - - da.differentiate('time') - -- Serialization: - -.. ipython:: python - - da.to_netcdf('example-no-leap.nc') - xr.open_dataset('example-no-leap.nc') - -- And resampling along the time dimension for data indexed by a :py:class:`~xarray.CFTimeIndex`: - -.. ipython:: python - - da.resample(time='81T', closed='right', label='right', base=3).mean() - -.. note:: - - - For some use-cases it may still be useful to convert from - a :py:class:`~xarray.CFTimeIndex` to a :py:class:`pandas.DatetimeIndex`, - despite the difference in calendar types. The recommended way of doing this - is to use the built-in :py:meth:`~xarray.CFTimeIndex.to_datetimeindex` - method: - - .. ipython:: python - :okwarning: - - modern_times = xr.cftime_range('2000', periods=24, freq='MS', calendar='noleap') - da = xr.DataArray(range(24), [('time', modern_times)]) - da - datetimeindex = da.indexes['time'].to_datetimeindex() - da['time'] = datetimeindex - - However in this case one should use caution to only perform operations which - do not depend on differences between dates (e.g. differentiation, - interpolation, or upsampling with resample), as these could introduce subtle - and silent errors due to the difference in calendar types between the dates - encoded in your data and the dates stored in memory. - -.. _Timestamp-valid range: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#timestamp-limitations -.. _ISO 8601-format: https://en.wikipedia.org/wiki/ISO_8601 -.. _partial datetime string indexing: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#partial-string-indexing diff --git a/doc/weather-climate.rst b/doc/weather-climate.rst new file mode 100644 index 00000000000..1950ba62ffb --- /dev/null +++ b/doc/weather-climate.rst @@ -0,0 +1,160 @@ +.. _weather-climate: + +Weather and climate data +======================== + +.. ipython:: python + :suppress: + + import xarray as xr + +``xarray`` can leverage metadata that follows the `Climate and Forecast (CF) conventions`_ if present. Examples include automatic labelling of plots with descriptive names and units if proper metadata is present (see :ref:`plotting`) and support for non-standard calendars used in climate science through the ``cftime`` module (see :ref:`CFTimeIndex`). There are also a number of geosciences-focused projects that build on xarray (see :ref:`related-projects`). + +.. _Climate and Forecast (CF) conventions: http://cfconventions.org + +.. _metpy_accessor: + +CF-compliant coordinate variables +--------------------------------- + +`MetPy`_ adds a ``metpy`` accessor that allows accessing coordinates with appropriate CF metadata using generic names ``x``, ``y``, ``vertical`` and ``time``. There is also a `cartopy_crs` attribute that provides projection information, parsed from the appropriate CF metadata, as a `Cartopy`_ projection object. See `their documentation`_ for more information. + +.. _`MetPy`: https://unidata.github.io/MetPy/dev/index.html +.. _`their documentation`: https://unidata.github.io/MetPy/dev/tutorials/xarray_tutorial.html#coordinates +.. _`Cartopy`: https://scitools.org.uk/cartopy/docs/latest/crs/projections.html + +.. _CFTimeIndex: + +Non-standard calendars and dates outside the Timestamp-valid range +------------------------------------------------------------------ + +Through the standalone ``cftime`` library and a custom subclass of +:py:class:`pandas.Index`, xarray supports a subset of the indexing +functionality enabled through the standard :py:class:`pandas.DatetimeIndex` for +dates from non-standard calendars commonly used in climate science or dates +using a standard calendar, but outside the `Timestamp-valid range`_ +(approximately between years 1678 and 2262). + +.. note:: + + As of xarray version 0.11, by default, :py:class:`cftime.datetime` objects + will be used to represent times (either in indexes, as a + :py:class:`~xarray.CFTimeIndex`, or in data arrays with dtype object) if + any of the following are true: + + - The dates are from a non-standard calendar + - Any dates are outside the Timestamp-valid range. + + Otherwise pandas-compatible dates from a standard calendar will be + represented with the ``np.datetime64[ns]`` data type, enabling the use of a + :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[ns]`` + and their full set of associated features. + +For example, you can create a DataArray indexed by a time +coordinate with dates from a no-leap calendar and a +:py:class:`~xarray.CFTimeIndex` will automatically be used: + +.. ipython:: python + + from itertools import product + from cftime import DatetimeNoLeap + dates = [DatetimeNoLeap(year, month, 1) for year, month in + product(range(1, 3), range(1, 13))] + da = xr.DataArray(np.arange(24), coords=[dates], dims=['time'], name='foo') + +xarray also includes a :py:func:`~xarray.cftime_range` function, which enables +creating a :py:class:`~xarray.CFTimeIndex` with regularly-spaced dates. For +instance, we can create the same dates and DataArray we created above using: + +.. ipython:: python + + dates = xr.cftime_range(start='0001', periods=24, freq='MS', calendar='noleap') + da = xr.DataArray(np.arange(24), coords=[dates], dims=['time'], name='foo') + +For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: + +- `Partial datetime string indexing`_ using strictly `ISO 8601-format`_ partial + datetime strings: + +.. ipython:: python + + da.sel(time='0001') + da.sel(time=slice('0001-05', '0002-02')) + +- Access of basic datetime components via the ``dt`` accessor (in this case + just "year", "month", "day", "hour", "minute", "second", "microsecond", + "season", "dayofyear", and "dayofweek"): + +.. ipython:: python + + da.time.dt.year + da.time.dt.month + da.time.dt.season + da.time.dt.dayofyear + da.time.dt.dayofweek + +- Group-by operations based on datetime accessor attributes (e.g. by month of + the year): + +.. ipython:: python + + da.groupby('time.month').sum() + +- Interpolation using :py:class:`cftime.datetime` objects: + +.. ipython:: python + + da.interp(time=[DatetimeNoLeap(1, 1, 15), DatetimeNoLeap(1, 2, 15)]) + +- Interpolation using datetime strings: + +.. ipython:: python + + da.interp(time=['0001-01-15', '0001-02-15']) + +- Differentiation: + +.. ipython:: python + + da.differentiate('time') + +- Serialization: + +.. ipython:: python + + da.to_netcdf('example-no-leap.nc') + xr.open_dataset('example-no-leap.nc') + +- And resampling along the time dimension for data indexed by a :py:class:`~xarray.CFTimeIndex`: + +.. ipython:: python + + da.resample(time='81T', closed='right', label='right', base=3).mean() + +.. note:: + + + For some use-cases it may still be useful to convert from + a :py:class:`~xarray.CFTimeIndex` to a :py:class:`pandas.DatetimeIndex`, + despite the difference in calendar types. The recommended way of doing this + is to use the built-in :py:meth:`~xarray.CFTimeIndex.to_datetimeindex` + method: + + .. ipython:: python + :okwarning: + + modern_times = xr.cftime_range('2000', periods=24, freq='MS', calendar='noleap') + da = xr.DataArray(range(24), [('time', modern_times)]) + da + datetimeindex = da.indexes['time'].to_datetimeindex() + da['time'] = datetimeindex + + However in this case one should use caution to only perform operations which + do not depend on differences between dates (e.g. differentiation, + interpolation, or upsampling with resample), as these could introduce subtle + and silent errors due to the difference in calendar types between the dates + encoded in your data and the dates stored in memory. + +.. _Timestamp-valid range: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#timestamp-limitations +.. _ISO 8601-format: https://en.wikipedia.org/wiki/ISO_8601 +.. _partial datetime string indexing: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#partial-string-indexing diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9ef2960ad76..b1363897094 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -100,7 +100,7 @@ Bug fixes from higher frequencies to lower frequencies. Datapoints outside the bounds of the original time coordinate are now filled with NaN (:issue:`2197`). By `Spencer Clark `_. -- Line plots with the `x` argument set to a non-dimensional coord now plot the correct data for 1D DataArrays. +- Line plots with the ``x`` argument set to a non-dimensional coord now plot the correct data for 1D DataArrays. (:issue:`27251). By `Tom Nicholas `_. - Subtracting a scalar ``cftime.datetime`` object from a :py:class:`CFTimeIndex` now results in a :py:class:`pandas.TimedeltaIndex` diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 61efcfdedf2..36baa9071c0 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -247,6 +247,13 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, dataset : Dataset The newly created dataset. + Notes + ----- + ``open_dataset`` opens the file with read-only access. When you modify + values of a Dataset, even one linked to files on disk, only the in-memory + copy you are manipulating in xarray is modified: the original file on disk + is never touched. + See Also -------- open_mfdataset @@ -597,6 +604,13 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, ------- xarray.Dataset + Notes + ----- + ``open_mfdataset`` opens files with read-only access. When you modify values + of a Dataset, even one linked to files on disk, only the in-memory copy you + are manipulating in xarray is modified: the original file on disk is never + touched. + See Also -------- auto_combine diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 96b42f19555..e7e12ae3da4 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1385,8 +1385,9 @@ def transpose(self, *dims): Notes ----- - Although this operation returns a view of this array's data, it is - not lazy -- the data will be fully loaded. + This operation returns a view of this array's data. It is + lazy for dask-backed DataArrays but not for numpy-backed DataArrays + -- the data will be fully loaded. See Also -------- @@ -2437,10 +2438,10 @@ def integrate(self, dim, datetime_unit=None): ---------- dim: str, or a sequence of str Coordinate(s) used for the integration. - datetime_unit - Can be specify the unit if datetime coordinate is used. One of - {'Y', 'M', 'W', 'D', 'h', 'm', 's', 'ms', 'us', 'ns', 'ps', 'fs', - 'as'} + datetime_unit: str, optional + Can be used to specify the unit if datetime coordinate is used. + One of {'Y', 'M', 'W', 'D', 'h', 'm', 's', 'ms', 'us', 'ns', + 'ps', 'fs', 'as'} Returns ------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f3e6cac1c5b..12c5d139fdc 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2848,8 +2848,9 @@ def transpose(self, *dims): Notes ----- - Although this operation returns a view of each array's data, it - is not lazy -- the data will be fully loaded into memory. + This operation returns a view of each array's data. It is + lazy for dask-backed DataArrays but not for numpy-backed DataArrays + -- the data will be fully loaded into memory. See Also -------- diff --git a/xarray/core/variable.py b/xarray/core/variable.py index b675317d83d..433f4a05e1f 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1133,8 +1133,8 @@ def transpose(self, *dims): Notes ----- - Although this operation returns a view of this variable's data, it is - not lazy -- the data will be fully loaded. + This operation returns a view of this variable's data. It is + lazy for dask-backed Variables but not for numpy-backed Variables. See Also -------- From 526a3959d5316a58f29ca9166896fd910cd275bb Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 14 Mar 2019 08:59:12 -0700 Subject: [PATCH 07/17] Drop failing tests writing multi-dimensional arrays as attributes (#2810) These aren't valid for netCDF files. Fixes GH2803 --- xarray/tests/test_backends.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f610dba1352..c6ddb8fae58 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3444,11 +3444,6 @@ def new_dataset_and_coord_attrs(): with create_tmp_file() as tmp_file: ds.to_netcdf(tmp_file) - ds, attrs = new_dataset_and_attrs() - attrs['test'] = np.arange(12).reshape(3, 4) - with create_tmp_file() as tmp_file: - ds.to_netcdf(tmp_file) - ds, attrs = new_dataset_and_attrs() attrs['test'] = 'This is a string' with create_tmp_file() as tmp_file: @@ -3459,11 +3454,6 @@ def new_dataset_and_coord_attrs(): with create_tmp_file() as tmp_file: ds.to_netcdf(tmp_file) - ds, attrs = new_dataset_and_attrs() - attrs['test'] = np.arange(12).reshape(3, 4) - with create_tmp_file() as tmp_file: - ds.to_netcdf(tmp_file) - @requires_scipy_or_netCDF4 class TestDataArrayToNetCDF(object): From 9f00c6f58057f75ec56816e5f4d74e145645ad70 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 14 Mar 2019 21:22:10 -0700 Subject: [PATCH 08/17] Push back finalizing deprecations for 0.12 (#2809) 0.12 will already have a big change in dropping Python 2.7 support. I'd rather wait a bit longer to finalize these deprecations to minimize the impact on users. --- xarray/backends/api.py | 4 ++-- xarray/core/groupby.py | 10 ++++++---- xarray/core/utils.py | 3 ++- xarray/tests/test_dataarray.py | 2 +- xarray/tutorial.py | 9 +++++---- 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 36baa9071c0..1f330bbd3a0 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -262,8 +262,8 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, warnings.warn( 'The autoclose argument is no longer used by ' 'xarray.open_dataset() and is now ignored; it will be removed in ' - 'xarray v0.12. If necessary, you can control the maximum number ' - 'of simultaneous open files with ' + 'a future version of xarray. If necessary, you can control the ' + 'maximum number of simultaneous open files with ' 'xarray.set_options(file_cache_maxsize=...).', FutureWarning, stacklevel=2) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 1fa1c159fbc..e8e2f1b08d4 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -616,8 +616,9 @@ def reduce(self, func, dim=None, axis=None, if self._obj.ndim > 1: warnings.warn( "Default reduction dimension will be changed to the " - "grouped dimension after xarray 0.12. To silence this " - "warning, pass dim=xarray.ALL_DIMS explicitly.", + "grouped dimension in a future version of xarray. To " + "silence this warning, pass dim=xarray.ALL_DIMS " + "explicitly.", FutureWarning, stacklevel=2) if keep_attrs is None: @@ -731,8 +732,9 @@ def reduce(self, func, dim=None, keep_attrs=None, **kwargs): # the deprecation process. Do not forget to remove _reduce_method warnings.warn( "Default reduction dimension will be changed to the " - "grouped dimension after xarray 0.12. To silence this " - "warning, pass dim=xarray.ALL_DIMS explicitly.", + "grouped dimension in a future version of xarray. To " + "silence this warning, pass dim=xarray.ALL_DIMS " + "explicitly.", FutureWarning, stacklevel=2) elif dim is None: dim = self._group_dim diff --git a/xarray/core/utils.py b/xarray/core/utils.py index fd1330a4e1f..349c8f98dc5 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -20,7 +20,8 @@ def _check_inplace(inplace, default=False): inplace = default else: warnings.warn('The inplace argument has been deprecated and will be ' - 'removed in xarray 0.12.0.', FutureWarning, stacklevel=3) + 'removed in a future version of xarray.', + FutureWarning, stacklevel=3) return inplace diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index ab05f19dbbe..4975071dad8 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2037,7 +2037,7 @@ def test_groupby_warning(self): with pytest.warns(FutureWarning): grouped.sum() - @pytest.mark.skipif(LooseVersion(xr.__version__) < LooseVersion('0.12'), + @pytest.mark.skipif(LooseVersion(xr.__version__) < LooseVersion('0.13'), reason="not to forget the behavior change") def test_groupby_sum_default(self): array = self.make_groupby_example_array() diff --git a/xarray/tutorial.py b/xarray/tutorial.py index 3f92bd9a400..f54cf7b3889 100644 --- a/xarray/tutorial.py +++ b/xarray/tutorial.py @@ -91,16 +91,17 @@ def open_dataset(name, cache=True, cache_dir=_default_cache_dir, def load_dataset(*args, **kwargs): """ - `load_dataset` will be removed in version 0.12. The current behavior of - this function can be achived by using `tutorial.open_dataset(...).load()`. + `load_dataset` will be removed a future version of xarray. The current + behavior of this function can be achived by using + `tutorial.open_dataset(...).load()`. See Also -------- open_dataset """ warnings.warn( - "load_dataset` will be removed in xarray version 0.12. The current " - "behavior of this function can be achived by using " + "load_dataset` will be removed in a future version of xarray. The " + "current behavior of this function can be achived by using " "`tutorial.open_dataset(...).load()`.", DeprecationWarning, stacklevel=2) return open_dataset(*args, **kwargs).load() From 7d209f663b0397da2580d66b376e817f6d16db80 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Fri, 15 Mar 2019 17:35:57 -0700 Subject: [PATCH 09/17] enable loading remote hdf5 files (#2782) * attempt at loading remote hdf5 * added a couple tests * rewind bytes after reading header * addressed comments for tests and error message * fixed pep8 formatting * created _get_engine_from_magic_number function, new tests * added description in whats-new * fixed test failure on windows * same error on windows and nix --- doc/whats-new.rst | 5 ++- xarray/backends/api.py | 77 +++++++++++++++++++++++------------ xarray/tests/__init__.py | 6 +++ xarray/tests/test_backends.py | 50 ++++++++++++++++++++++- 4 files changed, 110 insertions(+), 28 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b1363897094..17cf9a9c8fb 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,7 +33,9 @@ Breaking changes Enhancements ~~~~~~~~~~~~ - +- Added ability to open netcdf4/hdf5 file-like objects with ``open_dataset``. + Requires (h5netcdf>0.7 and h5py>2.9.0). (:issue:`2781`) + By `Scott Henderson `_ - Internal plotting now supports ``cftime.datetime`` objects as time series. (:issue:`2164`) By `Julius Busecke `_ and @@ -86,6 +88,7 @@ Enhancements - Allow ``expand_dims`` method to support inserting/broadcasting dimensions with size > 1. (:issue:`2710`) By `Martin Pletcher `_. + `Spencer Clark `_. - Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). By `Kevin Squire `_. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 1f330bbd3a0..a982c6cd35e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -75,6 +75,34 @@ def _get_default_engine_netcdf(): return engine +def _get_engine_from_magic_number(filename_or_obj): + # check byte header to determine file type + if isinstance(filename_or_obj, bytes): + magic_number = filename_or_obj[:8] + else: + if filename_or_obj.tell() != 0: + raise ValueError("file-like object read/write pointer not at zero " + "please close and reopen, or use a context " + "manager") + magic_number = filename_or_obj.read(8) + filename_or_obj.seek(0) + + if magic_number.startswith(b'CDF'): + engine = 'scipy' + elif magic_number.startswith(b'\211HDF\r\n\032\n'): + engine = 'h5netcdf' + if isinstance(filename_or_obj, bytes): + raise ValueError("can't open netCDF4/HDF5 as bytes " + "try passing a path or file-like object") + else: + if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80: + filename_or_obj = filename_or_obj[:80] + b'...' + raise ValueError('{} is not a valid netCDF file ' + 'did you mean to pass a string for a path instead?' + .format(filename_or_obj)) + return engine + + def _get_default_engine(path, allow_remote=False): if allow_remote and is_remote_uri(path): engine = _get_default_engine_remote_uri() @@ -170,8 +198,8 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, Strings and Path objects are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename ends with .gz, in which case the file is gunzipped and opened with - scipy.io.netcdf (only netCDF3 supported). File-like objects are opened - with scipy.io.netcdf (only netCDF3 supported). + scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like + objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). group : str, optional Path to the netCDF4 group in the given file to open (only works for netCDF4 files). @@ -258,6 +286,13 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, -------- open_mfdataset """ + engines = [None, 'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', + 'cfgrib', 'pseudonetcdf'] + if engine not in engines: + raise ValueError('unrecognized engine for open_dataset: {}\n' + 'must be one of: {}' + .format(engine, engines)) + if autoclose is not None: warnings.warn( 'The autoclose argument is no longer used by ' @@ -316,18 +351,9 @@ def maybe_decode_store(store, lock=False): if isinstance(filename_or_obj, backends.AbstractDataStore): store = filename_or_obj - ds = maybe_decode_store(store) - elif isinstance(filename_or_obj, str): - if (isinstance(filename_or_obj, bytes) and - filename_or_obj.startswith(b'\x89HDF')): - raise ValueError('cannot read netCDF4/HDF5 file images') - elif (isinstance(filename_or_obj, bytes) and - filename_or_obj.startswith(b'CDF')): - # netCDF3 file images are handled by scipy - pass - elif isinstance(filename_or_obj, str): - filename_or_obj = _normalize_path(filename_or_obj) + elif isinstance(filename_or_obj, str): + filename_or_obj = _normalize_path(filename_or_obj) if engine is None: engine = _get_default_engine(filename_or_obj, @@ -352,18 +378,19 @@ def maybe_decode_store(store, lock=False): elif engine == 'cfgrib': store = backends.CfGribDataStore( filename_or_obj, lock=lock, **backend_kwargs) - else: - raise ValueError('unrecognized engine for open_dataset: %r' - % engine) - with close_on_error(store): - ds = maybe_decode_store(store) else: - if engine is not None and engine != 'scipy': - raise ValueError('can only read file-like objects with ' - "default engine or engine='scipy'") - # assume filename_or_obj is a file-like object - store = backends.ScipyDataStore(filename_or_obj) + if engine not in [None, 'scipy', 'h5netcdf']: + raise ValueError("can only read bytes or file-like objects " + "with engine='scipy' or 'h5netcdf'") + engine = _get_engine_from_magic_number(filename_or_obj) + if engine == 'scipy': + store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs) + elif engine == 'h5netcdf': + store = backends.H5NetCDFStore(filename_or_obj, group=group, + lock=lock, **backend_kwargs) + + with close_on_error(store): ds = maybe_decode_store(store) # Ensure source filename always stored in dataset object (GH issue #2550) @@ -390,8 +417,8 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, Strings and Paths are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename ends with .gz, in which case the file is gunzipped and opened with - scipy.io.netcdf (only netCDF3 supported). File-like objects are opened - with scipy.io.netcdf (only netCDF3 supported). + scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like + objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). group : str, optional Path to the netCDF4 group in the given file to open (only works for netCDF4 files). diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 281fc662197..4ebcc29a61e 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -77,6 +77,12 @@ def LooseVersion(vstring): has_cfgrib, requires_cfgrib = _importorskip('cfgrib') # some special cases +has_h5netcdf07, requires_h5netcdf07 = _importorskip('h5netcdf', + minversion='0.7') +has_h5py29, requires_h5py29 = _importorskip('h5py', minversion='2.9.0') +has_h5fileobj = has_h5netcdf07 and has_h5py29 +requires_h5fileobj = pytest.mark.skipif( + not has_h5fileobj, reason='requires h5py>2.9.0 & h5netcdf>0.7') has_scipy_or_netCDF4 = has_scipy or has_netCDF4 requires_scipy_or_netCDF4 = pytest.mark.skipif( not has_scipy_or_netCDF4, reason='requires scipy or netCDF4') diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c6ddb8fae58..a20ba2df229 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -35,7 +35,7 @@ requires_cftime, requires_dask, requires_h5netcdf, requires_netCDF4, requires_pathlib, requires_pseudonetcdf, requires_pydap, requires_pynio, requires_rasterio, requires_scipy, requires_scipy_or_netCDF4, - requires_zarr) + requires_zarr, requires_h5fileobj) from .test_coding_times import (_STANDARD_CALENDARS, _NON_STANDARD_CALENDARS, _ALL_CALENDARS) from .test_dataset import create_test_data @@ -1770,7 +1770,7 @@ def test_engine(self): open_dataset(tmp_file, engine='foobar') netcdf_bytes = data.to_netcdf() - with raises_regex(ValueError, 'can only read'): + with raises_regex(ValueError, 'unrecognized engine'): open_dataset(BytesIO(netcdf_bytes), engine='foobar') def test_cross_engine_read_write_netcdf3(self): @@ -1955,6 +1955,52 @@ def test_dump_encodings_h5py(self): assert actual.x.encoding['compression_opts'] is None +@requires_h5fileobj +class TestH5NetCDFFileObject(TestH5NetCDFData): + engine = 'h5netcdf' + + def test_open_badbytes(self): + with raises_regex(ValueError, "HDF5 as bytes"): + with open_dataset(b'\211HDF\r\n\032\n', engine='h5netcdf'): + pass + with raises_regex(ValueError, "not a valid netCDF"): + with open_dataset(b'garbage'): + pass + with raises_regex(ValueError, "can only read bytes"): + with open_dataset(b'garbage', engine='netcdf4'): + pass + with raises_regex(ValueError, "not a valid netCDF"): + with open_dataset(BytesIO(b'garbage'), engine='h5netcdf'): + pass + + def test_open_twice(self): + expected = create_test_data() + expected.attrs['foo'] = 'bar' + with raises_regex(ValueError, 'read/write pointer not at zero'): + with create_tmp_file() as tmp_file: + expected.to_netcdf(tmp_file, engine='h5netcdf') + with open(tmp_file, 'rb') as f: + with open_dataset(f, engine='h5netcdf'): + with open_dataset(f, engine='h5netcdf'): + pass + + def test_open_fileobj(self): + # open in-memory datasets instead of local file paths + expected = create_test_data().drop('dim3') + expected.attrs['foo'] = 'bar' + with create_tmp_file() as tmp_file: + expected.to_netcdf(tmp_file, engine='h5netcdf') + + with open(tmp_file, 'rb') as f: + with open_dataset(f, engine='h5netcdf') as actual: + assert_identical(expected, actual) + + f.seek(0) + with BytesIO(f.read()) as bio: + with open_dataset(bio, engine='h5netcdf') as actual: + assert_identical(expected, actual) + + @requires_h5netcdf @requires_dask @pytest.mark.filterwarnings('ignore:deallocating CachingFileManager') From ccb198faa9bce662064a038c9179ea8176eaaac1 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 15 Mar 2019 21:02:04 -0700 Subject: [PATCH 10/17] Release 0.12.0 --- .gitignore | 1 + doc/whats-new.rst | 14 ++++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 2a016bb9228..fdf1b12d706 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,7 @@ pip-log.txt .tox nosetests.xml .cache +.mypy_cache .ropeproject/ .tags* .testmon* diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 17cf9a9c8fb..aee026585fa 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -15,8 +15,8 @@ What's New .. _whats-new.0.12.0: -v0.12.0 (unreleased) --------------------- +v0.12.0 (15 March 2019) +----------------------- Breaking changes ~~~~~~~~~~~~~~~~ @@ -24,12 +24,10 @@ Breaking changes - Remove support for Python 2. This is the first version of xarray that is Python 3 only. (:issue:`1876`). By `Joe Hamman `_. -- The `compat` argument to `Dataset` and the `encoding` argument to - `DataArray` are deprecated and will be removed in a future release. +- The ``compat`` argument to ``Dataset`` and the ``encoding`` argument to + ``DataArray`` are deprecated and will be removed in a future release. (:issue:`1188`) By `Maximilian Roos `_. -- `cyordereddict` is no longer used as an optional dependency (:issue:`2744`). - By `Joe Hamman `_. Enhancements ~~~~~~~~~~~~ @@ -104,14 +102,14 @@ Bug fixes of the original time coordinate are now filled with NaN (:issue:`2197`). By `Spencer Clark `_. - Line plots with the ``x`` argument set to a non-dimensional coord now plot the correct data for 1D DataArrays. - (:issue:`27251). By `Tom Nicholas `_. + (:issue:`27251`). By `Tom Nicholas `_. - Subtracting a scalar ``cftime.datetime`` object from a :py:class:`CFTimeIndex` now results in a :py:class:`pandas.TimedeltaIndex` instead of raising a ``TypeError`` (:issue:`2671`). By `Spencer Clark `_. - backend_kwargs are no longer ignored when using open_dataset with pynio engine (:issue:'2380') - By 'Jonathan Joyce '_. + By `Jonathan Joyce `_. - Fix ``open_rasterio`` creating a WKT CRS instead of PROJ.4 with ``rasterio`` 1.0.14+ (:issue:`2715`). By `David Hoese `_. From 6ec99108b631dcc681f417f1aa09cac23bb83f4f Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 15 Mar 2019 21:16:05 -0700 Subject: [PATCH 11/17] Add whats-new for 0.12.1 --- doc/whats-new.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index aee026585fa..afbb5a5a3a5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,6 +13,19 @@ What's New import xarray as xr np.random.seed(123456) +.. _whats-new.0.12.1: + +v0.12.1 (unreleased) +-------------------- + +Enhancements +~~~~~~~~~~~~ + + +Bug fixes +~~~~~~~~~ + + .. _whats-new.0.12.0: v0.12.0 (15 March 2019) From 4ce03c24695626248aac640ec8d9c057af7d1b82 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 15 Mar 2019 21:28:13 -0700 Subject: [PATCH 12/17] Rework whats-new for 0.12 --- doc/whats-new.rst | 75 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index afbb5a5a3a5..68fdf1df8f6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -31,37 +31,70 @@ Bug fixes v0.12.0 (15 March 2019) ----------------------- -Breaking changes -~~~~~~~~~~~~~~~~ +Highlights include: + +- Removed support for Python 2. This is the first version of xarray that is + Python 3 only! +- New :py:meth:`~xarray.DataArray.coarsen` and + :py:meth:`~xarray.DataArray.integrate` methods. See :ref:`comput.coarsen` + and :ref:`compute.using_coordinates` for details. +- Many improvements to cftime support. See below for details. + +Deprecations +~~~~~~~~~~~~ -- Remove support for Python 2. This is the first version of xarray that is - Python 3 only. (:issue:`1876`). - By `Joe Hamman `_. - The ``compat`` argument to ``Dataset`` and the ``encoding`` argument to ``DataArray`` are deprecated and will be removed in a future release. (:issue:`1188`) By `Maximilian Roos `_. -Enhancements -~~~~~~~~~~~~ -- Added ability to open netcdf4/hdf5 file-like objects with ``open_dataset``. - Requires (h5netcdf>0.7 and h5py>2.9.0). (:issue:`2781`) - By `Scott Henderson `_ +cftime related enhancements +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Resampling of standard and non-standard calendars indexed by + :py:class:`~xarray.CFTimeIndex` is now possible. (:issue:`2191`). + By `Jwen Fai Low `_ and + `Spencer Clark `_. + +- Taking the mean of arrays of :py:class:`cftime.datetime` objects, and + by extension, use of :py:meth:`~xarray.DataArray.coarsen` with + :py:class:`cftime.datetime` coordinates is now possible. By `Spencer Clark + `_. + - Internal plotting now supports ``cftime.datetime`` objects as time series. (:issue:`2164`) By `Julius Busecke `_ and `Spencer Clark `_. + +- :py:meth:`~xarray.cftime_range` now supports QuarterBegin and QuarterEnd offsets (:issue:`2663`). + By `Jwen Fai Low `_ + +- :py:meth:`~xarray.open_dataset` now accepts a ``use_cftime`` argument, which + can be used to require that ``cftime.datetime`` objects are always used, or + never used when decoding dates encoded with a standard calendar. This can be + used to ensure consistent date types are returned when using + :py:meth:`~xarray.open_mfdataset` (:issue:`1263`) and/or to silence + serialization warnings raised if dates from a standard calendar are found to + be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By + `Spencer Clark `_. + +- :py:meth:`pandas.Series.dropna` is now supported for a + :py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex` + (:issue:`2688`). By `Spencer Clark `_. + +Other enhancements +~~~~~~~~~~~~~~~~~~ + +- Added ability to open netcdf4/hdf5 file-like objects with ``open_dataset``. + Requires (h5netcdf>0.7 and h5py>2.9.0). (:issue:`2781`) + By `Scott Henderson `_ - Add ``data=False`` option to ``to_dict()`` methods. (:issue:`2656`) By `Ryan Abernathey `_ -- :py:meth:`~xarray.DataArray.coarsen` and - :py:meth:`~xarray.Dataset.coarsen` are newly added. +- :py:meth:`DataArray.coarsen` and + :py:meth:`Dataset.coarsen` are newly added. See :ref:`comput.coarsen` for details. (:issue:`2525`) By `Keisuke Fujii `_. -- Taking the mean of arrays of :py:class:`cftime.datetime` objects, and - by extension, use of :py:meth:`~xarray.DataArray.coarsen` with - :py:class:`cftime.datetime` coordinates is now possible. By `Spencer Clark - `_. - Upsampling an array via interpolation with resample is now dask-compatible, as long as the array is not chunked along the resampling dimension. By `Spencer Clark `_. @@ -70,16 +103,12 @@ Enhancements report showing what exactly differs between the two objects (dimensions / coordinates / variables / attributes) (:issue:`1507`). By `Benoit Bovy `_. -- Resampling of standard and non-standard calendars indexed by - :py:class:`~xarray.CFTimeIndex` is now possible. (:issue:`2191`). - By `Jwen Fai Low `_ and - `Spencer Clark `_. - Add ``tolerance`` option to ``resample()`` methods ``bfill``, ``pad``, ``nearest``. (:issue:`2695`) By `Hauke Schulz `_. -- :py:meth:`~xarray.DataArray.integrate` and - :py:meth:`~xarray.Dataset.integrate` are newly added. - See :ref:`_compute.using_coordinates` for the detail. +- :py:meth:`DataArray.integrate` and + :py:meth:`Dataset.integrate` are newly added. + See :ref:`compute.using_coordinates` for the detail. (:issue:`1332`) By `Keisuke Fujii `_. - :py:meth:`pandas.Series.dropna` is now supported for a From 97fdb8334118083f38aef7ae253993fc6424768a Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Tue, 19 Mar 2019 21:43:26 -0700 Subject: [PATCH 13/17] DOC: Update donation links --- README.rst | 2 +- doc/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index f69f7d95c31..6dbf774549d 100644 --- a/README.rst +++ b/README.rst @@ -97,7 +97,7 @@ to supporting the open source scientific computing community. If you like Xarray and want to support our mission, please consider making a donation_ to support our efforts. -.. _donation: https://www.flipcause.com/secure/cause_pdetails/NDE2NTU= +.. _donation: https://numfocus.salsalabs.org/donate-to-xarray/ History ------- diff --git a/doc/index.rst b/doc/index.rst index 1d3bb110ddb..002bd102e12 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -140,7 +140,7 @@ to supporting the open source scientific computing community. If you like Xarray and want to support our mission, please consider making a donation_ to support our efforts. -.. _donation: https://www.flipcause.com/secure/cause_pdetails/NDE2NTU= +.. _donation: https://numfocus.salsalabs.org/donate-to-xarray/ History From a74ecd6a95790a31140484fc7c7e6c2bee24eddb Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 20 Mar 2019 12:07:58 -0700 Subject: [PATCH 14/17] DOC: remove outdated warning (#2818) --- doc/data-structures.rst | 7 ------- 1 file changed, 7 deletions(-) diff --git a/doc/data-structures.rst b/doc/data-structures.rst index a8887471ec7..5be1f7b4262 100644 --- a/doc/data-structures.rst +++ b/doc/data-structures.rst @@ -353,13 +353,6 @@ setting) variables and attributes: This is particularly useful in an exploratory context, because you can tab-complete these variable names with tools like IPython. -.. warning:: - - We are changing the behavior of iterating over a Dataset the next major - release of xarray, to only include data variables instead of both data - variables and coordinates. In the meantime, prefer iterating over - ``ds.data_vars`` or ``ds.coords``. - Dictionary like methods ~~~~~~~~~~~~~~~~~~~~~~~ From 21fa6e0baaa5cf73237a0f72f1ed91c24320b55c Mon Sep 17 00:00:00 2001 From: pletcm Date: Thu, 7 Feb 2019 07:21:48 -0800 Subject: [PATCH 15/17] Allow expand_dims() method to support inserting/broadcasting dimensions with size>1 (#2757) * Make using dim_kwargs for python 3.5 illegal -- a ValueError is thrown * dataset.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * dataarray.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * Add alternative option to passing a dict to the dim argument, which is now an optional kwarg, passing in each new dimension as its own kwarg * Add expand_dims enhancement from issue 2710 to whats-new.rst * Fix test_dataarray.TestDataArray.test_expand_dims_with_greater_dim_size tests to pass in python 3.5 using ordered dicts instead of regular dicts. This was needed because python 3.5 and earlier did not maintain insertion order for dicts * Restrict core logic to use 'dim' as a dict--it will be converted into a dict on entry if it is a str or a sequence of str * Don't cast dim values (coords) as a list since IndexVariable/Variable will internally convert it into a numpy.ndarray. So just use IndexVariable((k,), v) * TypeErrors should be raised for invalid input types, rather than ValueErrors. * Force 'dim' to be OrderedDict for python 3.5 --- doc/whats-new.rst | 20 +--------- xarray/core/dataarray.py | 39 ++++++++++++++++-- xarray/core/dataset.py | 73 +++++++++++++++++++++++++++------- xarray/tests/test_dataarray.py | 53 +++++++++++++++++++++++- xarray/tests/test_dataset.py | 68 +++++++++++++++++++++++++++++++ 5 files changed, 216 insertions(+), 37 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 68fdf1df8f6..8734aab6330 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -111,27 +111,11 @@ Other enhancements See :ref:`compute.using_coordinates` for the detail. (:issue:`1332`) By `Keisuke Fujii `_. -- :py:meth:`pandas.Series.dropna` is now supported for a - :py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex` - (:issue:`2688`). By `Spencer Clark `_. -- :py:meth:`~xarray.cftime_range` now supports QuarterBegin and QuarterEnd offsets (:issue:`2663`). - By `Jwen Fai Low `_ -- :py:meth:`~xarray.open_dataset` now accepts a ``use_cftime`` argument, which - can be used to require that ``cftime.datetime`` objects are always used, or - never used when decoding dates encoded with a standard calendar. This can be - used to ensure consistent date types are returned when using - :py:meth:`~xarray.open_mfdataset` (:issue:`1263`) and/or to silence - serialization warnings raised if dates from a standard calendar are found to - be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By - `Spencer Clark `_. - +- Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). + By `Kevin Squire `_. - Allow ``expand_dims`` method to support inserting/broadcasting dimensions with size > 1. (:issue:`2710`) By `Martin Pletcher `_. - `Spencer Clark `_. - -- Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). - By `Kevin Squire `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e7e12ae3da4..75f3298104f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1,4 +1,5 @@ import functools +import sys import warnings from collections import OrderedDict @@ -1138,7 +1139,7 @@ def swap_dims(self, dims_dict): ds = self._to_temp_dataset().swap_dims(dims_dict) return self._from_temp_dataset(ds) - def expand_dims(self, dim, axis=None): + def expand_dims(self, dim=None, axis=None, **dim_kwargs): """Return a new object with an additional axis (or axes) inserted at the corresponding position in the array shape. @@ -1147,21 +1148,53 @@ def expand_dims(self, dim, axis=None): Parameters ---------- - dim : str or sequence of str. + dim : str, sequence of str, dict, or None Dimensions to include on the new variable. - dimensions are inserted with length 1. + If provided as str or sequence of str, then dimensions are inserted + with length 1. If provided as a dict, then the keys are the new + dimensions and the values are either integers (giving the length of + the new dimensions) or sequence/ndarray (giving the coordinates of + the new dimensions). **WARNING** for python 3.5, if ``dim`` is + dict-like, then it must be an ``OrderedDict``. This is to ensure + that the order in which the dims are given is maintained. axis : integer, list (or tuple) of integers, or None Axis position(s) where new axis is to be inserted (position(s) on the result array). If a list (or tuple) of integers is passed, multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. + **dim_kwargs : int or sequence/ndarray + The keywords are arbitrary dimensions being inserted and the values + are either the lengths of the new dims (if int is given), or their + coordinates. Note, this is an alternative to passing a dict to the + dim kwarg and will only be used if dim is None. **WARNING** for + python 3.5 ``dim_kwargs`` is not available. Returns ------- expanded : same type as caller This object, but with an additional dimension(s). """ + if isinstance(dim, int): + raise TypeError('dim should be str or sequence of strs or dict') + elif isinstance(dim, str): + dim = OrderedDict(((dim, 1),)) + elif isinstance(dim, (list, tuple)): + if len(dim) != len(set(dim)): + raise ValueError('dims should not contain duplicate values.') + dim = OrderedDict(((d, 1) for d in dim)) + + # TODO: get rid of the below code block when python 3.5 is no longer + # supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + not_ordereddict = dim is not None and not isinstance(dim, OrderedDict) + if not python36_plus and not_ordereddict: + raise TypeError("dim must be an OrderedDict for python <3.6") + elif not python36_plus and dim_kwargs: + raise ValueError("dim_kwargs isn't available for python <3.6") + dim_kwargs = OrderedDict(dim_kwargs) + + dim = either_dict_or_kwargs(dim, dim_kwargs, 'expand_dims') ds = self._to_temp_dataset().expand_dims(dim, axis) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 12c5d139fdc..5c8b048c175 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2329,7 +2329,7 @@ def swap_dims(self, dims_dict, inplace=None): return self._replace_with_new_dims(variables, coord_names, indexes=indexes, inplace=inplace) - def expand_dims(self, dim, axis=None): + def expand_dims(self, dim=None, axis=None, **dim_kwargs): """Return a new object with an additional axis (or axes) inserted at the corresponding position in the array shape. @@ -2338,15 +2338,27 @@ def expand_dims(self, dim, axis=None): Parameters ---------- - dim : str or sequence of str. + dim : str, sequence of str, dict, or None Dimensions to include on the new variable. - dimensions are inserted with length 1. + If provided as str or sequence of str, then dimensions are inserted + with length 1. If provided as a dict, then the keys are the new + dimensions and the values are either integers (giving the length of + the new dimensions) or sequence/ndarray (giving the coordinates of + the new dimensions). **WARNING** for python 3.5, if ``dim`` is + dict-like, then it must be an ``OrderedDict``. This is to ensure + that the order in which the dims are given is maintained. axis : integer, list (or tuple) of integers, or None Axis position(s) where new axis is to be inserted (position(s) on the result array). If a list (or tuple) of integers is passed, multiple axes are inserted. In this case, dim arguments should be - the same length list. If axis=None is passed, all the axes will - be inserted to the start of the result array. + same length list. If axis=None is passed, all the axes will be + inserted to the start of the result array. + **dim_kwargs : int or sequence/ndarray + The keywords are arbitrary dimensions being inserted and the values + are either the lengths of the new dims (if int is given), or their + coordinates. Note, this is an alternative to passing a dict to the + dim kwarg and will only be used if dim is None. **WARNING** for + python 3.5 ``dim_kwargs`` is not available. Returns ------- @@ -2354,10 +2366,25 @@ def expand_dims(self, dim, axis=None): This object, but with an additional dimension(s). """ if isinstance(dim, int): - raise ValueError('dim should be str or sequence of strs or dict') + raise TypeError('dim should be str or sequence of strs or dict') + elif isinstance(dim, str): + dim = OrderedDict(((dim, 1),)) + elif isinstance(dim, (list, tuple)): + if len(dim) != len(set(dim)): + raise ValueError('dims should not contain duplicate values.') + dim = OrderedDict(((d, 1) for d in dim)) + + # TODO: get rid of the below code block when python 3.5 is no longer + # supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + not_ordereddict = dim is not None and not isinstance(dim, OrderedDict) + if not python36_plus and not_ordereddict: + raise TypeError("dim must be an OrderedDict for python <3.6") + elif not python36_plus and dim_kwargs: + raise ValueError("dim_kwargs isn't available for python <3.6") + + dim = either_dict_or_kwargs(dim, dim_kwargs, 'expand_dims') - if isinstance(dim, str): - dim = [dim] if axis is not None and not isinstance(axis, (list, tuple)): axis = [axis] @@ -2376,10 +2403,24 @@ def expand_dims(self, dim, axis=None): '{dim} already exists as coordinate or' ' variable name.'.format(dim=d)) - if len(dim) != len(set(dim)): - raise ValueError('dims should not contain duplicate values.') - variables = OrderedDict() + # If dim is a dict, then ensure that the values are either integers + # or iterables. + for k, v in dim.items(): + if hasattr(v, "__iter__"): + # If the value for the new dimension is an iterable, then + # save the coordinates to the variables dict, and set the + # value within the dim dict to the length of the iterable + # for later use. + variables[k] = xr.IndexVariable((k,), v) + self._coord_names.add(k) + dim[k] = len(list(v)) + elif isinstance(v, int): + pass # Do nothing if the dimensions value is just an int + else: + raise TypeError('The value of new dimension {k} must be ' + 'an iterable or an int'.format(k=k)) + for k, v in self._variables.items(): if k not in dim: if k in self._coord_names: # Do not change coordinates @@ -2400,11 +2441,13 @@ def expand_dims(self, dim, axis=None): ' values.') # We need to sort them to make sure `axis` equals to the # axis positions of the result array. - zip_axis_dim = sorted(zip(axis_pos, dim)) + zip_axis_dim = sorted(zip(axis_pos, dim.items())) + + all_dims = list(zip(v.dims, v.shape)) + for d, c in zip_axis_dim: + all_dims.insert(d, c) + all_dims = OrderedDict(all_dims) - all_dims = list(v.dims) - for a, d in zip_axis_dim: - all_dims.insert(a, d) variables[k] = v.set_dims(all_dims) else: # If dims includes a label of a non-dimension coordinate, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 4975071dad8..b1ecf160533 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3,6 +3,7 @@ from collections import OrderedDict from copy import deepcopy from textwrap import dedent +import sys import numpy as np import pandas as pd @@ -1303,7 +1304,7 @@ def test_expand_dims_error(self): coords={'x': np.linspace(0.0, 1.0, 3)}, attrs={'key': 'entry'}) - with raises_regex(ValueError, 'dim should be str or'): + with raises_regex(TypeError, 'dim should be str or'): array.expand_dims(0) with raises_regex(ValueError, 'lengths of dim and axis'): # dims and axis argument should be the same length @@ -1328,6 +1329,16 @@ def test_expand_dims_error(self): array.expand_dims(dim=['y', 'z'], axis=[2, -4]) array.expand_dims(dim=['y', 'z'], axis=[2, 3]) + array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], + coords={'x': np.linspace(0.0, 1.0, 3)}, + attrs={'key': 'entry'}) + with pytest.raises(TypeError): + array.expand_dims(OrderedDict((("new_dim", 3.2),))) + + # Attempt to use both dim and kwargs + with pytest.raises(ValueError): + array.expand_dims(OrderedDict((("d", 4),)), e=4) + def test_expand_dims(self): array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], coords={'x': np.linspace(0.0, 1.0, 3)}, @@ -1392,6 +1403,46 @@ def test_expand_dims_with_scalar_coordinate(self): roundtripped = actual.squeeze(['z'], drop=False) assert_identical(array, roundtripped) + def test_expand_dims_with_greater_dim_size(self): + array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], + coords={'x': np.linspace(0.0, 1.0, 3), 'z': 1.0}, + attrs={'key': 'entry'}) + # For python 3.5 and earlier this has to be an ordered dict, to + # maintain insertion order. + actual = array.expand_dims( + OrderedDict((('y', 2), ('z', 1), ('dim_1', ['a', 'b', 'c'])))) + + expected_coords = OrderedDict(( + ('y', [0, 1]), ('z', [1.0]), ('dim_1', ['a', 'b', 'c']), + ('x', np.linspace(0, 1, 3)), ('dim_0', range(4)))) + expected = DataArray(array.values * np.ones([2, 1, 3, 3, 4]), + coords=expected_coords, + dims=list(expected_coords.keys()), + attrs={'key': 'entry'} + ).drop(['y', 'dim_0']) + assert_identical(expected, actual) + + # Test with kwargs instead of passing dict to dim arg. + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + other_way = array.expand_dims(dim_1=['a', 'b', 'c']) + + other_way_expected = DataArray( + array.values * np.ones([3, 3, 4]), + coords={'dim_1': ['a', 'b', 'c'], + 'x': np.linspace(0, 1, 3), + 'dim_0': range(4), 'z': 1.0}, + dims=['dim_1', 'x', 'dim_0'], + attrs={'key': 'entry'}).drop('dim_0') + assert_identical(other_way_expected, other_way) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + array.expand_dims(e=["l", "m", "n"]) + def test_set_index(self): indexes = [self.mindex.get_level_values(n) for n in self.mindex.names] coords = {idx.name: ('x', idx) for idx in indexes} diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 8e8c6c4b419..75b736239e6 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2033,6 +2033,27 @@ def test_expand_dims_error(self): with raises_regex(ValueError, 'already exists'): original.expand_dims(dim=['z']) + original = Dataset({'x': ('a', np.random.randn(3)), + 'y': (['b', 'a'], np.random.randn(4, 3)), + 'z': ('a', np.random.randn(3))}, + coords={'a': np.linspace(0, 1, 3), + 'b': np.linspace(0, 1, 4), + 'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + with raises_regex(TypeError, 'value of new dimension'): + original.expand_dims(OrderedDict((("d", 3.2),))) + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + with raises_regex(ValueError, 'both keyword and positional'): + original.expand_dims(OrderedDict((("d", 4),)), e=4) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + original.expand_dims(OrderedDict((("d", 4),)), e=4) + def test_expand_dims(self): original = Dataset({'x': ('a', np.random.randn(3)), 'y': (['b', 'a'], np.random.randn(4, 3))}, @@ -2066,6 +2087,53 @@ def test_expand_dims(self): roundtripped = actual.squeeze('z') assert_identical(original, roundtripped) + # Test expanding one dimension to have size > 1 that doesn't have + # coordinates, and also expanding another dimension to have size > 1 + # that DOES have coordinates. + actual = original.expand_dims( + OrderedDict((("d", 4), ("e", ["l", "m", "n"])))) + + expected = Dataset( + {'x': xr.DataArray(original['x'].values * np.ones([4, 3, 3]), + coords=dict(d=range(4), + e=['l', 'm', 'n'], + a=np.linspace(0, 1, 3)), + dims=['d', 'e', 'a']).drop('d'), + 'y': xr.DataArray(original['y'].values * np.ones([4, 3, 4, 3]), + coords=dict(d=range(4), + e=['l', 'm', 'n'], + b=np.linspace(0, 1, 4), + a=np.linspace(0, 1, 3)), + dims=['d', 'e', 'b', 'a']).drop('d')}, + coords={'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + assert_identical(actual, expected) + + # Test with kwargs instead of passing dict to dim arg. + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + other_way = original.expand_dims(e=["l", "m", "n"]) + other_way_expected = Dataset( + {'x': xr.DataArray(original['x'].values * np.ones([3, 3]), + coords=dict(e=['l', 'm', 'n'], + a=np.linspace(0, 1, 3)), + dims=['e', 'a']), + 'y': xr.DataArray(original['y'].values * np.ones([3, 4, 3]), + coords=dict(e=['l', 'm', 'n'], + b=np.linspace(0, 1, 4), + a=np.linspace(0, 1, 3)), + dims=['e', 'b', 'a'])}, + coords={'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + assert_identical(other_way_expected, other_way) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + original.expand_dims(e=["l", "m", "n"]) + def test_set_index(self): expected = create_test_multiindex() mindex = expected['x'].to_index() From 4e47dd14d94109d974d12866294d01484cf4f2a8 Mon Sep 17 00:00:00 2001 From: pletcm Date: Thu, 7 Feb 2019 07:21:48 -0800 Subject: [PATCH 16/17] Allow expand_dims() method to support inserting/broadcasting dimensions with size>1 (#2757) * use .size attribute to determine the size of a dimension, rather than converting to a list, which can be slow for large iterables * Make using dim_kwargs for python 3.5 illegal -- a ValueError is thrown * dataset.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * dataarray.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * Add alternative option to passing a dict to the dim argument, which is now an optional kwarg, passing in each new dimension as its own kwarg * Add expand_dims enhancement from issue 2710 to whats-new.rst * Fix test_dataarray.TestDataArray.test_expand_dims_with_greater_dim_size tests to pass in python 3.5 using ordered dicts instead of regular dicts. This was needed because python 3.5 and earlier did not maintain insertion order for dicts * Restrict core logic to use 'dim' as a dict--it will be converted into a dict on entry if it is a str or a sequence of str * Don't cast dim values (coords) as a list since IndexVariable/Variable will internally convert it into a numpy.ndarray. So just use IndexVariable((k,), v) * TypeErrors should be raised for invalid input types, rather than ValueErrors. * Force 'dim' to be OrderedDict for python 3.5 --- doc/whats-new.rst | 20 +--------- xarray/core/dataarray.py | 39 ++++++++++++++++-- xarray/core/dataset.py | 73 +++++++++++++++++++++++++++------- xarray/tests/test_dataarray.py | 53 +++++++++++++++++++++++- xarray/tests/test_dataset.py | 68 +++++++++++++++++++++++++++++++ 5 files changed, 216 insertions(+), 37 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 68fdf1df8f6..8734aab6330 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -111,27 +111,11 @@ Other enhancements See :ref:`compute.using_coordinates` for the detail. (:issue:`1332`) By `Keisuke Fujii `_. -- :py:meth:`pandas.Series.dropna` is now supported for a - :py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex` - (:issue:`2688`). By `Spencer Clark `_. -- :py:meth:`~xarray.cftime_range` now supports QuarterBegin and QuarterEnd offsets (:issue:`2663`). - By `Jwen Fai Low `_ -- :py:meth:`~xarray.open_dataset` now accepts a ``use_cftime`` argument, which - can be used to require that ``cftime.datetime`` objects are always used, or - never used when decoding dates encoded with a standard calendar. This can be - used to ensure consistent date types are returned when using - :py:meth:`~xarray.open_mfdataset` (:issue:`1263`) and/or to silence - serialization warnings raised if dates from a standard calendar are found to - be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By - `Spencer Clark `_. - +- Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). + By `Kevin Squire `_. - Allow ``expand_dims`` method to support inserting/broadcasting dimensions with size > 1. (:issue:`2710`) By `Martin Pletcher `_. - `Spencer Clark `_. - -- Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). - By `Kevin Squire `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e7e12ae3da4..75f3298104f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1,4 +1,5 @@ import functools +import sys import warnings from collections import OrderedDict @@ -1138,7 +1139,7 @@ def swap_dims(self, dims_dict): ds = self._to_temp_dataset().swap_dims(dims_dict) return self._from_temp_dataset(ds) - def expand_dims(self, dim, axis=None): + def expand_dims(self, dim=None, axis=None, **dim_kwargs): """Return a new object with an additional axis (or axes) inserted at the corresponding position in the array shape. @@ -1147,21 +1148,53 @@ def expand_dims(self, dim, axis=None): Parameters ---------- - dim : str or sequence of str. + dim : str, sequence of str, dict, or None Dimensions to include on the new variable. - dimensions are inserted with length 1. + If provided as str or sequence of str, then dimensions are inserted + with length 1. If provided as a dict, then the keys are the new + dimensions and the values are either integers (giving the length of + the new dimensions) or sequence/ndarray (giving the coordinates of + the new dimensions). **WARNING** for python 3.5, if ``dim`` is + dict-like, then it must be an ``OrderedDict``. This is to ensure + that the order in which the dims are given is maintained. axis : integer, list (or tuple) of integers, or None Axis position(s) where new axis is to be inserted (position(s) on the result array). If a list (or tuple) of integers is passed, multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. + **dim_kwargs : int or sequence/ndarray + The keywords are arbitrary dimensions being inserted and the values + are either the lengths of the new dims (if int is given), or their + coordinates. Note, this is an alternative to passing a dict to the + dim kwarg and will only be used if dim is None. **WARNING** for + python 3.5 ``dim_kwargs`` is not available. Returns ------- expanded : same type as caller This object, but with an additional dimension(s). """ + if isinstance(dim, int): + raise TypeError('dim should be str or sequence of strs or dict') + elif isinstance(dim, str): + dim = OrderedDict(((dim, 1),)) + elif isinstance(dim, (list, tuple)): + if len(dim) != len(set(dim)): + raise ValueError('dims should not contain duplicate values.') + dim = OrderedDict(((d, 1) for d in dim)) + + # TODO: get rid of the below code block when python 3.5 is no longer + # supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + not_ordereddict = dim is not None and not isinstance(dim, OrderedDict) + if not python36_plus and not_ordereddict: + raise TypeError("dim must be an OrderedDict for python <3.6") + elif not python36_plus and dim_kwargs: + raise ValueError("dim_kwargs isn't available for python <3.6") + dim_kwargs = OrderedDict(dim_kwargs) + + dim = either_dict_or_kwargs(dim, dim_kwargs, 'expand_dims') ds = self._to_temp_dataset().expand_dims(dim, axis) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 12c5d139fdc..9dbcd8a8f70 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2329,7 +2329,7 @@ def swap_dims(self, dims_dict, inplace=None): return self._replace_with_new_dims(variables, coord_names, indexes=indexes, inplace=inplace) - def expand_dims(self, dim, axis=None): + def expand_dims(self, dim=None, axis=None, **dim_kwargs): """Return a new object with an additional axis (or axes) inserted at the corresponding position in the array shape. @@ -2338,15 +2338,27 @@ def expand_dims(self, dim, axis=None): Parameters ---------- - dim : str or sequence of str. + dim : str, sequence of str, dict, or None Dimensions to include on the new variable. - dimensions are inserted with length 1. + If provided as str or sequence of str, then dimensions are inserted + with length 1. If provided as a dict, then the keys are the new + dimensions and the values are either integers (giving the length of + the new dimensions) or sequence/ndarray (giving the coordinates of + the new dimensions). **WARNING** for python 3.5, if ``dim`` is + dict-like, then it must be an ``OrderedDict``. This is to ensure + that the order in which the dims are given is maintained. axis : integer, list (or tuple) of integers, or None Axis position(s) where new axis is to be inserted (position(s) on the result array). If a list (or tuple) of integers is passed, multiple axes are inserted. In this case, dim arguments should be - the same length list. If axis=None is passed, all the axes will - be inserted to the start of the result array. + same length list. If axis=None is passed, all the axes will be + inserted to the start of the result array. + **dim_kwargs : int or sequence/ndarray + The keywords are arbitrary dimensions being inserted and the values + are either the lengths of the new dims (if int is given), or their + coordinates. Note, this is an alternative to passing a dict to the + dim kwarg and will only be used if dim is None. **WARNING** for + python 3.5 ``dim_kwargs`` is not available. Returns ------- @@ -2354,10 +2366,25 @@ def expand_dims(self, dim, axis=None): This object, but with an additional dimension(s). """ if isinstance(dim, int): - raise ValueError('dim should be str or sequence of strs or dict') + raise TypeError('dim should be str or sequence of strs or dict') + elif isinstance(dim, str): + dim = OrderedDict(((dim, 1),)) + elif isinstance(dim, (list, tuple)): + if len(dim) != len(set(dim)): + raise ValueError('dims should not contain duplicate values.') + dim = OrderedDict(((d, 1) for d in dim)) + + # TODO: get rid of the below code block when python 3.5 is no longer + # supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + not_ordereddict = dim is not None and not isinstance(dim, OrderedDict) + if not python36_plus and not_ordereddict: + raise TypeError("dim must be an OrderedDict for python <3.6") + elif not python36_plus and dim_kwargs: + raise ValueError("dim_kwargs isn't available for python <3.6") + + dim = either_dict_or_kwargs(dim, dim_kwargs, 'expand_dims') - if isinstance(dim, str): - dim = [dim] if axis is not None and not isinstance(axis, (list, tuple)): axis = [axis] @@ -2376,10 +2403,24 @@ def expand_dims(self, dim, axis=None): '{dim} already exists as coordinate or' ' variable name.'.format(dim=d)) - if len(dim) != len(set(dim)): - raise ValueError('dims should not contain duplicate values.') - variables = OrderedDict() + # If dim is a dict, then ensure that the values are either integers + # or iterables. + for k, v in dim.items(): + if hasattr(v, "__iter__"): + # If the value for the new dimension is an iterable, then + # save the coordinates to the variables dict, and set the + # value within the dim dict to the length of the iterable + # for later use. + variables[k] = xr.IndexVariable((k,), v) + self._coord_names.add(k) + dim[k] = variables[k].size + elif isinstance(v, int): + pass # Do nothing if the dimensions value is just an int + else: + raise TypeError('The value of new dimension {k} must be ' + 'an iterable or an int'.format(k=k)) + for k, v in self._variables.items(): if k not in dim: if k in self._coord_names: # Do not change coordinates @@ -2400,11 +2441,13 @@ def expand_dims(self, dim, axis=None): ' values.') # We need to sort them to make sure `axis` equals to the # axis positions of the result array. - zip_axis_dim = sorted(zip(axis_pos, dim)) + zip_axis_dim = sorted(zip(axis_pos, dim.items())) + + all_dims = list(zip(v.dims, v.shape)) + for d, c in zip_axis_dim: + all_dims.insert(d, c) + all_dims = OrderedDict(all_dims) - all_dims = list(v.dims) - for a, d in zip_axis_dim: - all_dims.insert(a, d) variables[k] = v.set_dims(all_dims) else: # If dims includes a label of a non-dimension coordinate, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 4975071dad8..b1ecf160533 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3,6 +3,7 @@ from collections import OrderedDict from copy import deepcopy from textwrap import dedent +import sys import numpy as np import pandas as pd @@ -1303,7 +1304,7 @@ def test_expand_dims_error(self): coords={'x': np.linspace(0.0, 1.0, 3)}, attrs={'key': 'entry'}) - with raises_regex(ValueError, 'dim should be str or'): + with raises_regex(TypeError, 'dim should be str or'): array.expand_dims(0) with raises_regex(ValueError, 'lengths of dim and axis'): # dims and axis argument should be the same length @@ -1328,6 +1329,16 @@ def test_expand_dims_error(self): array.expand_dims(dim=['y', 'z'], axis=[2, -4]) array.expand_dims(dim=['y', 'z'], axis=[2, 3]) + array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], + coords={'x': np.linspace(0.0, 1.0, 3)}, + attrs={'key': 'entry'}) + with pytest.raises(TypeError): + array.expand_dims(OrderedDict((("new_dim", 3.2),))) + + # Attempt to use both dim and kwargs + with pytest.raises(ValueError): + array.expand_dims(OrderedDict((("d", 4),)), e=4) + def test_expand_dims(self): array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], coords={'x': np.linspace(0.0, 1.0, 3)}, @@ -1392,6 +1403,46 @@ def test_expand_dims_with_scalar_coordinate(self): roundtripped = actual.squeeze(['z'], drop=False) assert_identical(array, roundtripped) + def test_expand_dims_with_greater_dim_size(self): + array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], + coords={'x': np.linspace(0.0, 1.0, 3), 'z': 1.0}, + attrs={'key': 'entry'}) + # For python 3.5 and earlier this has to be an ordered dict, to + # maintain insertion order. + actual = array.expand_dims( + OrderedDict((('y', 2), ('z', 1), ('dim_1', ['a', 'b', 'c'])))) + + expected_coords = OrderedDict(( + ('y', [0, 1]), ('z', [1.0]), ('dim_1', ['a', 'b', 'c']), + ('x', np.linspace(0, 1, 3)), ('dim_0', range(4)))) + expected = DataArray(array.values * np.ones([2, 1, 3, 3, 4]), + coords=expected_coords, + dims=list(expected_coords.keys()), + attrs={'key': 'entry'} + ).drop(['y', 'dim_0']) + assert_identical(expected, actual) + + # Test with kwargs instead of passing dict to dim arg. + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + other_way = array.expand_dims(dim_1=['a', 'b', 'c']) + + other_way_expected = DataArray( + array.values * np.ones([3, 3, 4]), + coords={'dim_1': ['a', 'b', 'c'], + 'x': np.linspace(0, 1, 3), + 'dim_0': range(4), 'z': 1.0}, + dims=['dim_1', 'x', 'dim_0'], + attrs={'key': 'entry'}).drop('dim_0') + assert_identical(other_way_expected, other_way) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + array.expand_dims(e=["l", "m", "n"]) + def test_set_index(self): indexes = [self.mindex.get_level_values(n) for n in self.mindex.names] coords = {idx.name: ('x', idx) for idx in indexes} diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 8e8c6c4b419..75b736239e6 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2033,6 +2033,27 @@ def test_expand_dims_error(self): with raises_regex(ValueError, 'already exists'): original.expand_dims(dim=['z']) + original = Dataset({'x': ('a', np.random.randn(3)), + 'y': (['b', 'a'], np.random.randn(4, 3)), + 'z': ('a', np.random.randn(3))}, + coords={'a': np.linspace(0, 1, 3), + 'b': np.linspace(0, 1, 4), + 'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + with raises_regex(TypeError, 'value of new dimension'): + original.expand_dims(OrderedDict((("d", 3.2),))) + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + with raises_regex(ValueError, 'both keyword and positional'): + original.expand_dims(OrderedDict((("d", 4),)), e=4) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + original.expand_dims(OrderedDict((("d", 4),)), e=4) + def test_expand_dims(self): original = Dataset({'x': ('a', np.random.randn(3)), 'y': (['b', 'a'], np.random.randn(4, 3))}, @@ -2066,6 +2087,53 @@ def test_expand_dims(self): roundtripped = actual.squeeze('z') assert_identical(original, roundtripped) + # Test expanding one dimension to have size > 1 that doesn't have + # coordinates, and also expanding another dimension to have size > 1 + # that DOES have coordinates. + actual = original.expand_dims( + OrderedDict((("d", 4), ("e", ["l", "m", "n"])))) + + expected = Dataset( + {'x': xr.DataArray(original['x'].values * np.ones([4, 3, 3]), + coords=dict(d=range(4), + e=['l', 'm', 'n'], + a=np.linspace(0, 1, 3)), + dims=['d', 'e', 'a']).drop('d'), + 'y': xr.DataArray(original['y'].values * np.ones([4, 3, 4, 3]), + coords=dict(d=range(4), + e=['l', 'm', 'n'], + b=np.linspace(0, 1, 4), + a=np.linspace(0, 1, 3)), + dims=['d', 'e', 'b', 'a']).drop('d')}, + coords={'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + assert_identical(actual, expected) + + # Test with kwargs instead of passing dict to dim arg. + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + other_way = original.expand_dims(e=["l", "m", "n"]) + other_way_expected = Dataset( + {'x': xr.DataArray(original['x'].values * np.ones([3, 3]), + coords=dict(e=['l', 'm', 'n'], + a=np.linspace(0, 1, 3)), + dims=['e', 'a']), + 'y': xr.DataArray(original['y'].values * np.ones([3, 4, 3]), + coords=dict(e=['l', 'm', 'n'], + b=np.linspace(0, 1, 4), + a=np.linspace(0, 1, 3)), + dims=['e', 'b', 'a'])}, + coords={'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + assert_identical(other_way_expected, other_way) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + original.expand_dims(e=["l", "m", "n"]) + def test_set_index(self): expected = create_test_multiindex() mindex = expected['x'].to_index() From c319c472af723d5a63e160539c16793d083dff02 Mon Sep 17 00:00:00 2001 From: pletcm Date: Thu, 7 Feb 2019 07:21:48 -0800 Subject: [PATCH 17/17] Allow expand_dims() method to support inserting/broadcasting dimensions with size>1 (#2757) * Move enhancement description up to 0.12.1 * use .size attribute to determine the size of a dimension, rather than converting to a list, which can be slow for large iterables * Make using dim_kwargs for python 3.5 illegal -- a ValueError is thrown * dataset.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * dataarray.expand_dims() method take dict like object where values represent length of dimensions or coordinates of dimesnsions * Add alternative option to passing a dict to the dim argument, which is now an optional kwarg, passing in each new dimension as its own kwarg * Add expand_dims enhancement from issue 2710 to whats-new.rst * Fix test_dataarray.TestDataArray.test_expand_dims_with_greater_dim_size tests to pass in python 3.5 using ordered dicts instead of regular dicts. This was needed because python 3.5 and earlier did not maintain insertion order for dicts * Restrict core logic to use 'dim' as a dict--it will be converted into a dict on entry if it is a str or a sequence of str * Don't cast dim values (coords) as a list since IndexVariable/Variable will internally convert it into a numpy.ndarray. So just use IndexVariable((k,), v) * TypeErrors should be raised for invalid input types, rather than ValueErrors. * Force 'dim' to be OrderedDict for python 3.5 --- doc/whats-new.rst | 23 ++--------- xarray/core/dataarray.py | 39 ++++++++++++++++-- xarray/core/dataset.py | 73 +++++++++++++++++++++++++++------- xarray/tests/test_dataarray.py | 53 +++++++++++++++++++++++- xarray/tests/test_dataset.py | 68 +++++++++++++++++++++++++++++++ 5 files changed, 218 insertions(+), 38 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 68fdf1df8f6..ab934ee9061 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,6 +21,10 @@ v0.12.1 (unreleased) Enhancements ~~~~~~~~~~~~ +- Allow ``expand_dims`` method to support inserting/broadcasting dimensions + with size > 1. (:issue:`2710`) + By `Martin Pletcher `_. + Bug fixes ~~~~~~~~~ @@ -111,25 +115,6 @@ Other enhancements See :ref:`compute.using_coordinates` for the detail. (:issue:`1332`) By `Keisuke Fujii `_. -- :py:meth:`pandas.Series.dropna` is now supported for a - :py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex` - (:issue:`2688`). By `Spencer Clark `_. -- :py:meth:`~xarray.cftime_range` now supports QuarterBegin and QuarterEnd offsets (:issue:`2663`). - By `Jwen Fai Low `_ -- :py:meth:`~xarray.open_dataset` now accepts a ``use_cftime`` argument, which - can be used to require that ``cftime.datetime`` objects are always used, or - never used when decoding dates encoded with a standard calendar. This can be - used to ensure consistent date types are returned when using - :py:meth:`~xarray.open_mfdataset` (:issue:`1263`) and/or to silence - serialization warnings raised if dates from a standard calendar are found to - be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By - `Spencer Clark `_. - -- Allow ``expand_dims`` method to support inserting/broadcasting dimensions - with size > 1. (:issue:`2710`) - By `Martin Pletcher `_. - `Spencer Clark `_. - - Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). By `Kevin Squire `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index e7e12ae3da4..75f3298104f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1,4 +1,5 @@ import functools +import sys import warnings from collections import OrderedDict @@ -1138,7 +1139,7 @@ def swap_dims(self, dims_dict): ds = self._to_temp_dataset().swap_dims(dims_dict) return self._from_temp_dataset(ds) - def expand_dims(self, dim, axis=None): + def expand_dims(self, dim=None, axis=None, **dim_kwargs): """Return a new object with an additional axis (or axes) inserted at the corresponding position in the array shape. @@ -1147,21 +1148,53 @@ def expand_dims(self, dim, axis=None): Parameters ---------- - dim : str or sequence of str. + dim : str, sequence of str, dict, or None Dimensions to include on the new variable. - dimensions are inserted with length 1. + If provided as str or sequence of str, then dimensions are inserted + with length 1. If provided as a dict, then the keys are the new + dimensions and the values are either integers (giving the length of + the new dimensions) or sequence/ndarray (giving the coordinates of + the new dimensions). **WARNING** for python 3.5, if ``dim`` is + dict-like, then it must be an ``OrderedDict``. This is to ensure + that the order in which the dims are given is maintained. axis : integer, list (or tuple) of integers, or None Axis position(s) where new axis is to be inserted (position(s) on the result array). If a list (or tuple) of integers is passed, multiple axes are inserted. In this case, dim arguments should be same length list. If axis=None is passed, all the axes will be inserted to the start of the result array. + **dim_kwargs : int or sequence/ndarray + The keywords are arbitrary dimensions being inserted and the values + are either the lengths of the new dims (if int is given), or their + coordinates. Note, this is an alternative to passing a dict to the + dim kwarg and will only be used if dim is None. **WARNING** for + python 3.5 ``dim_kwargs`` is not available. Returns ------- expanded : same type as caller This object, but with an additional dimension(s). """ + if isinstance(dim, int): + raise TypeError('dim should be str or sequence of strs or dict') + elif isinstance(dim, str): + dim = OrderedDict(((dim, 1),)) + elif isinstance(dim, (list, tuple)): + if len(dim) != len(set(dim)): + raise ValueError('dims should not contain duplicate values.') + dim = OrderedDict(((d, 1) for d in dim)) + + # TODO: get rid of the below code block when python 3.5 is no longer + # supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + not_ordereddict = dim is not None and not isinstance(dim, OrderedDict) + if not python36_plus and not_ordereddict: + raise TypeError("dim must be an OrderedDict for python <3.6") + elif not python36_plus and dim_kwargs: + raise ValueError("dim_kwargs isn't available for python <3.6") + dim_kwargs = OrderedDict(dim_kwargs) + + dim = either_dict_or_kwargs(dim, dim_kwargs, 'expand_dims') ds = self._to_temp_dataset().expand_dims(dim, axis) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 12c5d139fdc..9dbcd8a8f70 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2329,7 +2329,7 @@ def swap_dims(self, dims_dict, inplace=None): return self._replace_with_new_dims(variables, coord_names, indexes=indexes, inplace=inplace) - def expand_dims(self, dim, axis=None): + def expand_dims(self, dim=None, axis=None, **dim_kwargs): """Return a new object with an additional axis (or axes) inserted at the corresponding position in the array shape. @@ -2338,15 +2338,27 @@ def expand_dims(self, dim, axis=None): Parameters ---------- - dim : str or sequence of str. + dim : str, sequence of str, dict, or None Dimensions to include on the new variable. - dimensions are inserted with length 1. + If provided as str or sequence of str, then dimensions are inserted + with length 1. If provided as a dict, then the keys are the new + dimensions and the values are either integers (giving the length of + the new dimensions) or sequence/ndarray (giving the coordinates of + the new dimensions). **WARNING** for python 3.5, if ``dim`` is + dict-like, then it must be an ``OrderedDict``. This is to ensure + that the order in which the dims are given is maintained. axis : integer, list (or tuple) of integers, or None Axis position(s) where new axis is to be inserted (position(s) on the result array). If a list (or tuple) of integers is passed, multiple axes are inserted. In this case, dim arguments should be - the same length list. If axis=None is passed, all the axes will - be inserted to the start of the result array. + same length list. If axis=None is passed, all the axes will be + inserted to the start of the result array. + **dim_kwargs : int or sequence/ndarray + The keywords are arbitrary dimensions being inserted and the values + are either the lengths of the new dims (if int is given), or their + coordinates. Note, this is an alternative to passing a dict to the + dim kwarg and will only be used if dim is None. **WARNING** for + python 3.5 ``dim_kwargs`` is not available. Returns ------- @@ -2354,10 +2366,25 @@ def expand_dims(self, dim, axis=None): This object, but with an additional dimension(s). """ if isinstance(dim, int): - raise ValueError('dim should be str or sequence of strs or dict') + raise TypeError('dim should be str or sequence of strs or dict') + elif isinstance(dim, str): + dim = OrderedDict(((dim, 1),)) + elif isinstance(dim, (list, tuple)): + if len(dim) != len(set(dim)): + raise ValueError('dims should not contain duplicate values.') + dim = OrderedDict(((d, 1) for d in dim)) + + # TODO: get rid of the below code block when python 3.5 is no longer + # supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + not_ordereddict = dim is not None and not isinstance(dim, OrderedDict) + if not python36_plus and not_ordereddict: + raise TypeError("dim must be an OrderedDict for python <3.6") + elif not python36_plus and dim_kwargs: + raise ValueError("dim_kwargs isn't available for python <3.6") + + dim = either_dict_or_kwargs(dim, dim_kwargs, 'expand_dims') - if isinstance(dim, str): - dim = [dim] if axis is not None and not isinstance(axis, (list, tuple)): axis = [axis] @@ -2376,10 +2403,24 @@ def expand_dims(self, dim, axis=None): '{dim} already exists as coordinate or' ' variable name.'.format(dim=d)) - if len(dim) != len(set(dim)): - raise ValueError('dims should not contain duplicate values.') - variables = OrderedDict() + # If dim is a dict, then ensure that the values are either integers + # or iterables. + for k, v in dim.items(): + if hasattr(v, "__iter__"): + # If the value for the new dimension is an iterable, then + # save the coordinates to the variables dict, and set the + # value within the dim dict to the length of the iterable + # for later use. + variables[k] = xr.IndexVariable((k,), v) + self._coord_names.add(k) + dim[k] = variables[k].size + elif isinstance(v, int): + pass # Do nothing if the dimensions value is just an int + else: + raise TypeError('The value of new dimension {k} must be ' + 'an iterable or an int'.format(k=k)) + for k, v in self._variables.items(): if k not in dim: if k in self._coord_names: # Do not change coordinates @@ -2400,11 +2441,13 @@ def expand_dims(self, dim, axis=None): ' values.') # We need to sort them to make sure `axis` equals to the # axis positions of the result array. - zip_axis_dim = sorted(zip(axis_pos, dim)) + zip_axis_dim = sorted(zip(axis_pos, dim.items())) + + all_dims = list(zip(v.dims, v.shape)) + for d, c in zip_axis_dim: + all_dims.insert(d, c) + all_dims = OrderedDict(all_dims) - all_dims = list(v.dims) - for a, d in zip_axis_dim: - all_dims.insert(a, d) variables[k] = v.set_dims(all_dims) else: # If dims includes a label of a non-dimension coordinate, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 4975071dad8..b1ecf160533 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3,6 +3,7 @@ from collections import OrderedDict from copy import deepcopy from textwrap import dedent +import sys import numpy as np import pandas as pd @@ -1303,7 +1304,7 @@ def test_expand_dims_error(self): coords={'x': np.linspace(0.0, 1.0, 3)}, attrs={'key': 'entry'}) - with raises_regex(ValueError, 'dim should be str or'): + with raises_regex(TypeError, 'dim should be str or'): array.expand_dims(0) with raises_regex(ValueError, 'lengths of dim and axis'): # dims and axis argument should be the same length @@ -1328,6 +1329,16 @@ def test_expand_dims_error(self): array.expand_dims(dim=['y', 'z'], axis=[2, -4]) array.expand_dims(dim=['y', 'z'], axis=[2, 3]) + array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], + coords={'x': np.linspace(0.0, 1.0, 3)}, + attrs={'key': 'entry'}) + with pytest.raises(TypeError): + array.expand_dims(OrderedDict((("new_dim", 3.2),))) + + # Attempt to use both dim and kwargs + with pytest.raises(ValueError): + array.expand_dims(OrderedDict((("d", 4),)), e=4) + def test_expand_dims(self): array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], coords={'x': np.linspace(0.0, 1.0, 3)}, @@ -1392,6 +1403,46 @@ def test_expand_dims_with_scalar_coordinate(self): roundtripped = actual.squeeze(['z'], drop=False) assert_identical(array, roundtripped) + def test_expand_dims_with_greater_dim_size(self): + array = DataArray(np.random.randn(3, 4), dims=['x', 'dim_0'], + coords={'x': np.linspace(0.0, 1.0, 3), 'z': 1.0}, + attrs={'key': 'entry'}) + # For python 3.5 and earlier this has to be an ordered dict, to + # maintain insertion order. + actual = array.expand_dims( + OrderedDict((('y', 2), ('z', 1), ('dim_1', ['a', 'b', 'c'])))) + + expected_coords = OrderedDict(( + ('y', [0, 1]), ('z', [1.0]), ('dim_1', ['a', 'b', 'c']), + ('x', np.linspace(0, 1, 3)), ('dim_0', range(4)))) + expected = DataArray(array.values * np.ones([2, 1, 3, 3, 4]), + coords=expected_coords, + dims=list(expected_coords.keys()), + attrs={'key': 'entry'} + ).drop(['y', 'dim_0']) + assert_identical(expected, actual) + + # Test with kwargs instead of passing dict to dim arg. + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + other_way = array.expand_dims(dim_1=['a', 'b', 'c']) + + other_way_expected = DataArray( + array.values * np.ones([3, 3, 4]), + coords={'dim_1': ['a', 'b', 'c'], + 'x': np.linspace(0, 1, 3), + 'dim_0': range(4), 'z': 1.0}, + dims=['dim_1', 'x', 'dim_0'], + attrs={'key': 'entry'}).drop('dim_0') + assert_identical(other_way_expected, other_way) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + array.expand_dims(e=["l", "m", "n"]) + def test_set_index(self): indexes = [self.mindex.get_level_values(n) for n in self.mindex.names] coords = {idx.name: ('x', idx) for idx in indexes} diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 8e8c6c4b419..75b736239e6 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2033,6 +2033,27 @@ def test_expand_dims_error(self): with raises_regex(ValueError, 'already exists'): original.expand_dims(dim=['z']) + original = Dataset({'x': ('a', np.random.randn(3)), + 'y': (['b', 'a'], np.random.randn(4, 3)), + 'z': ('a', np.random.randn(3))}, + coords={'a': np.linspace(0, 1, 3), + 'b': np.linspace(0, 1, 4), + 'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + with raises_regex(TypeError, 'value of new dimension'): + original.expand_dims(OrderedDict((("d", 3.2),))) + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + with raises_regex(ValueError, 'both keyword and positional'): + original.expand_dims(OrderedDict((("d", 4),)), e=4) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + original.expand_dims(OrderedDict((("d", 4),)), e=4) + def test_expand_dims(self): original = Dataset({'x': ('a', np.random.randn(3)), 'y': (['b', 'a'], np.random.randn(4, 3))}, @@ -2066,6 +2087,53 @@ def test_expand_dims(self): roundtripped = actual.squeeze('z') assert_identical(original, roundtripped) + # Test expanding one dimension to have size > 1 that doesn't have + # coordinates, and also expanding another dimension to have size > 1 + # that DOES have coordinates. + actual = original.expand_dims( + OrderedDict((("d", 4), ("e", ["l", "m", "n"])))) + + expected = Dataset( + {'x': xr.DataArray(original['x'].values * np.ones([4, 3, 3]), + coords=dict(d=range(4), + e=['l', 'm', 'n'], + a=np.linspace(0, 1, 3)), + dims=['d', 'e', 'a']).drop('d'), + 'y': xr.DataArray(original['y'].values * np.ones([4, 3, 4, 3]), + coords=dict(d=range(4), + e=['l', 'm', 'n'], + b=np.linspace(0, 1, 4), + a=np.linspace(0, 1, 3)), + dims=['d', 'e', 'b', 'a']).drop('d')}, + coords={'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + assert_identical(actual, expected) + + # Test with kwargs instead of passing dict to dim arg. + + # TODO: only the code under the if-statement is needed when python 3.5 + # is no longer supported. + python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 + if python36_plus: + other_way = original.expand_dims(e=["l", "m", "n"]) + other_way_expected = Dataset( + {'x': xr.DataArray(original['x'].values * np.ones([3, 3]), + coords=dict(e=['l', 'm', 'n'], + a=np.linspace(0, 1, 3)), + dims=['e', 'a']), + 'y': xr.DataArray(original['y'].values * np.ones([3, 4, 3]), + coords=dict(e=['l', 'm', 'n'], + b=np.linspace(0, 1, 4), + a=np.linspace(0, 1, 3)), + dims=['e', 'b', 'a'])}, + coords={'c': np.linspace(0, 1, 5)}, + attrs={'key': 'entry'}) + assert_identical(other_way_expected, other_way) + else: + # In python 3.5, using dim_kwargs should raise a ValueError. + with raises_regex(ValueError, "dim_kwargs isn't"): + original.expand_dims(e=["l", "m", "n"]) + def test_set_index(self): expected = create_test_multiindex() mindex = expected['x'].to_index()