diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0275630f4c2..999aaa47843 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -76,6 +76,9 @@ Enhancements Bug fixes ~~~~~~~~~ +- Don't set encoding attributes on bounds variables when writing to netCDF. + (:issue:`2921`) + By `Deepak Cherian `_. - NetCDF4 output: variables with unlimited dimensions must be chunked (not contiguous) on output. (:issue:`1849`) By `James McCreight `_. diff --git a/xarray/conventions.py b/xarray/conventions.py index 3f8f76b08a2..d0d90242426 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -7,6 +7,7 @@ from .coding import strings, times, variables from .coding.variables import SerializationWarning from .core import duck_array_ops, indexing +from .core.common import contains_cftime_datetimes from .core.pycompat import dask_array_type from .core.variable import IndexVariable, Variable, as_variable @@ -355,6 +356,51 @@ def _update_bounds_attributes(variables): bounds_attrs.setdefault('calendar', attrs['calendar']) +def _update_bounds_encoding(variables): + """Adds time encoding to time bounds variables. + + Variables handling time bounds ("Cell boundaries" in the CF + conventions) do not necessarily carry the necessary attributes to be + decoded. This copies the encoding from the time variable to the + associated bounds variable so that we write CF-compliant files. + + See Also: + + http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/ + cf-conventions.html#cell-boundaries + + https://github.com/pydata/xarray/issues/2565 + """ + + # For all time variables with bounds + for v in variables.values(): + attrs = v.attrs + encoding = v.encoding + has_date_units = 'units' in encoding and 'since' in encoding['units'] + is_datetime_type = (np.issubdtype(v.dtype, np.datetime64) or + contains_cftime_datetimes(v)) + + if (is_datetime_type and not has_date_units and + 'bounds' in attrs and attrs['bounds'] in variables): + warnings.warn("Variable '{0}' has datetime type and a " + "bounds variable but {0}.encoding does not have " + "units specified. The units encodings for '{0}' " + "and '{1}' will be determined independently " + "and may not be equal, counter to CF-conventions. " + "If this is a concern, specify a units encoding for " + "'{0}' before writing to a file." + .format(v.name, attrs['bounds']), + UserWarning) + + if has_date_units and 'bounds' in attrs: + if attrs['bounds'] in variables: + bounds_encoding = variables[attrs['bounds']].encoding + bounds_encoding.setdefault('units', encoding['units']) + if 'calendar' in encoding: + bounds_encoding.setdefault('calendar', + encoding['calendar']) + + def decode_cf_variables(variables, attributes, concat_characters=True, mask_and_scale=True, decode_times=True, decode_coords=True, drop_variables=None, @@ -492,8 +538,6 @@ def cf_decoder(variables, attributes, """ Decode a set of CF encoded variables and attributes. - See Also, decode_cf_variable - Parameters ---------- variables : dict @@ -515,6 +559,10 @@ def cf_decoder(variables, attributes, A dictionary mapping from variable name to xarray.Variable objects. decoded_attributes : dict A dictionary mapping from attribute name to values. + + See also + -------- + decode_cf_variable """ variables, attributes, _ = decode_cf_variables( variables, attributes, concat_characters, mask_and_scale, decode_times) @@ -595,14 +643,12 @@ def encode_dataset_coordinates(dataset): def cf_encoder(variables, attributes): """ - A function which takes a dicts of variables and attributes - and encodes them to conform to CF conventions as much - as possible. This includes masking, scaling, character - array handling, and CF-time encoding. - - Decode a set of CF encoded variables and attributes. + Encode a set of CF encoded variables and attributes. + Takes a dicts of variables and attributes and encodes them + to conform to CF conventions as much as possible. + This includes masking, scaling, character array handling, + and CF-time encoding. - See Also, decode_cf_variable Parameters ---------- @@ -618,8 +664,27 @@ def cf_encoder(variables, attributes): encoded_attributes : dict A dictionary mapping from attribute name to value - See also: encode_cf_variable + See also + -------- + decode_cf_variable, encode_cf_variable """ + + # add encoding for time bounds variables if present. + _update_bounds_encoding(variables) + new_vars = OrderedDict((k, encode_cf_variable(v, name=k)) for k, v in variables.items()) + + # Remove attrs from bounds variables (issue #2921) + for var in new_vars.values(): + bounds = var.attrs['bounds'] if 'bounds' in var.attrs else None + if bounds and bounds in new_vars: + # see http://cfconventions.org/cf-conventions/cf-conventions.html#cell-boundaries # noqa + for attr in ['units', 'standard_name', 'axis', 'positive', + 'calendar', 'long_name', 'leap_month', 'leap_year', + 'month_lengths']: + if attr in new_vars[bounds].attrs and attr in var.attrs: + if new_vars[bounds].attrs[attr] == var.attrs[attr]: + new_vars[bounds].attrs.pop(attr) + return new_vars, attributes diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e71181f52c9..89190eee590 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2486,6 +2486,27 @@ def test_attrs_mfdataset(self): 'no attribute'): actual.test2 + @pytest.mark.xfail(reason='mfdataset loses encoding currently.') + def test_encoding_mfdataset(self): + original = Dataset({'foo': ('t', np.random.randn(10)), + 't': ('t', pd.date_range(start='2010-01-01', + periods=10, + freq='1D'))}) + original.t.encoding['units'] = 'days since 2010-01-01' + + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + ds1 = original.isel(t=slice(5)) + ds2 = original.isel(t=slice(5, 10)) + ds1.t.encoding['units'] = 'days since 2010-01-01' + ds2.t.encoding['units'] = 'days since 2000-01-01' + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset([tmp1, tmp2]) as actual: + assert actual.t.encoding['units'] == original.t.encoding['units'] # noqa + assert actual.t.encoding['units'] == ds1.t.encoding['units'] # noqa + assert actual.t.encoding['units'] != ds2.t.encoding['units'] # noqa + def test_preprocess_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 421637dd7e3..7b2ba5f22c8 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -5,11 +5,11 @@ import pandas as pd import pytest -from xarray import DataArray, Variable, coding, decode_cf +from xarray import DataArray, Dataset, Variable, coding, decode_cf from xarray.coding.times import ( _import_cftime, cftime_to_nptime, decode_cf_datetime, encode_cf_datetime) from xarray.coding.variables import SerializationWarning -from xarray.conventions import _update_bounds_attributes +from xarray.conventions import _update_bounds_attributes, cf_encoder from xarray.core.common import contains_cftime_datetimes from xarray.testing import assert_equal @@ -671,6 +671,51 @@ def test_decode_cf_time_bounds(): _update_bounds_attributes(ds.variables) +@requires_cftime_or_netCDF4 +def test_encode_time_bounds(): + + time = pd.date_range('2000-01-16', periods=1) + time_bounds = pd.date_range('2000-01-01', periods=2, freq='MS') + ds = Dataset(dict(time=time, time_bounds=time_bounds)) + ds.time.attrs = {'bounds': 'time_bounds'} + ds.time.encoding = {'calendar': 'noleap', + 'units': 'days since 2000-01-01'} + + expected = dict() + # expected['time'] = Variable(data=np.array([15]), dims=['time']) + expected['time_bounds'] = Variable(data=np.array([0, 31]), + dims=['time_bounds']) + + encoded, _ = cf_encoder(ds.variables, ds.attrs) + assert_equal(encoded['time_bounds'], expected['time_bounds']) + assert 'calendar' not in encoded['time_bounds'].attrs + assert 'units' not in encoded['time_bounds'].attrs + + # if time_bounds attrs are same as time attrs, it doesn't matter + ds.time_bounds.encoding = {'calendar': 'noleap', + 'units': 'days since 2000-01-01'} + encoded, _ = cf_encoder({k: ds[k] for k in ds.variables}, + ds.attrs) + assert_equal(encoded['time_bounds'], expected['time_bounds']) + assert 'calendar' not in encoded['time_bounds'].attrs + assert 'units' not in encoded['time_bounds'].attrs + + # for CF-noncompliant case of time_bounds attrs being different from + # time attrs; preserve them for faithful roundtrip + ds.time_bounds.encoding = {'calendar': 'noleap', + 'units': 'days since 1849-01-01'} + encoded, _ = cf_encoder({k: ds[k] for k in ds.variables}, + ds.attrs) + with pytest.raises(AssertionError): + assert_equal(encoded['time_bounds'], expected['time_bounds']) + assert 'calendar' not in encoded['time_bounds'].attrs + assert encoded['time_bounds'].attrs['units'] == ds.time_bounds.encoding['units'] # noqa + + ds.time.encoding = {} + with pytest.warns(UserWarning): + cf_encoder(ds.variables, ds.attrs) + + @pytest.fixture(params=_ALL_CALENDARS) def calendar(request): return request.param