Skip to content

Commit fbb4f06

Browse files
authored
Fixes for compat='no_conflicts' and open_mfdataset (#1007)
* Fixes for compat='no_conflicts' and open_mfdataset - `xarray.merge` now uses `compat='no_conflicts'` by default. - `compat='no_conflicts'` preserves attributes. - `Variable.no_conflicts` broadcasts arguments, making it a strictly more relaxed check than `broadcast_equals`. - Allow `concat_dim=None` in `open_mfdataset` and `auto_combine` to disable trying to infer a dimension for concatenating. - Add `compat` as an argument to `open_mfdataset` to allow more control over merging. * Fix what's new * Updates to tests/doc for auto_combine
1 parent c1eddaf commit fbb4f06

File tree

11 files changed

+221
-87
lines changed

11 files changed

+221
-87
lines changed

doc/combining.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ numpy):
193193
Note that ``NaN`` does not compare equal to ``NaN`` in element-wise comparison;
194194
you may need to deal with missing values explicitly.
195195

196+
.. _combining.no_conflicts:
197+
196198
Merging with 'no_conflicts'
197199
~~~~~~~~~~~~~~~~~~~~~~~~~~~
198200

doc/whats-new.rst

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ v0.9.0 (unreleased)
2121
Breaking changes
2222
~~~~~~~~~~~~~~~~
2323

24+
- The default behavior of ``merge`` is now ``compat='no_conflicts'``, so some
25+
merges will now succeed in cases that previously raised
26+
``xarray.MergeError``. Set ``compat='broadcast_equals'`` to restore the
27+
previous default.
28+
2429
Deprecations
2530
~~~~~~~~~~~~
2631

@@ -59,8 +64,14 @@ By `Robin Wilson <https://github.com/robintw>`_.
5964

6065
- Added the ``compat`` option ``'no_conflicts'`` to ``merge``, allowing the
6166
combination of xarray objects with disjoint (:issue:`742`) or
62-
overlapping (:issue:`835`) coordinates as long as any present data agrees.
63-
By `Johnnie Gray <https://github.com/jcmgray>`_.
67+
overlapping (:issue:`835`) coordinates as long as all present data agrees.
68+
By `Johnnie Gray <https://github.com/jcmgray>`_. See
69+
:ref:`combining.no_conflicts` for more details.
70+
71+
- It is now possible to set ``concat_dim=None`` explicitly in
72+
:py:func:`~xarray.open_mfdataset` to disable inferring a dimension along
73+
which to concatenate.
74+
By `Stephan Hoyer <https://github.com/shoyer>`_.
6475

6576
Bug fixes
6677
~~~~~~~~~

xarray/backends/api.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -379,8 +379,12 @@ def close(self):
379379
f.close()
380380

381381

382-
def open_mfdataset(paths, chunks=None, concat_dim=None, preprocess=None,
383-
engine=None, lock=None, **kwargs):
382+
_CONCAT_DIM_DEFAULT = '__infer_concat_dim__'
383+
384+
385+
def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
386+
compat='no_conflicts', preprocess=None, engine=None,
387+
lock=None, **kwargs):
384388
"""Open multiple files as a single dataset.
385389
386390
Experimental. Requires dask to be installed.
@@ -397,12 +401,28 @@ def open_mfdataset(paths, chunks=None, concat_dim=None, preprocess=None,
397401
By default, chunks will be chosen to load entire input files into
398402
memory at once. This has a major impact on performance: please see the
399403
full documentation for more details.
400-
concat_dim : str or DataArray or Index, optional
404+
concat_dim : None, str, DataArray or Index, optional
401405
Dimension to concatenate files along. This argument is passed on to
402406
:py:func:`xarray.auto_combine` along with the dataset objects. You only
403407
need to provide this argument if the dimension along which you want to
404408
concatenate is not a dimension in the original datasets, e.g., if you
405409
want to stack a collection of 2D arrays along a third dimension.
410+
By default, xarray attempts to infer this argument by examining
411+
component files. Set ``concat_dim=None`` explicitly to disable
412+
concatenation.
413+
compat : {'identical', 'equals', 'broadcast_equals',
414+
'no_conflicts'}, optional
415+
String indicating how to compare variables of the same name for
416+
potential conflicts when merging:
417+
418+
- 'broadcast_equals': all values must be equal when variables are
419+
broadcast against each other to ensure common dimensions.
420+
- 'equals': all values and dimensions must be the same.
421+
- 'identical': all values, dimensions and attributes must be the
422+
same.
423+
- 'no_conflicts': only values which are not null in both datasets
424+
must be equal. The returned dataset then contains the combination
425+
of all non-null values.
406426
preprocess : callable, optional
407427
If provided, call this function on each dataset prior to concatenation.
408428
engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio'}, optional
@@ -440,7 +460,10 @@ def open_mfdataset(paths, chunks=None, concat_dim=None, preprocess=None,
440460
if preprocess is not None:
441461
datasets = [preprocess(ds) for ds in datasets]
442462

443-
combined = auto_combine(datasets, concat_dim=concat_dim)
463+
if concat_dim is _CONCAT_DIM_DEFAULT:
464+
combined = auto_combine(datasets, compat=compat)
465+
else:
466+
combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat)
444467
combined._file_obj = _MultiFileCloser(file_objs)
445468
return combined
446469

xarray/core/combine.py

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -329,28 +329,25 @@ def _auto_concat(datasets, dim=None):
329329
return concat(datasets, dim=dim)
330330

331331

332-
def auto_combine(datasets, concat_dim=None):
332+
_CONCAT_DIM_DEFAULT = '__infer_concat_dim__'
333+
334+
335+
def auto_combine(datasets,
336+
concat_dim=_CONCAT_DIM_DEFAULT,
337+
compat='no_conflicts'):
333338
"""Attempt to auto-magically combine the given datasets into one.
334339
335340
This method attempts to combine a list of datasets into a single entity by
336341
inspecting metadata and using a combination of concat and merge.
337342
338-
It does not concatenate along more than one dimension or align or sort data
339-
under any circumstances. It will fail in complex cases, for which you
340-
should use ``concat`` and ``merge`` explicitly.
341-
342-
When ``auto_combine`` may succeed:
343+
It does not concatenate along more than one dimension or sort data under any
344+
circumstances. It does align coordinates, but different variables on
345+
datasets can cause it to fail under some scenarios. In complex cases, you
346+
may need to clean up your data and use ``concat``/``merge`` explicitly.
343347
344-
* You have N years of data and M data variables. Each combination of a
345-
distinct time period and test of data variables is saved its own dataset.
346-
347-
Examples of when ``auto_combine`` fails:
348-
349-
* In the above scenario, one file is missing, containing the data for one
350-
year's data for one variable.
351-
* In the most recent year, there is an additional data variable.
352-
* Your data includes "time" and "station" dimensions, and each year's data
353-
has a different set of stations.
348+
``auto_combine`` works well if you have N years of data and M data
349+
variables, and each combination of a distinct time period and set of data
350+
variables is saved its own dataset.
354351
355352
Parameters
356353
----------
@@ -362,6 +359,22 @@ def auto_combine(datasets, concat_dim=None):
362359
dimension along which you want to concatenate is not a dimension in
363360
the original datasets, e.g., if you want to stack a collection of
364361
2D arrays along a third dimension.
362+
By default, xarray attempts to infer this argument by examining
363+
component files. Set ``concat_dim=None`` explicitly to disable
364+
concatenation.
365+
compat : {'identical', 'equals', 'broadcast_equals',
366+
'no_conflicts'}, optional
367+
String indicating how to compare variables of the same name for
368+
potential conflicts:
369+
370+
- 'broadcast_equals': all values must be equal when variables are
371+
broadcast against each other to ensure common dimensions.
372+
- 'equals': all values and dimensions must be the same.
373+
- 'identical': all values, dimensions and attributes must be the
374+
same.
375+
- 'no_conflicts': only values which are not null in both datasets
376+
must be equal. The returned dataset then contains the combination
377+
of all non-null values.
365378
366379
Returns
367380
-------
@@ -373,8 +386,12 @@ def auto_combine(datasets, concat_dim=None):
373386
Dataset.merge
374387
"""
375388
from toolz import itertoolz
376-
grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)),
377-
datasets).values()
378-
concatenated = [_auto_concat(ds, dim=concat_dim) for ds in grouped]
379-
merged = merge(concatenated)
389+
if concat_dim is not None:
390+
dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
391+
grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)),
392+
datasets).values()
393+
concatenated = [_auto_concat(ds, dim=dim) for ds in grouped]
394+
else:
395+
concatenated = datasets
396+
merged = merge(concatenated, compat=compat)
380397
return merged

xarray/core/dataset.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1439,8 +1439,8 @@ def update(self, other, inplace=True):
14391439
return self._replace_vars_and_dims(variables, coord_names, dims,
14401440
inplace=inplace)
14411441

1442-
def merge(self, other, inplace=False, overwrite_vars=set(),
1443-
compat='broadcast_equals', join='outer'):
1442+
def merge(self, other, inplace=False, overwrite_vars=frozenset(),
1443+
compat='no_conflicts', join='outer'):
14441444
"""Merge the arrays of two datasets into a single dataset.
14451445
14461446
This method generally not allow for overriding data, with the exception
@@ -1490,7 +1490,8 @@ def merge(self, other, inplace=False, overwrite_vars=set(),
14901490
If any variables conflict (see ``compat``).
14911491
"""
14921492
variables, coord_names, dims = dataset_merge_method(
1493-
self, other, overwrite_vars, compat=compat, join=join)
1493+
self, other, overwrite_vars=overwrite_vars, compat=compat,
1494+
join=join)
14941495

14951496
return self._replace_vars_and_dims(variables, coord_names, dims,
14961497
inplace=inplace)

xarray/core/merge.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,9 @@ def unique_variable(name, variables, compat='broadcast_equals'):
8282
'first value: %r\nsecond value: %r'
8383
% (name, out, var))
8484
if combine_method:
85+
# TODO: add preservation of attrs into fillna
8586
out = getattr(out, combine_method)(var)
87+
out.attrs = var.attrs
8688

8789
return out
8890

@@ -397,8 +399,12 @@ def merge_data_and_coords(data, coords, compat='broadcast_equals',
397399
return merge_core(objs, compat, join, explicit_coords=explicit_coords)
398400

399401

400-
def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
401-
explicit_coords=None, indexes=None):
402+
def merge_core(objs,
403+
compat='broadcast_equals',
404+
join='outer',
405+
priority_arg=None,
406+
explicit_coords=None,
407+
indexes=None):
402408
"""Core logic for merging labeled objects.
403409
404410
This is not public API.
@@ -466,7 +472,7 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
466472
return variables, coord_names, dict(dims)
467473

468474

469-
def merge(objects, compat='broadcast_equals', join='outer'):
475+
def merge(objects, compat='no_conflicts', join='outer'):
470476
"""Merge any number of xarray objects into a single Dataset as variables.
471477
472478
Parameters
@@ -476,7 +482,17 @@ def merge(objects, compat='broadcast_equals', join='outer'):
476482
DataArray objects, they must have a name.
477483
compat : {'identical', 'equals', 'broadcast_equals',
478484
'no_conflicts'}, optional
479-
Compatibility checks to use when merging variables.
485+
String indicating how to compare variables of the same name for
486+
potential conflicts:
487+
488+
- 'broadcast_equals': all values must be equal when variables are
489+
broadcast against each other to ensure common dimensions.
490+
- 'equals': all values and dimensions must be the same.
491+
- 'identical': all values, dimensions and attributes must be the
492+
same.
493+
- 'no_conflicts': only values which are not null in both datasets
494+
must be equal. The returned dataset then contains the combination
495+
of all non-null values.
480496
join : {'outer', 'inner', 'left', 'right'}, optional
481497
How to combine objects with different indexes.
482498
@@ -521,8 +537,7 @@ def merge(objects, compat='broadcast_equals', join='outer'):
521537
return merged
522538

523539

524-
def dataset_merge_method(dataset, other, overwrite_vars=frozenset(),
525-
compat='broadcast_equals', join='outer'):
540+
def dataset_merge_method(dataset, other, overwrite_vars, compat, join):
526541
"""Guts of the Dataset.merge method."""
527542

528543
# we are locked into supporting overwrite_vars for the Dataset.merge

xarray/core/variable.py

Lines changed: 7 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -974,11 +974,7 @@ def concat(cls, variables, dim='concat_dim', positions=None,
974974

975975
return cls(dims, data, attrs)
976976

977-
def _data_equals(self, other):
978-
return (self._data is other._data or
979-
ops.array_equiv(self.data, other.data))
980-
981-
def equals(self, other):
977+
def equals(self, other, equiv=ops.array_equiv):
982978
"""True if two Variables have the same dimensions and values;
983979
otherwise False.
984980
@@ -990,11 +986,13 @@ def equals(self, other):
990986
"""
991987
other = getattr(other, 'variable', other)
992988
try:
993-
return (self.dims == other.dims and self._data_equals(other))
989+
return (self.dims == other.dims and
990+
(self._data is other._data or
991+
equiv(self.data, other.data)))
994992
except (TypeError, AttributeError):
995993
return False
996994

997-
def broadcast_equals(self, other):
995+
def broadcast_equals(self, other, equiv=ops.array_equiv):
998996
"""True if two Variables have the values after being broadcast against
999997
each other; otherwise False.
1000998
@@ -1005,7 +1003,7 @@ def broadcast_equals(self, other):
10051003
self, other = broadcast_variables(self, other)
10061004
except (ValueError, AttributeError):
10071005
return False
1008-
return self.equals(other)
1006+
return self.equals(other, equiv=equiv)
10091007

10101008
def identical(self, other):
10111009
"""Like equals, but also checks attributes.
@@ -1016,23 +1014,14 @@ def identical(self, other):
10161014
except (TypeError, AttributeError):
10171015
return False
10181016

1019-
def _data_no_conflicts(self, other):
1020-
return (self._data is other._data or
1021-
ops.array_notnull_equiv(self.data, other.data))
1022-
10231017
def no_conflicts(self, other):
10241018
"""True if the intersection of two Variable's non-null data is
10251019
equal; otherwise false.
10261020
10271021
Variables can thus still be equal if there are locations where either,
10281022
or both, contain NaN values.
10291023
"""
1030-
other = getattr(other, 'variable', other)
1031-
try:
1032-
return (self.dims == other.dims and
1033-
self._data_no_conflicts(other))
1034-
except (TypeError, AttributeError):
1035-
return False
1024+
return self.broadcast_equals(other, equiv=ops.array_notnull_equiv)
10361025

10371026
@property
10381027
def real(self):

xarray/test/test_backends.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -942,6 +942,15 @@ def test_open_and_do_math(self):
942942
actual = 1.0 * ds
943943
self.assertDatasetAllClose(original, actual)
944944

945+
def test_open_mfdataset_concat_dim_none(self):
946+
with create_tmp_file() as tmp1:
947+
with create_tmp_file() as tmp2:
948+
data = Dataset({'x': 0})
949+
data.to_netcdf(tmp1)
950+
Dataset({'x': np.nan}).to_netcdf(tmp2)
951+
with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual:
952+
self.assertDatasetIdentical(data, actual)
953+
945954
def test_open_dataset(self):
946955
original = Dataset({'foo': ('x', np.random.randn(10))})
947956
with create_tmp_file() as tmp:

0 commit comments

Comments
 (0)