From 9221aac92e7cc2c6527b10afa18aba996f55f1fd Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 4 Jan 2022 19:48:41 -0700 Subject: [PATCH 01/15] Vectorize groupby math --- xarray/core/groupby.py | 82 ++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 68db14c29be..6f108a3aa38 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -262,6 +262,9 @@ class GroupBy: "_stacked_dim", "_unique_coord", "_dims", + "_original_obj", + "_original_group", + "_bins", ) def __init__( @@ -323,6 +326,11 @@ def __init__( if getattr(group, "name", None) is None: group.name = "group" + # save object before any stacking + self._original_obj = obj + self._original_group = group + self._bins = bins + group, obj, stacked_dim, inserted_dims = _ensure_1d(group, obj) (group_dim,) = group.dims @@ -476,34 +484,56 @@ def _infer_concat_args(self, applied_example): def _binary_op(self, other, f, reflexive=False): g = f if not reflexive else lambda x, y: f(y, x) - applied = self._yield_binary_applied(g, other) - return self._combine(applied) - - def _yield_binary_applied(self, func, other): - dummy = None - for group_value, obj in self: - try: - other_sel = other.sel(**{self._group.name: group_value}) - except AttributeError: - raise TypeError( - "GroupBy objects only support binary ops " - "when the other argument is a Dataset or " - "DataArray" + obj = self._original_obj + group = self._original_group + name = group.name + dim = self._group_dim + # import IPython; IPython.core.debugger.set_trace() + try: + if self._bins is not None: + other = other.sel({f"{name}_bins": self._group}) + if isinstance(group, _DummyGroup): + # When binning by unindexed coordinate we need to reindex obj. + # _full_index is IntervalIndex, so idx will be -1 where + # a value does not belong to any bin. Using IntervalIndex + # accounts for any non-default cut_kwargs passed to the constructor + idx = pd.cut(obj[dim], bins=self._full_index).codes + obj = obj.isel({dim: np.arange(group.size)[idx != -1]}) + else: + if isinstance(group, _DummyGroup): + group = obj[dim] + other = other.sel({name: group}) + except AttributeError: + raise TypeError( + "GroupBy objects only support binary ops " + "when the other argument is a Dataset or " + "DataArray" + ) + except (KeyError, ValueError): + if name not in other.dims: + raise ValueError( + "incompatible dimensions for a grouped " + f"binary operation: the group variable {name!r} " + "is not a dimension on the other argument" ) - except (KeyError, ValueError): - if self._group.name not in other.dims: - raise ValueError( - "incompatible dimensions for a grouped " - f"binary operation: the group variable {self._group.name!r} " - "is not a dimension on the other argument" - ) - if dummy is None: - dummy = _dummy_copy(other) - other_sel = dummy - - result = func(obj, other_sel) - yield result + # some labels are absent i.e. other is not aligned + # so we align by reindexing and then rename dimensions. + # TODO: probably need to copy some coordinates over + other = ( + other.reindex({name: group.data}) + .rename({name: dim}) + .assign_coords({dim: obj[dim]}) + ) + + result = g(obj, other) + + # backcompat: concat during the "combine" step places + # `dim` as the first dimension + if dim in result.dims: + # guards against self._group_dim being "stacked" + result = result.transpose(dim, ...) + return result def _maybe_restore_empty_groups(self, combined): """Our index contained empty groups (e.g., from a resampling). If we From 6a158e1cb6c6a2e2674d066bbba7d09b6db6ec1f Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 4 Jan 2022 19:50:56 -0700 Subject: [PATCH 02/15] WIP --- xarray/core/groupby.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 6f108a3aa38..3eeca84ec3c 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -492,7 +492,11 @@ def _binary_op(self, other, f, reflexive=False): # import IPython; IPython.core.debugger.set_trace() try: if self._bins is not None: - other = other.sel({f"{name}_bins": self._group}) + if self._stacked_dim is not None: + group = self._group.unstack() + # idx = pd.factorize(group.data.ravel())[0] + # group_idx = group.copy(data=idx.reshape(group.data.shape)) + other = other.sel({f"{name}_bins": group}) if isinstance(group, _DummyGroup): # When binning by unindexed coordinate we need to reindex obj. # _full_index is IntervalIndex, so idx will be -1 where @@ -528,11 +532,12 @@ def _binary_op(self, other, f, reflexive=False): result = g(obj, other) - # backcompat: concat during the "combine" step places - # `dim` as the first dimension - if dim in result.dims: - # guards against self._group_dim being "stacked" - result = result.transpose(dim, ...) + # backcompat: + for var in set(obj.coords) - set(obj.xindexes): + print(var) + if dim not in obj[var]: + print(f"excluding {dim}, broadcasting {var}") + result[var] = obj[var].reset_coords(drop=True).broadcast_like(result) return result def _maybe_restore_empty_groups(self, combined): From 24980f2a0daecc65f1cc46413711a02c4ca85a34 Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 17 Mar 2022 18:07:33 +0530 Subject: [PATCH 03/15] rework tests --- xarray/tests/test_groupby.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 4b6da82bdc7..5cb8fcfadd1 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -1133,26 +1133,28 @@ def change_metadata(x): expected = change_metadata(expected) assert_equal(expected, actual) - def test_groupby_math(self): + @pytest.mark.parametrize("squeeze", [True, False]) + def test_groupby_math_squeeze(self, squeeze): array = self.da - for squeeze in [True, False]: - grouped = array.groupby("x", squeeze=squeeze) + grouped = array.groupby("x", squeeze=squeeze) - expected = array + array.coords["x"] - actual = grouped + array.coords["x"] - assert_identical(expected, actual) + expected = array + array.coords["x"] + actual = grouped + array.coords["x"] + assert_identical(expected, actual) - actual = array.coords["x"] + grouped - assert_identical(expected, actual) + actual = array.coords["x"] + grouped + assert_identical(expected, actual) - ds = array.coords["x"].to_dataset(name="X") - expected = array + ds - actual = grouped + ds - assert_identical(expected, actual) + ds = array.coords["x"].to_dataset(name="X") + expected = array + ds + actual = grouped + ds + assert_identical(expected, actual) - actual = ds + grouped - assert_identical(expected, actual) + actual = ds + grouped + assert_identical(expected, actual) + def test_groupby_math(self): + array = self.da grouped = array.groupby("abc") expected_agg = (grouped.mean(...) - np.arange(3)).rename(None) actual = grouped - DataArray(range(3), [("abc", ["a", "b", "c"])]) From 050ea70d65e84d6f719110f4459e431c2fcb0a94 Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 17 Mar 2022 22:36:12 +0530 Subject: [PATCH 04/15] working! --- xarray/core/groupby.py | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 3eeca84ec3c..36a65f52b51 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -489,25 +489,28 @@ def _binary_op(self, other, f, reflexive=False): group = self._original_group name = group.name dim = self._group_dim - # import IPython; IPython.core.debugger.set_trace() + try: if self._bins is not None: - if self._stacked_dim is not None: - group = self._group.unstack() - # idx = pd.factorize(group.data.ravel())[0] - # group_idx = group.copy(data=idx.reshape(group.data.shape)) - other = other.sel({f"{name}_bins": group}) - if isinstance(group, _DummyGroup): + group = self._maybe_unstack(self._group) + other = other.sel({f"{name}_bins": self._group}) + # TODO: vectorized indexing bug in .sel; name_bins is still an IndexVariable! + other[f"{name}_bins"] = other[f"{name}_bins"].variable.to_variable() + if name == dim and dim not in obj.xindexes: # When binning by unindexed coordinate we need to reindex obj. # _full_index is IntervalIndex, so idx will be -1 where # a value does not belong to any bin. Using IntervalIndex # accounts for any non-default cut_kwargs passed to the constructor idx = pd.cut(obj[dim], bins=self._full_index).codes - obj = obj.isel({dim: np.arange(group.size)[idx != -1]}) + obj = obj.isel({dim: np.arange(obj[dim].size)[idx != -1]}) + else: + obj = self._obj + else: if isinstance(group, _DummyGroup): group = obj[dim] other = other.sel({name: group}) + except AttributeError: raise TypeError( "GroupBy objects only support binary ops " @@ -523,7 +526,12 @@ def _binary_op(self, other, f, reflexive=False): ) # some labels are absent i.e. other is not aligned # so we align by reindexing and then rename dimensions. - # TODO: probably need to copy some coordinates over + # Broadcast out scalars. + for var in other.coords: + if other[var].ndim == 0: + other[var] = ( + other[var].drop(var).expand_dims({name: other.sizes[name]}) + ) other = ( other.reindex({name: group.data}) .rename({name: dim}) @@ -532,12 +540,13 @@ def _binary_op(self, other, f, reflexive=False): result = g(obj, other) - # backcompat: - for var in set(obj.coords) - set(obj.xindexes): - print(var) - if dim not in obj[var]: - print(f"excluding {dim}, broadcasting {var}") - result[var] = obj[var].reset_coords(drop=True).broadcast_like(result) + if group.ndim > 1: + # backcompat: + for var in set(obj.coords) - set(obj.xindexes): + if set(obj[var].dims) < set(group.dims): + result[var] = obj[var].reset_coords(drop=True).broadcast_like(group) + + result = self._maybe_unstack(result).transpose(*group.dims, ...) return result def _maybe_restore_empty_groups(self, combined): From 291656f6e5eee7ef88990dec1f40918c1c0b5756 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 18 Mar 2022 08:04:17 +0530 Subject: [PATCH 05/15] Cleanup --- xarray/core/groupby.py | 80 ++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 9476d73d0b8..2fd97cd9648 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -264,8 +264,6 @@ class GroupBy: "_stacked_dim", "_unique_coord", "_dims", - "_original_obj", - "_original_group", "_bins", ) @@ -328,11 +326,6 @@ def __init__( if getattr(group, "name", None) is None: group.name = "group" - # save object before any stacking - self._original_obj = obj - self._original_group = group - self._bins = bins - group, obj, stacked_dim, inserted_dims = _ensure_1d(group, obj) (group_dim,) = group.dims @@ -409,6 +402,7 @@ def __init__( self._inserted_dims = inserted_dims self._full_index = full_index self._restore_coord_dims = restore_coord_dims + self._bins = bins # cached attributes self._groups = None @@ -486,50 +480,41 @@ def _infer_concat_args(self, applied_example): return coord, dim, positions def _binary_op(self, other, f, reflexive=False): + from .dataarray import DataArray + from .dataset import Dataset + g = f if not reflexive else lambda x, y: f(y, x) - obj = self._original_obj - group = self._original_group - name = group.name + obj = self._obj + name = self._group.name dim = self._group_dim - try: - if self._bins is not None: - group = self._maybe_unstack(self._group) - other = other.sel({f"{name}_bins": self._group}) - # TODO: vectorized indexing bug in .sel; name_bins is still an IndexVariable! - other[f"{name}_bins"] = other[f"{name}_bins"].variable.to_variable() - if name == dim and dim not in obj.xindexes: - # When binning by unindexed coordinate we need to reindex obj. - # _full_index is IntervalIndex, so idx will be -1 where - # a value does not belong to any bin. Using IntervalIndex - # accounts for any non-default cut_kwargs passed to the constructor - idx = pd.cut(obj[dim], bins=self._full_index).codes - obj = obj.isel({dim: np.arange(obj[dim].size)[idx != -1]}) - else: - obj = self._obj - - else: - if isinstance(group, _DummyGroup): - group = obj[dim] - other = other.sel({name: group}) - - except AttributeError: + if not isinstance(other, (Dataset, DataArray)): raise TypeError( "GroupBy objects only support binary ops " "when the other argument is a Dataset or " "DataArray" ) - except (KeyError, ValueError): - if name not in other.dims: - raise ValueError( - "incompatible dimensions for a grouped " - f"binary operation: the group variable {name!r} " - "is not a dimension on the other argument" - ) + + if name not in other.dims: + raise ValueError( + "incompatible dimensions for a grouped " + f"binary operation: the group variable {name!r} " + "is not a dimension on the other argument" + ) + + group = self._group + if isinstance(group, _DummyGroup): + group = obj[dim] + + try: + other = other.sel({name: group}) + except KeyError: # some labels are absent i.e. other is not aligned # so we align by reindexing and then rename dimensions. - # Broadcast out scalars. + + # Broadcast out scalars for backwards compatibility + # TODO: get rid of this. for var in other.coords: if other[var].ndim == 0: other[var] = ( @@ -541,15 +526,28 @@ def _binary_op(self, other, f, reflexive=False): .assign_coords({dim: obj[dim]}) ) + if self._bins is not None: + # TODO: vectorized indexing bug in .sel; name_bins is still an IndexVariable! + other[name] = other[name].variable.to_base_variable() + if name == dim and dim not in obj.xindexes: + # When binning by unindexed coordinate we need to reindex obj. + # _full_index is IntervalIndex, so idx will be -1 where + # a value does not belong to any bin. Using IntervalIndex + # accounts for any non-default cut_kwargs passed to the constructor + idx = pd.cut(obj[dim], bins=self._full_index).codes + obj = obj.isel({dim: np.arange(obj[dim].size)[idx != -1]}) + result = g(obj, other) + result = self._maybe_unstack(result) + group = self._maybe_unstack(group) if group.ndim > 1: # backcompat: for var in set(obj.coords) - set(obj.xindexes): if set(obj[var].dims) < set(group.dims): result[var] = obj[var].reset_coords(drop=True).broadcast_like(group) - result = self._maybe_unstack(result).transpose(*group.dims, ...) + result = result.transpose(*group.dims, ...) return result def _maybe_restore_empty_groups(self, combined): From 97974b6e2d737901526011c6ccd47bdc2c9e9980 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 18 Mar 2022 13:29:57 +0530 Subject: [PATCH 06/15] Add benchmarks --- asv_bench/benchmarks/groupby.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 46d6293cc98..a399ef29ef8 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -16,6 +16,8 @@ def setup(self, *args, **kwargs): } ) self.ds2d = self.ds1d.expand_dims(z=10) + self.ds1d_mean = self.ds1d.groupby("b").mean() + self.ds2d_mean = self.ds2d.groupby("b").mean() @parameterized(["ndim"], [(1, 2)]) def time_init(self, ndim): @@ -31,6 +33,12 @@ def time_agg_large_num_groups(self, method, ndim): ds = getattr(self, f"ds{ndim}d") getattr(ds.groupby("b"), method)() + def time_groupby_binary_op_1d(self): + self.ds1d - self.ds1d_mean + + def time_groupby_binary_op_2d(self): + self.ds2d - self.ds2d_mean + class GroupByDask(GroupBy): def setup(self, *args, **kwargs): @@ -40,6 +48,8 @@ def setup(self, *args, **kwargs): self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2)).chunk( {"dim_0": 50, "z": 5} ) + self.ds1d_mean = self.ds1d.groupby("b").mean() + self.ds2d_mean = self.ds2d.groupby("b").mean() class GroupByPandasDataFrame(GroupBy): @@ -51,6 +61,13 @@ def setup(self, *args, **kwargs): super().setup(**kwargs) self.ds1d = self.ds1d.to_dataframe() + self.ds1d_mean = self.ds1d.groupby("b").mean() + + def time_groupby_binary_op_1d(self): + self.ds1d - self.ds1d_bmean + + def time_groupby_binary_op_2d(self): + raise NotImplementedError class GroupByDaskDataFrame(GroupBy): @@ -63,6 +80,13 @@ def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe() + self.ds1d_mean = self.ds1d.groupby("b").mean() + + def time_groupby_binary_op_1d(self): + self.ds1d - self.ds1d_bmean + + def time_groupby_binary_op_2d(self): + raise NotImplementedError class Resample: @@ -74,6 +98,8 @@ def setup(self, *args, **kwargs): coords={"time": pd.date_range("2001-01-01", freq="H", periods=365 * 24)}, ) self.ds2d = self.ds1d.expand_dims(z=10) + self.ds1d_mean = self.ds1d.resample(time="48H").mean() + self.ds2d_mean = self.ds2d.resample(time="48H").mean() @parameterized(["ndim"], [(1, 2)]) def time_init(self, ndim): @@ -89,6 +115,12 @@ def time_agg_large_num_groups(self, method, ndim): ds = getattr(self, f"ds{ndim}d") getattr(ds.resample(time="48H"), method)() + def time_groupby_binary_op_1d(self): + self.ds1d - self.ds1d_bmean + + def time_groupby_binary_op_2d(self): + self.ds2d - self.ds2d_mean + class ResampleDask(Resample): def setup(self, *args, **kwargs): From d5127a74d20a8d9395f7f44b1c863d0f3a18df5a Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 18 Mar 2022 13:48:08 +0530 Subject: [PATCH 07/15] Add whats-new note --- doc/whats-new.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 05bdfcf78bb..7b96dac922d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -52,6 +52,12 @@ Bug fixes Documentation ~~~~~~~~~~~~~ +Performance +~~~~~~~~~~~ + +- GroupBy binary operations are now vectorized. + Previously this involved looping over all groups. (:issue:`5804`,:pull:`6160`) + By `Deepak Cherian `_. Internal Changes ~~~~~~~~~~~~~~~~ From 1adbb664b8aefc8f6eba570dc66bba52ffb59f7f Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 18 Mar 2022 15:29:06 +0530 Subject: [PATCH 08/15] fix benchmark --- asv_bench/benchmarks/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index a399ef29ef8..309a1a903f0 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -64,7 +64,7 @@ def setup(self, *args, **kwargs): self.ds1d_mean = self.ds1d.groupby("b").mean() def time_groupby_binary_op_1d(self): - self.ds1d - self.ds1d_bmean + self.ds1d - self.ds1d_mean def time_groupby_binary_op_2d(self): raise NotImplementedError @@ -83,7 +83,7 @@ def setup(self, *args, **kwargs): self.ds1d_mean = self.ds1d.groupby("b").mean() def time_groupby_binary_op_1d(self): - self.ds1d - self.ds1d_bmean + self.ds1d - self.ds1d_mean def time_groupby_binary_op_2d(self): raise NotImplementedError @@ -116,7 +116,7 @@ def time_agg_large_num_groups(self, method, ndim): getattr(ds.resample(time="48H"), method)() def time_groupby_binary_op_1d(self): - self.ds1d - self.ds1d_bmean + self.ds1d - self.ds1d_mean def time_groupby_binary_op_2d(self): self.ds2d - self.ds2d_mean From a0e686532f4d8e2d61c89af65a9e17f3b4147caf Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 18 Mar 2022 16:15:52 +0530 Subject: [PATCH 09/15] Fix output dimension order --- xarray/core/groupby.py | 7 +++++-- xarray/tests/test_groupby.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 2fd97cd9648..aa28f469b77 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -518,7 +518,7 @@ def _binary_op(self, other, f, reflexive=False): for var in other.coords: if other[var].ndim == 0: other[var] = ( - other[var].drop(var).expand_dims({name: other.sizes[name]}) + other[var].drop_vars(var).expand_dims({name: other.sizes[name]}) ) other = ( other.reindex({name: group.data}) @@ -547,7 +547,10 @@ def _binary_op(self, other, f, reflexive=False): if set(obj[var].dims) < set(group.dims): result[var] = obj[var].reset_coords(drop=True).broadcast_like(group) - result = result.transpose(*group.dims, ...) + if isinstance(result, Dataset) and isinstance(obj, Dataset): + for var in set(result): + if dim not in obj[var].dims: + result[var] = result[var].transpose(dim, ...) return result def _maybe_restore_empty_groups(self, combined): diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 8e1855789ee..4e92f009987 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -857,6 +857,17 @@ def test_groupby_dataset_math_virtual() -> None: assert_identical(actual, expected) +def test_groupby_math_dim_order() -> None: + da = DataArray( + np.ones((10, 10, 12)), + dims=("x", "y", "time"), + coords={"time": pd.date_range("2001-01-01", periods=12, freq="6H")}, + ) + grouped = da.groupby("time.day") + result = grouped - grouped.mean() + assert result.dims == da.dims + + def test_groupby_dataset_nan() -> None: # nan should be excluded from groupby ds = Dataset({"foo": ("x", [1, 2, 3, 4])}, {"bar": ("x", [1, 1, 2, np.nan])}) From fd3031c66c7f24a82d378d784f9c501d8bfee42c Mon Sep 17 00:00:00 2001 From: dcherian Date: Sat, 19 Mar 2022 10:21:44 +0530 Subject: [PATCH 10/15] cleanup --- asv_bench/benchmarks/groupby.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 309a1a903f0..60c960abc55 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -63,9 +63,6 @@ def setup(self, *args, **kwargs): self.ds1d = self.ds1d.to_dataframe() self.ds1d_mean = self.ds1d.groupby("b").mean() - def time_groupby_binary_op_1d(self): - self.ds1d - self.ds1d_mean - def time_groupby_binary_op_2d(self): raise NotImplementedError @@ -82,9 +79,6 @@ def setup(self, *args, **kwargs): self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe() self.ds1d_mean = self.ds1d.groupby("b").mean() - def time_groupby_binary_op_1d(self): - self.ds1d - self.ds1d_mean - def time_groupby_binary_op_2d(self): raise NotImplementedError From bae15d5c250cb0f1e903c404756e12a10f59a56d Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 21 Mar 2022 10:19:38 +0530 Subject: [PATCH 11/15] comment --- xarray/core/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index aa28f469b77..866099c1e54 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -514,7 +514,7 @@ def _binary_op(self, other, f, reflexive=False): # so we align by reindexing and then rename dimensions. # Broadcast out scalars for backwards compatibility - # TODO: get rid of this. + # TODO: get rid of this when fixing GH2145 for var in other.coords: if other[var].ndim == 0: other[var] = ( From cc69d2d41e8766948e3be4c4b5c0b105b744f4ef Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 21 Mar 2022 10:22:02 +0530 Subject: [PATCH 12/15] cleanup --- xarray/core/groupby.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index aecc3d5e45d..056dcd98605 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -526,16 +526,13 @@ def _binary_op(self, other, f, reflexive=False): .assign_coords({dim: obj[dim]}) ) - if self._bins is not None: - # TODO: vectorized indexing bug in .sel; name_bins is still an IndexVariable! - other[name] = other[name].variable.to_base_variable() - if name == dim and dim not in obj.xindexes: - # When binning by unindexed coordinate we need to reindex obj. - # _full_index is IntervalIndex, so idx will be -1 where - # a value does not belong to any bin. Using IntervalIndex - # accounts for any non-default cut_kwargs passed to the constructor - idx = pd.cut(obj[dim], bins=self._full_index).codes - obj = obj.isel({dim: np.arange(obj[dim].size)[idx != -1]}) + if self._bins is not None and name == dim and dim not in obj.xindexes: + # When binning by unindexed coordinate we need to reindex obj. + # _full_index is IntervalIndex, so idx will be -1 where + # a value does not belong to any bin. Using IntervalIndex + # accounts for any non-default cut_kwargs passed to the constructor + idx = pd.cut(obj[dim], bins=self._full_index).codes + obj = obj.isel({dim: np.arange(obj[dim].size)[idx != -1]}) result = g(obj, other) From cc5e8a4eb7363d2316350828b719ad232f4fd3b5 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 21 Mar 2022 10:44:18 +0530 Subject: [PATCH 13/15] cleanup --- xarray/core/groupby.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 056dcd98605..ab6e02cde31 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -486,8 +486,11 @@ def _binary_op(self, other, f, reflexive=False): g = f if not reflexive else lambda x, y: f(y, x) obj = self._obj - name = self._group.name + group = self._group dim = self._group_dim + if isinstance(group, _DummyGroup): + group = obj[dim] + name = group.name if not isinstance(other, (Dataset, DataArray)): raise TypeError( @@ -503,10 +506,6 @@ def _binary_op(self, other, f, reflexive=False): "is not a dimension on the other argument" ) - group = self._group - if isinstance(group, _DummyGroup): - group = obj[dim] - try: other = other.sel({name: group}) except KeyError: @@ -531,8 +530,8 @@ def _binary_op(self, other, f, reflexive=False): # _full_index is IntervalIndex, so idx will be -1 where # a value does not belong to any bin. Using IntervalIndex # accounts for any non-default cut_kwargs passed to the constructor - idx = pd.cut(obj[dim], bins=self._full_index).codes - obj = obj.isel({dim: np.arange(obj[dim].size)[idx != -1]}) + idx = pd.cut(group, bins=self._full_index).codes + obj = obj.isel({dim: np.arange(group.size)[idx != -1]}) result = g(obj, other) @@ -540,6 +539,7 @@ def _binary_op(self, other, f, reflexive=False): group = self._maybe_unstack(group) if group.ndim > 1: # backcompat: + # TODO: get rid of this when fixing GH2145 for var in set(obj.coords) - set(obj.xindexes): if set(obj[var].dims) < set(group.dims): result[var] = obj[var].reset_coords(drop=True).broadcast_like(group) From 169e51a28b1734a58c8fa59d588e38450763e358 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 21 Mar 2022 10:48:37 +0530 Subject: [PATCH 14/15] More cleanup --- xarray/core/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index ab6e02cde31..14d6a7773b0 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -507,7 +507,7 @@ def _binary_op(self, other, f, reflexive=False): ) try: - other = other.sel({name: group}) + expanded = other.sel({name: group}) except KeyError: # some labels are absent i.e. other is not aligned # so we align by reindexing and then rename dimensions. @@ -519,7 +519,7 @@ def _binary_op(self, other, f, reflexive=False): other[var] = ( other[var].drop_vars(var).expand_dims({name: other.sizes[name]}) ) - other = ( + expanded = ( other.reindex({name: group.data}) .rename({name: dim}) .assign_coords({dim: obj[dim]}) @@ -533,7 +533,7 @@ def _binary_op(self, other, f, reflexive=False): idx = pd.cut(group, bins=self._full_index).codes obj = obj.isel({dim: np.arange(group.size)[idx != -1]}) - result = g(obj, other) + result = g(obj, expanded) result = self._maybe_unstack(result) group = self._maybe_unstack(group) From bb53249f4ad23e1a1e336207f1b0fcb724a105a1 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 27 Mar 2022 22:07:23 +0530 Subject: [PATCH 15/15] Update xarray/core/groupby.py Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- xarray/core/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 14d6a7773b0..952794dcef9 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -529,7 +529,7 @@ def _binary_op(self, other, f, reflexive=False): # When binning by unindexed coordinate we need to reindex obj. # _full_index is IntervalIndex, so idx will be -1 where # a value does not belong to any bin. Using IntervalIndex - # accounts for any non-default cut_kwargs passed to the constructor + # accounts for any non-default cut_kwargs passed to the constructor idx = pd.cut(group, bins=self._full_index).codes obj = obj.isel({dim: np.arange(group.size)[idx != -1]})