Skip to content
Merged
14 changes: 14 additions & 0 deletions asv_bench/benchmarks/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,20 @@ def time_rolling_offset(self, method):
getattr(self.groupby_roll_offset, method)()


class GroupbyLargeGroups:
# https://github.com/pandas-dev/pandas/issues/38038
# specific example where the rolling operation on a larger dataframe
# is relatively cheap (few but large groups), but creation of
# MultiIndex of result can be expensive

def setup(self):
N = 100000
self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)})

def time_rolling_multiindex_creation(self):
self.df.groupby("A").rolling(3).mean()


class GroupbyEWM:

params = ["cython", "numba"]
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Fixed regressions
- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`).
- Fixed performance regression for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`)
- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`)
- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`)

.. ---------------------------------------------------------------------------
Expand Down
38 changes: 22 additions & 16 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@

from pandas.core.aggregation import aggregate
from pandas.core.base import DataError, SelectionMixin
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.groupby.base import GotItemMixin, ShallowMixin
from pandas.core.indexes.api import Index, MultiIndex
Expand Down Expand Up @@ -791,22 +790,29 @@ def _apply(
# Our result will have still kept the column in the result
result = result.drop(columns=column_keys, errors="ignore")

result_index_data = []
for key, values in self._groupby.grouper.indices.items():
for value in values:
data = [
*com.maybe_make_list(key),
*com.maybe_make_list(
grouped_object_index[value]
if grouped_object_index is not None
else []
),
]
result_index_data.append(tuple(data))

result_index = MultiIndex.from_tuples(
result_index_data, names=result_index_names
codes = self._groupby.grouper.codes
levels = self._groupby.grouper.levels

group_indices = self._groupby.grouper.indices.values()
if group_indices:
indexer = np.concatenate(list(group_indices))
else:
indexer = np.array([], dtype=np.intp)
codes = [c.take(indexer) for c in codes]

# if the index of the original dataframe needs to be preserved, append
# this index (but reordered) to the codes/levels from the groupby
if grouped_object_index is not None:
idx = grouped_object_index.take(indexer)
if not isinstance(idx, MultiIndex):
idx = MultiIndex.from_arrays([idx])
codes.extend(list(idx.codes))
levels.extend(list(idx.levels))

result_index = MultiIndex(
levels, codes, names=result_index_names, verify_integrity=False
)

result.index = result_index
return result

Expand Down
79 changes: 76 additions & 3 deletions pandas/tests/window/test_groupby.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
import numpy as np
import pytest

from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, to_datetime
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
date_range,
to_datetime,
)
import pandas._testing as tm
from pandas.api.indexers import BaseIndexer
from pandas.core.groupby.groupby import get_groupby
Expand Down Expand Up @@ -418,12 +426,23 @@ def test_groupby_rolling_empty_frame(self):
# GH 36197
expected = DataFrame({"s1": []})
result = expected.groupby("s1").rolling(window=1).sum()
expected.index = MultiIndex.from_tuples([], names=["s1", None])
# GH-38057 from_tuples gives empty object dtype, we now get float/int levels
# expected.index = MultiIndex.from_tuples([], names=["s1", None])
expected.index = MultiIndex.from_product(
[Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None]
)
tm.assert_frame_equal(result, expected)

expected = DataFrame({"s1": [], "s2": []})
result = expected.groupby(["s1", "s2"]).rolling(window=1).sum()
expected.index = MultiIndex.from_tuples([], names=["s1", "s2", None])
expected.index = MultiIndex.from_product(
[
Index([], dtype="float64"),
Index([], dtype="float64"),
Index([], dtype="int64"),
],
names=["s1", "s2", None],
)
tm.assert_frame_equal(result, expected)

def test_groupby_rolling_string_index(self):
Expand Down Expand Up @@ -567,6 +586,60 @@ def test_groupby_rolling_index_level_and_column_label(self):
)
tm.assert_frame_equal(result, expected)

def test_groupby_rolling_resulting_multiindex(self):
# a few different cases checking the created MultiIndex of the result
# https://github.com/pandas-dev/pandas/pull/38057

# grouping by 1 columns -> 2-level MI as result
df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4})
result = df.groupby("b").rolling(3).mean()
expected_index = MultiIndex.from_tuples(
[(1, 0), (1, 2), (1, 4), (1, 6), (2, 1), (2, 3), (2, 5), (2, 7)],
names=["b", None],
)
tm.assert_index_equal(result.index, expected_index)

# grouping by 2 columns -> 3-level MI as result
df = DataFrame({"a": np.arange(12.0), "b": [1, 2] * 6, "c": [1, 2, 3, 4] * 3})
result = df.groupby(["b", "c"]).rolling(2).sum()
expected_index = MultiIndex.from_tuples(
[
(1, 1, 0),
(1, 1, 4),
(1, 1, 8),
(1, 3, 2),
(1, 3, 6),
(1, 3, 10),
(2, 2, 1),
(2, 2, 5),
(2, 2, 9),
(2, 4, 3),
(2, 4, 7),
(2, 4, 11),
],
names=["b", "c", None],
)
tm.assert_index_equal(result.index, expected_index)

# grouping with 1 level on dataframe with 2-level MI -> 3-level MI as result
df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4, "c": [1, 2, 3, 4] * 2})
df = df.set_index("c", append=True)
result = df.groupby("b").rolling(3).mean()
expected_index = MultiIndex.from_tuples(
[
(1, 0, 1),
(1, 2, 3),
(1, 4, 1),
(1, 6, 3),
(2, 1, 2),
(2, 3, 4),
(2, 5, 2),
(2, 7, 4),
],
names=["b", None, "c"],
)
tm.assert_index_equal(result.index, expected_index)


class TestExpanding:
def setup_method(self):
Expand Down
11 changes: 9 additions & 2 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1085,8 +1085,15 @@ def test_groupby_rolling_nan_included():
result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean()
expected = DataFrame(
{"B": [0.0, 2.0, 3.0, 1.0, 4.0]},
index=MultiIndex.from_tuples(
[("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)],
# GH-38057 from_tuples puts the NaNs in the codes, result expects them
# to be in the levels, at the moment
# index=MultiIndex.from_tuples(
# [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)],
# names=["group", None],
# ),
index=MultiIndex(
[["g1", "g2", np.nan], [0, 1, 2, 3, 4]],
[[0, 0, 1, 2, 2], [0, 2, 3, 1, 4]],
names=["group", None],
),
)
Expand Down