From b42b0648fe936aa3ec81b887a81f67456829614b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 2 Apr 2024 10:10:32 -0700 Subject: [PATCH 1/3] PERF: groupby returns a RangeIndex from groups when possible --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/groupby/ops.py | 6 +++++- pandas/tests/groupby/aggregate/test_aggregate.py | 14 ++++++++++++-- pandas/tests/groupby/test_groupby.py | 9 +++++++++ pandas/tests/groupby/transform/test_transform.py | 10 +++++++++- 5 files changed, 36 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4debd41de213f..3f51bb7c5942f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -297,6 +297,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`) - :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`57943`) +- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` returns a :class:`RangeIndex` index when possible. (:issue:`?`) - :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`) - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8585ae3828247..63c7aa6e23dfa 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -58,6 +58,7 @@ Index, MultiIndex, ensure_index, + maybe_sequence_to_range, ) from pandas.core.series import Series from pandas.core.sorting import ( @@ -754,7 +755,10 @@ def ids(self) -> npt.NDArray[np.intp]: @cache_readonly def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: - levels = [Index._with_infer(ping.uniques) for ping in self.groupings] + levels = [ + Index._with_infer(maybe_sequence_to_range(ping.uniques)) + for ping in self.groupings + ] obs = [ ping._observed or not ping._passed_categorical for ping in self.groupings ] diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 2b9df1b7079da..822dfbc620538 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -19,6 +19,7 @@ DataFrame, Index, MultiIndex, + RangeIndex, Series, concat, to_datetime, @@ -517,7 +518,7 @@ def test_callable_result_dtype_frame( df["c"] = df["c"].astype(input_dtype) op = getattr(df.groupby(keys)[["c"]], method) result = op(lambda x: x.astype(result_dtype).iloc[0]) - expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index + expected_index = RangeIndex(0, 1) if method == "transform" else agg_index expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype( result_dtype ) @@ -541,7 +542,7 @@ def test_callable_result_dtype_series(keys, agg_index, input, dtype, method): df = DataFrame({"a": [1], "b": [2], "c": [input]}) op = getattr(df.groupby(keys)["c"], method) result = op(lambda x: x.astype(dtype).iloc[0]) - expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index + expected_index = RangeIndex(0, 1) if method == "transform" else agg_index expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype) tm.assert_series_equal(result, expected) @@ -1663,3 +1664,12 @@ def func(x): msg = "length must not be 0" with pytest.raises(ValueError, match=msg): df.groupby("A", observed=False).agg(func) + + +def test_agg_groups_returns_rangeindex(): + df = DataFrame({"group": [1, 1, 2], "value": [1, 2, 3]}) + result = df.groupby("group").agg(max) + expected = DataFrame( + [2, 3], index=RangeIndex(1, 3, name="group"), columns=["value"] + ) + tm.assert_frame_equal(result, expected, check_index_type=True) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index be8f5d73fe7e8..c3491567c04f5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2955,3 +2955,12 @@ def test_groupby_dropna_with_nunique_unique(): ) tm.assert_frame_equal(result, expected) + + +def test_groupby_groups_returns_rangeindex(): + df = DataFrame({"group": [1, 1, 2], "value": [1, 2, 3]}) + result = df.groupby("group").max() + expected = DataFrame( + [2, 3], index=RangeIndex(1, 3, name="group"), columns=["value"] + ) + tm.assert_frame_equal(result, expected, check_index_type=True) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 245fb9c7babd7..2a08e1111232f 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -13,6 +13,7 @@ DataFrame, Index, MultiIndex, + RangeIndex, Series, Timestamp, concat, @@ -290,7 +291,7 @@ def test_transform_casting(): ), "DATETIME": pd.to_datetime([f"2014-10-08 {time}" for time in times]), }, - index=pd.RangeIndex(11, name="idx"), + index=RangeIndex(11, name="idx"), ) result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff()) @@ -1535,3 +1536,10 @@ def test_transform_sum_one_column_with_matching_labels_and_missing_labels(): result = df.groupby(series, as_index=False).transform("sum") expected = DataFrame({"X": [-93203.0, -93203.0, np.nan]}) tm.assert_frame_equal(result, expected) + + +def test_transform_groups_returns_rangeindex(): + df = DataFrame({"group": [1, 1, 2], "value": [1, 2, 3]}) + result = df.groupby("group").transform(lambda x: x + 1) + expected = DataFrame([2, 3, 4], index=RangeIndex(0, 3), columns=["value"]) + tm.assert_frame_equal(result, expected, check_index_type=True) From 584170e7e090fbb60ef1be919a35a1bd41ec73e6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 2 Apr 2024 10:11:42 -0700 Subject: [PATCH 2/3] Whatsnew number --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3f51bb7c5942f..65db6ee499616 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -296,8 +296,8 @@ Removal of prior version deprecations/changes Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`) +- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` returns a :class:`RangeIndex` index when possible. (:issue:`58117`) - :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`57943`) -- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` returns a :class:`RangeIndex` index when possible. (:issue:`?`) - :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`) - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) From 5a9ab65a603f973fb5b436ee0e9163b0b9193cbf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 2 Apr 2024 14:31:36 -0700 Subject: [PATCH 3/3] Fix some issues --- pandas/core/indexes/base.py | 5 ++- pandas/core/series.py | 5 ++- pandas/tests/groupby/test_groupby.py | 4 +-- pandas/tests/groupby/test_reductions.py | 7 ++-- .../tests/resample/test_resampler_grouper.py | 8 ++--- pandas/tests/reshape/test_pivot.py | 34 ++++++++----------- 6 files changed, 29 insertions(+), 34 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a4b58445289ad..73e564f95cf65 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7144,7 +7144,10 @@ def maybe_sequence_to_range(sequence) -> Any | range: return sequence if len(sequence) == 0: return range(0) - np_sequence = np.asarray(sequence, dtype=np.int64) + try: + np_sequence = np.asarray(sequence, dtype=np.int64) + except OverflowError: + return sequence diff = np_sequence[1] - np_sequence[0] if diff == 0: return sequence diff --git a/pandas/core/series.py b/pandas/core/series.py index ee496a355f6ca..967aea15fff60 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -132,6 +132,7 @@ PeriodIndex, default_index, ensure_index, + maybe_sequence_to_range, ) import pandas.core.indexes.base as ibase from pandas.core.indexes.multi import maybe_droplevels @@ -538,8 +539,6 @@ def _init_dict( _data : BlockManager for the new Series index : index for the new Series """ - keys: Index | tuple - # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] # raises KeyError), so we iterate the entire dict, and align if data: @@ -547,7 +546,7 @@ def _init_dict( # using generators in effects the performance. # Below is the new way of extracting the keys and values - keys = tuple(data.keys()) + keys = maybe_sequence_to_range(tuple(data.keys())) values = list(data.values()) # Generating list of values- faster way elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c3491567c04f5..4e5da6494a5a8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -46,9 +46,7 @@ def test_groupby_nonobject_dtype(multiindex_dataframe_random_data): result = grouped.sum() expected = multiindex_dataframe_random_data.groupby(key.astype("O")).sum() - assert result.index.dtype == np.int8 - assert expected.index.dtype == np.int64 - tm.assert_frame_equal(result, expected, check_index_type=False) + tm.assert_frame_equal(result, expected, check_index_type=True) def test_groupby_nonobject_dtype_mixed(): diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index edc94b2beeec1..9923d4ec4dbf7 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -258,9 +258,12 @@ def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype): ) gb = df.groupby("a") result = getattr(gb, how)() - expected = DataFrame( - {"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype) + exp_idx = ( + pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype) + if "float" in any_real_numpy_dtype + else pd.RangeIndex(range(1, 3), name="a") ) + expected = DataFrame({"b": [1, 0]}, index=exp_idx) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index b312d708ade1e..67860fde89a97 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import is_platform_windows - import pandas as pd from pandas import ( DataFrame, @@ -587,14 +585,12 @@ def test_resample_no_columns(): ) expected = DataFrame( index=pd.MultiIndex( - levels=[np.array([0, 1], dtype=np.intp), index], + levels=[range(2), index], codes=[[0, 0, 0, 1], [0, 1, 2, 3]], names=[None, "date"], ) ) - - # GH#52710 - Index comes out as 32-bit on 64-bit Windows - tm.assert_frame_equal(result, expected, check_index_type=not is_platform_windows()) + tm.assert_frame_equal(result, expected) def test_groupby_resample_size_all_index_same(): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 2ccb622c7a250..3d1a16c6d82a8 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -20,6 +20,7 @@ Grouper, Index, MultiIndex, + RangeIndex, Series, concat, date_range, @@ -424,12 +425,10 @@ def test_pivot_no_values(self): res = df.pivot_table(index=df.index.month, columns=df.index.day) exp_columns = MultiIndex.from_tuples([("A", 1), ("A", 2)]) - exp_columns = exp_columns.set_levels( - exp_columns.levels[1].astype(np.int32), level=1 - ) + exp_columns = exp_columns.set_levels(exp_columns.levels[1], level=1) exp = DataFrame( [[2.5, 4.0], [2.0, np.nan]], - index=Index([1, 2], dtype=np.int32), + index=range(1, 3), columns=exp_columns, ) tm.assert_frame_equal(res, exp) @@ -446,9 +445,7 @@ def test_pivot_no_values(self): [["A"], pd.DatetimeIndex(["2011-01-31"], dtype="M8[ns]")], names=[None, "dt"], ) - exp = DataFrame( - [3.25, 2.0], index=Index([1, 2], dtype=np.int32), columns=exp_columns - ) + exp = DataFrame([3.25, 2.0], index=range(1, 3), columns=exp_columns) tm.assert_frame_equal(res, exp) res = df.pivot_table( @@ -1671,7 +1668,7 @@ def test_pivot_dtaccessor(self): expected = DataFrame( {7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]}, index=exp_idx, - columns=Index([7, 8, 9], dtype=np.int32, name="dt1"), + columns=RangeIndex(range(7, 10), name="dt1"), ) tm.assert_frame_equal(result, expected) @@ -1681,8 +1678,8 @@ def test_pivot_dtaccessor(self): expected = DataFrame( {7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]}, - index=Index([1, 2], dtype=np.int32, name="dt2"), - columns=Index([7, 8, 9], dtype=np.int32, name="dt1"), + index=RangeIndex(range(1, 3), name="dt2"), + columns=RangeIndex(range(7, 10), name="dt1"), ) tm.assert_frame_equal(result, expected) @@ -1693,11 +1690,12 @@ def test_pivot_dtaccessor(self): values="value1", ) - exp_col = MultiIndex.from_arrays( - [ - np.array([7, 7, 8, 8, 9, 9], dtype=np.int32), - np.array([1, 2] * 3, dtype=np.int32), - ], + exp_col = MultiIndex( + levels=( + RangeIndex(start=7, stop=10, step=1, name="dt1"), + RangeIndex(start=1, stop=3, step=1, name="dt2"), + ), + codes=([0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]), names=["dt1", "dt2"], ) expected = DataFrame( @@ -1737,8 +1735,7 @@ def test_daily(self): for y in ts.index.year.unique().values: mask = ts.index.year == y expected[y] = Series(ts.values[mask], index=doy[mask]) - expected = DataFrame(expected, dtype=float).T - expected.index = expected.index.astype(np.int32) + expected = DataFrame(expected, dtype=float, index=range(1, 367)).T tm.assert_frame_equal(result, expected) def test_monthly(self): @@ -1753,8 +1750,7 @@ def test_monthly(self): for y in ts.index.year.unique().values: mask = ts.index.year == y expected[y] = Series(ts.values[mask], index=month[mask]) - expected = DataFrame(expected, dtype=float).T - expected.index = expected.index.astype(np.int32) + expected = DataFrame(expected, dtype=float, index=range(1, 13)).T tm.assert_frame_equal(result, expected) def test_pivot_table_with_iterator_values(self, data):