From 164bfc6d8984ca2c16c4a9dc2859c1a040623e1a Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 31 Jul 2024 21:49:02 +0000 Subject: [PATCH] fix: Fix caching from generating row numbers in partial ordering mode --- bigframes/series.py | 2 +- bigframes/session/__init__.py | 4 +++- tests/system/small/test_unordered.py | 15 ++++++++++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 1a5661529c..2ff4f8f455 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -641,7 +641,7 @@ def head(self, n: int = 5) -> Series: def tail(self, n: int = 5) -> Series: return typing.cast(Series, self.iloc[-n:]) - def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: + def peek(self, n: int = 5, *, force: bool = True) -> pandas.Series: """ Preview n arbitrary elements from the series without guarantees about row selection or ordering. diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 98cba867f2..5e06469974 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1992,8 +1992,10 @@ def _cache_with_session_awareness(self, array_value: core.ArrayValue) -> None: ) if len(cluster_cols) > 0: self._cache_with_cluster_cols(core.ArrayValue(target), cluster_cols) - else: + elif self._strictly_ordered: self._cache_with_offsets(core.ArrayValue(target)) + else: + self._cache_with_cluster_cols(core.ArrayValue(target), []) def _simplify_with_caching(self, array_value: core.ArrayValue): """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces.""" diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 7d7097ceb3..df967e532e 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -17,7 +17,11 @@ import bigframes.exceptions import bigframes.pandas as bpd -from tests.system.utils import assert_pandas_df_equal, skip_legacy_pandas +from tests.system.utils import ( + assert_pandas_df_equal, + assert_series_equal, + skip_legacy_pandas, +) def test_unordered_mode_sql_no_hash(unordered_session): @@ -49,6 +53,15 @@ def test_unordered_mode_cache_aggregate(unordered_session): assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) +def test_unordered_mode_series_peek(unordered_session): + pd_series = pd.Series([1, 2, 3, 4, 5, 6], dtype=pd.Int64Dtype()) + bf_series = bpd.Series(pd_series, session=unordered_session) + pd_result = pd_series.groupby(pd_series % 4).sum() + bf_peek = bf_series.groupby(bf_series % 4).sum().peek(2) + + assert_series_equal(bf_peek, pd_result.reindex(bf_peek.index)) + + def test_unordered_mode_single_aggregate(unordered_session): pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) bf_df = bpd.DataFrame(pd_df, session=unordered_session)