From 4fcedb2c0c865028789eb18be862461dd9689b42 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 6 Jun 2023 18:03:02 -0700 Subject: [PATCH 1/6] DEBUG: npdev build --- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e6c89139d06d..85ca70d51aca3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6567,7 +6567,7 @@ def sort_values( axis: Axis = 0, ascending: bool | list[bool] | tuple[bool, ...] = True, inplace: bool = False, - kind: SortKind = "quicksort", + kind: SortKind = "stable", na_position: str = "last", ignore_index: bool = False, key: ValueKeyFunc = None, @@ -6592,7 +6592,7 @@ def sort_values( the by. inplace : bool, default False If True, perform operation in-place. - kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'stable' Choice of sorting algorithm. See also :func:`numpy.sort` for more information. `mergesort` and `stable` are the only stable algorithms. For DataFrames, this option is only applied when sorting on a single diff --git a/pandas/core/series.py b/pandas/core/series.py index 9c7110cc21082..d0161a1498b63 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3440,7 +3440,7 @@ def sort_values( axis: Axis = 0, ascending: bool | int | Sequence[bool] | Sequence[int] = True, inplace: bool = False, - kind: SortKind = "quicksort", + kind: SortKind = "stable", na_position: NaPosition = "last", ignore_index: bool = False, key: ValueKeyFunc = None, @@ -3459,7 +3459,7 @@ def sort_values( If True, sort values in ascending order, otherwise descending. inplace : bool, default False If True, perform operation in-place. - kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'stable' Choice of sorting algorithm. See also :func:`numpy.sort` for more information. 'mergesort' and 'stable' are the only stable algorithms. na_position : {'first' or 'last'}, default 'last' From 5c7b2e58f729a0d0ece2d6a58d26fe4d7c9aaa6d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 7 Jun 2023 11:53:18 -0700 Subject: [PATCH 2/6] Address tests where sorting changed --- pandas/core/frame.py | 4 +- pandas/core/series.py | 4 +- pandas/tests/frame/methods/test_nlargest.py | 2 +- .../tests/frame/methods/test_sort_values.py | 8 +- pandas/tests/groupby/test_value_counts.py | 208 +++++++++++++----- 5 files changed, 164 insertions(+), 62 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 85ca70d51aca3..3e6c89139d06d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6567,7 +6567,7 @@ def sort_values( axis: Axis = 0, ascending: bool | list[bool] | tuple[bool, ...] = True, inplace: bool = False, - kind: SortKind = "stable", + kind: SortKind = "quicksort", na_position: str = "last", ignore_index: bool = False, key: ValueKeyFunc = None, @@ -6592,7 +6592,7 @@ def sort_values( the by. inplace : bool, default False If True, perform operation in-place. - kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'stable' + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' Choice of sorting algorithm. See also :func:`numpy.sort` for more information. `mergesort` and `stable` are the only stable algorithms. For DataFrames, this option is only applied when sorting on a single diff --git a/pandas/core/series.py b/pandas/core/series.py index d0161a1498b63..9c7110cc21082 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3440,7 +3440,7 @@ def sort_values( axis: Axis = 0, ascending: bool | int | Sequence[bool] | Sequence[int] = True, inplace: bool = False, - kind: SortKind = "stable", + kind: SortKind = "quicksort", na_position: NaPosition = "last", ignore_index: bool = False, key: ValueKeyFunc = None, @@ -3459,7 +3459,7 @@ def sort_values( If True, sort values in ascending order, otherwise descending. inplace : bool, default False If True, perform operation in-place. - kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'stable' + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' Choice of sorting algorithm. See also :func:`numpy.sort` for more information. 'mergesort' and 'stable' are the only stable algorithms. na_position : {'first' or 'last'}, default 'last' diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index b5c33a41dd780..c98bd9fc9ea9d 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -164,7 +164,7 @@ def test_nlargest_n_duplicate_index(self, df_duplicates, n, order): tm.assert_frame_equal(result, expected) result = df.nlargest(n, order) - expected = df.sort_values(order, ascending=False).head(n) + expected = df.sort_values(order, ascending=False, kind="stable").head(n) tm.assert_frame_equal(result, expected) def test_nlargest_duplicate_keep_all_ties(self): diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index e2877acbdd040..08967eafcecb3 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -878,13 +878,17 @@ def test_sort_column_level_and_index_label( # transposing. For some cases this will result in a frame with # multiple column levels expected = ( - df_none.sort_values(by=sort_names, ascending=ascending, axis=0) + df_none.sort_values( + by=sort_names, ascending=ascending, axis=0, kind="stable" + ) .set_index(levels) .T ) # Compute result by transposing and sorting on axis=1. - result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) + result = df_idx.T.sort_values( + by=sort_names, ascending=ascending, axis=1, kind="stable" + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 5477ad75a56f7..e3943a52f3f91 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -21,6 +21,7 @@ to_datetime, ) import pandas._testing as tm +from pandas.util.version import Version def tests_value_counts_index_names_category_column(): @@ -285,7 +286,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) def test_against_frame_and_seriesgroupby( - education_df, groupby, normalize, name, sort, ascending, as_index, frame + education_df, groupby, normalize, name, sort, ascending, as_index, frame, request ): # test all parameters: # - Use column, array or function as by= parameter @@ -295,6 +296,13 @@ def test_against_frame_and_seriesgroupby( # - 3-way compare against: # - apply with :meth:`~DataFrame.value_counts` # - `~SeriesGroupBy.value_counts` + if sort and name == "proportion" and Version(np.__version__) >= Version("1.25"): + # TODO: Change the expected comparison + request.node.add_marker( + pytest.mark.xfail( + reason="default sorting is unstable; numpy sorting changed in 1.25" + ) + ) by = { "column": "country", "array": education_df["country"].values, @@ -441,22 +449,36 @@ def nulls_df(): ) +# TODO: Actually fix the expected result for the xfails @pytest.mark.parametrize( "group_dropna, count_dropna, expected_rows, expected_values", [ - ( + pytest.param( False, False, [0, 1, 3, 5, 7, 6, 8, 2, 4], [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], + marks=pytest.mark.xfail( + Version(np.__version__) >= Version("1.25"), + reason="default sorting is unstable; numpy sorting changed in 1.25", + ), + ), + pytest.param( + False, + True, + [0, 1, 3, 5, 2, 4], + [0.5, 0.5, 1.0, 1.0, 1.0, 1.0], + marks=pytest.mark.xfail( + Version(np.__version__) >= Version("1.25"), + reason="default sorting is unstable; numpy sorting changed in 1.25", + ), ), - (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), ], ) def test_dropna_combinations( - nulls_df, group_dropna, count_dropna, expected_rows, expected_values + nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request ): gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) @@ -558,8 +580,9 @@ def test_categorical_single_grouper_with_only_observed_categories( ) result = gp.value_counts(normalize=normalize) - expected_index = MultiIndex.from_tuples( - [ + if Version(np.__version__) < Version("1.25"): + # default sorting is unstable; numpy sorting changed + expected_tuples = [ ("FR", "male", "low"), ("FR", "female", "high"), ("FR", "male", "medium"), @@ -572,7 +595,25 @@ def test_categorical_single_grouper_with_only_observed_categories( ("US", "female", "medium"), ("US", "male", "high"), ("US", "male", "medium"), - ], + ] + else: + expected_tuples = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "male", "high"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] + + expected_index = MultiIndex.from_tuples( + expected_tuples, names=["country", "gender", "education"], ) @@ -651,20 +692,37 @@ def test_categorical_single_grouper_observed_true( ): # GH#46357 - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ] + if Version(np.__version__) < Version("1.25"): + # default sorting is unstable; numpy sorting changed + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] + else: + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("FR", "male", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] assert_categorical_single_grouper( education_df=education_df, @@ -721,26 +779,49 @@ def test_categorical_single_grouper_observed_false( ): # GH#46357 - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "male", "high"), - ("FR", "female", "medium"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "male", "medium"), - ("US", "male", "high"), - ("US", "female", "medium"), - ("US", "female", "low"), - ("ASIA", "male", "low"), - ("ASIA", "male", "high"), - ("ASIA", "female", "medium"), - ("ASIA", "female", "low"), - ("ASIA", "female", "high"), - ("ASIA", "male", "medium"), - ] + if Version(np.__version__) < Version("1.25"): + # default sorting is unstable; numpy sorting changed + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "male", "high"), + ("FR", "female", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "male", "medium"), + ("US", "male", "high"), + ("US", "female", "medium"), + ("US", "female", "low"), + ("ASIA", "male", "low"), + ("ASIA", "male", "high"), + ("ASIA", "female", "medium"), + ("ASIA", "female", "low"), + ("ASIA", "female", "high"), + ("ASIA", "male", "medium"), + ] + else: + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ("ASIA", "female", "medium"), + ("ASIA", "female", "low"), + ("ASIA", "female", "high"), + ("ASIA", "male", "medium"), + ("ASIA", "male", "low"), + ("ASIA", "male", "high"), + ] assert_categorical_single_grouper( education_df=education_df, @@ -869,20 +950,37 @@ def test_categorical_non_groupers( gp = education_df.groupby("country", as_index=as_index, observed=observed) result = gp.value_counts(normalize=normalize) - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ] + if Version(np.__version__) < Version("1.25"): + # default sorting is unstable; numpy sorting changed + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] + else: + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "male", "high"), + ("FR", "female", "medium"), + ("FR", "female", "low"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] expected_series = Series( data=expected_data, index=MultiIndex.from_tuples( From 72149150647b13a6f67667c777c031b995fb9ce6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 7 Jun 2023 15:05:49 -0700 Subject: [PATCH 3/6] Adjust more tests --- pandas/tests/frame/methods/test_nlargest.py | 14 +++++- pandas/tests/groupby/test_value_counts.py | 53 +++++++++++++-------- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index c98bd9fc9ea9d..0717a99de74d4 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -9,6 +9,7 @@ import pandas as pd import pandas._testing as tm +from pandas.util.version import Version @pytest.fixture @@ -155,7 +156,7 @@ def test_nlargest_n_identical_values(self): [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], ) @pytest.mark.parametrize("n", range(1, 6)) - def test_nlargest_n_duplicate_index(self, df_duplicates, n, order): + def test_nlargest_n_duplicate_index(self, df_duplicates, n, order, request): # GH#13412 df = df_duplicates @@ -165,6 +166,17 @@ def test_nlargest_n_duplicate_index(self, df_duplicates, n, order): result = df.nlargest(n, order) expected = df.sort_values(order, ascending=False, kind="stable").head(n) + if ( + n == 5 + and order in (["a"], ["a", "b"]) + and Version(np.__version__) >= Version("1.25") + ): + # TODO: Change the expected comparison + request.node.add_marker( + pytest.mark.xfail( + reason="default sorting is unstable; numpy sorting changed in 1.25" + ) + ) tm.assert_frame_equal(result, expected) def test_nlargest_duplicate_keep_all_ties(self): diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index e3943a52f3f91..ee83dbcefd305 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -252,16 +252,27 @@ def test_basic(education_df): result = education_df.groupby("country")[["gender", "education"]].value_counts( normalize=True ) + if Version(np.__version__) >= Version("1.25"): + # default sorting is unstable; numpy sorting changed in 1.25 + expected_tuples = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("US", "male", "low"), + ("US", "female", "high"), + ] + else: + expected_tuples = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ] expected = Series( data=[0.5, 0.25, 0.25, 0.5, 0.5], index=MultiIndex.from_tuples( - [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("US", "female", "high"), - ("US", "male", "low"), - ], + expected_tuples, names=["country", "gender", "education"], ), name="proportion", @@ -296,13 +307,13 @@ def test_against_frame_and_seriesgroupby( # - 3-way compare against: # - apply with :meth:`~DataFrame.value_counts` # - `~SeriesGroupBy.value_counts` - if sort and name == "proportion" and Version(np.__version__) >= Version("1.25"): - # TODO: Change the expected comparison - request.node.add_marker( - pytest.mark.xfail( - reason="default sorting is unstable; numpy sorting changed in 1.25" - ) - ) + # if sort and name == "proportion" and Version(np.__version__) >= Version("1.25"): + # # TODO: Change the expected comparison + # request.node.add_marker( + # pytest.mark.xfail( + # reason="default sorting is unstable; numpy sorting changed in 1.25" + # ) + # ) by = { "column": "country", "array": education_df["country"].values, @@ -712,10 +723,10 @@ def test_categorical_single_grouper_observed_true( expected_index = [ ("FR", "male", "low"), ("FR", "female", "high"), - ("FR", "female", "low"), - ("FR", "female", "medium"), ("FR", "male", "high"), ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), ("US", "female", "high"), ("US", "male", "low"), ("US", "female", "low"), @@ -815,12 +826,12 @@ def test_categorical_single_grouper_observed_false( ("US", "female", "medium"), ("US", "male", "high"), ("US", "male", "medium"), - ("ASIA", "female", "medium"), - ("ASIA", "female", "low"), ("ASIA", "female", "high"), - ("ASIA", "male", "medium"), - ("ASIA", "male", "low"), + ("ASIA", "female", "low"), + ("ASIA", "female", "medium"), ("ASIA", "male", "high"), + ("ASIA", "male", "low"), + ("ASIA", "male", "medium"), ] assert_categorical_single_grouper( @@ -972,8 +983,8 @@ def test_categorical_non_groupers( ("FR", "female", "high"), ("FR", "male", "medium"), ("FR", "male", "high"), - ("FR", "female", "medium"), ("FR", "female", "low"), + ("FR", "female", "medium"), ("US", "female", "high"), ("US", "male", "low"), ("US", "female", "low"), From eb0c1bee460fe2c9c27701bb2ce116d52e813a89 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 7 Jun 2023 16:34:07 -0700 Subject: [PATCH 4/6] Undo everything, even nanargsort --- pandas/core/sorting.py | 2 +- pandas/tests/frame/methods/test_nlargest.py | 16 +- .../tests/frame/methods/test_sort_values.py | 8 +- pandas/tests/groupby/test_value_counts.py | 233 +++++------------- 4 files changed, 67 insertions(+), 192 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 0c0b312c11c48..b63f3f28b8f6c 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -424,7 +424,7 @@ def lexsort_indexer( def nargsort( items: ArrayLike | Index | Series, - kind: SortKind = "stable", + kind: SortKind = "quicksort", ascending: bool = True, na_position: str = "last", key: Callable | None = None, diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 0717a99de74d4..b5c33a41dd780 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -9,7 +9,6 @@ import pandas as pd import pandas._testing as tm -from pandas.util.version import Version @pytest.fixture @@ -156,7 +155,7 @@ def test_nlargest_n_identical_values(self): [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], ) @pytest.mark.parametrize("n", range(1, 6)) - def test_nlargest_n_duplicate_index(self, df_duplicates, n, order, request): + def test_nlargest_n_duplicate_index(self, df_duplicates, n, order): # GH#13412 df = df_duplicates @@ -165,18 +164,7 @@ def test_nlargest_n_duplicate_index(self, df_duplicates, n, order, request): tm.assert_frame_equal(result, expected) result = df.nlargest(n, order) - expected = df.sort_values(order, ascending=False, kind="stable").head(n) - if ( - n == 5 - and order in (["a"], ["a", "b"]) - and Version(np.__version__) >= Version("1.25") - ): - # TODO: Change the expected comparison - request.node.add_marker( - pytest.mark.xfail( - reason="default sorting is unstable; numpy sorting changed in 1.25" - ) - ) + expected = df.sort_values(order, ascending=False).head(n) tm.assert_frame_equal(result, expected) def test_nlargest_duplicate_keep_all_ties(self): diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 08967eafcecb3..e2877acbdd040 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -878,17 +878,13 @@ def test_sort_column_level_and_index_label( # transposing. For some cases this will result in a frame with # multiple column levels expected = ( - df_none.sort_values( - by=sort_names, ascending=ascending, axis=0, kind="stable" - ) + df_none.sort_values(by=sort_names, ascending=ascending, axis=0) .set_index(levels) .T ) # Compute result by transposing and sorting on axis=1. - result = df_idx.T.sort_values( - by=sort_names, ascending=ascending, axis=1, kind="stable" - ) + result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index ee83dbcefd305..5477ad75a56f7 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -21,7 +21,6 @@ to_datetime, ) import pandas._testing as tm -from pandas.util.version import Version def tests_value_counts_index_names_category_column(): @@ -252,27 +251,16 @@ def test_basic(education_df): result = education_df.groupby("country")[["gender", "education"]].value_counts( normalize=True ) - if Version(np.__version__) >= Version("1.25"): - # default sorting is unstable; numpy sorting changed in 1.25 - expected_tuples = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("US", "male", "low"), - ("US", "female", "high"), - ] - else: - expected_tuples = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("US", "female", "high"), - ("US", "male", "low"), - ] expected = Series( data=[0.5, 0.25, 0.25, 0.5, 0.5], index=MultiIndex.from_tuples( - expected_tuples, + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ], names=["country", "gender", "education"], ), name="proportion", @@ -297,7 +285,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) def test_against_frame_and_seriesgroupby( - education_df, groupby, normalize, name, sort, ascending, as_index, frame, request + education_df, groupby, normalize, name, sort, ascending, as_index, frame ): # test all parameters: # - Use column, array or function as by= parameter @@ -307,13 +295,6 @@ def test_against_frame_and_seriesgroupby( # - 3-way compare against: # - apply with :meth:`~DataFrame.value_counts` # - `~SeriesGroupBy.value_counts` - # if sort and name == "proportion" and Version(np.__version__) >= Version("1.25"): - # # TODO: Change the expected comparison - # request.node.add_marker( - # pytest.mark.xfail( - # reason="default sorting is unstable; numpy sorting changed in 1.25" - # ) - # ) by = { "column": "country", "array": education_df["country"].values, @@ -460,36 +441,22 @@ def nulls_df(): ) -# TODO: Actually fix the expected result for the xfails @pytest.mark.parametrize( "group_dropna, count_dropna, expected_rows, expected_values", [ - pytest.param( + ( False, False, [0, 1, 3, 5, 7, 6, 8, 2, 4], [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], - marks=pytest.mark.xfail( - Version(np.__version__) >= Version("1.25"), - reason="default sorting is unstable; numpy sorting changed in 1.25", - ), - ), - pytest.param( - False, - True, - [0, 1, 3, 5, 2, 4], - [0.5, 0.5, 1.0, 1.0, 1.0, 1.0], - marks=pytest.mark.xfail( - Version(np.__version__) >= Version("1.25"), - reason="default sorting is unstable; numpy sorting changed in 1.25", - ), ), + (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), ], ) def test_dropna_combinations( - nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request + nulls_df, group_dropna, count_dropna, expected_rows, expected_values ): gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) @@ -591,9 +558,8 @@ def test_categorical_single_grouper_with_only_observed_categories( ) result = gp.value_counts(normalize=normalize) - if Version(np.__version__) < Version("1.25"): - # default sorting is unstable; numpy sorting changed - expected_tuples = [ + expected_index = MultiIndex.from_tuples( + [ ("FR", "male", "low"), ("FR", "female", "high"), ("FR", "male", "medium"), @@ -606,25 +572,7 @@ def test_categorical_single_grouper_with_only_observed_categories( ("US", "female", "medium"), ("US", "male", "high"), ("US", "male", "medium"), - ] - else: - expected_tuples = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "male", "high"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ] - - expected_index = MultiIndex.from_tuples( - expected_tuples, + ], names=["country", "gender", "education"], ) @@ -703,37 +651,20 @@ def test_categorical_single_grouper_observed_true( ): # GH#46357 - if Version(np.__version__) < Version("1.25"): - # default sorting is unstable; numpy sorting changed - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ] - else: - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ] + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] assert_categorical_single_grouper( education_df=education_df, @@ -790,49 +721,26 @@ def test_categorical_single_grouper_observed_false( ): # GH#46357 - if Version(np.__version__) < Version("1.25"): - # default sorting is unstable; numpy sorting changed - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "male", "high"), - ("FR", "female", "medium"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "male", "medium"), - ("US", "male", "high"), - ("US", "female", "medium"), - ("US", "female", "low"), - ("ASIA", "male", "low"), - ("ASIA", "male", "high"), - ("ASIA", "female", "medium"), - ("ASIA", "female", "low"), - ("ASIA", "female", "high"), - ("ASIA", "male", "medium"), - ] - else: - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ("ASIA", "female", "high"), - ("ASIA", "female", "low"), - ("ASIA", "female", "medium"), - ("ASIA", "male", "high"), - ("ASIA", "male", "low"), - ("ASIA", "male", "medium"), - ] + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "male", "high"), + ("FR", "female", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "male", "medium"), + ("US", "male", "high"), + ("US", "female", "medium"), + ("US", "female", "low"), + ("ASIA", "male", "low"), + ("ASIA", "male", "high"), + ("ASIA", "female", "medium"), + ("ASIA", "female", "low"), + ("ASIA", "female", "high"), + ("ASIA", "male", "medium"), + ] assert_categorical_single_grouper( education_df=education_df, @@ -961,37 +869,20 @@ def test_categorical_non_groupers( gp = education_df.groupby("country", as_index=as_index, observed=observed) result = gp.value_counts(normalize=normalize) - if Version(np.__version__) < Version("1.25"): - # default sorting is unstable; numpy sorting changed - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ] - else: - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "male", "high"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ] + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] expected_series = Series( data=expected_data, index=MultiIndex.from_tuples( From 679dae4dbeef108378158b23f266da646acc26fa Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 7 Jun 2023 18:19:37 -0700 Subject: [PATCH 5/6] xfail the relevant tests --- pandas/tests/frame/methods/test_nlargest.py | 15 +++- .../tests/frame/methods/test_sort_values.py | 16 +++- pandas/tests/groupby/test_value_counts.py | 89 +++++++++++++++++-- 3 files changed, 111 insertions(+), 9 deletions(-) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index b5c33a41dd780..17dea51263222 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -9,6 +9,7 @@ import pandas as pd import pandas._testing as tm +from pandas.util.version import Version @pytest.fixture @@ -155,7 +156,7 @@ def test_nlargest_n_identical_values(self): [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], ) @pytest.mark.parametrize("n", range(1, 6)) - def test_nlargest_n_duplicate_index(self, df_duplicates, n, order): + def test_nlargest_n_duplicate_index(self, df_duplicates, n, order, request): # GH#13412 df = df_duplicates @@ -165,6 +166,18 @@ def test_nlargest_n_duplicate_index(self, df_duplicates, n, order): result = df.nlargest(n, order) expected = df.sort_values(order, ascending=False).head(n) + if Version(np.__version__) >= Version("1.25") and ( + (order == ["a"] and n in (1, 2, 3, 4)) or (order == ["a", "b"]) and n == 5 + ): + request.node.add_marker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) tm.assert_frame_equal(result, expected) def test_nlargest_duplicate_keep_all_ties(self): diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index e2877acbdd040..3440c73d19ecf 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -12,6 +12,7 @@ date_range, ) import pandas._testing as tm +from pandas.util.version import Version class TestDataFrameSortValues: @@ -849,9 +850,22 @@ def ascending(request): class TestSortValuesLevelAsStr: def test_sort_index_level_and_column_label( - self, df_none, df_idx, sort_names, ascending + self, df_none, df_idx, sort_names, ascending, request ): # GH#14353 + if ( + Version(np.__version__) >= Version("1.25") + and request.node.callspec.id == "df_idx0-inner-True" + ): + request.node.add_marker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) # Get index levels from df_idx levels = df_idx.index.names diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 5477ad75a56f7..78c8b6b236b65 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -21,6 +21,7 @@ to_datetime, ) import pandas._testing as tm +from pandas.util.version import Version def tests_value_counts_index_names_category_column(): @@ -246,8 +247,18 @@ def test_bad_subset(education_df): gp.value_counts(subset=["country"]) -def test_basic(education_df): +def test_basic(education_df, request): # gh43564 + if Version(np.__version__) >= Version("1.25"): + request.node.add_marker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) result = education_df.groupby("country")[["gender", "education"]].value_counts( normalize=True ) @@ -285,7 +296,7 @@ def _frame_value_counts(df, keys, normalize, sort, ascending): @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize("frame", [True, False]) def test_against_frame_and_seriesgroupby( - education_df, groupby, normalize, name, sort, ascending, as_index, frame + education_df, groupby, normalize, name, sort, ascending, as_index, frame, request ): # test all parameters: # - Use column, array or function as by= parameter @@ -295,6 +306,16 @@ def test_against_frame_and_seriesgroupby( # - 3-way compare against: # - apply with :meth:`~DataFrame.value_counts` # - `~SeriesGroupBy.value_counts` + if Version(np.__version__) >= Version("1.25") and frame and sort and normalize: + request.node.add_marker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) by = { "column": "country", "array": education_df["country"].values, @@ -456,8 +477,18 @@ def nulls_df(): ], ) def test_dropna_combinations( - nulls_df, group_dropna, count_dropna, expected_rows, expected_values + nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request ): + if Version(np.__version__) >= Version("1.25") and not group_dropna: + request.node.add_marker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) columns = DataFrame() @@ -548,10 +579,20 @@ def test_data_frame_value_counts_dropna( ], ) def test_categorical_single_grouper_with_only_observed_categories( - education_df, as_index, observed, normalize, name, expected_data + education_df, as_index, observed, normalize, name, expected_data, request ): # Test single categorical grouper with only observed grouping categories # when non-groupers are also categorical + if Version(np.__version__) >= Version("1.25"): + request.node.add_marker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) gp = education_df.astype("category").groupby( "country", as_index=as_index, observed=observed @@ -647,10 +688,21 @@ def assert_categorical_single_grouper( ], ) def test_categorical_single_grouper_observed_true( - education_df, as_index, normalize, name, expected_data + education_df, as_index, normalize, name, expected_data, request ): # GH#46357 + if Version(np.__version__) >= Version("1.25"): + request.node.add_marker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + expected_index = [ ("FR", "male", "low"), ("FR", "female", "high"), @@ -717,10 +769,21 @@ def test_categorical_single_grouper_observed_true( ], ) def test_categorical_single_grouper_observed_false( - education_df, as_index, normalize, name, expected_data + education_df, as_index, normalize, name, expected_data, request ): # GH#46357 + if Version(np.__version__) >= Version("1.25"): + request.node.add_marker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + expected_index = [ ("FR", "male", "low"), ("FR", "female", "high"), @@ -858,10 +921,22 @@ def test_categorical_multiple_groupers( ], ) def test_categorical_non_groupers( - education_df, as_index, observed, normalize, name, expected_data + education_df, as_index, observed, normalize, name, expected_data, request ): # GH#46357 Test non-observed categories are included in the result, # regardless of `observed` + + if Version(np.__version__) >= Version("1.25"): + request.node.add_marker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + education_df = education_df.copy() education_df["gender"] = education_df["gender"].astype("category") education_df["education"] = education_df["education"].astype("category") From 3ee8d834edff48506d71be1a91029948e592b3e6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 8 Jun 2023 08:19:38 -0700 Subject: [PATCH 6/6] Add xfail to test_sort_column_level_and_index_label --- pandas/tests/frame/methods/test_sort_values.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 3440c73d19ecf..4c41632040dbe 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -881,7 +881,7 @@ def test_sort_index_level_and_column_label( tm.assert_frame_equal(result, expected) def test_sort_column_level_and_index_label( - self, df_none, df_idx, sort_names, ascending + self, df_none, df_idx, sort_names, ascending, request ): # GH#14353 @@ -900,6 +900,17 @@ def test_sort_column_level_and_index_label( # Compute result by transposing and sorting on axis=1. result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) + if Version(np.__version__) >= Version("1.25"): + request.node.add_marker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + tm.assert_frame_equal(result, expected) def test_sort_values_validate_ascending_for_value_error(self):