diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 55a6cc48ebfc8..762fc060c0ffd 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -4,6 +4,9 @@ import numpy as np import pytest +from pandas.compat import pa_version_under7p0 +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import ( DatetimeIndex, @@ -36,8 +39,16 @@ def test_value_counts(index_or_series_obj): # TODO(GH#32514): Order of entries with the same count is inconsistent # on CI (gh-32449) if obj.duplicated().any(): - result = result.sort_index() - expected = expected.sort_index() + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow", + ): + result = result.sort_index() + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow", + ): + expected = expected.sort_index() tm.assert_series_equal(result, expected) @@ -70,8 +81,16 @@ def test_value_counts_null(null_obj, index_or_series_obj): if obj.duplicated().any(): # TODO(GH#32514): # Order of entries with the same count is inconsistent on CI (gh-32449) - expected = expected.sort_index() - result = result.sort_index() + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow", + ): + expected = expected.sort_index() + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow", + ): + result = result.sort_index() if not isinstance(result.dtype, np.dtype): # i.e IntegerDtype @@ -84,8 +103,16 @@ def test_value_counts_null(null_obj, index_or_series_obj): if obj.duplicated().any(): # TODO(GH#32514): # Order of entries with the same count is inconsistent on CI (gh-32449) - expected = expected.sort_index() - result = result.sort_index() + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow", + ): + expected = expected.sort_index() + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow", + ): + result = result.sort_index() tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index e4293d6d70e38..fb7c0b32ff16d 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,7 +18,10 @@ import numpy as np import pytest -from pandas.compat import pa_version_under6p0 +from pandas.compat import ( + pa_version_under6p0, + pa_version_under7p0, +) from pandas.errors import PerformanceWarning import pandas as pd @@ -167,6 +170,22 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): class TestMethods(base.BaseMethodsTests): + def test_argsort(self, data_for_sorting): + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow", + ): + super().test_argsort(data_for_sorting) + + def test_argsort_missing(self, data_missing_for_sorting): + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow", + ): + super().test_argsort_missing(data_missing_for_sorting) + def test_argmin_argmax( self, data_for_sorting, data_missing_for_sorting, na_value, request ): @@ -210,6 +229,89 @@ def test_argreduce_series( data_missing_for_sorting, op_name, skipna, expected ) + @pytest.mark.parametrize("dropna", [True, False]) + def test_value_counts(self, all_data, dropna, request): + all_data = all_data[:10] + if dropna: + other = all_data[~all_data.isna()] + else: + other = all_data + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(all_data.dtype, "storage", "") == "pyarrow" + and not (dropna and "data_missing" in request.node.nodeid), + ): + result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(other.dtype, "storage", "") == "pyarrow" + and not (dropna and "data_missing" in request.node.nodeid), + ): + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() + + self.assert_series_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning") + def test_value_counts_with_normalize(self, data): + super().test_value_counts_with_normalize(data) + + def test_argsort_missing_array(self, data_missing_for_sorting): + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow", + ): + super().test_argsort_missing(data_missing_for_sorting) + + @pytest.mark.parametrize( + "na_position, expected", + [ + ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))), + ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))), + ], + ) + def test_nargsort(self, data_missing_for_sorting, na_position, expected): + # GH 25439 + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow", + ): + super().test_nargsort(data_missing_for_sorting, na_position, expected) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values(self, data_for_sorting, ascending, sort_by_key): + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow", + ): + super().test_sort_values(data_for_sorting, ascending, sort_by_key) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values_missing( + self, data_missing_for_sorting, ascending, sort_by_key + ): + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow", + ): + super().test_sort_values_missing( + data_missing_for_sorting, ascending, sort_by_key + ) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values_frame(self, data_for_sorting, ascending): + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow", + ): + super().test_sort_values_frame(data_for_sorting, ascending) + class TestCasting(base.BaseCastingTests): pass @@ -236,8 +338,41 @@ class TestPrinting(base.BasePrintingTests): class TestGroupBy(base.BaseGroupbyTests): - def test_groupby_extension_transform(self, data_for_grouping, request): - super().test_groupby_extension_transform(data_for_grouping) + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(data_for_grouping.dtype, "storage", "") == "pyarrow", + ): + result = df.groupby("B", as_index=as_index).A.mean() + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(data_for_grouping.dtype, "storage", "") == "pyarrow", + ): + _, uniques = pd.factorize(data_for_grouping, sort=True) + + if as_index: + index = pd.Index._with_infer(uniques, name="B") + expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A") + self.assert_series_equal(result, expected) + else: + expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0, 4.0]}) + self.assert_frame_equal(result, expected) + + def test_groupby_extension_transform(self, data_for_grouping): + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 + and getattr(data_for_grouping.dtype, "storage", "") == "pyarrow", + ): + super().test_groupby_extension_transform(data_for_grouping) + + @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning") + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): + super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) class Test2DCompat(base.Dim2CompatTests): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index def63c552e059..eb2b8be2d716b 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -9,6 +9,8 @@ algos as libalgos, hashtable as ht, ) +from pandas.compat import pa_version_under7p0 +from pandas.errors import PerformanceWarning import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( @@ -50,7 +52,13 @@ class TestFactorize: @pytest.mark.parametrize("sort", [True, False]) def test_factorize(self, index_or_series_obj, sort): obj = index_or_series_obj - result_codes, result_uniques = obj.factorize(sort=sort) + with tm.maybe_produces_warning( + PerformanceWarning, + sort + and pa_version_under7p0 + and getattr(obj.dtype, "storage", "") == "pyarrow", + ): + result_codes, result_uniques = obj.factorize(sort=sort) constructor = Index if isinstance(obj, MultiIndex): @@ -64,7 +72,11 @@ def test_factorize(self, index_or_series_obj, sort): expected_uniques = expected_uniques.astype(object) if sort: - expected_uniques = expected_uniques.sort_values() + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow", + ): + expected_uniques = expected_uniques.sort_values() # construct an integer ndarray so that # `expected_uniques.take(expected_codes)` is equal to `obj`