TST: Filter/test pyarrow PerformanceWarnings (#48093)

mroeschke · web-flow · commit 786c28fe929e · 2022-08-19T17:17:14.000-04:00
diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
@@ -4,6 +4,9 @@
 import numpy as np
 import pytest
 
+from pandas.compat import pa_version_under7p0
+from pandas.errors import PerformanceWarning
+
 import pandas as pd
 from pandas import (
     DatetimeIndex,
@@ -36,8 +39,16 @@ def test_value_counts(index_or_series_obj):
     # TODO(GH#32514): Order of entries with the same count is inconsistent
     #  on CI (gh-32449)
     if obj.duplicated().any():
-        result = result.sort_index()
-        expected = expected.sort_index()
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
+        ):
+            result = result.sort_index()
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
+        ):
+            expected = expected.sort_index()
     tm.assert_series_equal(result, expected)
 
 
@@ -70,8 +81,16 @@ def test_value_counts_null(null_obj, index_or_series_obj):
     if obj.duplicated().any():
         # TODO(GH#32514):
         #  Order of entries with the same count is inconsistent on CI (gh-32449)
-        expected = expected.sort_index()
-        result = result.sort_index()
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
+        ):
+            expected = expected.sort_index()
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
+        ):
+            result = result.sort_index()
 
     if not isinstance(result.dtype, np.dtype):
         # i.e IntegerDtype
@@ -84,8 +103,16 @@ def test_value_counts_null(null_obj, index_or_series_obj):
     if obj.duplicated().any():
         # TODO(GH#32514):
         #  Order of entries with the same count is inconsistent on CI (gh-32449)
-        expected = expected.sort_index()
-        result = result.sort_index()
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
+        ):
+            expected = expected.sort_index()
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
+        ):
+            result = result.sort_index()
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -18,7 +18,10 @@
 import numpy as np
 import pytest
 
-from pandas.compat import pa_version_under6p0
+from pandas.compat import (
+    pa_version_under6p0,
+    pa_version_under7p0,
+)
 from pandas.errors import PerformanceWarning
 
 import pandas as pd
@@ -167,6 +170,22 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
 
 
 class TestMethods(base.BaseMethodsTests):
+    def test_argsort(self, data_for_sorting):
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow",
+        ):
+            super().test_argsort(data_for_sorting)
+
+    def test_argsort_missing(self, data_missing_for_sorting):
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
+        ):
+            super().test_argsort_missing(data_missing_for_sorting)
+
     def test_argmin_argmax(
         self, data_for_sorting, data_missing_for_sorting, na_value, request
     ):
@@ -210,6 +229,89 @@ def test_argreduce_series(
             data_missing_for_sorting, op_name, skipna, expected
         )
 
+    @pytest.mark.parametrize("dropna", [True, False])
+    def test_value_counts(self, all_data, dropna, request):
+        all_data = all_data[:10]
+        if dropna:
+            other = all_data[~all_data.isna()]
+        else:
+            other = all_data
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(all_data.dtype, "storage", "") == "pyarrow"
+            and not (dropna and "data_missing" in request.node.nodeid),
+        ):
+            result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(other.dtype, "storage", "") == "pyarrow"
+            and not (dropna and "data_missing" in request.node.nodeid),
+        ):
+            expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
+
+        self.assert_series_equal(result, expected)
+
+    @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
+    def test_value_counts_with_normalize(self, data):
+        super().test_value_counts_with_normalize(data)
+
+    def test_argsort_missing_array(self, data_missing_for_sorting):
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
+        ):
+            super().test_argsort_missing(data_missing_for_sorting)
+
+    @pytest.mark.parametrize(
+        "na_position, expected",
+        [
+            ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))),
+            ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))),
+        ],
+    )
+    def test_nargsort(self, data_missing_for_sorting, na_position, expected):
+        # GH 25439
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
+        ):
+            super().test_nargsort(data_missing_for_sorting, na_position, expected)
+
+    @pytest.mark.parametrize("ascending", [True, False])
+    def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow",
+        ):
+            super().test_sort_values(data_for_sorting, ascending, sort_by_key)
+
+    @pytest.mark.parametrize("ascending", [True, False])
+    def test_sort_values_missing(
+        self, data_missing_for_sorting, ascending, sort_by_key
+    ):
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(data_missing_for_sorting.dtype, "storage", "") == "pyarrow",
+        ):
+            super().test_sort_values_missing(
+                data_missing_for_sorting, ascending, sort_by_key
+            )
+
+    @pytest.mark.parametrize("ascending", [True, False])
+    def test_sort_values_frame(self, data_for_sorting, ascending):
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(data_for_sorting.dtype, "storage", "") == "pyarrow",
+        ):
+            super().test_sort_values_frame(data_for_sorting, ascending)
+
 
 class TestCasting(base.BaseCastingTests):
     pass
@@ -236,8 +338,41 @@ class TestPrinting(base.BasePrintingTests):
 
 
 class TestGroupBy(base.BaseGroupbyTests):
-    def test_groupby_extension_transform(self, data_for_grouping, request):
-        super().test_groupby_extension_transform(data_for_grouping)
+    @pytest.mark.parametrize("as_index", [True, False])
+    def test_groupby_extension_agg(self, as_index, data_for_grouping):
+        df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(data_for_grouping.dtype, "storage", "") == "pyarrow",
+        ):
+            result = df.groupby("B", as_index=as_index).A.mean()
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(data_for_grouping.dtype, "storage", "") == "pyarrow",
+        ):
+            _, uniques = pd.factorize(data_for_grouping, sort=True)
+
+        if as_index:
+            index = pd.Index._with_infer(uniques, name="B")
+            expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
+            self.assert_series_equal(result, expected)
+        else:
+            expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0, 4.0]})
+            self.assert_frame_equal(result, expected)
+
+    def test_groupby_extension_transform(self, data_for_grouping):
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under7p0
+            and getattr(data_for_grouping.dtype, "storage", "") == "pyarrow",
+        ):
+            super().test_groupby_extension_transform(data_for_grouping)
+
+    @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
+    def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
+        super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
 
 
 class Test2DCompat(base.Dim2CompatTests):
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -9,6 +9,8 @@
     algos as libalgos,
     hashtable as ht,
 )
+from pandas.compat import pa_version_under7p0
+from pandas.errors import PerformanceWarning
 import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.common import (
@@ -50,7 +52,13 @@ class TestFactorize:
     @pytest.mark.parametrize("sort", [True, False])
     def test_factorize(self, index_or_series_obj, sort):
         obj = index_or_series_obj
-        result_codes, result_uniques = obj.factorize(sort=sort)
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            sort
+            and pa_version_under7p0
+            and getattr(obj.dtype, "storage", "") == "pyarrow",
+        ):
+            result_codes, result_uniques = obj.factorize(sort=sort)
 
         constructor = Index
         if isinstance(obj, MultiIndex):
@@ -64,7 +72,11 @@ def test_factorize(self, index_or_series_obj, sort):
             expected_uniques = expected_uniques.astype(object)
 
         if sort:
-            expected_uniques = expected_uniques.sort_values()
+            with tm.maybe_produces_warning(
+                PerformanceWarning,
+                pa_version_under7p0 and getattr(obj.dtype, "storage", "") == "pyarrow",
+            ):
+                expected_uniques = expected_uniques.sort_values()
 
         # construct an integer ndarray so that
         # `expected_uniques.take(expected_codes)` is equal to `obj`