From 2e55728f0fd268448cfd0a327790fae75e6bc98d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 9 May 2021 10:34:24 +0100 Subject: [PATCH 1/3] [ArrowStringArray] TST: paramerterise str.splt tests --- asv_bench/benchmarks/strings.py | 23 ++- pandas/tests/strings/test_split_partition.py | 187 +++++++++++-------- 2 files changed, 129 insertions(+), 81 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 45a9053954569..e77f74b90d6c8 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -230,17 +230,24 @@ def time_contains(self, dtype, regex): class Split: - params = [True, False] - param_names = ["expand"] + params = (["str", "string", "arrow_string"], [None, "-", "--"], [True, False]) + param_names = ["dtype", "pat", "expand"] - def setup(self, expand): - self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--") + def setup(self, dtype, pat, expand): + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + if pat is None: + pat = " " + try: + self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join(pat) + except ImportError: + raise NotImplementedError - def time_split(self, expand): - self.s.str.split("--", expand=expand) + def time_split(self, dtype, pat, expand): + self.s.str.split(pat, expand=expand) - def time_rsplit(self, expand): - self.s.str.rsplit("--", expand=expand) + def time_rsplit(self, dtype, pat, expand): + self.s.str.rsplit(pat, expand=expand) class Dummies: diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 6df8fa805955d..f153554cc9057 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -13,22 +13,29 @@ ) -def test_split(): - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) +def test_split(any_string_dtype): + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.split("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char - values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) + values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) result = values.str.split("__") tm.assert_series_equal(result, exp) result = values.str.split("__", expand=False) tm.assert_series_equal(result, exp) - # mixed + # regex split + values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) + result = values.str.split("[,_]") + exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) + tm.assert_series_equal(result, exp) + + +def test_split_object_mixed(): mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) result = mixed.str.split("_") exp = Series( @@ -50,17 +57,10 @@ def test_split(): assert isinstance(result, Series) tm.assert_almost_equal(result, exp) - # regex split - values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) - result = values.str.split("[,_]") - exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - tm.assert_series_equal(result, exp) - -@pytest.mark.parametrize("dtype", [object, "string"]) @pytest.mark.parametrize("method", ["split", "rsplit"]) -def test_split_n(dtype, method): - s = Series(["a b", pd.NA, "b c"], dtype=dtype) +def test_split_n(any_string_dtype, method): + s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype) expected = Series([["a", "b"], pd.NA, ["b", "c"]]) result = getattr(s.str, method)(" ", n=None) @@ -70,20 +70,34 @@ def test_split_n(dtype, method): tm.assert_series_equal(result, expected) -def test_rsplit(): - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) +def test_rsplit(any_string_dtype): + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.rsplit("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char - values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"]) + values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) result = values.str.rsplit("__") tm.assert_series_equal(result, exp) result = values.str.rsplit("__", expand=False) tm.assert_series_equal(result, exp) + # regex split is not supported by rsplit + values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) + result = values.str.rsplit("[,_]") + exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) + tm.assert_series_equal(result, exp) + + # setting max number of splits, make sure it's from reverse + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) + result = values.str.rsplit("_", n=1) + exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) + tm.assert_series_equal(result, exp) + + +def test_rsplit_object_mixed(): # mixed mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) result = mixed.str.rsplit("_") @@ -106,27 +120,15 @@ def test_rsplit(): assert isinstance(result, Series) tm.assert_almost_equal(result, exp) - # regex split is not supported by rsplit - values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"]) - result = values.str.rsplit("[,_]") - exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) - tm.assert_series_equal(result, exp) - # setting max number of splits, make sure it's from reverse - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - result = values.str.rsplit("_", n=1) - exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) - tm.assert_series_equal(result, exp) - - -def test_split_blank_string(): +def test_split_blank_string(any_string_dtype): # expand blank split GH 20067 - values = Series([""], name="test") + values = Series([""], name="test", dtype=any_string_dtype) result = values.str.split(expand=True) - exp = DataFrame([[]]) # NOTE: this is NOT an empty DataFrame + exp = DataFrame([[]], dtype=any_string_dtype) # NOTE: this is NOT an empty df tm.assert_frame_equal(result, exp) - values = Series(["a b c", "a b", "", " "], name="test") + values = Series(["a b c", "a b", "", " "], name="test", dtype=any_string_dtype) result = values.str.split(expand=True) exp = DataFrame( [ @@ -134,14 +136,15 @@ def test_split_blank_string(): ["a", "b", np.nan], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], - ] + ], + dtype=any_string_dtype, ) tm.assert_frame_equal(result, exp) -def test_split_noargs(): +def test_split_noargs(any_string_dtype): # #1859 - s = Series(["Wes McKinney", "Travis Oliphant"]) + s = Series(["Wes McKinney", "Travis Oliphant"], dtype=any_string_dtype) result = s.str.split() expected = ["Travis", "Oliphant"] assert result[1] == expected @@ -149,44 +152,64 @@ def test_split_noargs(): assert result[1] == expected -def test_split_maxsplit(): +@pytest.mark.parametrize( + "data, pat", + [ + (["bd asdf jfg", "kjasdflqw asdfnfk"], None), + (["bd asdf jfg", "kjasdflqw asdfnfk"], "asdf"), + (["bd_asdf_jfg", "kjasdflqw_asdfnfk"], "_"), + ], +) +def test_split_maxsplit(data, pat, any_string_dtype): # re.split 0, str.split -1 - s = Series(["bd asdf jfg", "kjasdflqw asdfnfk"]) + s = Series(data, dtype=any_string_dtype) - result = s.str.split(n=-1) - xp = s.str.split() + result = s.str.split(pat=pat, n=-1) + xp = s.str.split(pat=pat) tm.assert_series_equal(result, xp) - result = s.str.split(n=0) - tm.assert_series_equal(result, xp) - - xp = s.str.split("asdf") - result = s.str.split("asdf", n=0) - tm.assert_series_equal(result, xp) - - result = s.str.split("asdf", n=-1) + result = s.str.split(pat=pat, n=0) tm.assert_series_equal(result, xp) -def test_split_no_pat_with_nonzero_n(): - s = Series(["split once", "split once too!"]) - result = s.str.split(n=1) - expected = Series({0: ["split", "once"], 1: ["split", "once too!"]}) +@pytest.mark.parametrize( + "data, pat, expected", + [ + ( + ["split once", "split once too!"], + None, + Series({0: ["split", "once"], 1: ["split", "once too!"]}), + ), + ( + ["split_once", "split_once_too!"], + "_", + Series({0: ["split", "once"], 1: ["split", "once_too!"]}), + ), + ], +) +def test_split_no_pat_with_nonzero_n(data, pat, expected, any_string_dtype): + s = Series(data, dtype=any_string_dtype) + result = s.str.split(pat=pat, n=1) tm.assert_series_equal(expected, result, check_index_type=False) -def test_split_to_dataframe(): - s = Series(["nosplit", "alsonosplit"]) +def test_split_to_dataframe(any_string_dtype): + s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype) result = s.str.split("_", expand=True) - exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) + exp = DataFrame({0: Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)}) tm.assert_frame_equal(result, exp) - s = Series(["some_equal_splits", "with_no_nans"]) + s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype) result = s.str.split("_", expand=True) - exp = DataFrame({0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}, + dtype=any_string_dtype, + ) tm.assert_frame_equal(result, exp) - s = Series(["some_unequal_splits", "one_of_these_things_is_not"]) + s = Series( + ["some_unequal_splits", "one_of_these_things_is_not"], dtype=any_string_dtype + ) result = s.str.split("_", expand=True) exp = DataFrame( { @@ -196,14 +219,19 @@ def test_split_to_dataframe(): 3: [np.nan, "things"], 4: [np.nan, "is"], 5: [np.nan, "not"], - } + }, + dtype=any_string_dtype, ) tm.assert_frame_equal(result, exp) - s = Series(["some_splits", "with_index"], index=["preserve", "me"]) + s = Series( + ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype + ) result = s.str.split("_", expand=True) exp = DataFrame( - {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] + {0: ["some", "with"], 1: ["splits", "index"]}, + index=["preserve", "me"], + dtype=any_string_dtype, ) tm.assert_frame_equal(result, exp) @@ -250,15 +278,23 @@ def test_split_to_multiindex_expand(): idx.str.split("_", expand="not_a_boolean") -def test_rsplit_to_dataframe_expand(): - s = Series(["nosplit", "alsonosplit"]) +def test_rsplit_to_dataframe_expand(any_string_dtype, request): + if any_string_dtype != "object": + reason = 'Attribute "dtype" are different' + mark = pytest.mark.xfail(reason=reason, raises=AssertionError) + request.node.add_marker(mark) + + s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype) result = s.str.rsplit("_", expand=True) - exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) + exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype) tm.assert_frame_equal(result, exp) - s = Series(["some_equal_splits", "with_no_nans"]) + s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype) result = s.str.rsplit("_", expand=True) - exp = DataFrame({0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}, + dtype=any_string_dtype, + ) tm.assert_frame_equal(result, exp) result = s.str.rsplit("_", expand=True, n=2) @@ -297,30 +333,35 @@ def test_rsplit_to_multiindex_expand(): assert result.nlevels == 2 -def test_split_nan_expand(): +def test_split_nan_expand(any_string_dtype): # gh-18450 - s = Series(["foo,bar,baz", np.nan]) + s = Series(["foo,bar,baz", np.nan], dtype=any_string_dtype) result = s.str.split(",", expand=True) - exp = DataFrame([["foo", "bar", "baz"], [np.nan, np.nan, np.nan]]) + exp = DataFrame( + [["foo", "bar", "baz"], [np.nan, np.nan, np.nan]], dtype=any_string_dtype + ) tm.assert_frame_equal(result, exp) - # check that these are actually np.nan and not None + # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - assert all(np.isnan(x) for x in result.iloc[1]) + if any_string_dtype == "object": + assert all(np.isnan(x) for x in result.iloc[1]) + else: + assert all(x is pd.NA for x in result.iloc[1]) -def test_split_with_name(): +def test_split_with_name(any_string_dtype): # GH 12617 # should preserve name - s = Series(["a,b", "c,d"], name="xxx") + s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype) res = s.str.split(",") exp = Series([["a", "b"], ["c", "d"]], name="xxx") tm.assert_series_equal(res, exp) res = s.str.split(",", expand=True) - exp = DataFrame([["a", "b"], ["c", "d"]]) + exp = DataFrame([["a", "b"], ["c", "d"]], dtype=any_string_dtype) tm.assert_frame_equal(res, exp) idx = Index(["a,b", "c,d"], name="xxx") From dbddda1cc1a8f09f03bbc8c48127a396b7fbb156 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 9 May 2021 11:08:57 +0100 Subject: [PATCH 2/3] fixup test_rsplit_to_dataframe_expand --- pandas/tests/strings/test_split_partition.py | 24 ++++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index f153554cc9057..bcd96ed04fbda 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -278,12 +278,7 @@ def test_split_to_multiindex_expand(): idx.str.split("_", expand="not_a_boolean") -def test_rsplit_to_dataframe_expand(any_string_dtype, request): - if any_string_dtype != "object": - reason = 'Attribute "dtype" are different' - mark = pytest.mark.xfail(reason=reason, raises=AssertionError) - request.node.add_marker(mark) - +def test_rsplit_to_dataframe_expand(any_string_dtype): s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype) result = s.str.rsplit("_", expand=True) exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype) @@ -298,17 +293,26 @@ def test_rsplit_to_dataframe_expand(any_string_dtype, request): tm.assert_frame_equal(result, exp) result = s.str.rsplit("_", expand=True, n=2) - exp = DataFrame({0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}, + dtype=any_string_dtype, + ) tm.assert_frame_equal(result, exp) result = s.str.rsplit("_", expand=True, n=1) - exp = DataFrame({0: ["some_equal", "with_no"], 1: ["splits", "nans"]}) + exp = DataFrame( + {0: ["some_equal", "with_no"], 1: ["splits", "nans"]}, dtype=any_string_dtype + ) tm.assert_frame_equal(result, exp) - s = Series(["some_splits", "with_index"], index=["preserve", "me"]) + s = Series( + ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype + ) result = s.str.rsplit("_", expand=True) exp = DataFrame( - {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] + {0: ["some", "with"], 1: ["splits", "index"]}, + index=["preserve", "me"], + dtype=any_string_dtype, ) tm.assert_frame_equal(result, exp) From 005d881eff02a28abbe29f2741cd98984a28bc73 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 10 May 2021 11:58:01 +0100 Subject: [PATCH 3/3] remove pat from benchmark for now --- asv_bench/benchmarks/strings.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index e77f74b90d6c8..79ea2a4fba284 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -230,24 +230,22 @@ def time_contains(self, dtype, regex): class Split: - params = (["str", "string", "arrow_string"], [None, "-", "--"], [True, False]) - param_names = ["dtype", "pat", "expand"] + params = (["str", "string", "arrow_string"], [True, False]) + param_names = ["dtype", "expand"] - def setup(self, dtype, pat, expand): + def setup(self, dtype, expand): from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - if pat is None: - pat = " " try: - self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join(pat) + self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("--") except ImportError: raise NotImplementedError - def time_split(self, dtype, pat, expand): - self.s.str.split(pat, expand=expand) + def time_split(self, dtype, expand): + self.s.str.split("--", expand=expand) - def time_rsplit(self, dtype, pat, expand): - self.s.str.rsplit(pat, expand=expand) + def time_rsplit(self, dtype, expand): + self.s.str.rsplit("--", expand=expand) class Dummies: