From ecf04f11f23efe1f1f6562df27e680cbaf34f0fa Mon Sep 17 00:00:00 2001 From: Artem Vorobyev Date: Tue, 18 Apr 2023 16:53:03 +0200 Subject: [PATCH 1/7] DF join cow tests --- pandas/tests/copy_view/test_functions.py | 90 ++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 53d72baf7da4e..caa9b700a69bd 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -310,3 +310,93 @@ def test_merge_copy_keyword(using_copy_on_write, copy): else: assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) + + +def test_join_on_key(using_copy_on_write): + """Test if DataFrame.join applies Copy-On-Write optimization. + + GIVEN two DataFrame instances + WHEN DataFrame.join is called for one of them + THEN check that the result DataFrame instance + shares the same memory with original dataframes until it is edited. + """ + df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) + df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}) + df1_orig = df1.copy() + df2_orig = df2.copy() + + result = df1.join(df2.set_index("key"), on="key") + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) + assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key")) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + + result.iloc[0, 1] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + + result.iloc[0, 2] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + tm.assert_frame_equal(df1, df1_orig) + tm.assert_frame_equal(df2, df2_orig) + + +def test_join_multiple_dataframes_on_key(using_copy_on_write): + """Test if DataFrame.join applies Copy-On-Write optimization. + + GIVEN a DataFrame instance and a list of DataFrame instances to be joined + WHEN DataFrame.join is called for original DataFrame instance + THEN check that the result DataFrame instance + shares the same memory with original dataframes until it is edited. + """ + df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}).set_index("key") + dfs_list = [ + DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}).set_index("key"), + DataFrame({"key": ["a", "b", "c"], "c": [7, 8, 9]}).set_index("key"), + ] + df1_orig = df1.copy() + dfs_list_orig = [df.copy() for df in dfs_list] + + result = df1.join(dfs_list) + + if using_copy_on_write: + assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) + assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + assert np.shares_memory(get_array(result.index), get_array(df1.index)) + assert not np.shares_memory( + get_array(result.index), get_array(dfs_list[0].index) + ) + assert not np.shares_memory( + get_array(result.index), get_array(dfs_list[1].index) + ) + else: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) + assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + + result.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) + assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) + assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + + result.iloc[0, 1] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b")) + assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + + result.iloc[0, 2] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c")) + + tm.assert_frame_equal(df1, df1_orig) + for df, df_orig in zip(dfs_list, dfs_list_orig): + tm.assert_frame_equal(df, df_orig) From 14970640c94fbaa215137f182d98dcf40cd2c4f4 Mon Sep 17 00:00:00 2001 From: SecretLake Date: Thu, 20 Apr 2023 13:19:55 +0000 Subject: [PATCH 2/7] PR feedback --- pandas/tests/copy_view/test_functions.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index caa9b700a69bd..a59d800e6bb06 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -3,6 +3,7 @@ from pandas import ( DataFrame, + Index, Series, concat, merge, @@ -313,13 +314,6 @@ def test_merge_copy_keyword(using_copy_on_write, copy): def test_join_on_key(using_copy_on_write): - """Test if DataFrame.join applies Copy-On-Write optimization. - - GIVEN two DataFrame instances - WHEN DataFrame.join is called for one of them - THEN check that the result DataFrame instance - shares the same memory with original dataframes until it is edited. - """ df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}) df1_orig = df1.copy() @@ -349,17 +343,11 @@ def test_join_on_key(using_copy_on_write): def test_join_multiple_dataframes_on_key(using_copy_on_write): - """Test if DataFrame.join applies Copy-On-Write optimization. - - GIVEN a DataFrame instance and a list of DataFrame instances to be joined - WHEN DataFrame.join is called for original DataFrame instance - THEN check that the result DataFrame instance - shares the same memory with original dataframes until it is edited. - """ - df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}).set_index("key") + df_index = Index(["a", "b", "c"], name="key") + df1 = DataFrame({"a": [1, 2, 3]}, index=df_index) dfs_list = [ - DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}).set_index("key"), - DataFrame({"key": ["a", "b", "c"], "c": [7, 8, 9]}).set_index("key"), + DataFrame({"b": [4, 5, 6]}, index=df_index), + DataFrame({"c": [7, 8, 9]}, index=df_index), ] df1_orig = df1.copy() dfs_list_orig = [df.copy() for df in dfs_list] From 4c6c1bf82a3d743a771b0c794bb43de00372b420 Mon Sep 17 00:00:00 2001 From: SecretLake Date: Thu, 20 Apr 2023 13:34:52 +0000 Subject: [PATCH 3/7] Integrate PR feedback for test_join_ok_key --- pandas/tests/copy_view/test_functions.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index a59d800e6bb06..d2debd29a125b 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -314,12 +314,15 @@ def test_merge_copy_keyword(using_copy_on_write, copy): def test_join_on_key(using_copy_on_write): - df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]}) - df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]}) + df_index = Index(["a", "b", "c"], name="key") + + df1 = DataFrame({"a": [1, 2, 3]}, index=df_index) + df2 = DataFrame({"b": [4, 5, 6]}, index=df_index) + df1_orig = df1.copy() df2_orig = df2.copy() - result = df1.join(df2.set_index("key"), on="key") + result = df1.join(df2, on="key") if using_copy_on_write: assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) @@ -330,12 +333,12 @@ def test_join_on_key(using_copy_on_write): assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) - result.iloc[0, 1] = 0 + result.loc[0, 1] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) - result.iloc[0, 2] = 0 + result.loc[0, 2] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) tm.assert_frame_equal(df1, df1_orig) @@ -344,11 +347,13 @@ def test_join_on_key(using_copy_on_write): def test_join_multiple_dataframes_on_key(using_copy_on_write): df_index = Index(["a", "b", "c"], name="key") + df1 = DataFrame({"a": [1, 2, 3]}, index=df_index) dfs_list = [ DataFrame({"b": [4, 5, 6]}, index=df_index), DataFrame({"c": [7, 8, 9]}, index=df_index), ] + df1_orig = df1.copy() dfs_list_orig = [df.copy() for df in dfs_list] From 967c46c2182e07d7865bc0ae60789b758bbd026f Mon Sep 17 00:00:00 2001 From: SecretLake Date: Thu, 20 Apr 2023 15:48:43 +0000 Subject: [PATCH 4/7] Fix tests --- pandas/tests/copy_view/test_functions.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index d2debd29a125b..07cedae3b74b0 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -314,10 +314,8 @@ def test_merge_copy_keyword(using_copy_on_write, copy): def test_join_on_key(using_copy_on_write): - df_index = Index(["a", "b", "c"], name="key") - - df1 = DataFrame({"a": [1, 2, 3]}, index=df_index) - df2 = DataFrame({"b": [4, 5, 6]}, index=df_index) + df1 = DataFrame({"a": [1, 2, 3]}, index=Index(["a", "b", "c"], name="key")) + df2 = DataFrame({"b": [4, 5, 6]}, index=Index(["a", "b", "c"], name="key")) df1_orig = df1.copy() df2_orig = df2.copy() @@ -327,31 +325,31 @@ def test_join_on_key(using_copy_on_write): if using_copy_on_write: assert np.shares_memory(get_array(result, "a"), get_array(df1, "a")) assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) - assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) - assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key")) + assert np.shares_memory(get_array(result.index), get_array(df1.index)) + assert not np.shares_memory(get_array(result.index), get_array(df2.index)) else: assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) - result.loc[0, 1] = 0 + result.iloc[0, 0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a")) assert np.shares_memory(get_array(result, "b"), get_array(df2, "b")) - result.loc[0, 2] = 0 + result.iloc[0, 1] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) + tm.assert_frame_equal(df1, df1_orig) tm.assert_frame_equal(df2, df2_orig) def test_join_multiple_dataframes_on_key(using_copy_on_write): - df_index = Index(["a", "b", "c"], name="key") + df1 = DataFrame({"a": [1, 2, 3]}, index=Index(["a", "b", "c"], name="key")) - df1 = DataFrame({"a": [1, 2, 3]}, index=df_index) dfs_list = [ - DataFrame({"b": [4, 5, 6]}, index=df_index), - DataFrame({"c": [7, 8, 9]}, index=df_index), + DataFrame({"b": [4, 5, 6]}, index=Index(["a", "b", "c"], name="key")), + DataFrame({"c": [7, 8, 9]}, index=Index(["a", "b", "c"], name="key")), ] df1_orig = df1.copy() From 481ab5e2022dab04b56b470c639388af34e6894b Mon Sep 17 00:00:00 2001 From: SecretLake Date: Thu, 20 Apr 2023 15:54:58 +0000 Subject: [PATCH 5/7] Fix pre-commit --- pandas/tests/copy_view/test_functions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 07cedae3b74b0..09e44fee31c0d 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -319,7 +319,6 @@ def test_join_on_key(using_copy_on_write): df1_orig = df1.copy() df2_orig = df2.copy() - result = df1.join(df2, on="key") if using_copy_on_write: From c4e7f2ae26a90ff75f45a72000c8afd6aabfca16 Mon Sep 17 00:00:00 2001 From: SecretLake Date: Thu, 20 Apr 2023 15:55:39 +0000 Subject: [PATCH 6/7] Fix pre-commit --- pandas/tests/copy_view/test_functions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 09e44fee31c0d..d0cdb8af9b19b 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -319,6 +319,7 @@ def test_join_on_key(using_copy_on_write): df1_orig = df1.copy() df2_orig = df2.copy() + result = df1.join(df2, on="key") if using_copy_on_write: @@ -338,7 +339,7 @@ def test_join_on_key(using_copy_on_write): result.iloc[0, 1] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b")) - + tm.assert_frame_equal(df1, df1_orig) tm.assert_frame_equal(df2, df2_orig) From 7f6635c27b74f376785eb787dd8af018f758d00b Mon Sep 17 00:00:00 2001 From: SecretLake Date: Thu, 20 Apr 2023 16:02:34 +0000 Subject: [PATCH 7/7] Copy index instead of setting it multiple times --- pandas/tests/copy_view/test_functions.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index d0cdb8af9b19b..56e4b186350f2 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -314,8 +314,10 @@ def test_merge_copy_keyword(using_copy_on_write, copy): def test_join_on_key(using_copy_on_write): - df1 = DataFrame({"a": [1, 2, 3]}, index=Index(["a", "b", "c"], name="key")) - df2 = DataFrame({"b": [4, 5, 6]}, index=Index(["a", "b", "c"], name="key")) + df_index = Index(["a", "b", "c"], name="key") + + df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) + df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)) df1_orig = df1.copy() df2_orig = df2.copy() @@ -345,11 +347,12 @@ def test_join_on_key(using_copy_on_write): def test_join_multiple_dataframes_on_key(using_copy_on_write): - df1 = DataFrame({"a": [1, 2, 3]}, index=Index(["a", "b", "c"], name="key")) + df_index = Index(["a", "b", "c"], name="key") + df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True)) dfs_list = [ - DataFrame({"b": [4, 5, 6]}, index=Index(["a", "b", "c"], name="key")), - DataFrame({"c": [7, 8, 9]}, index=Index(["a", "b", "c"], name="key")), + DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)), + DataFrame({"c": [7, 8, 9]}, index=df_index.copy(deep=True)), ] df1_orig = df1.copy()