-
Notifications
You must be signed in to change notification settings - Fork 50
feat: df.join lsuffix and rsuffix support #1857
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
8e85a0b
515c985
481a6bb
e66a0a1
798d3d5
14a1c54
8c6630b
69fa715
9748b35
53ef0cc
8b09d10
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2816,12 +2816,99 @@ def test_join_different_table( | |
assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) | ||
|
||
|
||
def test_join_duplicate_columns_raises_not_implemented(scalars_dfs): | ||
scalars_df, _ = scalars_dfs | ||
df_a = scalars_df[["string_col", "float64_col"]] | ||
df_b = scalars_df[["float64_col"]] | ||
with pytest.raises(NotImplementedError): | ||
df_a.join(df_b, how="outer").to_pandas() | ||
@all_joins | ||
def test_join_different_table_with_duplicate_column_name( | ||
scalars_df_index, scalars_pandas_df_index, how | ||
): | ||
bf_df_a = scalars_df_index[["string_col", "int64_col", "int64_too"]].rename( | ||
columns={"int64_too": "int64_col"} | ||
) | ||
bf_df_b = scalars_df_index.dropna()[ | ||
["string_col", "int64_col", "int64_too"] | ||
].rename(columns={"int64_too": "int64_col"}) | ||
bf_result = bf_df_a.join(bf_df_b, how=how, lsuffix="_l", rsuffix="_r").to_pandas() | ||
print(bf_result) | ||
Genesis929 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pd_df_a = scalars_pandas_df_index[["string_col", "int64_col", "int64_too"]].rename( | ||
columns={"int64_too": "int64_col"} | ||
) | ||
pd_df_b = scalars_pandas_df_index.dropna()[ | ||
["string_col", "int64_col", "int64_too"] | ||
].rename(columns={"int64_too": "int64_col"}) | ||
pd_result = pd_df_a.join(pd_df_b, how=how, lsuffix="_l", rsuffix="_r") | ||
print(pd_result) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove leftover print() statements. PS. Adding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks! |
||
|
||
pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) | ||
|
||
|
||
@all_joins | ||
def test_join_param_on_with_duplicate_column_name_not_on_col( | ||
scalars_df_index, scalars_pandas_df_index, how | ||
): | ||
# This test is for duplicate column names, but the 'on' column is not duplicated. | ||
if how == "cross": | ||
return | ||
bf_df_a = scalars_df_index[ | ||
["string_col", "datetime_col", "timestamp_col", "int64_too"] | ||
].rename(columns={"timestamp_col": "datetime_col"}) | ||
bf_df_b = scalars_df_index.dropna()[ | ||
["string_col", "datetime_col", "timestamp_col"] | ||
].rename(columns={"timestamp_col": "datetime_col"}) | ||
bf_result = bf_df_a.join( | ||
bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" | ||
).to_pandas() | ||
pd_df_a = scalars_pandas_df_index[ | ||
["string_col", "datetime_col", "timestamp_col", "int64_too"] | ||
].rename(columns={"timestamp_col": "datetime_col"}) | ||
pd_df_b = scalars_pandas_df_index.dropna()[ | ||
["string_col", "datetime_col", "timestamp_col"] | ||
].rename(columns={"timestamp_col": "datetime_col"}) | ||
pd_result = pd_df_a.join( | ||
pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" | ||
) | ||
pd.testing.assert_frame_equal( | ||
bf_result.sort_index(), | ||
pd_result.sort_index(), | ||
check_like=True, | ||
check_index_type=False, | ||
) | ||
|
||
|
||
@pytest.mark.skipif( | ||
pandas.__version__.startswith("1."), reason="bad left join in pandas 1.x" | ||
) | ||
@all_joins | ||
def test_join_param_on_with_duplicate_column_name_on_col( | ||
scalars_df_index, scalars_pandas_df_index, how | ||
): | ||
# This test is for duplicate column names, and the 'on' column is duplicated. | ||
if how == "cross": | ||
return | ||
bf_df_a = scalars_df_index[ | ||
["string_col", "datetime_col", "timestamp_col", "int64_too"] | ||
].rename(columns={"timestamp_col": "datetime_col"}) | ||
bf_df_b = scalars_df_index.dropna()[ | ||
["string_col", "datetime_col", "timestamp_col", "int64_too"] | ||
].rename(columns={"timestamp_col": "datetime_col"}) | ||
bf_result = bf_df_a.join( | ||
bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" | ||
).to_pandas() | ||
print(bf_result) | ||
Genesis929 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pd_df_a = scalars_pandas_df_index[ | ||
["string_col", "datetime_col", "timestamp_col", "int64_too"] | ||
].rename(columns={"timestamp_col": "datetime_col"}) | ||
pd_df_b = scalars_pandas_df_index.dropna()[ | ||
["string_col", "datetime_col", "timestamp_col", "int64_too"] | ||
].rename(columns={"timestamp_col": "datetime_col"}) | ||
pd_result = pd_df_a.join( | ||
pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" | ||
) | ||
print(pd_result) | ||
Genesis929 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pd.testing.assert_frame_equal( | ||
bf_result.sort_index(), | ||
pd_result.sort_index(), | ||
check_like=True, | ||
check_index_type=False, | ||
) | ||
|
||
|
||
@all_joins | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2443,12 +2443,36 @@ def test_join_different_table( | |
assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) | ||
|
||
|
||
def test_join_duplicate_columns_raises_not_implemented(scalars_dfs): | ||
@all_joins | ||
def test_join_raise_when_param_on_duplicate_with_column(scalars_df_index, how): | ||
if how == "cross": | ||
return | ||
bf_df_a = scalars_df_index[["string_col", "int64_col"]].rename( | ||
columns={"int64_col": "string_col"} | ||
) | ||
bf_df_a.index.name = "string_col" | ||
bf_df_b = scalars_df_index.dropna()["string_col"] | ||
with pytest.raises(ValueError): | ||
Genesis929 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
bf_df_a.join(bf_df_b, on="string_col", how=how, lsuffix="_l", rsuffix="_r") | ||
|
||
|
||
def test_join_duplicate_columns_raises_value_error(scalars_dfs): | ||
scalars_df, _ = scalars_dfs | ||
df_a = scalars_df[["string_col", "float64_col"]] | ||
df_b = scalars_df[["float64_col"]] | ||
with pytest.raises(NotImplementedError): | ||
df_a.join(df_b, how="outer").to_pandas() | ||
with pytest.raises(ValueError): | ||
Genesis929 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
df_a.join(df_b, how="outer") | ||
|
||
|
||
@all_joins | ||
def test_join_param_on_duplicate_with_index_raises_value_error(scalars_df_index, how): | ||
if how == "cross": | ||
return | ||
Comment on lines
+2469
to
+2470
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it'd be worth added a test that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cross join actually raise another error, match added. |
||
bf_df_a = scalars_df_index[["string_col"]] | ||
bf_df_a.index.name = "string_col" | ||
bf_df_b = scalars_df_index.dropna()["string_col"] | ||
with pytest.raises(ValueError): | ||
Genesis929 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
bf_df_a.join(bf_df_b, on="string_col", how=how, lsuffix="_l", rsuffix="_r") | ||
|
||
|
||
@all_joins | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: This
if
block is getting pretty long. Might be time for a helper function.