From e19d7cff6a4e47f6897deacb2c998eb96e4e39bf Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 14 Mar 2023 23:30:26 +0100 Subject: [PATCH 1/4] BUG: astype_view check raising on minimum versions build --- pandas/core/dtypes/astype.py | 5 ++++- pandas/tests/copy_view/test_astype.py | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index cd91f06d7ff04..09e338d205bbc 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -263,6 +263,9 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: ------- True if new data is a view or not guaranteed to be a copy, False otherwise """ + if isinstance(dtype, np.dtype) and not isinstance(new_dtype, np.dtype): + new_dtype, dtype = dtype, new_dtype + if dtype == new_dtype: return True @@ -290,7 +293,7 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: numpy_dtype = dtype if new_numpy_dtype is None and isinstance(new_dtype, np.dtype): - numpy_dtype = new_dtype + new_numpy_dtype = new_dtype if numpy_dtype is not None and new_numpy_dtype is not None: # if both have NumPy dtype or one of them is a numpy dtype diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 310e811c0c6d8..d16434d8671e7 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,6 +3,7 @@ from pandas.compat import pa_version_under7p0 +import pandas as pd from pandas import ( DataFrame, Series, @@ -84,6 +85,13 @@ def test_astype_different_target_dtype(using_copy_on_write, dtype): tm.assert_frame_equal(df2, df_orig.astype(dtype)) +def test_astype_numpy_to_ea(): + ser = Series([1, 2, 3]) + with pd.option_context("mode.copy_on_write", True): + result = ser.astype("Int64") + assert np.shares_memory(get_array(ser), get_array(result)) + + @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] ) From 6722815e07a801de472f21ebfda65caec8d98f67 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 01:07:55 +0100 Subject: [PATCH 2/4] Skip for array manager --- pandas/tests/copy_view/test_astype.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index d16434d8671e7..16c060d004bc7 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -2,6 +2,7 @@ import pytest from pandas.compat import pa_version_under7p0 +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -85,6 +86,7 @@ def test_astype_different_target_dtype(using_copy_on_write, dtype): tm.assert_frame_equal(df2, df_orig.astype(dtype)) +@td.skip_array_manager_invalid_test def test_astype_numpy_to_ea(): ser = Series([1, 2, 3]) with pd.option_context("mode.copy_on_write", True): From b8eff47b985e9c7d7c2af9ac794890bc84a80e54 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 02:10:10 +0100 Subject: [PATCH 3/4] BUG: read_csv for arrow with mismatching dtypes does not work --- pandas/io/parsers/c_parser_wrapper.py | 37 ++++--------------- .../io/parser/test_concatenate_chunks.py | 36 ++++++++++++++++++ 2 files changed, 43 insertions(+), 30 deletions(-) create mode 100644 pandas/tests/io/parser/test_concatenate_chunks.py diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 4b8bc5c402157..2b24e4e873a92 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -23,8 +23,10 @@ is_categorical_dtype, pandas_dtype, ) -from pandas.core.dtypes.concat import union_categoricals -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.concat import ( + concat_compat, + union_categoricals, +) from pandas.core.indexes.api import ensure_index_from_sequences @@ -379,40 +381,15 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. dtypes = {a.dtype for a in arrs} - # TODO: shouldn't we exclude all EA dtypes here? numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} - if len(numpy_dtypes) > 1: - # error: Argument 1 to "find_common_type" has incompatible type - # "Set[Any]"; expected "Sequence[Union[dtype[Any], None, type, - # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, - # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]" - common_type = np.find_common_type( - numpy_dtypes, # type: ignore[arg-type] - [], - ) - if common_type == np.dtype(object): - warning_columns.append(str(name)) dtype = dtypes.pop() if is_categorical_dtype(dtype): result[name] = union_categoricals(arrs, sort_categories=False) - elif isinstance(dtype, ExtensionDtype): - # TODO: concat_compat? - array_type = dtype.construct_array_type() - # error: Argument 1 to "_concat_same_type" of "ExtensionArray" - # has incompatible type "List[Union[ExtensionArray, ndarray]]"; - # expected "Sequence[ExtensionArray]" - result[name] = array_type._concat_same_type(arrs) # type: ignore[arg-type] else: - # error: Argument 1 to "concatenate" has incompatible - # type "List[Union[ExtensionArray, ndarray[Any, Any]]]" - # ; expected "Union[_SupportsArray[dtype[Any]], - # Sequence[_SupportsArray[dtype[Any]]], - # Sequence[Sequence[_SupportsArray[dtype[Any]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]] - # , Sequence[Sequence[Sequence[Sequence[ - # _SupportsArray[dtype[Any]]]]]]]" - result[name] = np.concatenate(arrs) # type: ignore[arg-type] + result[name] = concat_compat(arrs) + if len(numpy_dtypes) > 1 and result[name].dtype == np.dtype(object): + warning_columns.append(str(name)) if warning_columns: warning_names = ",".join(warning_columns) diff --git a/pandas/tests/io/parser/test_concatenate_chunks.py b/pandas/tests/io/parser/test_concatenate_chunks.py new file mode 100644 index 0000000000000..1bae2317a2fc6 --- /dev/null +++ b/pandas/tests/io/parser/test_concatenate_chunks.py @@ -0,0 +1,36 @@ +import numpy as np +import pytest + +from pandas.errors import DtypeWarning + +import pandas._testing as tm +from pandas.core.arrays import ArrowExtensionArray + +from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks + + +def test_concatenate_chunks_pyarrow(): + # GH#51876 + pa = pytest.importorskip("pyarrow") + chunks = [ + {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, + {0: ArrowExtensionArray(pa.array([1, 2]))}, + ] + result = _concatenate_chunks(chunks) + expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0])) + tm.assert_extension_array_equal(result[0], expected) + + +def test_concatenate_chunks_pyarrow_strings(): + # GH#51876 + pa = pytest.importorskip("pyarrow") + chunks = [ + {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, + {0: ArrowExtensionArray(pa.array(["a", "b"]))}, + ] + with tm.assert_produces_warning(DtypeWarning, match="have mixed types"): + result = _concatenate_chunks(chunks) + expected = np.concatenate( + [np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])] + ) + tm.assert_numpy_array_equal(result[0], expected) From e4d787393986d17b9665ac8883b283bb72f02f77 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Mar 2023 02:11:09 +0100 Subject: [PATCH 4/4] Revert "BUG: read_csv for arrow with mismatching dtypes does not work" This reverts commit b8eff47b985e9c7d7c2af9ac794890bc84a80e54. --- pandas/io/parsers/c_parser_wrapper.py | 37 +++++++++++++++---- .../io/parser/test_concatenate_chunks.py | 36 ------------------ 2 files changed, 30 insertions(+), 43 deletions(-) delete mode 100644 pandas/tests/io/parser/test_concatenate_chunks.py diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 2b24e4e873a92..4b8bc5c402157 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -23,10 +23,8 @@ is_categorical_dtype, pandas_dtype, ) -from pandas.core.dtypes.concat import ( - concat_compat, - union_categoricals, -) +from pandas.core.dtypes.concat import union_categoricals +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.indexes.api import ensure_index_from_sequences @@ -381,15 +379,40 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. dtypes = {a.dtype for a in arrs} + # TODO: shouldn't we exclude all EA dtypes here? numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} + if len(numpy_dtypes) > 1: + # error: Argument 1 to "find_common_type" has incompatible type + # "Set[Any]"; expected "Sequence[Union[dtype[Any], None, type, + # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, + # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]" + common_type = np.find_common_type( + numpy_dtypes, # type: ignore[arg-type] + [], + ) + if common_type == np.dtype(object): + warning_columns.append(str(name)) dtype = dtypes.pop() if is_categorical_dtype(dtype): result[name] = union_categoricals(arrs, sort_categories=False) + elif isinstance(dtype, ExtensionDtype): + # TODO: concat_compat? + array_type = dtype.construct_array_type() + # error: Argument 1 to "_concat_same_type" of "ExtensionArray" + # has incompatible type "List[Union[ExtensionArray, ndarray]]"; + # expected "Sequence[ExtensionArray]" + result[name] = array_type._concat_same_type(arrs) # type: ignore[arg-type] else: - result[name] = concat_compat(arrs) - if len(numpy_dtypes) > 1 and result[name].dtype == np.dtype(object): - warning_columns.append(str(name)) + # error: Argument 1 to "concatenate" has incompatible + # type "List[Union[ExtensionArray, ndarray[Any, Any]]]" + # ; expected "Union[_SupportsArray[dtype[Any]], + # Sequence[_SupportsArray[dtype[Any]]], + # Sequence[Sequence[_SupportsArray[dtype[Any]]]], + # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]] + # , Sequence[Sequence[Sequence[Sequence[ + # _SupportsArray[dtype[Any]]]]]]]" + result[name] = np.concatenate(arrs) # type: ignore[arg-type] if warning_columns: warning_names = ",".join(warning_columns) diff --git a/pandas/tests/io/parser/test_concatenate_chunks.py b/pandas/tests/io/parser/test_concatenate_chunks.py deleted file mode 100644 index 1bae2317a2fc6..0000000000000 --- a/pandas/tests/io/parser/test_concatenate_chunks.py +++ /dev/null @@ -1,36 +0,0 @@ -import numpy as np -import pytest - -from pandas.errors import DtypeWarning - -import pandas._testing as tm -from pandas.core.arrays import ArrowExtensionArray - -from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks - - -def test_concatenate_chunks_pyarrow(): - # GH#51876 - pa = pytest.importorskip("pyarrow") - chunks = [ - {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, - {0: ArrowExtensionArray(pa.array([1, 2]))}, - ] - result = _concatenate_chunks(chunks) - expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0])) - tm.assert_extension_array_equal(result[0], expected) - - -def test_concatenate_chunks_pyarrow_strings(): - # GH#51876 - pa = pytest.importorskip("pyarrow") - chunks = [ - {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, - {0: ArrowExtensionArray(pa.array(["a", "b"]))}, - ] - with tm.assert_produces_warning(DtypeWarning, match="have mixed types"): - result = _concatenate_chunks(chunks) - expected = np.concatenate( - [np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])] - ) - tm.assert_numpy_array_equal(result[0], expected)