From 58284dd7db64d3a045929fa865e9e1104718d07f Mon Sep 17 00:00:00 2001 From: arthurlw Date: Wed, 6 Aug 2025 10:32:21 +0700 Subject: [PATCH 01/10] Added condition for CategoricalDtype --- pandas/core/arrays/base.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bfa2309bb023a..178eaba1d447b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -757,6 +757,8 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: >>> arr2.dtype dtype('float64') """ + from pandas.api.types import CategoricalDtype + dtype = pandas_dtype(dtype) if dtype == self.dtype: if not copy: @@ -764,6 +766,11 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: else: return self.copy() + if isinstance(dtype, CategoricalDtype): + from pandas.core.arrays import Categorical + + return Categorical(self.to_numpy(), dtype=dtype) + if isinstance(dtype, ExtensionDtype): cls = dtype.construct_array_type() return cls._from_sequence(self, dtype=dtype, copy=copy) From 1896199d1a3c6ba3c35d58c4398b8098742b5683 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Wed, 6 Aug 2025 10:51:53 +0700 Subject: [PATCH 02/10] Added tests --- pandas/tests/arrays/categorical/test_astype.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index 7ed4da69f5a99..3c662616e3c89 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -7,14 +7,17 @@ CategoricalDtype, CategoricalIndex, DatetimeIndex, + Index, Interval, NaT, Period, Timestamp, array, + isna, to_datetime, ) import pandas._testing as tm +from pandas.core.arrays.arrow.array import ArrowExtensionArray class TestAstype: @@ -160,3 +163,18 @@ def test_astype_category_readonly_mask_values(self): result = arr.astype("category") expected = array([0, 1, 2], dtype="Int64").astype("category") tm.assert_extension_array_equal(result, expected) + + def test_arrow_array_astype_to_categorical_dtype_temporal(self): + arr = array( + ["2017-01-01", "2018-01-01", "2019-01-01"], dtype="date32[day][pyarrow]" + ) + cats = Index(["2017-01-01", "2018-01-01", "2019-01-01"], dtype="M8[s]") + dtype = CategoricalDtype(categories=cats, ordered=False) + + assert not all(isna(arr.astype(dtype))) + + arr = ArrowExtensionArray._from_sequence(["1h", "2h", "3h"]) + cats = Index(["1h", "2h", "3h"], dtype="m8[ns]") + dtype = CategoricalDtype(cats, ordered=False) + + assert not all(isna(arr.astype(dtype))) From 36b84cfea8226ed06ad1ca3f23dd65fc02992163 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Wed, 6 Aug 2025 10:53:47 +0700 Subject: [PATCH 03/10] whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ec5027840dfd5..392c48354256b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -693,7 +693,7 @@ Categorical - Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`) - Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`) - Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`) -- +- Bug in :meth:`array.astype` where casting a pyarrow-backed array to a temporal :class:`CategoricalDtype` (e.g. with datetime or timedelta categories) raised or incorrectly converted values to all ``NaT`` (:issue:`62051`) Datetimelike ^^^^^^^^^^^^ From 2dc06987ccf142ccbb39079cf05fa3305a6d96c7 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Wed, 6 Aug 2025 10:54:37 +0700 Subject: [PATCH 04/10] Added GH ref --- pandas/tests/arrays/categorical/test_astype.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index 3c662616e3c89..12337afba58ea 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -165,6 +165,7 @@ def test_astype_category_readonly_mask_values(self): tm.assert_extension_array_equal(result, expected) def test_arrow_array_astype_to_categorical_dtype_temporal(self): + # GH#62051 arr = array( ["2017-01-01", "2018-01-01", "2019-01-01"], dtype="date32[day][pyarrow]" ) From 5987952f7d0e74f3555ee6c95da8cca95902b4a6 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Wed, 6 Aug 2025 11:26:16 +0700 Subject: [PATCH 05/10] Updated conditions --- pandas/core/arrays/base.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 178eaba1d447b..95f8205e8badc 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -757,7 +757,11 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: >>> arr2.dtype dtype('float64') """ - from pandas.api.types import CategoricalDtype + from pandas.api.types import ( + CategoricalDtype, + is_datetime64_any_dtype, + ) + from pandas.core.arrays import Categorical dtype = pandas_dtype(dtype) if dtype == self.dtype: @@ -766,9 +770,11 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: else: return self.copy() - if isinstance(dtype, CategoricalDtype): - from pandas.core.arrays import Categorical - + if ( + isinstance(self, Categorical) + and isinstance(dtype, CategoricalDtype) + and is_datetime64_any_dtype(self.categories) + ): return Categorical(self.to_numpy(), dtype=dtype) if isinstance(dtype, ExtensionDtype): From a5fab10c7b96058621d5cca9468b87d9e0717ccf Mon Sep 17 00:00:00 2001 From: arthurlw Date: Wed, 6 Aug 2025 11:59:59 +0700 Subject: [PATCH 06/10] Updated fix logic --- pandas/core/arrays/base.py | 13 ------------- pandas/core/arrays/categorical.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 95f8205e8badc..bfa2309bb023a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -757,12 +757,6 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: >>> arr2.dtype dtype('float64') """ - from pandas.api.types import ( - CategoricalDtype, - is_datetime64_any_dtype, - ) - from pandas.core.arrays import Categorical - dtype = pandas_dtype(dtype) if dtype == self.dtype: if not copy: @@ -770,13 +764,6 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: else: return self.copy() - if ( - isinstance(self, Categorical) - and isinstance(dtype, CategoricalDtype) - and is_datetime64_any_dtype(self.categories) - ): - return Categorical(self.to_numpy(), dtype=dtype) - if isinstance(dtype, ExtensionDtype): cls = dtype.construct_array_type() return cls._from_sequence(self, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f2a401bd3687a..347c7a6b76d8b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -73,6 +73,7 @@ NDArrayBackedExtensionArray, ravel_compat, ) +from pandas.core.arrays.arrow.array import ArrowExtensionArray from pandas.core.base import ( ExtensionArray, NoNewAttributesMixin, @@ -483,6 +484,18 @@ def __init__( ) else: + if isinstance(values, ArrowExtensionArray): + from pandas.api.types import ( + is_datetime64_any_dtype, + is_timedelta64_dtype, + ) + + cat_dtype = dtype.categories.dtype + if is_datetime64_any_dtype(cat_dtype) or is_timedelta64_dtype( + cat_dtype + ): + values = values.to_numpy() + codes = _get_codes_for_values(values, dtype.categories) if null_mask.any(): From e6b7c64ce29f10233ebf8783405a2b6d527dd793 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Wed, 6 Aug 2025 12:26:49 +0700 Subject: [PATCH 07/10] importorskip --- pandas/tests/arrays/categorical/test_astype.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index 12337afba58ea..4b9906f020c4b 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -166,6 +166,7 @@ def test_astype_category_readonly_mask_values(self): def test_arrow_array_astype_to_categorical_dtype_temporal(self): # GH#62051 + pytest.importorskip("pyarrow") arr = array( ["2017-01-01", "2018-01-01", "2019-01-01"], dtype="date32[day][pyarrow]" ) From 95e331cc631cca296cfb5fd94cd0241ac4617e29 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 12 Aug 2025 14:05:15 +0700 Subject: [PATCH 08/10] Update fix logic --- pandas/core/arrays/categorical.py | 13 ------------- pandas/core/indexes/base.py | 20 ++++++++++++++++++++ pandas/core/indexes/datetimes.py | 12 ++++++++++++ 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 347c7a6b76d8b..f2a401bd3687a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -73,7 +73,6 @@ NDArrayBackedExtensionArray, ravel_compat, ) -from pandas.core.arrays.arrow.array import ArrowExtensionArray from pandas.core.base import ( ExtensionArray, NoNewAttributesMixin, @@ -484,18 +483,6 @@ def __init__( ) else: - if isinstance(values, ArrowExtensionArray): - from pandas.api.types import ( - is_datetime64_any_dtype, - is_timedelta64_dtype, - ) - - cat_dtype = dtype.categories.dtype - if is_datetime64_any_dtype(cat_dtype) or is_timedelta64_dtype( - cat_dtype - ): - values = values.to_numpy() - codes = _get_codes_for_values(values, dtype.categories) if null_mask.any(): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e8c5a03a6de50..2f01065a8c180 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3674,6 +3674,16 @@ def get_indexer( orig_target = target target = self._maybe_cast_listlike_indexer(target) + from pandas.api.types import is_timedelta64_dtype + if ( + self.dtype == "string[pyarrow]" and is_timedelta64_dtype(target.dtype) + ) or ( + target.dtype == "string[pyarrow]" and is_timedelta64_dtype(self.dtype) + ): + from pandas.core.arrays.timedeltas import sequence_to_td64ns + data, freq = sequence_to_td64ns(target, copy=False, unit=None) + target = type(target)(data) + self._check_indexing_method(method, limit, tolerance) if not self._index_as_unique: @@ -6273,6 +6283,16 @@ def _find_common_type_compat(self, target) -> DtypeObj: ): return _dtype_obj + # from pandas.api.types import is_timedelta64_dtype + # from pandas.core.arrays.timedeltas import sequence_to_td64ns + + # if ( + # self.dtype == "string[pyarrow]" and is_timedelta64_dtype(target_dtype) + # ) or ( + # target_dtype == "string[pyarrow]" and is_timedelta64_dtype(self.dtype) + # ): + # return np.dtype("m8[ns]") + dtype = find_result_type(self.dtype, target) dtype = common_dtype_categorical_compat([self, target], dtype) return dtype diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 9adbaadbdcdc8..d88fb536d868b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -384,6 +384,18 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: if self.tz is not None: # If we have tz, we can compare to tzaware return isinstance(dtype, DatetimeTZDtype) + + from pandas import ArrowDtype + + if isinstance(dtype, ArrowDtype): + import pyarrow as pa + + return ( + pa.types.is_date32(dtype.pyarrow_dtype) + or pa.types.is_date64(dtype.pyarrow_dtype) + or pa.types.is_timestamp(dtype.pyarrow_dtype) + ) + # if we dont have tz, we can only compare to tznaive return lib.is_np_dtype(dtype, "M") From 34534872200b4d46122b736e4a62cea522c3ac07 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 12 Aug 2025 14:08:06 +0700 Subject: [PATCH 09/10] precommit and removed unnecessary comments --- pandas/core/indexes/base.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2f01065a8c180..004b0e9d32df3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3675,12 +3675,12 @@ def get_indexer( target = self._maybe_cast_listlike_indexer(target) from pandas.api.types import is_timedelta64_dtype - if ( - self.dtype == "string[pyarrow]" and is_timedelta64_dtype(target.dtype) - ) or ( + + if (self.dtype == "string[pyarrow]" and is_timedelta64_dtype(target.dtype)) or ( target.dtype == "string[pyarrow]" and is_timedelta64_dtype(self.dtype) ): from pandas.core.arrays.timedeltas import sequence_to_td64ns + data, freq = sequence_to_td64ns(target, copy=False, unit=None) target = type(target)(data) @@ -6283,16 +6283,6 @@ def _find_common_type_compat(self, target) -> DtypeObj: ): return _dtype_obj - # from pandas.api.types import is_timedelta64_dtype - # from pandas.core.arrays.timedeltas import sequence_to_td64ns - - # if ( - # self.dtype == "string[pyarrow]" and is_timedelta64_dtype(target_dtype) - # ) or ( - # target_dtype == "string[pyarrow]" and is_timedelta64_dtype(self.dtype) - # ): - # return np.dtype("m8[ns]") - dtype = find_result_type(self.dtype, target) dtype = common_dtype_categorical_compat([self, target], dtype) return dtype From d74714154f70cff64f15bbd96804e59b24bc00f1 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 12 Aug 2025 14:16:04 +0700 Subject: [PATCH 10/10] Update condition --- pandas/core/indexes/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 004b0e9d32df3..581181d510c81 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3676,9 +3676,7 @@ def get_indexer( from pandas.api.types import is_timedelta64_dtype - if (self.dtype == "string[pyarrow]" and is_timedelta64_dtype(target.dtype)) or ( - target.dtype == "string[pyarrow]" and is_timedelta64_dtype(self.dtype) - ): + if target.dtype == "string[pyarrow]" and is_timedelta64_dtype(self.dtype): from pandas.core.arrays.timedeltas import sequence_to_td64ns data, freq = sequence_to_td64ns(target, copy=False, unit=None)