From 8d92b110535225807e1427ffbaa50d58c79059e3 Mon Sep 17 00:00:00 2001 From: bzhaoop Date: Wed, 20 Jul 2022 11:32:56 +0800 Subject: [PATCH] [SPARK-39821][PYTHON] Fix error during using DatetimeIndex Pandas disallow conversion between datetime/timedelta and conversions for any datetimelike to float. This will raise error in PYSPARK, during we simply call a DatetimeIndex. So we need to avoid to call astype with datetime64. BTW, PYSPARK PANDAS announces that won't support DatetimeTZD type. So lets skip datetime64 type only in base __repr__ func in Index. --- python/pyspark/pandas/tests/indexes/test_datetime.py | 5 +++++ python/pyspark/sql/pandas/conversion.py | 7 +++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py index 85a2b21901774..053e7f341222a 100644 --- a/python/pyspark/pandas/tests/indexes/test_datetime.py +++ b/python/pyspark/pandas/tests/indexes/test_datetime.py @@ -240,6 +240,11 @@ def test_map(self): mapper_pser = pd.Series([1, 2, 3], index=pidx) self.assert_eq(psidx.map(mapper_pser), pidx.map(mapper_pser)) + def test_repr(self): + pidx_repr = pd.DatetimeIndex(['1970-01-01', '1970-02-01', '1970-03-01']).__repr__() + psidx_repr = ps.DatetimeIndex(['1970-01-01', '1970-02-01', '1970-03-01']).__repr__() + self.assert_eq(pidx_repr, psidx_repr) + if __name__ == "__main__": import unittest diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 119a9bf315caa..ce3c7886bd2fc 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -87,7 +87,7 @@ def toPandas(self) -> "PandasDataFrameLike": import numpy as np import pandas as pd - from pandas.core.dtypes.common import is_timedelta64_dtype + from pandas.core.dtypes.common import is_timedelta64_dtype, is_datetime64_dtype jconf = self.sparkSession._jconf timezone = jconf.sessionLocalTimeZone() @@ -244,7 +244,10 @@ def toPandas(self) -> "PandasDataFrameLike": # No need to cast for non-empty series for timedelta. The type is already correct. should_check_timedelta = is_timedelta64_dtype(t) and len(pdf) == 0 - if (t is not None and not is_timedelta64_dtype(t)) or should_check_timedelta: + if (t is not None and + not all([ + is_timedelta64_dtype(t), + is_datetime64_dtype(t)])) or should_check_timedelta: series = series.astype(t, copy=False) with catch_warnings():