Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 9 additions & 30 deletions python/pyspark/pandas/tests/computation/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,6 @@ def df_pair(self):
psdf = ps.from_pandas(pdf)
return pdf, psdf

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43556): Enable DataFrameSlowTests.test_describe for pandas 2.0.0.",
)
def test_describe(self):
pdf, psdf = self.df_pair

Expand Down Expand Up @@ -78,19 +74,10 @@ def test_describe(self):
}
)
pdf = psdf._to_pandas()
# NOTE: Set `datetime_is_numeric=True` for pandas:
# FutureWarning: Treating datetime data as categorical rather than numeric in
# `.describe` is deprecated and will be removed in a future version of pandas.
# Specify `datetime_is_numeric=True` to silence this
# warning and adopt the future behavior now.
# NOTE: Compare the result except percentiles, since we use approximate percentile
# so the result is different from pandas.
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pdf.describe(datetime_is_numeric=True)
.astype(str)
.loc[["count", "mean", "min", "max"]],
pdf.describe().astype(str).loc[["count", "mean", "min", "max"]],
)
else:
self.assert_eq(
Expand Down Expand Up @@ -136,17 +123,13 @@ def test_describe(self):
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pdf.describe(datetime_is_numeric=True)
.astype(str)
.loc[["count", "mean", "min", "max"]],
pdf.describe().astype(str).loc[["count", "mean", "min", "max"]],
)
psdf.A += psdf.A
pdf.A += pdf.A
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pdf.describe(datetime_is_numeric=True)
.astype(str)
.loc[["count", "mean", "min", "max"]],
pdf.describe().astype(str).loc[["count", "mean", "min", "max"]],
)
else:
expected_result = ps.DataFrame(
Expand Down Expand Up @@ -187,15 +170,15 @@ def test_describe(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pandas_result = pdf.describe(datetime_is_numeric=True)
pandas_result = pdf.describe()
pandas_result.B = pandas_result.B.astype(str)
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pandas_result.loc[["count", "mean", "min", "max"]],
)
psdf.A += psdf.A
pdf.A += pdf.A
pandas_result = pdf.describe(datetime_is_numeric=True)
pandas_result = pdf.describe()
pandas_result.B = pandas_result.B.astype(str)
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
Expand Down Expand Up @@ -252,7 +235,7 @@ def test_describe(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pandas_result = pdf.describe(datetime_is_numeric=True)
pandas_result = pdf.describe()
pandas_result.b = pandas_result.b.astype(str)
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
Expand Down Expand Up @@ -288,10 +271,6 @@ def test_describe(self):
with self.assertRaisesRegex(ValueError, msg):
psdf.describe()

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43556): Enable DataFrameSlowTests.test_describe for pandas 2.0.0.",
)
def test_describe_empty(self):
# Empty DataFrame
psdf = ps.DataFrame(columns=["A", "B"])
Expand Down Expand Up @@ -328,7 +307,7 @@ def test_describe_empty(self):
# For timestamp type, we should convert NaT to None in pandas result
# since pandas API on Spark doesn't support the NaT for object type.
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
pdf_result = pdf[pdf.a != pdf.a].describe()
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
pdf_result.where(pdf_result.notnull(), None).astype(str),
Expand Down Expand Up @@ -367,7 +346,7 @@ def test_describe_empty(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
pdf_result = pdf[pdf.a != pdf.a].describe()
pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(), None).astype(str)
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
Expand Down Expand Up @@ -417,7 +396,7 @@ def test_describe_empty(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
pdf_result = pdf[pdf.a != pdf.a].describe()
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
pdf_result.where(pdf_result.notnull(), None).astype(str),
Expand Down