-
Notifications
You must be signed in to change notification settings - Fork 28.6k
[SPARK-46858][PYTHON][PS][BUILD] Upgrade Pandas to 2.2.0 #44881
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9ae857a
5caa678
e9a6445
edb3d9a
5440381
3e66505
8643ebd
a8237b4
836dcfe
d3c5f57
66f69a2
9d4e8a1
37300e8
ea57fdb
a3f3e91
8a24900
b727550
e92082f
5f62fcc
f235780
ad67735
4e6c77a
26b7bd6
4c84b2a
fbbaf88
0ca4aa6
7536263
b07e608
acd7b7f
d560825
6de7931
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2554,7 +2554,10 @@ def resolve_func(psdf, this_column_labels, that_column_labels): | |
if isinstance(obj, Series): | ||
num_series += 1 | ||
series_names.add(obj.name) | ||
new_objs.append(obj.to_frame(DEFAULT_SERIES_NAME)) | ||
if not ignore_index and not should_return_series: | ||
new_objs.append(obj.to_frame()) | ||
else: | ||
new_objs.append(obj.to_frame(DEFAULT_SERIES_NAME)) | ||
Comment on lines
-2557
to
+2560
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related to pandas-dev/pandas#15047 |
||
else: | ||
assert isinstance(obj, DataFrame) | ||
new_objs.append(obj) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,11 +15,14 @@ | |
# limitations under the License. | ||
# | ||
|
||
from typing import final | ||
|
||
from pyspark.loose_version import LooseVersion | ||
|
||
import matplotlib as mat | ||
import numpy as np | ||
from matplotlib.axes._base import _process_plot_format # type: ignore[attr-defined] | ||
from matplotlib.figure import Figure | ||
from pandas.core.dtypes.inference import is_list_like | ||
from pandas.io.formats.printing import pprint_thing | ||
from pandas.plotting._matplotlib import ( # type: ignore[attr-defined] | ||
|
@@ -44,10 +47,29 @@ | |
unsupported_function, | ||
KdePlotBase, | ||
) | ||
from pyspark.pandas.series import Series, first_series | ||
|
||
_all_kinds = PlotAccessor._all_kinds # type: ignore[attr-defined] | ||
|
||
|
||
def _set_ticklabels(ax, labels, is_vertical, **kwargs) -> None: | ||
"""Set the tick labels of a given axis. | ||
|
||
Due to https://github.com/matplotlib/matplotlib/pull/17266, we need to handle the | ||
case of repeated ticks (due to `FixedLocator`) and thus we duplicate the number of | ||
labels. | ||
""" | ||
ticks = ax.get_xticks() if is_vertical else ax.get_yticks() | ||
if len(ticks) != len(labels): | ||
i, remainder = divmod(len(ticks), len(labels)) | ||
assert remainder == 0, remainder | ||
labels *= i | ||
if is_vertical: | ||
ax.set_xticklabels(labels, **kwargs) | ||
else: | ||
ax.set_yticklabels(labels, **kwargs) | ||
|
||
|
||
class PandasOnSparkBarPlot(PandasBarPlot, TopNPlotBase): | ||
_kind = "bar" | ||
|
||
|
@@ -231,10 +253,23 @@ def _plot(self, ax, bxpstats, column_num=None, return_type="axes", **kwds): | |
else: | ||
return ax, bp | ||
|
||
@final | ||
def _ensure_frame(self, data): | ||
if isinstance(data, Series): | ||
label = self.label | ||
if label is None and data.name is None: | ||
label = "" | ||
if label is None: | ||
data = data.to_frame() | ||
else: | ||
data = data.to_frame(name=label) | ||
return data | ||
|
||
def _compute_plot_data(self): | ||
colname = self.data.name | ||
spark_column_name = self.data._internal.spark_column_name_for(self.data._column_label) | ||
data = self.data | ||
data = first_series(data) if not isinstance(data, Series) else data | ||
colname = data.name | ||
spark_column_name = data._internal.spark_column_name_for(data._column_label) | ||
|
||
# Updates all props with the rc defaults from matplotlib | ||
self.kwds.update(PandasOnSparkBoxPlot.rc_defaults(**self.kwds)) | ||
|
@@ -277,7 +312,7 @@ def _compute_plot_data(self): | |
|
||
self.data = {labels[0]: stats} | ||
|
||
def _make_plot(self): | ||
def _make_plot(self, fig: Figure): | ||
bxpstats = list(self.data.values())[0] | ||
ax = self._get_ax(0) | ||
kwds = self.kwds.copy() | ||
|
@@ -303,7 +338,7 @@ def _make_plot(self): | |
labels = [pprint_thing(lbl) for lbl in labels] | ||
if not self.use_index: | ||
labels = [pprint_thing(key) for key in range(len(labels))] | ||
self._set_ticklabels(ax, labels) | ||
_set_ticklabels(ax, labels, self.orientation == "vertical") | ||
|
||
@staticmethod | ||
def rc_defaults( | ||
|
@@ -363,10 +398,32 @@ def _args_adjust(self): | |
if is_list_like(self.bottom): | ||
self.bottom = np.array(self.bottom) | ||
|
||
@final | ||
def _ensure_frame(self, data): | ||
if isinstance(data, Series): | ||
label = self.label | ||
if label is None and data.name is None: | ||
label = "" | ||
if label is None: | ||
data = data.to_frame() | ||
else: | ||
data = data.to_frame(name=label) | ||
return data | ||
|
||
def _calculate_bins(self, data, bins): | ||
return bins | ||
Comment on lines
+413
to
+414
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pandas recently pushed couple of commits for refactoring the internal plotting structure such as pandas-dev/pandas#55850 or pandas-dev/pandas#55872, so we also should inherits couple of internal methods to follow the latest Pandas behavior. |
||
|
||
def _compute_plot_data(self): | ||
self.data, self.bins = HistogramPlotBase.prepare_hist_data(self.data, self.bins) | ||
|
||
def _make_plot(self): | ||
def _make_plot_keywords(self, kwds, y): | ||
"""merge BoxPlot/KdePlot properties to passed kwds""" | ||
# y is required for KdePlot | ||
kwds["bottom"] = self.bottom | ||
kwds["bins"] = self.bins | ||
return kwds | ||
|
||
def _make_plot(self, fig: Figure): | ||
# TODO: this logic is similar to KdePlot. Might have to deduplicate it. | ||
# 'num_colors' requires to calculate `shape` which has to count all. | ||
# Use 1 for now to save the computation. | ||
|
@@ -423,9 +480,9 @@ class PandasOnSparkPiePlot(PandasPiePlot, TopNPlotBase): | |
def __init__(self, data, **kwargs): | ||
super().__init__(self.get_top_n(data), **kwargs) | ||
|
||
def _make_plot(self): | ||
def _make_plot(self, fig: Figure): | ||
self.set_result_text(self._get_ax(0)) | ||
super()._make_plot() | ||
super()._make_plot(fig) | ||
|
||
|
||
class PandasOnSparkAreaPlot(PandasAreaPlot, SampledPlotBase): | ||
|
@@ -434,9 +491,9 @@ class PandasOnSparkAreaPlot(PandasAreaPlot, SampledPlotBase): | |
def __init__(self, data, **kwargs): | ||
super().__init__(self.get_sampled(data), **kwargs) | ||
|
||
def _make_plot(self): | ||
def _make_plot(self, fig: Figure): | ||
self.set_result_text(self._get_ax(0)) | ||
super()._make_plot() | ||
super()._make_plot(fig) | ||
|
||
|
||
class PandasOnSparkLinePlot(PandasLinePlot, SampledPlotBase): | ||
|
@@ -445,9 +502,9 @@ class PandasOnSparkLinePlot(PandasLinePlot, SampledPlotBase): | |
def __init__(self, data, **kwargs): | ||
super().__init__(self.get_sampled(data), **kwargs) | ||
|
||
def _make_plot(self): | ||
def _make_plot(self, fig: Figure): | ||
self.set_result_text(self._get_ax(0)) | ||
super()._make_plot() | ||
super()._make_plot(fig) | ||
|
||
|
||
class PandasOnSparkBarhPlot(PandasBarhPlot, TopNPlotBase): | ||
|
@@ -456,9 +513,9 @@ class PandasOnSparkBarhPlot(PandasBarhPlot, TopNPlotBase): | |
def __init__(self, data, **kwargs): | ||
super().__init__(self.get_top_n(data), **kwargs) | ||
|
||
def _make_plot(self): | ||
def _make_plot(self, fig: Figure): | ||
self.set_result_text(self._get_ax(0)) | ||
super()._make_plot() | ||
super()._make_plot(fig) | ||
|
||
|
||
class PandasOnSparkScatterPlot(PandasScatterPlot, TopNPlotBase): | ||
|
@@ -467,9 +524,9 @@ class PandasOnSparkScatterPlot(PandasScatterPlot, TopNPlotBase): | |
def __init__(self, data, x, y, **kwargs): | ||
super().__init__(self.get_top_n(data), x, y, **kwargs) | ||
|
||
def _make_plot(self): | ||
def _make_plot(self, fig: Figure): | ||
self.set_result_text(self._get_ax(0)) | ||
super()._make_plot() | ||
super()._make_plot(fig) | ||
|
||
|
||
class PandasOnSparkKdePlot(PandasKdePlot, KdePlotBase): | ||
|
@@ -478,7 +535,12 @@ class PandasOnSparkKdePlot(PandasKdePlot, KdePlotBase): | |
def _compute_plot_data(self): | ||
self.data = KdePlotBase.prepare_kde_data(self.data) | ||
|
||
def _make_plot(self): | ||
def _make_plot_keywords(self, kwds, y): | ||
kwds["bw_method"] = self.bw_method | ||
kwds["ind"] = type(self)._get_ind(y, ind=self.ind) | ||
return kwds | ||
|
||
def _make_plot(self, fig: Figure): | ||
# 'num_colors' requires to calculate `shape` which has to count all. | ||
# Use 1 for now to save the computation. | ||
colors = self._get_colors(num_colors=1) | ||
|
@@ -515,8 +577,9 @@ def _make_plot(self): | |
self, "_append_legend_handles_labels" | ||
) else self._add_legend_handle(artists[0], label, index=i) | ||
|
||
def _get_ind(self, y): | ||
return KdePlotBase.get_ind(y, self.ind) | ||
@staticmethod | ||
def _get_ind(y, ind): | ||
return KdePlotBase.get_ind(y, ind) | ||
|
||
@classmethod | ||
def _plot( | ||
|
Uh oh!
There was an error while loading. Please reload this page.