Skip to content

FEAT-#1839: Update pandas dependency and pandas APIs to match #1840

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/supported_apis/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ default to pandas.
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``combine_first`` | `combine_first`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``compare`` | `compare`_ | D | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``copy`` | `copy`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``corr`` | `corr`_ | D | |
Expand Down
2 changes: 2 additions & 0 deletions docs/supported_apis/series_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+
| ``combine_first`` | Y |
+-----------------------------+---------------------------------+
| ``compare`` | D |
+-----------------------------+---------------------------------+
| ``compress`` | D |
+-----------------------------+---------------------------------+
| ``copy`` | Y |
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: modin
channels:
- conda-forge
dependencies:
- pandas==1.0.5
- pandas==1.1.2
- numpy
- pyarrow<0.17
- dask[complete]>=2.12.0,<=2.19.0
Expand Down
11 changes: 7 additions & 4 deletions modin/backends/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,13 +212,16 @@ def from_arrow(cls, at, data_cls):
# END From Arrow

# To NumPy
def to_numpy(self):
"""Converts Modin DataFrame to NumPy array.

Returns:
def to_numpy(self, **kwargs):
"""
Converts Modin DataFrame to NumPy array.

Returns
-------
NumPy array of the QueryCompiler.
"""
return DataFrameDefault.register(pandas.DataFrame.to_numpy)(self)
return DataFrameDefault.register(pandas.DataFrame.to_numpy)(self, **kwargs)

# END To NumPy

Expand Down
38 changes: 25 additions & 13 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,13 +283,15 @@ def free(self):
# END Data Management Methods

# To NumPy
def to_numpy(self):
"""Converts Modin DataFrame to NumPy array.
def to_numpy(self, **kwargs):
"""
Converts Modin DataFrame to NumPy array.

Returns:
Returns
-------
NumPy array of the QueryCompiler.
"""
arr = self._modin_frame.to_numpy()
arr = self._modin_frame.to_numpy(**kwargs)
ErrorMessage.catch_bugs_and_request_email(
len(arr) != len(self.index) or len(arr[0]) != len(self.columns)
)
Expand Down Expand Up @@ -632,12 +634,13 @@ def is_monotonic_decreasing(self):
lambda df, **kwargs: df.apply(
lambda x: (x.sum(skipna=kwargs.get("skipna", True)), x.count()),
axis=kwargs.get("axis", 0),
),
result_type="reduce",
).set_axis(df.axes[kwargs.get("axis", 0) ^ 1], axis=0),
lambda df, **kwargs: df.apply(
lambda x: x.apply(lambda d: d[0]).sum(skipna=kwargs.get("skipna", True))
/ x.apply(lambda d: d[1]).sum(skipna=kwargs.get("skipna", True)),
axis=kwargs.get("axis", 0),
),
).set_axis(df.axes[kwargs.get("axis", 0) ^ 1], axis=0),
)

def value_counts(self, **kwargs):
Expand Down Expand Up @@ -1874,6 +1877,7 @@ def melt(
var_name=None,
value_name="value",
col_level=None,
ignore_index=True,
):
ErrorMessage.missmatch_with_pandas(
operation="melt", message="Order of rows could be different from pandas"
Expand Down Expand Up @@ -2283,14 +2287,20 @@ def dict_apply_builder(df, func_dict={}):
)

def _list_like_func(self, func, axis, *args, **kwargs):
"""Apply list-like function across given axis.
"""
Apply list-like function across given axis.

Args:
func: The function to apply.
axis: Target axis to apply the function along.
Parameters
----------
func : list-like
The function to apply.
axis : 0 or 1 (0 - index, 1 - columns)
Target axis to apply the function along.

Returns:
A new PandasQueryCompiler.
Returns
-------
PandasQueryCompiler
A new QueryCompiler.
"""
# When the function is list-like, the function names become the index/columns
new_index = (
Expand Down Expand Up @@ -2357,7 +2367,9 @@ def _callable_func(self, func, axis, *args, **kwargs):
lambda df, **kwargs: df.sum(**kwargs), lambda df, **kwargs: df.sum(**kwargs)
)
groupby_size = GroupbyReduceFunction.register(
lambda df, **kwargs: pandas.DataFrame(df.size()), lambda df, **kwargs: df.sum()
lambda df, **kwargs: pandas.DataFrame(df.size()),
lambda df, **kwargs: df.sum(),
method="size",
)

def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False):
Expand Down
10 changes: 6 additions & 4 deletions modin/backends/pyarrow/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,12 @@ def to_pandas(self):
"""
return self._modin_frame.to_pandas()

def to_numpy(self):
"""Converts Modin DataFrame to NumPy array.
def to_numpy(self, **kwargs):
"""
Converts Modin DataFrame to NumPy array.

Returns:
Returns
-------
NumPy array of the QueryCompiler.
"""
return self._modin_frame.to_numpy()
return self._modin_frame.to_numpy(**kwargs)
6 changes: 5 additions & 1 deletion modin/data_management/functions/groupby_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,11 @@ def compute_reduce(df):
]
return new_result
else:
return result.drop(columns=by_part)
return (
result.drop(columns=by_part)
if call_kwds.get("method", None) != "size"
else result
)
return result

try:
Expand Down
56 changes: 31 additions & 25 deletions modin/engines/base/frame/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import numpy as np
import pandas
from pandas.core.indexes.api import ensure_index, Index, RangeIndex
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.dtypes.common import is_numeric_dtype
from typing import Union

Expand Down Expand Up @@ -271,31 +272,34 @@ def _validate_axis_equality(self, axis: int, force: bool = False):

Parameters
----------
axis : int,
Axis to validate indices along
force : bool,
axis : 0 or 1
Axis to validate indices along (0 - index, 1 - columns).
force : boolean, default False
Whether to update external indices with internal if their lengths
do not match or raise an exception in that case.
"""
internal_axis = self._frame_mgr_cls.get_indices(
axis, self._partitions, lambda df: df.axes[axis]
)
is_equals = self.axes[axis].equals(internal_axis)
is_lenghts_matches = len(self.axes[axis]) == len(internal_axis)
self_axis = self.axes[axis]
is_equals = self_axis.equals(internal_axis)
if (
isinstance(self_axis, DatetimeIndex)
and isinstance(internal_axis, DatetimeIndex)
and is_equals
):
if getattr(self_axis, "freq") != getattr(internal_axis, "freq"):
is_equals = False
force = True
is_lenghts_matches = len(self_axis) == len(internal_axis)
if not is_equals:
if force:
if not is_lenghts_matches:
if axis:
self._column_widths_cache = None
else:
self._row_lengths_cache = None
new_axis = self.axes[axis] if is_lenghts_matches else internal_axis
self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches)
else:
self._set_axis(
axis,
self.axes[axis],
)
if not is_lenghts_matches:
if axis:
self._column_widths_cache = None
else:
self._row_lengths_cache = None
new_axis = self_axis if is_lenghts_matches and not force else internal_axis
self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches)

def _validate_internal_indices(self, mode=None, **kwargs):
"""
Expand All @@ -316,7 +320,6 @@ def _validate_internal_indices(self, mode=None, **kwargs):
Whether to update external indices with internal if their lengths
do not match or raise an exception in that case.
"""

if isinstance(mode, bool):
is_force = mode
mode = "all"
Expand All @@ -329,7 +332,7 @@ def _validate_internal_indices(self, mode=None, **kwargs):
"reduced": {
"validate_index": self.index.equals(reduced_sample),
"validate_columns": self.columns.equals(reduced_sample),
"force": True,
"force": False,
},
"all": {
"validate_index": True,
Expand Down Expand Up @@ -560,6 +563,7 @@ def mask(
new_row_lengths,
new_col_widths,
new_dtypes,
validate_axes="all" if new_partitions.size != 0 else False,
)
# Check if monotonically increasing, return if it is. Fast track code path for
# common case to keep it fast.
Expand Down Expand Up @@ -1254,7 +1258,7 @@ def _apply_full_axis(
None,
None,
dtypes,
validate_axes="reduced",
validate_axes="all" if new_partitions.size != 0 else False,
)

def _apply_full_axis_select_indices(
Expand Down Expand Up @@ -1818,13 +1822,15 @@ def to_pandas(self):

return df

def to_numpy(self):
"""Converts Modin DataFrame to a 2D NumPy array.
def to_numpy(self, **kwargs):
"""
Converts Modin DataFrame to a 2D NumPy array.

Returns:
Returns
-------
NumPy array.
"""
return self._frame_mgr_cls.to_numpy(self._partitions)
return self._frame_mgr_cls.to_numpy(self._partitions, **kwargs)

def transpose(self):
"""Transpose the index and columns of this dataframe.
Expand Down
2 changes: 1 addition & 1 deletion modin/engines/base/frame/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def to_pandas(self):
"""
raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)

def to_numpy(self):
def to_numpy(self, **kwargs):
"""Convert the object stored in this partition to a NumPy array.

Note: If the underlying object is a Pandas DataFrame, this will return
Expand Down
12 changes: 8 additions & 4 deletions modin/engines/base/frame/partition_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,13 +360,17 @@ def to_pandas(cls, partitions):
return cls.concatenate(df_rows)

@classmethod
def to_numpy(cls, partitions):
"""Convert this object into a NumPy array from the partitions.
def to_numpy(cls, partitions, **kwargs):
"""
Convert this object into a NumPy array from the partitions.

Returns:
Returns
-------
A NumPy array
"""
return np.block([[block.to_numpy() for block in row] for row in partitions])
return np.block(
[[block.to_numpy(**kwargs) for block in row] for row in partitions]
)

@classmethod
def from_pandas(cls, df, return_dims=False):
Expand Down
7 changes: 7 additions & 0 deletions modin/engines/base/io/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from collections import OrderedDict
from modin.error_message import ErrorMessage
from modin.backends.base.query_compiler import BaseQueryCompiler
from typing import Optional


class BaseIO(object):
Expand Down Expand Up @@ -197,6 +198,7 @@ def read_json(
lines=False,
chunksize=None,
compression="infer",
nrows: Optional[int] = None,
):
ErrorMessage.default_to_pandas("`read_json`")
kwargs = {
Expand All @@ -214,6 +216,7 @@ def read_json(
"lines": lines,
"chunksize": chunksize,
"compression": compression,
"nrows": nrows,
}
return cls.from_pandas(pandas.read_json(**kwargs))

Expand All @@ -234,6 +237,7 @@ def read_gbq(
private_key=None,
verbose=None,
progress_bar_type=None,
max_results=None,
):
ErrorMessage.default_to_pandas("`read_gbq`")
return cls.from_pandas(
Expand All @@ -252,6 +256,7 @@ def read_gbq(
private_key=private_key,
verbose=verbose,
progress_bar_type=progress_bar_type,
max_results=max_results,
)
)

Expand Down Expand Up @@ -327,6 +332,7 @@ def read_excel(
skipfooter=0,
convert_float=True,
mangle_dupe_cols=True,
na_filter=True,
**kwds,
):
if skip_footer != 0:
Expand Down Expand Up @@ -357,6 +363,7 @@ def read_excel(
skipfooter=skipfooter,
convert_float=convert_float,
mangle_dupe_cols=mangle_dupe_cols,
na_filter=na_filter,
**kwds,
)
if isinstance(intermediate, (OrderedDict, dict)):
Expand Down
10 changes: 6 additions & 4 deletions modin/engines/dask/pandas_on_dask/frame/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,15 @@ def to_pandas(self):

return dataframe

def to_numpy(self):
"""Convert the object stored in this parition to a NumPy array.
def to_numpy(self, **kwargs):
"""
Convert the object stored in this parition to a NumPy array.

Returns:
Returns
-------
A NumPy array.
"""
return self.apply(lambda df: df.to_numpy()).get()
return self.apply(lambda df, **kwargs: df.to_numpy(**kwargs)).get()

@classmethod
def put(cls, obj):
Expand Down
Loading