modin-project · anmyachev · Sep 17, 2020 · Sep 16, 2020
@@ -79,6 +79,8 @@ default to pandas.
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
 | ``combine_first``          | `combine_first`_          | Y                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
+| ``compare``                | `compare`_                | D                      |                                                    |
++----------------------------+---------------------------+------------------------+----------------------------------------------------+
 | ``copy``                   | `copy`_                   | Y                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
 | ``corr``                   | `corr`_                   | D                      |                                                    |

@@ -90,6 +90,8 @@ the related section on `Defaulting to pandas`_.
 +-----------------------------+---------------------------------+
 | ``combine_first``           | Y                               |
 +-----------------------------+---------------------------------+
+| ``compare``                 | D                               |
++-----------------------------+---------------------------------+
 | ``compress``                | D                               |
 +-----------------------------+---------------------------------+
 | ``copy``                    | Y                               |

@@ -2,7 +2,7 @@ name: modin
 channels:
   - conda-forge
 dependencies:
-  - pandas==1.0.5
+  - pandas==1.1.2
   - numpy
   - pyarrow<0.17
   - dask[complete]>=2.12.0,<=2.19.0

@@ -212,13 +212,16 @@ def from_arrow(cls, at, data_cls):
     # END From Arrow
 
     # To NumPy
-    def to_numpy(self):
-        """Converts Modin DataFrame to NumPy array.
 
-        Returns:
+    def to_numpy(self, **kwargs):
+        """
+        Converts Modin DataFrame to NumPy array.
+
+        Returns
+        -------
             NumPy array of the QueryCompiler.
         """
-        return DataFrameDefault.register(pandas.DataFrame.to_numpy)(self)
+        return DataFrameDefault.register(pandas.DataFrame.to_numpy)(self, **kwargs)
 
     # END To NumPy
 

@@ -283,13 +283,15 @@ def free(self):
     # END Data Management Methods
 
     # To NumPy
-    def to_numpy(self):
-        """Converts Modin DataFrame to NumPy array.
+    def to_numpy(self, **kwargs):
+        """
+        Converts Modin DataFrame to NumPy array.
 
-        Returns:
+        Returns
+        -------
             NumPy array of the QueryCompiler.
         """
-        arr = self._modin_frame.to_numpy()
+        arr = self._modin_frame.to_numpy(**kwargs)
         ErrorMessage.catch_bugs_and_request_email(
             len(arr) != len(self.index) or len(arr[0]) != len(self.columns)
         )
@@ -632,12 +634,13 @@ def is_monotonic_decreasing(self):
         lambda df, **kwargs: df.apply(
             lambda x: (x.sum(skipna=kwargs.get("skipna", True)), x.count()),
             axis=kwargs.get("axis", 0),
-        ),
+            result_type="reduce",
+        ).set_axis(df.axes[kwargs.get("axis", 0) ^ 1], axis=0),
         lambda df, **kwargs: df.apply(
             lambda x: x.apply(lambda d: d[0]).sum(skipna=kwargs.get("skipna", True))
             / x.apply(lambda d: d[1]).sum(skipna=kwargs.get("skipna", True)),
             axis=kwargs.get("axis", 0),
-        ),
+        ).set_axis(df.axes[kwargs.get("axis", 0) ^ 1], axis=0),
     )
 
     def value_counts(self, **kwargs):
@@ -1874,6 +1877,7 @@ def melt(
         var_name=None,
         value_name="value",
         col_level=None,
+        ignore_index=True,
     ):
         ErrorMessage.missmatch_with_pandas(
             operation="melt", message="Order of rows could be different from pandas"
@@ -2283,14 +2287,20 @@ def dict_apply_builder(df, func_dict={}):
         )
 
     def _list_like_func(self, func, axis, *args, **kwargs):
-        """Apply list-like function across given axis.
+        """
+        Apply list-like function across given axis.
 
-        Args:
-            func: The function to apply.
-            axis: Target axis to apply the function along.
+        Parameters
+        ----------
+            func : list-like
+                The function to apply.
+            axis : 0 or 1 (0 - index, 1 - columns)
+                Target axis to apply the function along.
 
-        Returns:
-            A new PandasQueryCompiler.
+        Returns
+        -------
+        PandasQueryCompiler
+            A new QueryCompiler.
         """
         # When the function is list-like, the function names become the index/columns
         new_index = (
@@ -2357,7 +2367,9 @@ def _callable_func(self, func, axis, *args, **kwargs):
         lambda df, **kwargs: df.sum(**kwargs), lambda df, **kwargs: df.sum(**kwargs)
     )
     groupby_size = GroupbyReduceFunction.register(
-        lambda df, **kwargs: pandas.DataFrame(df.size()), lambda df, **kwargs: df.sum()
+        lambda df, **kwargs: pandas.DataFrame(df.size()),
+        lambda df, **kwargs: df.sum(),
+        method="size",
     )
 
     def groupby_dict_agg(self, by, func_dict, groupby_args, agg_args, drop=False):

@@ -190,10 +190,12 @@ def to_pandas(self):
         """
         return self._modin_frame.to_pandas()
 
-    def to_numpy(self):
-        """Converts Modin DataFrame to NumPy array.
+    def to_numpy(self, **kwargs):
+        """
+        Converts Modin DataFrame to NumPy array.
 
-        Returns:
+        Returns
+        -------
             NumPy array of the QueryCompiler.
         """
-        return self._modin_frame.to_numpy()
+        return self._modin_frame.to_numpy(**kwargs)
@@ -132,7 +132,11 @@ def compute_reduce(df):
                             ]
                             return new_result
                         else:
-                            return result.drop(columns=by_part)
+                            return (
+                                result.drop(columns=by_part)
+                                if call_kwds.get("method", None) != "size"
+                                else result
+                            )
                     return result
 
                 try:

@@ -15,6 +15,7 @@
 import numpy as np
 import pandas
 from pandas.core.indexes.api import ensure_index, Index, RangeIndex
+from pandas.core.indexes.datetimes import DatetimeIndex
 from pandas.core.dtypes.common import is_numeric_dtype
 from typing import Union
 
@@ -271,31 +272,34 @@ def _validate_axis_equality(self, axis: int, force: bool = False):
 
         Parameters
         ----------
-            axis : int,
-                Axis to validate indices along
-            force : bool,
+            axis : 0 or 1
+                Axis to validate indices along (0 - index, 1 - columns).
+            force : boolean, default False
                 Whether to update external indices with internal if their lengths
                 do not match or raise an exception in that case.
         """
         internal_axis = self._frame_mgr_cls.get_indices(
             axis, self._partitions, lambda df: df.axes[axis]
         )
-        is_equals = self.axes[axis].equals(internal_axis)
-        is_lenghts_matches = len(self.axes[axis]) == len(internal_axis)
+        self_axis = self.axes[axis]
+        is_equals = self_axis.equals(internal_axis)
+        if (
+            isinstance(self_axis, DatetimeIndex)
+            and isinstance(internal_axis, DatetimeIndex)
+            and is_equals
+        ):
+            if getattr(self_axis, "freq") != getattr(internal_axis, "freq"):
+                is_equals = False
+                force = True
+        is_lenghts_matches = len(self_axis) == len(internal_axis)
         if not is_equals:
-            if force:
-                if not is_lenghts_matches:
-                    if axis:
-                        self._column_widths_cache = None
-                    else:
-                        self._row_lengths_cache = None
-                new_axis = self.axes[axis] if is_lenghts_matches else internal_axis
-                self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches)
-            else:
-                self._set_axis(
-                    axis,
-                    self.axes[axis],
-                )
+            if not is_lenghts_matches:
+                if axis:
+                    self._column_widths_cache = None
+                else:
+                    self._row_lengths_cache = None
+            new_axis = self_axis if is_lenghts_matches and not force else internal_axis
+            self._set_axis(axis, new_axis, cache_only=not is_lenghts_matches)
 
     def _validate_internal_indices(self, mode=None, **kwargs):
         """
@@ -316,7 +320,6 @@ def _validate_internal_indices(self, mode=None, **kwargs):
                 Whether to update external indices with internal if their lengths
                 do not match or raise an exception in that case.
         """
-
         if isinstance(mode, bool):
             is_force = mode
             mode = "all"
@@ -329,7 +332,7 @@ def _validate_internal_indices(self, mode=None, **kwargs):
             "reduced": {
                 "validate_index": self.index.equals(reduced_sample),
                 "validate_columns": self.columns.equals(reduced_sample),
-                "force": True,
+                "force": False,
             },
             "all": {
                 "validate_index": True,
@@ -560,6 +563,7 @@ def mask(
             new_row_lengths,
             new_col_widths,
             new_dtypes,
+            validate_axes="all" if new_partitions.size != 0 else False,
         )
         # Check if monotonically increasing, return if it is. Fast track code path for
         # common case to keep it fast.
@@ -1254,7 +1258,7 @@ def _apply_full_axis(
             None,
             None,
             dtypes,
-            validate_axes="reduced",
+            validate_axes="all" if new_partitions.size != 0 else False,
         )
 
     def _apply_full_axis_select_indices(
@@ -1818,13 +1822,15 @@ def to_pandas(self):
 
         return df
 
-    def to_numpy(self):
-        """Converts Modin DataFrame to a 2D NumPy array.
+    def to_numpy(self, **kwargs):
+        """
+        Converts Modin DataFrame to a 2D NumPy array.
 
-        Returns:
+        Returns
+        -------
             NumPy array.
         """
-        return self._frame_mgr_cls.to_numpy(self._partitions)
+        return self._frame_mgr_cls.to_numpy(self._partitions, **kwargs)
 
     def transpose(self):
         """Transpose the index and columns of this dataframe.

@@ -79,7 +79,7 @@ def to_pandas(self):
         """
         raise NotImplementedError(NOT_IMPLEMENTED_MESSAGE)
 
-    def to_numpy(self):
+    def to_numpy(self, **kwargs):
         """Convert the object stored in this partition to a NumPy array.
 
         Note: If the underlying object is a Pandas DataFrame, this will return

@@ -360,13 +360,17 @@ def to_pandas(cls, partitions):
             return cls.concatenate(df_rows)
 
     @classmethod
-    def to_numpy(cls, partitions):
-        """Convert this object into a NumPy array from the partitions.
+    def to_numpy(cls, partitions, **kwargs):
+        """
+        Convert this object into a NumPy array from the partitions.
 
-        Returns:
+        Returns
+        -------
             A NumPy array
         """
-        return np.block([[block.to_numpy() for block in row] for row in partitions])
+        return np.block(
+            [[block.to_numpy(**kwargs) for block in row] for row in partitions]
+        )
 
     @classmethod
     def from_pandas(cls, df, return_dims=False):

@@ -15,6 +15,7 @@
 from collections import OrderedDict
 from modin.error_message import ErrorMessage
 from modin.backends.base.query_compiler import BaseQueryCompiler
+from typing import Optional
 
 
 class BaseIO(object):
@@ -197,6 +198,7 @@ def read_json(
         lines=False,
         chunksize=None,
         compression="infer",
+        nrows: Optional[int] = None,
     ):
         ErrorMessage.default_to_pandas("`read_json`")
         kwargs = {
@@ -214,6 +216,7 @@ def read_json(
             "lines": lines,
             "chunksize": chunksize,
             "compression": compression,
+            "nrows": nrows,
         }
         return cls.from_pandas(pandas.read_json(**kwargs))
 
@@ -234,6 +237,7 @@ def read_gbq(
         private_key=None,
         verbose=None,
         progress_bar_type=None,
+        max_results=None,
     ):
         ErrorMessage.default_to_pandas("`read_gbq`")
         return cls.from_pandas(
@@ -252,6 +256,7 @@ def read_gbq(
                 private_key=private_key,
                 verbose=verbose,
                 progress_bar_type=progress_bar_type,
+                max_results=max_results,
             )
         )
 
@@ -327,6 +332,7 @@ def read_excel(
         skipfooter=0,
         convert_float=True,
         mangle_dupe_cols=True,
+        na_filter=True,
         **kwds,
     ):
         if skip_footer != 0:
@@ -357,6 +363,7 @@ def read_excel(
             skipfooter=skipfooter,
             convert_float=convert_float,
             mangle_dupe_cols=mangle_dupe_cols,
+            na_filter=na_filter,
             **kwds,
         )
         if isinstance(intermediate, (OrderedDict, dict)):

@@ -123,13 +123,15 @@ def to_pandas(self):
 
         return dataframe
 
-    def to_numpy(self):
-        """Convert the object stored in this parition to a NumPy array.
+    def to_numpy(self, **kwargs):
+        """
+        Convert the object stored in this parition to a NumPy array.
 
-        Returns:
+        Returns
+        -------
             A NumPy array.
         """
-        return self.apply(lambda df: df.to_numpy()).get()
+        return self.apply(lambda df, **kwargs: df.to_numpy(**kwargs)).get()
 
     @classmethod
     def put(cls, obj):