From 4b06cab13369de78a637244b1281c1c417e2327c Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 29 Mar 2023 16:45:28 -0700 Subject: [PATCH 1/3] PERF: concat --- pandas/core/dtypes/concat.py | 28 +++++++++++++++------------- pandas/core/reshape/concat.py | 17 +++++++++-------- pandas/tests/extension/test_numpy.py | 13 +------------ 3 files changed, 25 insertions(+), 33 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 9917af9da7665..e8307141762f2 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -7,6 +7,8 @@ import numpy as np +from pandas._libs import lib + from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import ( common_dtype_categorical_compat, @@ -29,6 +31,14 @@ from pandas.core.arrays import Categorical +def _is_nonempty(x, axis) -> bool: + # filter empty arrays + # 1-d dtypes always are included here + if x.ndim <= axis: + return True + return x.shape[axis] > 0 + + def concat_compat(to_concat, axis: AxisInt = 0, ea_compat_axis: bool = False): """ provide concatenation of an array of arrays each of which is a single @@ -48,21 +58,13 @@ def concat_compat(to_concat, axis: AxisInt = 0, ea_compat_axis: bool = False): ------- a single array, preserving the combined dtypes """ - - # filter empty arrays - # 1-d dtypes always are included here - def is_nonempty(x) -> bool: - if x.ndim <= axis: - return True - return x.shape[axis] > 0 - # If all arrays are empty, there's nothing to convert, just short-cut to # the concatenation, #3121. # # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. - non_empties = [x for x in to_concat if is_nonempty(x)] + non_empties = [x for x in to_concat if _is_nonempty(x, axis)] if non_empties and axis == 0 and not ea_compat_axis: # ea_compat_axis see GH#39574 to_concat = non_empties @@ -70,13 +72,13 @@ def is_nonempty(x) -> bool: dtypes = {obj.dtype for obj in to_concat} kinds = {obj.dtype.kind for obj in to_concat} contains_datetime = any( - isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in ["m", "M"] + isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in "mM" for dtype in dtypes ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat) all_empty = not len(non_empties) - single_dtype = len({x.dtype for x in to_concat}) == 1 - any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat) + single_dtype = len(dtypes) == 1 + any_ea = any(isinstance(x, ExtensionDtype) for x in dtypes) if contains_datetime: return _concat_datetime(to_concat, axis=axis) @@ -308,7 +310,7 @@ def _concat_datetime(to_concat, axis: AxisInt = 0): to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] - single_dtype = len({x.dtype for x in to_concat}) == 1 + single_dtype = lib.dtypes_all_equal([x.dtype for x in to_concat]) # multiple types, need to coerce to object if not single_dtype: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index d3806c6850b7a..0e920876e09f2 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -480,9 +480,7 @@ def __init__( else: # filter out the empties if we have not multi-index possibilities # note to keep empty Series as it affect to result columns / name - non_empties = [ - obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, ABCSeries) - ] + non_empties = [obj for obj in objs if sum(obj.shape) > 0 or obj.ndim == 1] if len(non_empties) and ( keys is None and names is None and levels is None and not self.intersect @@ -495,19 +493,21 @@ def __init__( self.objs = objs # Standardize axis parameter to int - if isinstance(sample, ABCSeries): + if sample.ndim == 1: from pandas import DataFrame axis = DataFrame._get_axis_number(axis) + self._is_frame = False + self._is_series = True else: axis = sample._get_axis_number(axis) + self._is_frame = True + self._is_series = False # Need to flip BlockManager axis in the DataFrame special case - self._is_frame = isinstance(sample, ABCDataFrame) if self._is_frame: axis = sample._get_block_manager_axis(axis) - self._is_series = isinstance(sample, ABCSeries) if not 0 <= axis <= sample.ndim: raise AssertionError( f"axis must be between 0 and {sample.ndim}, input was {axis}" @@ -583,7 +583,8 @@ def get_result(self): arrs = [ser._values for ser in self.objs] res = concat_compat(arrs, axis=0) - result = cons(res, index=self.new_axes[0], name=name, dtype=res.dtype) + mgr = type(sample._mgr).from_array(res, index=self.new_axes[0]) + result = cons(mgr, name=name, fastpath=True) return result.__finalize__(self, method="concat") # combine as columns in a frame @@ -666,7 +667,7 @@ def _get_concat_axis(self) -> Index: num = 0 has_names = False for i, x in enumerate(self.objs): - if not isinstance(x, ABCSeries): + if x.ndim != 1: raise TypeError( f"Cannot concatenate type 'Series' with " f"object of type '{type(x).__name__}'" diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 9cf7a08357720..16b05be2e0bb9 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -346,18 +346,7 @@ def test_fillna_frame(self, data_missing): class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): - @pytest.mark.parametrize( - "in_frame", - [ - True, - pytest.param( - False, - marks=pytest.mark.xfail(reason="PandasArray inconsistently extracted"), - ), - ], - ) - def test_concat(self, data, in_frame): - super().test_concat(data, in_frame) + pass class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): From 009a0e8e9b938e451f2b67d5e5186a491cc3183a Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 30 Mar 2023 14:46:50 -0700 Subject: [PATCH 2/3] REF/PERF: pd.concat --- pandas/core/generic.py | 8 +- pandas/core/reshape/concat.py | 208 ++++++++++++++++++++-------------- 2 files changed, 129 insertions(+), 87 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bb3573d148e98..f7d6dd15a8ac7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9347,7 +9347,13 @@ def compare( else: axis = self._get_axis_number(align_axis) - diff = concat([self, other], axis=axis, keys=result_names) + # error: List item 0 has incompatible type "NDFrame"; expected + # "Union[Series, DataFrame]" + diff = concat( + [self, other], # type: ignore[list-item] + axis=axis, + keys=result_names, + ) if axis >= self.ndim: # No need to reorganize data if stacking on new axis diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 0e920876e09f2..0670b89a45d4a 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -56,7 +56,6 @@ DataFrame, Series, ) - from pandas.core.generic import NDFrame # --------------------------------------------------------------------- # Concatenate DataFrame objects @@ -98,7 +97,7 @@ def concat( @overload def concat( - objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame], + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], *, axis: Literal[0, "index"] = ..., join: str = ..., @@ -115,7 +114,7 @@ def concat( @overload def concat( - objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame], + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], *, axis: Literal[1, "columns"], join: str = ..., @@ -132,7 +131,7 @@ def concat( @overload def concat( - objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame], + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], *, axis: Axis = ..., join: str = ..., @@ -148,7 +147,7 @@ def concat( def concat( - objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame], + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], *, axis: Axis = 0, join: str = "outer", @@ -395,7 +394,7 @@ class _Concatenator: def __init__( self, - objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame], + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], axis: Axis = 0, join: str = "outer", keys=None, @@ -421,6 +420,72 @@ def __init__( "Only can inner (intersect) or outer (union) join the other axis" ) + if not is_bool(sort): + raise ValueError( + f"The 'sort' keyword only accepts boolean values; {sort} was passed." + ) + # Incompatible types in assignment (expression has type "Union[bool, bool_]", + # variable has type "bool") + self.sort = sort # type: ignore[assignment] + + self.ignore_index = ignore_index + self.verify_integrity = verify_integrity + self.copy = copy + + objs, keys = self._clean_keys_and_objs(objs, keys) + + # figure out what our result ndim is going to be + ndims = self._get_ndims(objs) + sample, objs = self._get_sample_object(objs, ndims, keys, names, levels) + + # Standardize axis parameter to int + if sample.ndim == 1: + from pandas import DataFrame + + axis = DataFrame._get_axis_number(axis) + self._is_frame = False + self._is_series = True + else: + axis = sample._get_axis_number(axis) + self._is_frame = True + self._is_series = False + + # Need to flip BlockManager axis in the DataFrame special case + axis = sample._get_block_manager_axis(axis) + + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + objs, sample = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) + + self.objs = objs + + # note: this is the BlockManager axis (since DataFrame is transposed) + self.bm_axis = axis + self.axis = 1 - self.bm_axis if self._is_frame else 0 + self.keys = keys + self.names = names or getattr(keys, "names", None) + self.levels = levels + + def _get_ndims(self, objs: list[Series | DataFrame]) -> set[int]: + # figure out what our result ndim is going to be + ndims = set() + for obj in objs: + if not isinstance(obj, (ABCSeries, ABCDataFrame)): + msg = ( + f"cannot concatenate object of type '{type(obj)}'; " + "only Series and DataFrame objs are valid" + ) + raise TypeError(msg) + + ndims.add(obj.ndim) + return ndims + + def _clean_keys_and_objs( + self, + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], + keys, + ) -> tuple[list[Series | DataFrame], Index | None]: if isinstance(objs, abc.Mapping): if keys is None: keys = list(objs.keys()) @@ -434,7 +499,7 @@ def __init__( if keys is None: objs = list(com.not_none(*objs)) else: - # #1649 + # GH#1649 clean_keys = [] clean_objs = [] for k, v in zip(keys, objs): @@ -454,22 +519,20 @@ def __init__( if len(objs) == 0: raise ValueError("All objects passed were None") - # figure out what our result ndim is going to be - ndims = set() - for obj in objs: - if not isinstance(obj, (ABCSeries, ABCDataFrame)): - msg = ( - f"cannot concatenate object of type '{type(obj)}'; " - "only Series and DataFrame objs are valid" - ) - raise TypeError(msg) - - ndims.add(obj.ndim) + return objs, keys + def _get_sample_object( + self, + objs: list[Series | DataFrame], + ndims: set[int], + keys, + names, + levels, + ) -> tuple[Series | DataFrame, list[Series | DataFrame]]: # get the sample # want the highest ndim that we have, and must be non-empty # unless all objs are empty - sample: NDFrame | None = None + sample: Series | DataFrame | None = None if len(ndims) > 1: max_ndim = max(ndims) for obj in objs: @@ -490,82 +553,48 @@ def __init__( if sample is None: sample = objs[0] - self.objs = objs - - # Standardize axis parameter to int - if sample.ndim == 1: - from pandas import DataFrame - - axis = DataFrame._get_axis_number(axis) - self._is_frame = False - self._is_series = True - else: - axis = sample._get_axis_number(axis) - self._is_frame = True - self._is_series = False - - # Need to flip BlockManager axis in the DataFrame special case - if self._is_frame: - axis = sample._get_block_manager_axis(axis) - - if not 0 <= axis <= sample.ndim: - raise AssertionError( - f"axis must be between 0 and {sample.ndim}, input was {axis}" - ) + return sample, objs + def _sanitize_mixed_ndim( + self, + objs: list[Series | DataFrame], + sample: Series | DataFrame, + ignore_index: bool, + axis: AxisInt, + ) -> tuple[list[Series | DataFrame], Series | DataFrame]: # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed - if len(ndims) > 1: - current_column = 0 - max_ndim = sample.ndim - self.objs, objs = [], self.objs - for obj in objs: - ndim = obj.ndim - if ndim == max_ndim: - pass - elif ndim != max_ndim - 1: - raise ValueError( - "cannot concatenate unaligned mixed " - "dimensional NDFrame objects" - ) + new_objs = [] - else: - name = getattr(obj, "name", None) - if ignore_index or name is None: - name = current_column - current_column += 1 + current_column = 0 + max_ndim = sample.ndim + for obj in objs: + ndim = obj.ndim + if ndim == max_ndim: + pass - # doing a row-wise concatenation so need everything - # to line up - if self._is_frame and axis == 1: - name = 0 - # mypy needs to know sample is not an NDFrame - sample = cast("DataFrame | Series", sample) - obj = sample._constructor({name: obj}, copy=False) + elif ndim != max_ndim - 1: + raise ValueError( + "cannot concatenate unaligned mixed dimensional NDFrame objects" + ) - self.objs.append(obj) + else: + name = getattr(obj, "name", None) + if ignore_index or name is None: + name = current_column + current_column += 1 - # note: this is the BlockManager axis (since DataFrame is transposed) - self.bm_axis = axis - self.axis = 1 - self.bm_axis if self._is_frame else 0 - self.keys = keys - self.names = names or getattr(keys, "names", None) - self.levels = levels + # doing a row-wise concatenation so need everything + # to line up + if self._is_frame and axis == 1: + name = 0 - if not is_bool(sort): - raise ValueError( - f"The 'sort' keyword only accepts boolean values; {sort} was passed." - ) - # Incompatible types in assignment (expression has type "Union[bool, bool_]", - # variable has type "bool") - self.sort = sort # type: ignore[assignment] + obj = sample._constructor({name: obj}, copy=False) - self.ignore_index = ignore_index - self.verify_integrity = verify_integrity - self.copy = copy + new_objs.append(obj) - self.new_axes = self._get_new_axes() + return new_objs, sample def get_result(self): cons: Callable[..., DataFrame | Series] @@ -583,7 +612,13 @@ def get_result(self): arrs = [ser._values for ser in self.objs] res = concat_compat(arrs, axis=0) - mgr = type(sample._mgr).from_array(res, index=self.new_axes[0]) + if self.ignore_index: + # We can avoid surprisingly-expensive _get_concat_axis + new_index = default_index(len(res)) + else: + new_index = self.new_axes[0] + + mgr = type(sample._mgr).from_array(res, index=new_index) result = cons(mgr, name=name, fastpath=True) return result.__finalize__(self, method="concat") @@ -634,7 +669,8 @@ def _get_result_dim(self) -> int: else: return self.objs[0].ndim - def _get_new_axes(self) -> list[Index]: + @cache_readonly + def new_axes(self) -> list[Index]: ndim = self._get_result_dim() return [ self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i) From f31df8d0e3346c83478f2da11be2d35b631aab3b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 30 Mar 2023 16:15:05 -0700 Subject: [PATCH 3/3] mypy fixup --- pandas/core/reshape/concat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 0670b89a45d4a..a74273cd3fac5 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -612,6 +612,8 @@ def get_result(self): arrs = [ser._values for ser in self.objs] res = concat_compat(arrs, axis=0) + + new_index: Index if self.ignore_index: # We can avoid surprisingly-expensive _get_concat_axis new_index = default_index(len(res))