diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bb3573d148e98..f7d6dd15a8ac7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9347,7 +9347,13 @@ def compare( else: axis = self._get_axis_number(align_axis) - diff = concat([self, other], axis=axis, keys=result_names) + # error: List item 0 has incompatible type "NDFrame"; expected + # "Union[Series, DataFrame]" + diff = concat( + [self, other], # type: ignore[list-item] + axis=axis, + keys=result_names, + ) if axis >= self.ndim: # No need to reorganize data if stacking on new axis diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 0e920876e09f2..cc2bde18a4cc2 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -56,7 +56,6 @@ DataFrame, Series, ) - from pandas.core.generic import NDFrame # --------------------------------------------------------------------- # Concatenate DataFrame objects @@ -98,7 +97,7 @@ def concat( @overload def concat( - objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame], + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], *, axis: Literal[0, "index"] = ..., join: str = ..., @@ -115,7 +114,7 @@ def concat( @overload def concat( - objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame], + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], *, axis: Literal[1, "columns"], join: str = ..., @@ -132,7 +131,7 @@ def concat( @overload def concat( - objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame], + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], *, axis: Axis = ..., join: str = ..., @@ -148,7 +147,7 @@ def concat( def concat( - objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame], + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], *, axis: Axis = 0, join: str = "outer", @@ -395,7 +394,7 @@ class _Concatenator: def __init__( self, - objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame], + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], axis: Axis = 0, join: str = "outer", keys=None, @@ -421,6 +420,72 @@ def __init__( "Only can inner (intersect) or outer (union) join the other axis" ) + if not is_bool(sort): + raise ValueError( + f"The 'sort' keyword only accepts boolean values; {sort} was passed." + ) + # Incompatible types in assignment (expression has type "Union[bool, bool_]", + # variable has type "bool") + self.sort = sort # type: ignore[assignment] + + self.ignore_index = ignore_index + self.verify_integrity = verify_integrity + self.copy = copy + + objs, keys = self._clean_keys_and_objs(objs, keys) + + # figure out what our result ndim is going to be + ndims = self._get_ndims(objs) + sample, objs = self._get_sample_object(objs, ndims, keys, names, levels) + + # Standardize axis parameter to int + if sample.ndim == 1: + from pandas import DataFrame + + axis = DataFrame._get_axis_number(axis) + self._is_frame = False + self._is_series = True + else: + axis = sample._get_axis_number(axis) + self._is_frame = True + self._is_series = False + + # Need to flip BlockManager axis in the DataFrame special case + axis = sample._get_block_manager_axis(axis) + + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + objs, sample = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) + + self.objs = objs + + # note: this is the BlockManager axis (since DataFrame is transposed) + self.bm_axis = axis + self.axis = 1 - self.bm_axis if self._is_frame else 0 + self.keys = keys + self.names = names or getattr(keys, "names", None) + self.levels = levels + + def _get_ndims(self, objs: list[Series | DataFrame]) -> set[int]: + # figure out what our result ndim is going to be + ndims = set() + for obj in objs: + if not isinstance(obj, (ABCSeries, ABCDataFrame)): + msg = ( + f"cannot concatenate object of type '{type(obj)}'; " + "only Series and DataFrame objs are valid" + ) + raise TypeError(msg) + + ndims.add(obj.ndim) + return ndims + + def _clean_keys_and_objs( + self, + objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], + keys, + ) -> tuple[list[Series | DataFrame], Index | None]: if isinstance(objs, abc.Mapping): if keys is None: keys = list(objs.keys()) @@ -434,7 +499,7 @@ def __init__( if keys is None: objs = list(com.not_none(*objs)) else: - # #1649 + # GH#1649 clean_keys = [] clean_objs = [] for k, v in zip(keys, objs): @@ -454,22 +519,20 @@ def __init__( if len(objs) == 0: raise ValueError("All objects passed were None") - # figure out what our result ndim is going to be - ndims = set() - for obj in objs: - if not isinstance(obj, (ABCSeries, ABCDataFrame)): - msg = ( - f"cannot concatenate object of type '{type(obj)}'; " - "only Series and DataFrame objs are valid" - ) - raise TypeError(msg) - - ndims.add(obj.ndim) + return objs, keys + def _get_sample_object( + self, + objs: list[Series | DataFrame], + ndims: set[int], + keys, + names, + levels, + ) -> tuple[Series | DataFrame, list[Series | DataFrame]]: # get the sample # want the highest ndim that we have, and must be non-empty # unless all objs are empty - sample: NDFrame | None = None + sample: Series | DataFrame | None = None if len(ndims) > 1: max_ndim = max(ndims) for obj in objs: @@ -490,82 +553,48 @@ def __init__( if sample is None: sample = objs[0] - self.objs = objs - - # Standardize axis parameter to int - if sample.ndim == 1: - from pandas import DataFrame - - axis = DataFrame._get_axis_number(axis) - self._is_frame = False - self._is_series = True - else: - axis = sample._get_axis_number(axis) - self._is_frame = True - self._is_series = False - - # Need to flip BlockManager axis in the DataFrame special case - if self._is_frame: - axis = sample._get_block_manager_axis(axis) - - if not 0 <= axis <= sample.ndim: - raise AssertionError( - f"axis must be between 0 and {sample.ndim}, input was {axis}" - ) + return sample, objs + def _sanitize_mixed_ndim( + self, + objs: list[Series | DataFrame], + sample: Series | DataFrame, + ignore_index: bool, + axis: AxisInt, + ) -> tuple[list[Series | DataFrame], Series | DataFrame]: # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed - if len(ndims) > 1: - current_column = 0 - max_ndim = sample.ndim - self.objs, objs = [], self.objs - for obj in objs: - ndim = obj.ndim - if ndim == max_ndim: - pass - elif ndim != max_ndim - 1: - raise ValueError( - "cannot concatenate unaligned mixed " - "dimensional NDFrame objects" - ) + new_objs = [] - else: - name = getattr(obj, "name", None) - if ignore_index or name is None: - name = current_column - current_column += 1 + current_column = 0 + max_ndim = sample.ndim + for obj in objs: + ndim = obj.ndim + if ndim == max_ndim: + pass - # doing a row-wise concatenation so need everything - # to line up - if self._is_frame and axis == 1: - name = 0 - # mypy needs to know sample is not an NDFrame - sample = cast("DataFrame | Series", sample) - obj = sample._constructor({name: obj}, copy=False) + elif ndim != max_ndim - 1: + raise ValueError( + "cannot concatenate unaligned mixed dimensional NDFrame objects" + ) - self.objs.append(obj) + else: + name = getattr(obj, "name", None) + if ignore_index or name is None: + name = current_column + current_column += 1 - # note: this is the BlockManager axis (since DataFrame is transposed) - self.bm_axis = axis - self.axis = 1 - self.bm_axis if self._is_frame else 0 - self.keys = keys - self.names = names or getattr(keys, "names", None) - self.levels = levels + # doing a row-wise concatenation so need everything + # to line up + if self._is_frame and axis == 1: + name = 0 - if not is_bool(sort): - raise ValueError( - f"The 'sort' keyword only accepts boolean values; {sort} was passed." - ) - # Incompatible types in assignment (expression has type "Union[bool, bool_]", - # variable has type "bool") - self.sort = sort # type: ignore[assignment] + obj = sample._constructor({name: obj}, copy=False) - self.ignore_index = ignore_index - self.verify_integrity = verify_integrity - self.copy = copy + new_objs.append(obj) - self.new_axes = self._get_new_axes() + return new_objs, sample def get_result(self): cons: Callable[..., DataFrame | Series] @@ -583,7 +612,16 @@ def get_result(self): arrs = [ser._values for ser in self.objs] res = concat_compat(arrs, axis=0) - mgr = type(sample._mgr).from_array(res, index=self.new_axes[0]) + + new_index: Index + if self.ignore_index: + # We can avoid surprisingly-expensive _get_concat_axis + new_index = default_index(len(res)) + else: + new_index = self.new_axes[0] + + mgr = type(sample._mgr).from_array(res, index=new_index) + result = cons(mgr, name=name, fastpath=True) return result.__finalize__(self, method="concat") @@ -634,7 +672,8 @@ def _get_result_dim(self) -> int: else: return self.objs[0].ndim - def _get_new_axes(self) -> list[Index]: + @cache_readonly + def new_axes(self) -> list[Index]: ndim = self._get_result_dim() return [ self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i)