From 4b06cab13369de78a637244b1281c1c417e2327c Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 29 Mar 2023 16:45:28 -0700
Subject: [PATCH 1/3] PERF: concat

---
 pandas/core/dtypes/concat.py         | 28 +++++++++++++++-------------
 pandas/core/reshape/concat.py        | 17 +++++++++--------
 pandas/tests/extension/test_numpy.py | 13 +------------
 3 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
index 9917af9da7665..e8307141762f2 100644
--- a/pandas/core/dtypes/concat.py
+++ b/pandas/core/dtypes/concat.py
@@ -7,6 +7,8 @@
 
 import numpy as np
 
+from pandas._libs import lib
+
 from pandas.core.dtypes.astype import astype_array
 from pandas.core.dtypes.cast import (
     common_dtype_categorical_compat,
@@ -29,6 +31,14 @@
     from pandas.core.arrays import Categorical
 
 
+def _is_nonempty(x, axis) -> bool:
+    # filter empty arrays
+    # 1-d dtypes always are included here
+    if x.ndim <= axis:
+        return True
+    return x.shape[axis] > 0
+
+
 def concat_compat(to_concat, axis: AxisInt = 0, ea_compat_axis: bool = False):
     """
     provide concatenation of an array of arrays each of which is a single
@@ -48,21 +58,13 @@ def concat_compat(to_concat, axis: AxisInt = 0, ea_compat_axis: bool = False):
     -------
     a single array, preserving the combined dtypes
     """
-
-    # filter empty arrays
-    # 1-d dtypes always are included here
-    def is_nonempty(x) -> bool:
-        if x.ndim <= axis:
-            return True
-        return x.shape[axis] > 0
-
     # If all arrays are empty, there's nothing to convert, just short-cut to
     # the concatenation, #3121.
     #
     # Creating an empty array directly is tempting, but the winnings would be
     # marginal given that it would still require shape & dtype calculation and
     # np.concatenate which has them both implemented is compiled.
-    non_empties = [x for x in to_concat if is_nonempty(x)]
+    non_empties = [x for x in to_concat if _is_nonempty(x, axis)]
     if non_empties and axis == 0 and not ea_compat_axis:
         # ea_compat_axis see GH#39574
         to_concat = non_empties
@@ -70,13 +72,13 @@ def is_nonempty(x) -> bool:
     dtypes = {obj.dtype for obj in to_concat}
     kinds = {obj.dtype.kind for obj in to_concat}
     contains_datetime = any(
-        isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in ["m", "M"]
+        isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in "mM"
         for dtype in dtypes
     ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat)
 
     all_empty = not len(non_empties)
-    single_dtype = len({x.dtype for x in to_concat}) == 1
-    any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat)
+    single_dtype = len(dtypes) == 1
+    any_ea = any(isinstance(x, ExtensionDtype) for x in dtypes)
 
     if contains_datetime:
         return _concat_datetime(to_concat, axis=axis)
@@ -308,7 +310,7 @@ def _concat_datetime(to_concat, axis: AxisInt = 0):
 
     to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]
 
-    single_dtype = len({x.dtype for x in to_concat}) == 1
+    single_dtype = lib.dtypes_all_equal([x.dtype for x in to_concat])
 
     # multiple types, need to coerce to object
     if not single_dtype:
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index d3806c6850b7a..0e920876e09f2 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -480,9 +480,7 @@ def __init__(
         else:
             # filter out the empties if we have not multi-index possibilities
             # note to keep empty Series as it affect to result columns / name
-            non_empties = [
-                obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, ABCSeries)
-            ]
+            non_empties = [obj for obj in objs if sum(obj.shape) > 0 or obj.ndim == 1]
 
             if len(non_empties) and (
                 keys is None and names is None and levels is None and not self.intersect
@@ -495,19 +493,21 @@ def __init__(
         self.objs = objs
 
         # Standardize axis parameter to int
-        if isinstance(sample, ABCSeries):
+        if sample.ndim == 1:
             from pandas import DataFrame
 
             axis = DataFrame._get_axis_number(axis)
+            self._is_frame = False
+            self._is_series = True
         else:
             axis = sample._get_axis_number(axis)
+            self._is_frame = True
+            self._is_series = False
 
         # Need to flip BlockManager axis in the DataFrame special case
-        self._is_frame = isinstance(sample, ABCDataFrame)
         if self._is_frame:
             axis = sample._get_block_manager_axis(axis)
 
-        self._is_series = isinstance(sample, ABCSeries)
         if not 0 <= axis <= sample.ndim:
             raise AssertionError(
                 f"axis must be between 0 and {sample.ndim}, input was {axis}"
@@ -583,7 +583,8 @@ def get_result(self):
                 arrs = [ser._values for ser in self.objs]
 
                 res = concat_compat(arrs, axis=0)
-                result = cons(res, index=self.new_axes[0], name=name, dtype=res.dtype)
+                mgr = type(sample._mgr).from_array(res, index=self.new_axes[0])
+                result = cons(mgr, name=name, fastpath=True)
                 return result.__finalize__(self, method="concat")
 
             # combine as columns in a frame
@@ -666,7 +667,7 @@ def _get_concat_axis(self) -> Index:
                 num = 0
                 has_names = False
                 for i, x in enumerate(self.objs):
-                    if not isinstance(x, ABCSeries):
+                    if x.ndim != 1:
                         raise TypeError(
                             f"Cannot concatenate type 'Series' with "
                             f"object of type '{type(x).__name__}'"
diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
index 9cf7a08357720..16b05be2e0bb9 100644
--- a/pandas/tests/extension/test_numpy.py
+++ b/pandas/tests/extension/test_numpy.py
@@ -346,18 +346,7 @@ def test_fillna_frame(self, data_missing):
 
 
 class TestReshaping(BaseNumPyTests, base.BaseReshapingTests):
-    @pytest.mark.parametrize(
-        "in_frame",
-        [
-            True,
-            pytest.param(
-                False,
-                marks=pytest.mark.xfail(reason="PandasArray inconsistently extracted"),
-            ),
-        ],
-    )
-    def test_concat(self, data, in_frame):
-        super().test_concat(data, in_frame)
+    pass
 
 
 class TestSetitem(BaseNumPyTests, base.BaseSetitemTests):

From 009a0e8e9b938e451f2b67d5e5186a491cc3183a Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 30 Mar 2023 14:46:50 -0700
Subject: [PATCH 2/3] REF/PERF: pd.concat

---
 pandas/core/generic.py        |   8 +-
 pandas/core/reshape/concat.py | 208 ++++++++++++++++++++--------------
 2 files changed, 129 insertions(+), 87 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index bb3573d148e98..f7d6dd15a8ac7 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -9347,7 +9347,13 @@ def compare(
         else:
             axis = self._get_axis_number(align_axis)
 
-        diff = concat([self, other], axis=axis, keys=result_names)
+        # error: List item 0 has incompatible type "NDFrame"; expected
+        #  "Union[Series, DataFrame]"
+        diff = concat(
+            [self, other],  # type: ignore[list-item]
+            axis=axis,
+            keys=result_names,
+        )
 
         if axis >= self.ndim:
             # No need to reorganize data if stacking on new axis
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index 0e920876e09f2..0670b89a45d4a 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -56,7 +56,6 @@
         DataFrame,
         Series,
     )
-    from pandas.core.generic import NDFrame
 
 # ---------------------------------------------------------------------
 # Concatenate DataFrame objects
@@ -98,7 +97,7 @@ def concat(
 
 @overload
 def concat(
-    objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
+    objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
     *,
     axis: Literal[0, "index"] = ...,
     join: str = ...,
@@ -115,7 +114,7 @@ def concat(
 
 @overload
 def concat(
-    objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
+    objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
     *,
     axis: Literal[1, "columns"],
     join: str = ...,
@@ -132,7 +131,7 @@ def concat(
 
 @overload
 def concat(
-    objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
+    objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
     *,
     axis: Axis = ...,
     join: str = ...,
@@ -148,7 +147,7 @@ def concat(
 
 
 def concat(
-    objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
+    objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
     *,
     axis: Axis = 0,
     join: str = "outer",
@@ -395,7 +394,7 @@ class _Concatenator:
 
     def __init__(
         self,
-        objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
+        objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
         axis: Axis = 0,
         join: str = "outer",
         keys=None,
@@ -421,6 +420,72 @@ def __init__(
                 "Only can inner (intersect) or outer (union) join the other axis"
             )
 
+        if not is_bool(sort):
+            raise ValueError(
+                f"The 'sort' keyword only accepts boolean values; {sort} was passed."
+            )
+        # Incompatible types in assignment (expression has type "Union[bool, bool_]",
+        # variable has type "bool")
+        self.sort = sort  # type: ignore[assignment]
+
+        self.ignore_index = ignore_index
+        self.verify_integrity = verify_integrity
+        self.copy = copy
+
+        objs, keys = self._clean_keys_and_objs(objs, keys)
+
+        # figure out what our result ndim is going to be
+        ndims = self._get_ndims(objs)
+        sample, objs = self._get_sample_object(objs, ndims, keys, names, levels)
+
+        # Standardize axis parameter to int
+        if sample.ndim == 1:
+            from pandas import DataFrame
+
+            axis = DataFrame._get_axis_number(axis)
+            self._is_frame = False
+            self._is_series = True
+        else:
+            axis = sample._get_axis_number(axis)
+            self._is_frame = True
+            self._is_series = False
+
+            # Need to flip BlockManager axis in the DataFrame special case
+            axis = sample._get_block_manager_axis(axis)
+
+        # if we have mixed ndims, then convert to highest ndim
+        # creating column numbers as needed
+        if len(ndims) > 1:
+            objs, sample = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis)
+
+        self.objs = objs
+
+        # note: this is the BlockManager axis (since DataFrame is transposed)
+        self.bm_axis = axis
+        self.axis = 1 - self.bm_axis if self._is_frame else 0
+        self.keys = keys
+        self.names = names or getattr(keys, "names", None)
+        self.levels = levels
+
+    def _get_ndims(self, objs: list[Series | DataFrame]) -> set[int]:
+        # figure out what our result ndim is going to be
+        ndims = set()
+        for obj in objs:
+            if not isinstance(obj, (ABCSeries, ABCDataFrame)):
+                msg = (
+                    f"cannot concatenate object of type '{type(obj)}'; "
+                    "only Series and DataFrame objs are valid"
+                )
+                raise TypeError(msg)
+
+            ndims.add(obj.ndim)
+        return ndims
+
+    def _clean_keys_and_objs(
+        self,
+        objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
+        keys,
+    ) -> tuple[list[Series | DataFrame], Index | None]:
         if isinstance(objs, abc.Mapping):
             if keys is None:
                 keys = list(objs.keys())
@@ -434,7 +499,7 @@ def __init__(
         if keys is None:
             objs = list(com.not_none(*objs))
         else:
-            # #1649
+            # GH#1649
             clean_keys = []
             clean_objs = []
             for k, v in zip(keys, objs):
@@ -454,22 +519,20 @@ def __init__(
         if len(objs) == 0:
             raise ValueError("All objects passed were None")
 
-        # figure out what our result ndim is going to be
-        ndims = set()
-        for obj in objs:
-            if not isinstance(obj, (ABCSeries, ABCDataFrame)):
-                msg = (
-                    f"cannot concatenate object of type '{type(obj)}'; "
-                    "only Series and DataFrame objs are valid"
-                )
-                raise TypeError(msg)
-
-            ndims.add(obj.ndim)
+        return objs, keys
 
+    def _get_sample_object(
+        self,
+        objs: list[Series | DataFrame],
+        ndims: set[int],
+        keys,
+        names,
+        levels,
+    ) -> tuple[Series | DataFrame, list[Series | DataFrame]]:
         # get the sample
         # want the highest ndim that we have, and must be non-empty
         # unless all objs are empty
-        sample: NDFrame | None = None
+        sample: Series | DataFrame | None = None
         if len(ndims) > 1:
             max_ndim = max(ndims)
             for obj in objs:
@@ -490,82 +553,48 @@ def __init__(
 
         if sample is None:
             sample = objs[0]
-        self.objs = objs
-
-        # Standardize axis parameter to int
-        if sample.ndim == 1:
-            from pandas import DataFrame
-
-            axis = DataFrame._get_axis_number(axis)
-            self._is_frame = False
-            self._is_series = True
-        else:
-            axis = sample._get_axis_number(axis)
-            self._is_frame = True
-            self._is_series = False
-
-        # Need to flip BlockManager axis in the DataFrame special case
-        if self._is_frame:
-            axis = sample._get_block_manager_axis(axis)
-
-        if not 0 <= axis <= sample.ndim:
-            raise AssertionError(
-                f"axis must be between 0 and {sample.ndim}, input was {axis}"
-            )
+        return sample, objs
 
+    def _sanitize_mixed_ndim(
+        self,
+        objs: list[Series | DataFrame],
+        sample: Series | DataFrame,
+        ignore_index: bool,
+        axis: AxisInt,
+    ) -> tuple[list[Series | DataFrame], Series | DataFrame]:
         # if we have mixed ndims, then convert to highest ndim
         # creating column numbers as needed
-        if len(ndims) > 1:
-            current_column = 0
-            max_ndim = sample.ndim
-            self.objs, objs = [], self.objs
-            for obj in objs:
-                ndim = obj.ndim
-                if ndim == max_ndim:
-                    pass
 
-                elif ndim != max_ndim - 1:
-                    raise ValueError(
-                        "cannot concatenate unaligned mixed "
-                        "dimensional NDFrame objects"
-                    )
+        new_objs = []
 
-                else:
-                    name = getattr(obj, "name", None)
-                    if ignore_index or name is None:
-                        name = current_column
-                        current_column += 1
+        current_column = 0
+        max_ndim = sample.ndim
+        for obj in objs:
+            ndim = obj.ndim
+            if ndim == max_ndim:
+                pass
 
-                    # doing a row-wise concatenation so need everything
-                    # to line up
-                    if self._is_frame and axis == 1:
-                        name = 0
-                    # mypy needs to know sample is not an NDFrame
-                    sample = cast("DataFrame | Series", sample)
-                    obj = sample._constructor({name: obj}, copy=False)
+            elif ndim != max_ndim - 1:
+                raise ValueError(
+                    "cannot concatenate unaligned mixed dimensional NDFrame objects"
+                )
 
-                self.objs.append(obj)
+            else:
+                name = getattr(obj, "name", None)
+                if ignore_index or name is None:
+                    name = current_column
+                    current_column += 1
 
-        # note: this is the BlockManager axis (since DataFrame is transposed)
-        self.bm_axis = axis
-        self.axis = 1 - self.bm_axis if self._is_frame else 0
-        self.keys = keys
-        self.names = names or getattr(keys, "names", None)
-        self.levels = levels
+                # doing a row-wise concatenation so need everything
+                # to line up
+                if self._is_frame and axis == 1:
+                    name = 0
 
-        if not is_bool(sort):
-            raise ValueError(
-                f"The 'sort' keyword only accepts boolean values; {sort} was passed."
-            )
-        # Incompatible types in assignment (expression has type "Union[bool, bool_]",
-        # variable has type "bool")
-        self.sort = sort  # type: ignore[assignment]
+                obj = sample._constructor({name: obj}, copy=False)
 
-        self.ignore_index = ignore_index
-        self.verify_integrity = verify_integrity
-        self.copy = copy
+            new_objs.append(obj)
 
-        self.new_axes = self._get_new_axes()
+        return new_objs, sample
 
     def get_result(self):
         cons: Callable[..., DataFrame | Series]
@@ -583,7 +612,13 @@ def get_result(self):
                 arrs = [ser._values for ser in self.objs]
 
                 res = concat_compat(arrs, axis=0)
-                mgr = type(sample._mgr).from_array(res, index=self.new_axes[0])
+                if self.ignore_index:
+                    # We can avoid surprisingly-expensive _get_concat_axis
+                    new_index = default_index(len(res))
+                else:
+                    new_index = self.new_axes[0]
+
+                mgr = type(sample._mgr).from_array(res, index=new_index)
                 result = cons(mgr, name=name, fastpath=True)
                 return result.__finalize__(self, method="concat")
 
@@ -634,7 +669,8 @@ def _get_result_dim(self) -> int:
         else:
             return self.objs[0].ndim
 
-    def _get_new_axes(self) -> list[Index]:
+    @cache_readonly
+    def new_axes(self) -> list[Index]:
         ndim = self._get_result_dim()
         return [
             self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i)

From f31df8d0e3346c83478f2da11be2d35b631aab3b Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 30 Mar 2023 16:15:05 -0700
Subject: [PATCH 3/3] mypy fixup

---
 pandas/core/reshape/concat.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index 0670b89a45d4a..a74273cd3fac5 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -612,6 +612,8 @@ def get_result(self):
                 arrs = [ser._values for ser in self.objs]
 
                 res = concat_compat(arrs, axis=0)
+
+                new_index: Index
                 if self.ignore_index:
                     # We can avoid surprisingly-expensive _get_concat_axis
                     new_index = default_index(len(res))