Merge branch 'main' into bug-agg-nonunique-col

luke396 · web-flow · commit 5daa349d9fbe · 2023-04-10T13:44:10.000+08:00
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -3303,13 +3303,9 @@ def post_processor(
 
         orig_scalar = is_scalar(q)
         if orig_scalar:
-            # error: Incompatible types in assignment (expression has type "List[
-            # Union[float, ExtensionArray, ndarray[Any, Any], Index, Series]]",
-            # variable has type "Union[float, Union[Union[ExtensionArray, ndarray[
-            # Any, Any]], Index, Series]]")
-            q = [q]  # type: ignore[assignment]
-
-        qs = np.array(q, dtype=np.float64)
+            qs = np.array([q], dtype=np.float64)
+        else:
+            qs = np.array(q, dtype=np.float64)
         ids, _, ngroups = self.grouper.group_info
         nqs = len(qs)
 
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -13,6 +13,7 @@
 from pandas._libs import (
     NaT,
     internals as libinternals,
+    lib,
 )
 from pandas._libs.missing import NA
 from pandas.util._decorators import cache_readonly
@@ -403,56 +404,41 @@ def __init__(self, block: Block, shape: Shape, indexers=None) -> None:
         # Note: block is None implies indexers is None, but not vice-versa
         if indexers is None:
             indexers = {}
+        # Otherwise we may have only {0: np.array(...)} and only non-negative
+        #  entries.
         self.block = block
         self.indexers = indexers
         self.shape = shape
 
     def __repr__(self) -> str:
         return f"{type(self).__name__}({repr(self.block)}, {self.indexers})"
 
-    @cache_readonly
-    def needs_filling(self) -> bool:
-        for indexer in self.indexers.values():
-            # FIXME: cache results of indexer == -1 checks.
-            if (indexer == -1).any():
-                return True
-
-        return False
-
-    @cache_readonly
-    def dtype(self) -> DtypeObj:
-        blk = self.block
-        if blk.values.dtype.kind == "V":
-            raise AssertionError("Block is None, no dtype")
-
-        if not self.needs_filling:
-            return blk.dtype
-        return ensure_dtype_can_hold_na(blk.dtype)
-
     def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
         """
         Check that we are all-NA of a type/dtype that is compatible with this dtype.
         Augments `self.is_na` with an additional check of the type of NA values.
         """
         if not self.is_na:
             return False
-        if self.block.dtype.kind == "V":
+
+        blk = self.block
+        if blk.dtype.kind == "V":
             return True
 
-        if self.dtype == object:
-            values = self.block.values
+        if blk.dtype == object:
+            values = blk.values
             return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))
 
-        na_value = self.block.fill_value
-        if na_value is NaT and not is_dtype_equal(self.dtype, dtype):
+        na_value = blk.fill_value
+        if na_value is NaT and not is_dtype_equal(blk.dtype, dtype):
             # e.g. we are dt64 and other is td64
-            # fill_values match but we should not cast self.block.values to dtype
+            # fill_values match but we should not cast blk.values to dtype
             # TODO: this will need updating if we ever have non-nano dt64/td64
             return False
 
         if na_value is NA and needs_i8_conversion(dtype):
             # FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat
-            #  e.g. self.dtype == "Int64" and dtype is td64, we dont want
+            #  e.g. blk.dtype == "Int64" and dtype is td64, we dont want
             #  to consider these as matching
             return False
 
@@ -663,9 +649,11 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
 
     has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
 
-    dtypes = [unit.dtype for unit in join_units if not unit.is_na]
+    dtypes = [unit.block.dtype for unit in join_units if not unit.is_na]
     if not len(dtypes):
-        dtypes = [unit.dtype for unit in join_units if unit.block.dtype.kind != "V"]
+        dtypes = [
+            unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
+        ]
 
     dtype = find_common_type(dtypes)
     if has_none_blocks:
@@ -712,7 +700,7 @@ def _is_uniform_reindex(join_units) -> bool:
     return (
         # TODO: should this be ju.block._can_hold_na?
         all(ju.block.is_extension for ju in join_units)
-        and len({ju.block.dtype.name for ju in join_units}) == 1
+        and lib.dtypes_all_equal([ju.block.dtype for ju in join_units])
     )
 
 
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -1004,7 +1004,7 @@ def _maybe_add_join_keys(
                     result_dtype = find_common_type([lvals.dtype, rvals.dtype])
 
                 if result._is_label_reference(name):
-                    result[name] = Series(
+                    result[name] = result._constructor_sliced(
                         key_col, dtype=result_dtype, index=result.index
                     )
                 elif result._is_level_reference(name):
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -263,7 +263,7 @@ def _add_margins(
     rows,
     cols,
     aggfunc,
-    observed=None,
+    observed: bool,
     margins_name: Hashable = "All",
     fill_value=None,
 ):
@@ -292,7 +292,7 @@ def _add_margins(
     if not values and isinstance(table, ABCSeries):
         # If there are no values and the table is a series, then there is only
         # one column in the data. Compute grand margin and return it.
-        return table._append(Series({key: grand_margin[margins_name]}))
+        return table._append(table._constructor({key: grand_margin[margins_name]}))
 
     elif values:
         marginal_result_set = _generate_marginal_results(
@@ -364,8 +364,16 @@ def _compute_grand_margin(
 
 
 def _generate_marginal_results(
-    table, data, values, rows, cols, aggfunc, observed, margins_name: Hashable = "All"
+    table,
+    data: DataFrame,
+    values,
+    rows,
+    cols,
+    aggfunc,
+    observed: bool,
+    margins_name: Hashable = "All",
 ):
+    margin_keys: list | Index
     if len(cols) > 0:
         # need to "interleave" the margins
         table_pieces = []
@@ -433,23 +441,24 @@ def _all_key(key):
         new_order = [len(cols)] + list(range(len(cols)))
         row_margin.index = row_margin.index.reorder_levels(new_order)
     else:
-        row_margin = Series(np.nan, index=result.columns)
+        row_margin = data._constructor_sliced(np.nan, index=result.columns)
 
     return result, margin_keys, row_margin
 
 
 def _generate_marginal_results_without_values(
     table: DataFrame,
-    data,
+    data: DataFrame,
     rows,
     cols,
     aggfunc,
-    observed,
+    observed: bool,
     margins_name: Hashable = "All",
 ):
+    margin_keys: list | Index
     if len(cols) > 0:
         # need to "interleave" the margins
-        margin_keys: list | Index = []
+        margin_keys = []
 
         def _all_key():
             if len(cols) == 1:
@@ -535,7 +544,9 @@ def pivot(
                     data.index.get_level_values(i) for i in range(data.index.nlevels)
                 ]
             else:
-                index_list = [Series(data.index, name=data.index.name)]
+                index_list = [
+                    data._constructor_sliced(data.index, name=data.index.name)
+                ]
         else:
             index_list = [data[idx] for idx in com.convert_to_list_like(index)]
 
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -46,7 +46,10 @@
 )
 
 if TYPE_CHECKING:
-    from pandas._typing import npt
+    from pandas._typing import (
+        Level,
+        npt,
+    )
 
     from pandas.core.arrays import ExtensionArray
     from pandas.core.indexes.frozen import FrozenList
@@ -98,9 +101,7 @@ class _Unstacker:
     unstacked : DataFrame
     """
 
-    def __init__(self, index: MultiIndex, level=-1, constructor=None) -> None:
-        if constructor is None:
-            constructor = DataFrame
+    def __init__(self, index: MultiIndex, level: Level, constructor) -> None:
         self.constructor = constructor
 
         self.index = index.remove_unused_levels()
@@ -374,13 +375,14 @@ def new_index(self) -> MultiIndex:
         )
 
 
-def _unstack_multiple(data, clocs, fill_value=None):
+def _unstack_multiple(data: Series | DataFrame, clocs, fill_value=None):
     if len(clocs) == 0:
         return data
 
     # NOTE: This doesn't deal with hierarchical columns yet
 
     index = data.index
+    index = cast(MultiIndex, index)  # caller is responsible for checking
 
     # GH 19966 Make sure if MultiIndexed index has tuple name, they will be
     # recognised as a whole
@@ -433,10 +435,10 @@ def _unstack_multiple(data, clocs, fill_value=None):
             return result
 
         # GH#42579 deep=False to avoid consolidating
-        dummy = data.copy(deep=False)
-        dummy.index = dummy_index
+        dummy_df = data.copy(deep=False)
+        dummy_df.index = dummy_index
 
-        unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
+        unstacked = dummy_df.unstack("__placeholder__", fill_value=fill_value)
         if isinstance(unstacked, Series):
             unstcols = unstacked.index
         else:
@@ -497,7 +499,7 @@ def unstack(obj: Series | DataFrame, level, fill_value=None):
         )
 
 
-def _unstack_frame(obj: DataFrame, level, fill_value=None):
+def _unstack_frame(obj: DataFrame, level, fill_value=None) -> DataFrame:
     assert isinstance(obj.index, MultiIndex)  # checked by caller
     unstacker = _Unstacker(obj.index, level=level, constructor=obj._constructor)
 
@@ -617,7 +619,7 @@ def factorize(index):
     return frame._constructor_sliced(new_values, index=new_index)
 
 
-def stack_multiple(frame, level, dropna: bool = True):
+def stack_multiple(frame: DataFrame, level, dropna: bool = True):
     # If all passed levels match up to column names, no
     # ambiguity about what to do
     if all(lev in frame.columns.names for lev in level):
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -32,7 +32,10 @@
     is_scalar,
     is_timedelta64_dtype,
 )
-from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.dtypes import (
+    DatetimeTZDtype,
+    ExtensionDtype,
+)
 from pandas.core.dtypes.generic import ABCSeries
 from pandas.core.dtypes.missing import isna
 
@@ -47,7 +50,10 @@
 import pandas.core.algorithms as algos
 
 if TYPE_CHECKING:
-    from pandas._typing import IntervalLeftRight
+    from pandas._typing import (
+        DtypeObj,
+        IntervalLeftRight,
+    )
 
 
 def cut(
@@ -399,7 +405,7 @@ def _bins_to_cuts(
     labels=None,
     precision: int = 3,
     include_lowest: bool = False,
-    dtype=None,
+    dtype: DtypeObj | None = None,
     duplicates: str = "raise",
     ordered: bool = True,
 ):
@@ -481,7 +487,7 @@ def _coerce_to_type(x):
     this method converts it to numeric so that cut or qcut method can
     handle it
     """
-    dtype = None
+    dtype: DtypeObj | None = None
 
     if is_datetime64tz_dtype(x.dtype):
         dtype = x.dtype
@@ -508,7 +514,7 @@ def _coerce_to_type(x):
     return x, dtype
 
 
-def _convert_bin_to_numeric_type(bins, dtype):
+def _convert_bin_to_numeric_type(bins, dtype: DtypeObj | None):
     """
     if the passed bin is of datetime/timedelta type,
     this method converts it to integer
@@ -542,7 +548,7 @@ def _convert_bin_to_numeric_type(bins, dtype):
     return bins
 
 
-def _convert_bin_to_datelike_type(bins, dtype):
+def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None):
     """
     Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
     datelike
@@ -557,22 +563,26 @@ def _convert_bin_to_datelike_type(bins, dtype):
     bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
            datelike
     """
-    if is_datetime64tz_dtype(dtype):
+    if isinstance(dtype, DatetimeTZDtype):
         bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)
     elif is_datetime_or_timedelta_dtype(dtype):
         bins = Index(bins.astype(np.int64), dtype=dtype)
     return bins
 
 
 def _format_labels(
-    bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None
+    bins,
+    precision: int,
+    right: bool = True,
+    include_lowest: bool = False,
+    dtype: DtypeObj | None = None,
 ):
     """based on the dtype, return our labels"""
     closed: IntervalLeftRight = "right" if right else "left"
 
     formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
 
-    if is_datetime64tz_dtype(dtype):
+    if isinstance(dtype, DatetimeTZDtype):
         formatter = lambda x: Timestamp(x, tz=dtype.tz)
         adjust = lambda x: x - Timedelta("1ns")
     elif is_datetime64_dtype(dtype):
diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py
diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py

Original file line number	Diff line number	Diff line change
`@@ -1004,7 +1004,7 @@ def _maybe_add_join_keys(`
`1004`	`1004`	`result_dtype = find_common_type([lvals.dtype, rvals.dtype])`
`1005`	`1005`
`1006`	`1006`	`if result._is_label_reference(name):`
`1007`		`- result[name] = Series(`
	`1007`	`+ result[name] = result._constructor_sliced(`
`1008`	`1008`	`key_col, dtype=result_dtype, index=result.index`
`1009`	`1009`	`)`
`1010`	`1010`	`elif result._is_level_reference(name):`