From edc54125e3156ead76453d51e64b8bd47ed4957d Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 29 Sep 2020 20:46:09 +0700 Subject: [PATCH 1/7] REF: extract func _select_upcast_cls_from_dtype --- pandas/core/internals/concat.py | 47 +++++++++++++++++---------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index f5d0c921e1006..426d0fdb06876 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -380,29 +380,7 @@ def _get_empty_dtype_and_na(join_units): if dtype is None: continue - if is_categorical_dtype(dtype): - upcast_cls = "category" - elif is_datetime64tz_dtype(dtype): - upcast_cls = "datetimetz" - - elif is_extension_array_dtype(dtype): - upcast_cls = "extension" - - elif issubclass(dtype.type, np.bool_): - upcast_cls = "bool" - elif issubclass(dtype.type, np.object_): - upcast_cls = "object" - elif is_datetime64_dtype(dtype): - upcast_cls = "datetime" - elif is_timedelta64_dtype(dtype): - upcast_cls = "timedelta" - elif is_sparse(dtype): - upcast_cls = dtype.subtype.name - elif is_float_dtype(dtype) or is_numeric_dtype(dtype): - upcast_cls = dtype.name - else: - upcast_cls = "float" - + upcast_cls = _select_upcast_cls_from_dtype(dtype) # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. @@ -458,6 +436,29 @@ def _get_empty_dtype_and_na(join_units): raise AssertionError(msg) +def _select_upcast_cls_from_dtype(dtype): + if is_categorical_dtype(dtype): + return "category" + elif is_datetime64tz_dtype(dtype): + return "datetimetz" + elif is_extension_array_dtype(dtype): + return "extension" + elif issubclass(dtype.type, np.bool_): + return "bool" + elif issubclass(dtype.type, np.object_): + return "object" + elif is_datetime64_dtype(dtype): + return "datetime" + elif is_timedelta64_dtype(dtype): + return "timedelta" + elif is_sparse(dtype): + return dtype.subtype.name + elif is_float_dtype(dtype) or is_numeric_dtype(dtype): + return dtype.name + else: + return "float" + + def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: """ Check if the join units consist of blocks of uniform type that can From 2b57f0f6bcaf3d01c5b6993ee9a28e7907d87121 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 29 Sep 2020 20:51:08 +0700 Subject: [PATCH 2/7] REF: extract function _get_upcast_classes --- pandas/core/internals/concat.py | 39 +++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 426d0fdb06876..eea5b862d17eb 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -374,23 +374,8 @@ def _get_empty_dtype_and_na(join_units): else: dtypes[i] = unit.dtype - upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - for dtype, unit in zip(dtypes, join_units): - if dtype is None: - continue - - upcast_cls = _select_upcast_cls_from_dtype(dtype) - # Null blocks should not influence upcast class selection, unless there - # are only null blocks, when same upcasting rules must be applied to - # null upcast classes. - if unit.is_na: - null_upcast_classes[upcast_cls].append(dtype) - else: - upcast_classes[upcast_cls].append(dtype) + upcast_classes = _get_upcast_classes(join_units, dtypes) - if not upcast_classes: - upcast_classes = null_upcast_classes # TODO: de-duplicate with maybe_promote? # create the result if "extension" in upcast_classes: @@ -436,6 +421,28 @@ def _get_empty_dtype_and_na(join_units): raise AssertionError(msg) +def _get_upcast_classes(join_units, dtypes): + upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) + null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) + for dtype, unit in zip(dtypes, join_units): + if dtype is None: + continue + + upcast_cls = _select_upcast_cls_from_dtype(dtype) + # Null blocks should not influence upcast class selection, unless there + # are only null blocks, when same upcasting rules must be applied to + # null upcast classes. + if unit.is_na: + null_upcast_classes[upcast_cls].append(dtype) + else: + upcast_classes[upcast_cls].append(dtype) + + if not upcast_classes: + upcast_classes = null_upcast_classes + + return upcast_classes + + def _select_upcast_cls_from_dtype(dtype): if is_categorical_dtype(dtype): return "category" From ee4e74507f6d5e51cda3994c6fae42b8fd654e22 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 29 Sep 2020 21:22:32 +0700 Subject: [PATCH 3/7] CLN: rename g -> common_dtype --- pandas/core/internals/concat.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index eea5b862d17eb..88cbd4582b909 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -404,18 +404,18 @@ def _get_empty_dtype_and_na(join_units): return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: - g = np.find_common_type(upcast_classes, []) + common_dtype = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: - if is_float_dtype(g): - return g, g.type(np.nan) - elif is_numeric_dtype(g): + if is_float_dtype(common_dtype): + return common_dtype, common_dtype.type(np.nan) + elif is_numeric_dtype(common_dtype): if has_none_blocks: return np.dtype(np.float64), np.nan else: - return g, None + return common_dtype, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg) From 658b88bf2a0b1a80f60739fd3edbf888ca415594 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 29 Sep 2020 22:44:47 +0700 Subject: [PATCH 4/7] TYP: type extracted functions --- pandas/core/internals/concat.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 88cbd4582b909..f2b67be0a8dc8 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,6 +1,6 @@ from collections import defaultdict import copy -from typing import Dict, List +from typing import Any, Dict, List, Sequence, Tuple import numpy as np @@ -344,7 +344,7 @@ def _concatenate_join_units(join_units, concat_axis, copy): return concat_values -def _get_empty_dtype_and_na(join_units): +def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, Any]: """ Return dtype and N/A values to use when concatenating specified units. @@ -421,7 +421,10 @@ def _get_empty_dtype_and_na(join_units): raise AssertionError(msg) -def _get_upcast_classes(join_units, dtypes): +def _get_upcast_classes( + join_units: Sequence[JoinUnit], + dtypes: Sequence[DtypeObj], +) -> Dict[str, List[DtypeObj]]: upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) for dtype, unit in zip(dtypes, join_units): @@ -443,7 +446,7 @@ def _get_upcast_classes(join_units, dtypes): return upcast_classes -def _select_upcast_cls_from_dtype(dtype): +def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: if is_categorical_dtype(dtype): return "category" elif is_datetime64tz_dtype(dtype): @@ -459,7 +462,9 @@ def _select_upcast_cls_from_dtype(dtype): elif is_timedelta64_dtype(dtype): return "timedelta" elif is_sparse(dtype): - return dtype.subtype.name + # parent class ExtensionDtypes does not have attribute 'subtype', + # only some subclasses do (like SparseDtype), thus mypy error. + return dtype.subtype.name # type: ignore [union-attr] elif is_float_dtype(dtype) or is_numeric_dtype(dtype): return dtype.name else: From 2ed75047b08a2d60f4fbafbf8953e9437dd2d577 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 30 Sep 2020 01:07:11 +0700 Subject: [PATCH 5/7] DOC: add docstrings to extracted methods --- pandas/core/internals/concat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index f2b67be0a8dc8..64f84344a333e 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -425,6 +425,7 @@ def _get_upcast_classes( join_units: Sequence[JoinUnit], dtypes: Sequence[DtypeObj], ) -> Dict[str, List[DtypeObj]]: + """Create mapping between upcast class names and lists of dtypes.""" upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) for dtype, unit in zip(dtypes, join_units): @@ -447,6 +448,7 @@ def _get_upcast_classes( def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: + """Select upcast class name based on dtype.""" if is_categorical_dtype(dtype): return "category" elif is_datetime64tz_dtype(dtype): From 15a7cc9f4daeb18e0b7fd5089e03f16020d3e754 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 30 Sep 2020 01:12:38 +0700 Subject: [PATCH 6/7] TYP: cast instead of ignoring mypy error --- pandas/core/internals/concat.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 64f84344a333e..7ad61fee029eb 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,6 +1,6 @@ from collections import defaultdict import copy -from typing import Any, Dict, List, Sequence, Tuple +from typing import Any, Dict, List, Sequence, Tuple, cast import numpy as np @@ -25,6 +25,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, ExtensionArray +from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager @@ -464,9 +465,8 @@ def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: elif is_timedelta64_dtype(dtype): return "timedelta" elif is_sparse(dtype): - # parent class ExtensionDtypes does not have attribute 'subtype', - # only some subclasses do (like SparseDtype), thus mypy error. - return dtype.subtype.name # type: ignore [union-attr] + dtype = cast(SparseDtype, dtype) + return dtype.subtype.name elif is_float_dtype(dtype) or is_numeric_dtype(dtype): return dtype.name else: From f9afdaf4ca9e3423aca37a5836f423191cf3af1a Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 3 Oct 2020 22:12:01 +0700 Subject: [PATCH 7/7] CLN: import SparseDtype only for type checking --- pandas/core/internals/concat.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 7ad61fee029eb..7ad058cfeb83c 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,6 +1,6 @@ from collections import defaultdict import copy -from typing import Any, Dict, List, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, cast import numpy as np @@ -25,10 +25,12 @@ import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, ExtensionArray -from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager +if TYPE_CHECKING: + from pandas.core.arrays.sparse.dtype import SparseDtype + def concatenate_block_managers( mgrs_indexers, axes, concat_axis: int, copy: bool @@ -465,7 +467,7 @@ def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: elif is_timedelta64_dtype(dtype): return "timedelta" elif is_sparse(dtype): - dtype = cast(SparseDtype, dtype) + dtype = cast("SparseDtype", dtype) return dtype.subtype.name elif is_float_dtype(dtype) or is_numeric_dtype(dtype): return dtype.name