From 2f5c450e4fb581560e67441f27da01dbfe62723f Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 1 Aug 2020 23:27:01 -0700 Subject: [PATCH 01/11] CLN: Simplify get_flattened_iterator --- pandas/core/groupby/ops.py | 2 +- pandas/core/sorting.py | 43 +++++++++++--------------------------- 2 files changed, 13 insertions(+), 32 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3aaeef3b63760..c0f441e674208 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -153,7 +153,7 @@ def _get_group_keys(self): comp_ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) + return list(get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes)) def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ee73aa42701b0..0c1b7339f325b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,5 +1,5 @@ """ miscellaneous sorting / groupby utilities """ -from typing import Callable, Optional +from typing import Callable, Generator, Optional, List, Tuple import numpy as np @@ -440,36 +440,17 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): return result -class _KeyMapper: - """ - Map compressed group id -> key tuple. - """ - - def __init__(self, comp_ids, ngroups: int, levels, labels): - self.levels = levels - self.labels = labels - self.comp_ids = comp_ids.astype(np.int64) - - self.k = len(labels) - self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)] - - self._populate_tables() - - def _populate_tables(self): - for labs, table in zip(self.labels, self.tables): - table.map(self.comp_ids, labs.astype(np.int64)) - - def get_key(self, comp_id): - return tuple( - level[table.get_item(comp_id)] - for table, level in zip(self.tables, self.levels) - ) - - -def get_flattened_iterator(comp_ids, ngroups, levels, labels): - # provide "flattened" iterator for multi-group setting - mapper = _KeyMapper(comp_ids, ngroups, levels, labels) - return [mapper.get_key(i) for i in range(ngroups)] +def get_flattened_iterator(comp_ids: np.ndarray, ngroups: int, levels, labels: List[np.ndarray]) -> Generator[Tuple]: + """Map compressed group id -> key tuple.""" + comp_ids = comp_ids.astype(np.int64, copy=False) + tables = [] + for labs, level in zip(labels, levels): + table = hashtable.Int64HashTable(ngroups) + table.map(comp_ids, labs.astype(np.int64, copy=False)) + tables.append(table) + for i in range(ngroups): + for table, level in zip(tables, levels): + yield tuple(level[table.get_item(i)]) def get_indexer_dict(label_list, keys): From 071378d6543e4e2ff58c1623c7763ee9940e6f46 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 2 Aug 2020 00:02:34 -0700 Subject: [PATCH 02/11] Yield correct value --- pandas/core/groupby/ops.py | 4 +++- pandas/core/sorting.py | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c0f441e674208..2c946b5915df8 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -153,7 +153,9 @@ def _get_group_keys(self): comp_ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return list(get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes)) + return list( + get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) + ) def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 0c1b7339f325b..f944e0cc7261d 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -440,7 +440,9 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): return result -def get_flattened_iterator(comp_ids: np.ndarray, ngroups: int, levels, labels: List[np.ndarray]) -> Generator[Tuple]: +def get_flattened_iterator( + comp_ids: np.ndarray, ngroups: int, levels, labels: List[np.ndarray] +) -> Generator[Tuple, None, None]: """Map compressed group id -> key tuple.""" comp_ids = comp_ids.astype(np.int64, copy=False) tables = [] @@ -449,8 +451,7 @@ def get_flattened_iterator(comp_ids: np.ndarray, ngroups: int, levels, labels: L table.map(comp_ids, labs.astype(np.int64, copy=False)) tables.append(table) for i in range(ngroups): - for table, level in zip(tables, levels): - yield tuple(level[table.get_item(i)]) + yield tuple(level[table.get_item(i)] for table, level in zip(tables, levels)) def get_indexer_dict(label_list, keys): From b3af159eb65b87c9c386b4007b347a1d62953fb9 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 2 Aug 2020 00:36:21 -0700 Subject: [PATCH 03/11] isort --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index f944e0cc7261d..4449ebb4400aa 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,5 +1,5 @@ """ miscellaneous sorting / groupby utilities """ -from typing import Callable, Generator, Optional, List, Tuple +from typing import Callable, Generator, List, Optional, Tuple import numpy as np From 1cb2feda8389515099ed0d5b23958b1ee92d5e8a Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 2 Aug 2020 14:34:15 -0700 Subject: [PATCH 04/11] Rename to get_flattened_list --- pandas/core/groupby/ops.py | 6 ++---- pandas/core/sorting.py | 12 +++++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 2c946b5915df8..c6b0732b04c09 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -50,7 +50,7 @@ from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, - get_flattened_iterator, + get_flattened_list, get_group_index, get_group_index_sorter, get_indexer_dict, @@ -153,9 +153,7 @@ def _get_group_keys(self): comp_ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return list( - get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) - ) + return get_flattened_list(comp_ids, ngroups, self.levels, self.codes) def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 4449ebb4400aa..31627951143a2 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,5 +1,5 @@ """ miscellaneous sorting / groupby utilities """ -from typing import Callable, Generator, List, Optional, Tuple +from typing import Callable, List, Optional, Tuple import numpy as np @@ -440,9 +440,9 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): return result -def get_flattened_iterator( +def get_flattened_list( comp_ids: np.ndarray, ngroups: int, levels, labels: List[np.ndarray] -) -> Generator[Tuple, None, None]: +) -> List[Tuple]: """Map compressed group id -> key tuple.""" comp_ids = comp_ids.astype(np.int64, copy=False) tables = [] @@ -450,8 +450,10 @@ def get_flattened_iterator( table = hashtable.Int64HashTable(ngroups) table.map(comp_ids, labs.astype(np.int64, copy=False)) tables.append(table) - for i in range(ngroups): - yield tuple(level[table.get_item(i)] for table, level in zip(tables, levels)) + return [ + tuple(level[table.get_item(i)] for table, level in zip(tables, levels)) + for i in range(ngroups) + ] def get_indexer_dict(label_list, keys): From 52b938c1fb5a73135fe7417fc4181640cd8b8415 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 4 Aug 2020 21:01:49 -0700 Subject: [PATCH 05/11] store intermediate arrays for performance --- pandas/core/sorting.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 31627951143a2..da2f17339e4fa 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,4 +1,5 @@ """ miscellaneous sorting / groupby utilities """ +from collections import defaultdict from typing import Callable, List, Optional, Tuple import numpy as np @@ -445,15 +446,13 @@ def get_flattened_list( ) -> List[Tuple]: """Map compressed group id -> key tuple.""" comp_ids = comp_ids.astype(np.int64, copy=False) - tables = [] + arrays = defaultdict(list) for labs, level in zip(labels, levels): table = hashtable.Int64HashTable(ngroups) table.map(comp_ids, labs.astype(np.int64, copy=False)) - tables.append(table) - return [ - tuple(level[table.get_item(i)] for table, level in zip(tables, levels)) - for i in range(ngroups) - ] + for i in range(ngroups): + arrays[i].append(level[table.get_item(i)]) + return [tuple(array) for array in arrays.values()] def get_indexer_dict(label_list, keys): From f889efd41b83602b8639cd2b9dddca271c47b6a5 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 4 Aug 2020 23:37:25 -0700 Subject: [PATCH 06/11] typing --- pandas/core/sorting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index da2f17339e4fa..60382560bdc3c 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,6 +1,6 @@ """ miscellaneous sorting / groupby utilities """ from collections import defaultdict -from typing import Callable, List, Optional, Tuple +from typing import Callable, Dict, List, Optional, Tuple import numpy as np @@ -446,7 +446,7 @@ def get_flattened_list( ) -> List[Tuple]: """Map compressed group id -> key tuple.""" comp_ids = comp_ids.astype(np.int64, copy=False) - arrays = defaultdict(list) + arrays: Dict[int, List] = defaultdict(list) for labs, level in zip(labels, levels): table = hashtable.Int64HashTable(ngroups) table.map(comp_ids, labs.astype(np.int64, copy=False)) From 8e83f5ad4f92505b7bf10d7345547317e0b1ae56 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 5 Aug 2020 09:45:30 -0700 Subject: [PATCH 07/11] Add better typing --- pandas/core/sorting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 60382560bdc3c..c328c872d978a 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,6 +1,6 @@ """ miscellaneous sorting / groupby utilities """ from collections import defaultdict -from typing import Callable, Dict, List, Optional, Tuple +from typing import Callable, DefaultDict, List, Optional, Tuple import numpy as np @@ -446,7 +446,7 @@ def get_flattened_list( ) -> List[Tuple]: """Map compressed group id -> key tuple.""" comp_ids = comp_ids.astype(np.int64, copy=False) - arrays: Dict[int, List] = defaultdict(list) + arrays: DefaultDict[int, List[int]] = defaultdict(list) for labs, level in zip(labels, levels): table = hashtable.Int64HashTable(ngroups) table.map(comp_ids, labs.astype(np.int64, copy=False)) From 0ce1136532e5c9399584c34999263317a25487cc Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 5 Aug 2020 11:07:36 -0700 Subject: [PATCH 08/11] type levels --- pandas/core/sorting.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index c328c872d978a..a3dbd6e2edbfc 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,6 +1,6 @@ """ miscellaneous sorting / groupby utilities """ from collections import defaultdict -from typing import Callable, DefaultDict, List, Optional, Tuple +from typing import TYPE_CHECKING, Callable, DefaultDict, List, Optional, Tuple import numpy as np @@ -19,6 +19,9 @@ import pandas.core.algorithms as algorithms from pandas.core.construction import extract_array +if TYPE_CHECKING: + from pandas.core.indexes.base import Index # noqa:F401 + _INT64_MAX = np.iinfo(np.int64).max @@ -442,7 +445,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): def get_flattened_list( - comp_ids: np.ndarray, ngroups: int, levels, labels: List[np.ndarray] + comp_ids: np.ndarray, ngroups: int, levels: List["Index"], labels: List[np.ndarray] ) -> List[Tuple]: """Map compressed group id -> key tuple.""" comp_ids = comp_ids.astype(np.int64, copy=False) From 4584c6d68fb10e271726077c27111d957db74d71 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 5 Aug 2020 11:54:32 -0700 Subject: [PATCH 09/11] Change noqa code --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index a3dbd6e2edbfc..c3627f0482ebc 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -20,7 +20,7 @@ from pandas.core.construction import extract_array if TYPE_CHECKING: - from pandas.core.indexes.base import Index # noqa:F401 + from pandas.core.indexes.base import Index # noqa:F811 _INT64_MAX = np.iinfo(np.int64).max From 68273aba14f28155cfaa87f7d48773c9108c33dd Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 5 Aug 2020 14:39:20 -0700 Subject: [PATCH 10/11] Add another noqa --- pandas/core/sorting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index c3627f0482ebc..6d0ddb78b501b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -20,7 +20,7 @@ from pandas.core.construction import extract_array if TYPE_CHECKING: - from pandas.core.indexes.base import Index # noqa:F811 + from pandas.core.indexes.base import Index # noqa:F401 _INT64_MAX = np.iinfo(np.int64).max @@ -413,7 +413,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): levels : Optional[List], if values is a MultiIndex, list of levels to apply the key to. """ - from pandas.core.indexes.api import Index + from pandas.core.indexes.api import Index # noqa:F811 if not key: return values From 7793b9d9fb069ae1277043643cc46bebc5a5431f Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 6 Aug 2020 09:21:49 -0700 Subject: [PATCH 11/11] use iterable instead of list --- pandas/core/sorting.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 6d0ddb78b501b..8bdd466ae6f33 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,6 +1,6 @@ """ miscellaneous sorting / groupby utilities """ from collections import defaultdict -from typing import TYPE_CHECKING, Callable, DefaultDict, List, Optional, Tuple +from typing import TYPE_CHECKING, Callable, DefaultDict, Iterable, List, Optional, Tuple import numpy as np @@ -445,7 +445,10 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): def get_flattened_list( - comp_ids: np.ndarray, ngroups: int, levels: List["Index"], labels: List[np.ndarray] + comp_ids: np.ndarray, + ngroups: int, + levels: Iterable["Index"], + labels: Iterable[np.ndarray], ) -> List[Tuple]: """Map compressed group id -> key tuple.""" comp_ids = comp_ids.astype(np.int64, copy=False)