WIP

dcherian · dcherian · commit adb82026043e · 2022-02-18T19:33:11.000-07:00
diff --git a/flox/core.py b/flox/core.py
@@ -66,7 +66,7 @@ def _get_expected_groups(by, raise_if_dask=True) -> pd.Index | None:
         return None
     flatby = by.ravel()
     expected = np.unique(flatby[~isnull(flatby)])
-    return _convert_expected_groups_to_index(expected, isbin=False)
+    return _convert_expected_groups_to_index((expected,), isbin=(False,))[0]
 
 
 def _get_chunk_reduction(reduction_type: str) -> Callable:
@@ -385,7 +385,7 @@ def offset_labels(labels: np.ndarray, ngroups: int) -> tuple[np.ndarray, int]:
     return offset, size
 
 
-def factorize_(by: tuple, axis, expected_groups: tuple[pd.Index, ...] = None, reindex=False):
+def factorize_(by: tuple, axis, expected_groups: tuple[pd.Index, ...] = None, reindex=False, fastpath=False):
     if not isinstance(by, tuple):
         raise ValueError(f"Expected `by` to be a tuple. Received {type(by)} instead")
 
@@ -420,10 +420,13 @@ def factorize_(by: tuple, axis, expected_groups: tuple[pd.Index, ...] = None, re
     grp_shape = tuple(len(grp) for grp in found_groups)
     ngroups = np.prod(grp_shape)
     if len(by) > 1:
-        group_idx = np.ravel_multi_index(factorized, grp_shape).reshape(by[0].shape)
+        group_idx = np.ravel_multi_index(factorized, grp_shape)
     else:
         group_idx = factorized[0]
 
+    if fastpath:
+        return group_idx, found_groups, grp_shape
+
     if np.isscalar(axis) and groupvar.ndim > 1:
         # Not reducing along all dimensions of by
         # this is OK because for 3D by and axis=(1,2),
@@ -1243,23 +1246,60 @@ def _assert_by_is_aligned(shape, by):
             )
 
 
-def _convert_expected_groups_to_index(expected_groups, isbin: bool) -> pd.Index | None:
-    if isinstance(expected_groups, pd.IntervalIndex) or (
-        isinstance(expected_groups, pd.Index) and not isbin
-    ):
-        return expected_groups
-    if isbin:
-        return pd.IntervalIndex.from_arrays(expected_groups[:-1], expected_groups[1:])
-    elif expected_groups is not None:
-        return pd.Index(expected_groups)
-    return expected_groups
+def _convert_expected_groups_to_index(expected_groups: tuple, isbin: bool) -> pd.Index | None:
+    out = []
+    for ex, isbin_ in zip(expected_groups, isbin):
+        if isinstance(ex, pd.IntervalIndex) or (isinstance(ex, pd.Index) and not isbin):
+            out.append(expected_groups)
+        elif ex is not None:
+            if isbin_:
+                out.append(pd.IntervalIndex.from_arrays(ex[:-1], ex[1:]))
+            else:
+                out.append(pd.Index(ex))
+        else:
+            assert ex is None
+            out.append(None)
+    return tuple(out)
+
+
+def _lazy_factorize_wrapper(*by, **kwargs):
+    group_idx, _ = factorize_(by, **kwargs)
+    return group_idx
+
+
+def _factorize_multiple(by, expected_groups, by_is_dask):
+    kwargs = dict(
+        expected_groups=expected_groups,
+        axis=None,  # always None, we offset later if necessary.
+        fastpath=True,
+    )
+    if by_is_dask:
+        import dask.array
+
+        group_idx = dask.array.map_blocks(
+            _lazy_factorize_wrapper,
+            *np.broadcast_arrays(*by),
+            meta=np.array((), dtype=np.int64),
+            **kwargs,
+        )
+        found_groups = tuple(None if is_duck_dask_array(b) else np.unique(b) for b in by)
+    else:
+        group_idx, found_groups, grp_shape = factorize_(by, **kwargs)
+
+    final_groups = tuple(
+        pd.Index(found) if expect is None else expect
+        for found, expect in zip(found_groups, expected_groups)
+    )
+
+    if any(grp is None for grp in final_groups):
+        raise
+    return (group_idx,), final_groups, grp_shape
 
 
 def groupby_reduce(
     array: np.ndarray | DaskArray,
-    by: np.ndarray | DaskArray,
+    *by: np.ndarray | DaskArray,
     func: str | Aggregation,
-    *,
     expected_groups: Sequence | np.ndarray | None = None,
     sort: bool = True,
     isbin: bool = False,
@@ -1373,6 +1413,7 @@ def groupby_reduce(
     reindex = _validate_reindex(reindex, func, method, expected_groups)
 
     by: tuple = tuple(np.asarray(b) if not is_duck_array(b) else b for b in by)
+    nby = len(by)
     by_is_dask = any(is_duck_dask_array(b) for b in by)
     if not is_duck_array(array):
         array = np.asarray(array)
@@ -1392,6 +1433,20 @@ def groupby_reduce(
     # (pd.IntervalIndex or not)
     expected_groups = _convert_expected_groups_to_index(expected_groups, isbin)
 
+    # when grouping by multiple variables, we factorize early.
+    # TODO: could restrict this to dask-only
+    if nby > 1:
+        by, final_groups, grp_shape = _factorize_multiple(
+            by, expected_groups, by_is_dask=by_is_dask
+        )
+        expected_groups = (pd.RangeIndex(np.prod(grp_shape)),)
+    else:
+        final_groups = expected_groups
+
+    assert len(by) == 1
+    by = by[0]
+    expected_groups = expected_groups[0]
+
     if axis is None:
         axis = tuple(array.ndim + np.arange(-by.ndim, 0))
     else:
@@ -1404,7 +1459,7 @@ def groupby_reduce(
 
     # TODO: make sure expected_groups is unique
     # TODO: hack
-    if len(axis) == 1 and by_ndim > 1 and expected_groups[0] is None:
+    if len(axis) == 1 and by.ndim > 1 and expected_groups is None:
         # TODO: hack
         if not by_is_dask:
             expected_groups = _get_expected_groups(by)
@@ -1509,7 +1564,11 @@ def groupby_reduce(
                 array = rechunk_for_blockwise(array, axis=-1, labels=by)
 
             result, *groups = partial_agg(
-                array, by, expected_groups=expected_groups, reindex=reindex, method=method
+                array,
+                by,
+                expected_groups=None if method == "blockwise" else expected_groups,
+                reindex=reindex,
+                method=method,
             )
 
         if sort and method != "map-reduce":
@@ -1518,4 +1577,7 @@ def groupby_reduce(
             result = result[..., sorted_idx]
             groups = (groups[0][sorted_idx],)
 
+    if nby > 1:
+        groups = final_groups
+        result = result.reshape(result.shape[:-1] + grp_shape)
     return (result, *groups)