Skip to content

BUG: groupby sum, mean, var should always be floats #41139

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 21, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,36 @@ Preserve dtypes in :meth:`~pandas.DataFrame.combine_first`

combined.dtypes

Group by methods agg and transform no longer changes return dtype for callables
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Previously the methods :meth:`.DataFrameGroupBy.aggregate`,
:meth:`.SeriesGroupBy.aggregate`, :meth:`.DataFrameGroupBy.transform`, and
:meth:`.SeriesGroupBy.transform` might cast the result dtype when the argument ``func``
is callable, possibly leading to undesirable results (:issue:`21240`). The cast would
occur if the result is numeric and casting back to the input dtype does not change any
values as measured by ``np.allclose``. Now no such casting occurs.

.. ipython:: python

df = pd.DataFrame({'key': [1, 1], 'a': [True, False], 'b': [True, True]})
df

*pandas 1.2.x*

.. code-block:: ipython

In [5]: df.groupby('key').agg(lambda x: x.sum())
Out[5]:
a b
key
1 True 2

*pandas 1.3.0*

.. ipython:: python

In [5]: df.groupby('key').agg(lambda x: x.sum())

Try operating inplace when setting values with ``loc`` and ``iloc``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
11 changes: 2 additions & 9 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,6 @@
doc,
)

from pandas.core.dtypes.cast import (
find_common_type,
maybe_downcast_numeric,
)
from pandas.core.dtypes.common import (
ensure_int64,
is_bool,
Expand Down Expand Up @@ -588,8 +584,9 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

def _transform_general(self, func, *args, **kwargs):
"""
Transform with a non-str `func`.
Transform with a callable func`.
"""
assert callable(func)
klass = type(self._selected_obj)

results = []
Expand All @@ -613,10 +610,6 @@ def _transform_general(self, func, *args, **kwargs):
# we will only try to coerce the result type if
# we have a numeric dtype, as these are *always* user-defined funcs
# the cython take a different path (and casting)
if is_numeric_dtype(result.dtype):
common_dtype = find_common_type([self._selected_obj.dtype, result.dtype])
if common_dtype is result.dtype:
result = maybe_downcast_numeric(result, self._selected_obj.dtype)

result.name = self._selected_obj.name
return result
Expand Down
13 changes: 5 additions & 8 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,9 +1241,6 @@ def _python_agg_general(self, func, *args, **kwargs):
assert result is not None
key = base.OutputKey(label=name, position=idx)

if is_numeric_dtype(obj.dtype):
result = maybe_downcast_numeric(result, obj.dtype)

if self.grouper._filter_empty_groups:
mask = counts.ravel() > 0

Expand Down Expand Up @@ -1525,12 +1522,12 @@ def mean(self, numeric_only: bool = True):
Groupby two columns and return the mean of the remaining column.

>>> df.groupby(['A', 'B']).mean()
C
C
A B
1 2.0 2
4.0 1
2 3.0 1
5.0 2
1 2.0 2.0
4.0 1.0
2 3.0 1.0
5.0 2.0

Groupby one column and return the mean of only particular column in
the group.
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ class Grouper:
>>> df.groupby(pd.Grouper(key="Animal")).mean()
Speed
Animal
Falcon 200
Parrot 10
Falcon 200.0
Parrot 10.0

Specify a resample operation on the column 'Publish date'

Expand Down
11 changes: 7 additions & 4 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,13 @@ def get_result_dtype(self, dtype: DtypeObj) -> DtypeObj:
return np.dtype(np.int64)
elif isinstance(dtype, (BooleanDtype, _IntegerDtype)):
return Int64Dtype()
elif how in ["mean", "median", "var"] and isinstance(
dtype, (BooleanDtype, _IntegerDtype)
):
return Float64Dtype()
elif how in ["mean", "median", "var"]:
if isinstance(dtype, (BooleanDtype, _IntegerDtype)):
return Float64Dtype()
elif is_float_dtype(dtype):
return dtype
elif is_numeric_dtype(dtype):
return np.dtype(np.float64)
return dtype

def uses_mask(self) -> bool:
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
_, index = pd.factorize(data_for_grouping, sort=True)

index = pd.Index(index, name="B")
expected = pd.Series([3, 1, 4], index=index, name="A")
expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
if as_index:
self.assert_series_equal(result, expected)
else:
Expand Down Expand Up @@ -54,7 +54,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
_, index = pd.factorize(data_for_grouping, sort=False)

index = pd.Index(index, name="B")
expected = pd.Series([1, 3, 4], index=index, name="A")
expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A")
self.assert_series_equal(result, expected)

def test_groupby_extension_transform(self, data_for_grouping):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
_, index = pd.factorize(data_for_grouping, sort=True)

index = pd.Index(index, name="B")
expected = pd.Series([3, 1], index=index, name="A")
expected = pd.Series([3.0, 1.0], index=index, name="A")
if as_index:
self.assert_series_equal(result, expected)
else:
Expand Down Expand Up @@ -301,7 +301,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
_, index = pd.factorize(data_for_grouping, sort=False)

index = pd.Index(index, name="B")
expected = pd.Series([1, 3], index=index, name="A")
expected = pd.Series([1.0, 3.0], index=index, name="A")
self.assert_series_equal(result, expected)

def test_groupby_extension_transform(self, data_for_grouping):
Expand Down
66 changes: 61 additions & 5 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,10 @@ def test_aggregate_item_by_item(df):
K = len(result.columns)

# GH5782
# odd comparisons can result here, so cast to make easy
exp = Series(np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo")
exp = Series(np.array([foo] * K), index=list("BCD"), name="foo")
tm.assert_series_equal(result.xs("foo"), exp)

exp = Series(np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar")
exp = Series(np.array([bar] * K), index=list("BCD"), name="bar")
tm.assert_almost_equal(result.xs("bar"), exp)

def aggfun(ser):
Expand Down Expand Up @@ -442,6 +441,57 @@ def test_bool_agg_dtype(op):
assert is_integer_dtype(result)


@pytest.mark.parametrize(
"keys, agg_index",
[
(["a"], Index([1], name="a")),
(["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
],
)
@pytest.mark.parametrize(
"input_dtype", ["bool", "int32", "int64", "float32", "float64"]
)
@pytest.mark.parametrize(
"result_dtype", ["bool", "int32", "int64", "float32", "float64"]
)
@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
def test_callable_result_dtype_frame(
keys, agg_index, input_dtype, result_dtype, method
):
# GH 21240
df = DataFrame({"a": [1], "b": [2], "c": [True]})
df["c"] = df["c"].astype(input_dtype)
op = getattr(df.groupby(keys)[["c"]], method)
result = op(lambda x: x.astype(result_dtype).iloc[0])
expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype(
result_dtype
)
if method == "apply":
expected.columns.names = [0]
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"keys, agg_index",
[
(["a"], Index([1], name="a")),
(["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
],
)
@pytest.mark.parametrize("input", [True, 1, 1.0])
@pytest.mark.parametrize("dtype", [bool, int, float])
@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
def test_callable_result_dtype_series(keys, agg_index, input, dtype, method):
# GH 21240
df = DataFrame({"a": [1], "b": [2], "c": [input]})
op = getattr(df.groupby(keys)["c"], method)
result = op(lambda x: x.astype(dtype).iloc[0])
expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype)
tm.assert_series_equal(result, expected)


def test_order_aggregate_multiple_funcs():
# GH 25692
df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
Expand All @@ -462,7 +512,9 @@ def test_uint64_type_handling(dtype, how):
expected = df.groupby("y").agg({"x": how})
df.x = df.x.astype(dtype)
result = df.groupby("y").agg({"x": how})
result.x = result.x.astype(np.int64)
if how not in ("mean", "median"):
# mean and median always result in floats
result.x = result.x.astype(np.int64)
tm.assert_frame_equal(result, expected, check_exact=True)


Expand Down Expand Up @@ -849,7 +901,11 @@ def test_multiindex_custom_func(func):
data = [[1, 4, 2], [5, 7, 1]]
df = DataFrame(data, columns=MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]]))
result = df.groupby(np.array([0, 1])).agg(func)
expected_dict = {(1, 3): {0: 1, 1: 5}, (1, 4): {0: 4, 1: 7}, (2, 3): {0: 2, 1: 1}}
expected_dict = {
(1, 3): {0: 1.0, 1: 5.0},
(1, 4): {0: 4.0, 1: 7.0},
(2, 3): {0: 2.0, 1: 1.0},
}
expected = DataFrame(expected_dict)
tm.assert_frame_equal(result, expected)

Expand Down
14 changes: 8 additions & 6 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,6 @@ def test_apply(ordered):
result = grouped.apply(lambda x: np.mean(x))
tm.assert_frame_equal(result, expected)

# we coerce back to ints
expected = expected.astype("int")
result = grouped.mean()
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -371,7 +369,7 @@ def test_observed(observed, using_array_manager):
result = groups_double_key.agg("mean")
expected = DataFrame(
{
"val": [10, 30, 20, 40],
"val": [10.0, 30.0, 20.0, 40.0],
"cat": Categorical(
["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True
),
Expand Down Expand Up @@ -418,7 +416,9 @@ def test_observed_codes_remap(observed):
groups_double_key = df.groupby([values, "C2"], observed=observed)

idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"])
expected = DataFrame({"C1": [3, 3, 4, 5], "C3": [10, 100, 200, 34]}, index=idx)
expected = DataFrame(
{"C1": [3.0, 3.0, 4.0, 5.0], "C3": [10.0, 100.0, 200.0, 34.0]}, index=idx
)
if not observed:
expected = cartesian_product_for_groupers(
expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"]
Expand Down Expand Up @@ -1505,7 +1505,9 @@ def test_read_only_category_no_sort():
df = DataFrame(
{"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))}
)
expected = DataFrame(data={"a": [2, 6]}, index=CategoricalIndex([1, 2], name="b"))
expected = DataFrame(
data={"a": [2.0, 6.0]}, index=CategoricalIndex([1, 2], name="b")
)
result = df.groupby("b", sort=False).mean()
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -1597,7 +1599,7 @@ def test_aggregate_categorical_with_isnan():
index = MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B"))
expected = DataFrame(
data={
"numerical_col": [1.0, 0.0],
"numerical_col": [1, 0],
"object_col": [0, 0],
"categorical_col": [0, 0],
},
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ def test_ops_general(op, targop):
df = DataFrame(np.random.randn(1000))
labels = np.random.randint(0, 50, size=1000).astype(float)

result = getattr(df.groupby(labels), op)().astype(float)
result = getattr(df.groupby(labels), op)()
expected = df.groupby(labels).agg(targop)
tm.assert_frame_equal(result, expected)

Expand Down
11 changes: 5 additions & 6 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,10 +299,9 @@ def f(x):
return float(len(x))

agged = grouped.agg(f)
expected = Series([4, 2], index=["bar", "foo"])

tm.assert_series_equal(agged, expected, check_dtype=False)
assert issubclass(agged.dtype.type, np.dtype(dtype).type)
expected = Series([4.0, 2.0], index=["bar", "foo"])
tm.assert_series_equal(agged, expected)


def test_indices_concatenation_order():
Expand Down Expand Up @@ -1237,7 +1236,7 @@ def test_groupby_keys_same_size_as_index():
)
df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index)
result = df.groupby([Grouper(level=0, freq=freq), "metric"]).mean()
expected = df.set_index([df.index, "metric"])
expected = df.set_index([df.index, "metric"]).astype(float)

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -1330,7 +1329,7 @@ def test_groupby_2d_malformed():
d["ones"] = [1, 1]
d["label"] = ["l1", "l2"]
tmp = d.groupby(["group"]).mean()
res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
tm.assert_numpy_array_equal(tmp.values, res_values)

Expand Down Expand Up @@ -2023,7 +2022,7 @@ def test_groupby_crash_on_nunique(axis):

def test_groupby_list_level():
# GH 9790
expected = DataFrame(np.arange(0, 9).reshape(3, 3))
expected = DataFrame(np.arange(0, 9).reshape(3, 3), dtype=float)
result = expected.groupby(level=[0]).mean()
tm.assert_frame_equal(result, expected)

Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def test_transform_bug():
# transforming on a datetime column
df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
result = df.groupby("A")["B"].transform(lambda x: x.rank(ascending=False))
expected = Series(np.arange(5, 0, step=-1), name="B")
expected = Series(np.arange(5, 0, step=-1), name="B", dtype="float64")
tm.assert_series_equal(result, expected)


Expand Down Expand Up @@ -493,7 +493,7 @@ def test_groupby_transform_with_int():
)
with np.errstate(all="ignore"):
result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
expected = DataFrame({"B": np.nan, "C": [-1, 0, 1, -1, 0, 1]})
expected = DataFrame({"B": np.nan, "C": [-1.0, 0.0, 1.0, -1.0, 0.0, 1.0]})
tm.assert_frame_equal(result, expected)

# int that needs float conversion
Expand All @@ -509,9 +509,9 @@ def test_groupby_transform_with_int():
expected = DataFrame({"B": np.nan, "C": concat([s1, s2])})
tm.assert_frame_equal(result, expected)

# int downcasting
# int doesn't get downcasted
result = df.groupby("A").transform(lambda x: x * 2 / 2)
expected = DataFrame({"B": 1, "C": [2, 3, 4, 10, 5, -1]})
expected = DataFrame({"B": 1.0, "C": [2.0, 3.0, 4.0, 10.0, 5.0, -1.0]})
tm.assert_frame_equal(result, expected)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/formats/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def test_to_csv_date_format(self):
df_sec["B"] = 0
df_sec["C"] = 1

expected_rows = ["A,B,C", "2013-01-01,0,1"]
expected_rows = ["A,B,C", "2013-01-01,0,1.0"]
expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)

df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"])
Expand Down
Loading