pandas-dev
diff --git a/‎doc/source/whatsnew/v1.3.0.rst
+31 b/‎doc/source/whatsnew/v1.3.0.rst
+31
diff --git a/‎pandas/_libs/lib.pyx
+10-12 b/‎pandas/_libs/lib.pyx
+10-12
diff --git a/‎pandas/core/dtypes/cast.py
+39 b/‎pandas/core/dtypes/cast.py
+39
diff --git a/‎pandas/core/groupby/generic.py
+2-9 b/‎pandas/core/groupby/generic.py
+2-9
diff --git a/‎pandas/core/groupby/groupby.py
-3 b/‎pandas/core/groupby/groupby.py
-3
diff --git a/‎pandas/core/groupby/ops.py
+7-4 b/‎pandas/core/groupby/ops.py
+7-4
diff --git a/‎pandas/tests/extension/base/groupby.py
+2-2 b/‎pandas/tests/extension/base/groupby.py
+2-2
diff --git a/‎pandas/tests/extension/test_boolean.py
+2-2 b/‎pandas/tests/extension/test_boolean.py
+2-2
diff --git a/‎pandas/tests/groupby/aggregate/test_aggregate.py
+61-5 b/‎pandas/tests/groupby/aggregate/test_aggregate.py
+61-5
diff --git a/‎pandas/tests/groupby/test_categorical.py
+8-6 b/‎pandas/tests/groupby/test_categorical.py
+8-6
diff --git a/‎pandas/tests/groupby/test_function.py
+3-2 b/‎pandas/tests/groupby/test_function.py
+3-2
@@ -298,6 +298,36 @@ Preserve dtypes in  :meth:`~pandas.DataFrame.combine_first`
 
    combined.dtypes
 
+Group by methods agg and transform no longer changes return dtype for callables
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Previously the methods :meth:`.DataFrameGroupBy.aggregate`,
+:meth:`.SeriesGroupBy.aggregate`, :meth:`.DataFrameGroupBy.transform`, and
+:meth:`.SeriesGroupBy.transform` might cast the result dtype when the argument ``func``
+is callable, possibly leading to undesirable results (:issue:`21240`). The cast would
+occur if the result is numeric and casting back to the input dtype does not change any
+values as measured by ``np.allclose``. Now no such casting occurs.
+
+.. ipython:: python
+
+    df = pd.DataFrame({'key': [1, 1], 'a': [True, False], 'b': [True, True]})
+    df
+
+*pandas 1.2.x*
+
+.. code-block:: ipython
+
+    In [5]: df.groupby('key').agg(lambda x: x.sum())
+    Out[5]:
+            a  b
+    key
+    1    True  2
+
+*pandas 1.3.0*
+
+.. ipython:: python
+
+    In [5]: df.groupby('key').agg(lambda x: x.sum())
 
 Try operating inplace when setting values with ``loc`` and ``iloc``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -847,6 +877,7 @@ Groupby/resample/rolling
 - Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`)
 - Bug in :meth:`.GroupBy.rank` with nullable dtypes incorrectly raising ``TypeError`` (:issue:`41010`)
 - Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`)
+- Bug in :meth:`.GroupBy.mean`, :meth:`.GroupBy.median`, and :meth:`.GroupBy.var` would return integer dtype if the result happened to be an integer; now these methods will always return floats (:issue:`41137`)
 
 Reshaping
 ^^^^^^^^^
 
@@ -2233,7 +2233,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
         Array of converted object values to more specific dtypes if applicable.
     """
     cdef:
-        Py_ssize_t i, n, itemsize_max = 0
+        Py_ssize_t i, n, itemsize = 0
         ndarray[float64_t] floats
         ndarray[complex128_t] complexes
         ndarray[int64_t] ints
@@ -2266,10 +2266,12 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
 
     for i in range(n):
         val = objects[i]
-        if itemsize_max != -1:
-            itemsize = get_itemsize(val)
-            if itemsize > itemsize_max or itemsize == -1:
-                itemsize_max = itemsize
+        if (
+            hasattr(val, "dtype")
+            and hasattr(val.dtype, "itemsize")
+            and val.dtype.itemsize > itemsize
+        ):
+            itemsize = val.dtype.itemsize
 
         if val is None:
             seen.null_ = True
@@ -2458,13 +2460,9 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
                                 result = ints
                 elif seen.is_bool and not seen.nan_:
                     result = bools.view(np.bool_)
-
-        if result is uints or result is ints or result is floats or result is complexes:
-            # cast to the largest itemsize when all values are NumPy scalars
-            if itemsize_max > 0 and itemsize_max != result.dtype.itemsize:
-                result = result.astype(result.dtype.kind + str(itemsize_max))
-            return result
-        elif result is not None:
+        if result is not None:
+            if itemsize > 0 and itemsize != result.dtype.itemsize:
+                result = result.astype(result.dtype.kind + str(itemsize))
             return result
 
     return objects
 
@@ -406,6 +406,45 @@ def maybe_cast_pointwise_result(
     return result
 
 
+def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj:
+    """
+    Get the desired dtype of a result based on the
+    input dtype and how it was computed.
+
+    Parameters
+    ----------
+    dtype : DtypeObj
+        Input dtype.
+    how : str
+        How the result was computed.
+
+    Returns
+    -------
+    DtypeObj
+        The desired dtype of the result.
+    """
+    from pandas.core.arrays.boolean import BooleanDtype
+    from pandas.core.arrays.floating import Float64Dtype
+    from pandas.core.arrays.integer import (
+        Int64Dtype,
+        _IntegerDtype,
+    )
+
+    if how in ["add", "cumsum", "sum", "prod"]:
+        if dtype == np.dtype(bool):
+            return np.dtype(np.int64)
+        elif isinstance(dtype, (BooleanDtype, _IntegerDtype)):
+            return Int64Dtype()
+    elif how in ["mean", "median", "var"]:
+        if isinstance(dtype, (BooleanDtype, _IntegerDtype)):
+            return Float64Dtype()
+        elif is_float_dtype(dtype):
+            return dtype
+        elif is_numeric_dtype(dtype):
+            return np.dtype(np.float64)
+    return dtype
+
+
 def maybe_cast_to_extension_array(
     cls: type[ExtensionArray], obj: ArrayLike, dtype: ExtensionDtype | None = None
 ) -> ArrayLike:
 
@@ -44,10 +44,6 @@
     doc,
 )
 
-from pandas.core.dtypes.cast import (
-    find_common_type,
-    maybe_downcast_numeric,
-)
 from pandas.core.dtypes.common import (
     ensure_int64,
     is_bool,
@@ -588,8 +584,9 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
 
     def _transform_general(self, func, *args, **kwargs):
         """
-        Transform with a non-str `func`.
+        Transform with a callable func`.
         """
+        assert callable(func)
         klass = type(self._selected_obj)
 
         results = []
@@ -613,10 +610,6 @@ def _transform_general(self, func, *args, **kwargs):
         # we will only try to coerce the result type if
         # we have a numeric dtype, as these are *always* user-defined funcs
         # the cython take a different path (and casting)
-        if is_numeric_dtype(result.dtype):
-            common_dtype = find_common_type([self._selected_obj.dtype, result.dtype])
-            if common_dtype is result.dtype:
-                result = maybe_downcast_numeric(result, self._selected_obj.dtype)
 
         result.name = self._selected_obj.name
         return result
 
@@ -1241,9 +1241,6 @@ def _python_agg_general(self, func, *args, **kwargs):
             assert result is not None
             key = base.OutputKey(label=name, position=idx)
 
-            if is_numeric_dtype(obj.dtype):
-                result = maybe_downcast_numeric(result, obj.dtype)
-
             if self.grouper._filter_empty_groups:
                 mask = counts.ravel() > 0
 
 
@@ -290,10 +290,13 @@ def get_result_dtype(self, dtype: DtypeObj) -> DtypeObj:
                 return np.dtype(np.int64)
             elif isinstance(dtype, (BooleanDtype, _IntegerDtype)):
                 return Int64Dtype()
-        elif how in ["mean", "median", "var"] and isinstance(
-            dtype, (BooleanDtype, _IntegerDtype)
-        ):
-            return Float64Dtype()
+        elif how in ["mean", "median", "var"]:
+            if isinstance(dtype, (BooleanDtype, _IntegerDtype)):
+                return Float64Dtype()
+            elif is_float_dtype(dtype):
+                return dtype
+            elif is_numeric_dtype(dtype):
+                return np.dtype(np.float64)
         return dtype
 
     def uses_mask(self) -> bool:
 
@@ -25,7 +25,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
         _, index = pd.factorize(data_for_grouping, sort=True)
 
         index = pd.Index(index, name="B")
-        expected = pd.Series([3, 1, 4], index=index, name="A")
+        expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
         if as_index:
             self.assert_series_equal(result, expected)
         else:
@@ -54,7 +54,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
         _, index = pd.factorize(data_for_grouping, sort=False)
 
         index = pd.Index(index, name="B")
-        expected = pd.Series([1, 3, 4], index=index, name="A")
+        expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A")
         self.assert_series_equal(result, expected)
 
     def test_groupby_extension_transform(self, data_for_grouping):
 
@@ -272,7 +272,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
         _, index = pd.factorize(data_for_grouping, sort=True)
 
         index = pd.Index(index, name="B")
-        expected = pd.Series([3, 1], index=index, name="A")
+        expected = pd.Series([3.0, 1.0], index=index, name="A")
         if as_index:
             self.assert_series_equal(result, expected)
         else:
@@ -301,7 +301,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
         _, index = pd.factorize(data_for_grouping, sort=False)
 
         index = pd.Index(index, name="B")
-        expected = pd.Series([1, 3], index=index, name="A")
+        expected = pd.Series([1.0, 3.0], index=index, name="A")
         self.assert_series_equal(result, expected)
 
     def test_groupby_extension_transform(self, data_for_grouping):
 
@@ -234,11 +234,10 @@ def test_aggregate_item_by_item(df):
     K = len(result.columns)
 
     # GH5782
-    # odd comparisons can result here, so cast to make easy
-    exp = Series(np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo")
+    exp = Series(np.array([foo] * K), index=list("BCD"), name="foo")
     tm.assert_series_equal(result.xs("foo"), exp)
 
-    exp = Series(np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar")
+    exp = Series(np.array([bar] * K), index=list("BCD"), name="bar")
     tm.assert_almost_equal(result.xs("bar"), exp)
 
     def aggfun(ser):
@@ -442,6 +441,57 @@ def test_bool_agg_dtype(op):
     assert is_integer_dtype(result)
 
 
+@pytest.mark.parametrize(
+    "keys, agg_index",
+    [
+        (["a"], Index([1], name="a")),
+        (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype", ["bool", "int32", "int64", "float32", "float64"]
+)
+@pytest.mark.parametrize(
+    "result_dtype", ["bool", "int32", "int64", "float32", "float64"]
+)
+@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
+def test_callable_result_dtype_frame(
+    keys, agg_index, input_dtype, result_dtype, method
+):
+    # GH 21240
+    df = DataFrame({"a": [1], "b": [2], "c": [True]})
+    df["c"] = df["c"].astype(input_dtype)
+    op = getattr(df.groupby(keys)[["c"]], method)
+    result = op(lambda x: x.astype(result_dtype).iloc[0])
+    expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
+    expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype(
+        result_dtype
+    )
+    if method == "apply":
+        expected.columns.names = [0]
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "keys, agg_index",
+    [
+        (["a"], Index([1], name="a")),
+        (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
+    ],
+)
+@pytest.mark.parametrize("input", [True, 1, 1.0])
+@pytest.mark.parametrize("dtype", [bool, int, float])
+@pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
+def test_callable_result_dtype_series(keys, agg_index, input, dtype, method):
+    # GH 21240
+    df = DataFrame({"a": [1], "b": [2], "c": [input]})
+    op = getattr(df.groupby(keys)["c"], method)
+    result = op(lambda x: x.astype(dtype).iloc[0])
+    expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
+    expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype)
+    tm.assert_series_equal(result, expected)
+
+
 def test_order_aggregate_multiple_funcs():
     # GH 25692
     df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
@@ -462,7 +512,9 @@ def test_uint64_type_handling(dtype, how):
     expected = df.groupby("y").agg({"x": how})
     df.x = df.x.astype(dtype)
     result = df.groupby("y").agg({"x": how})
-    result.x = result.x.astype(np.int64)
+    if how not in ("mean", "median"):
+        # mean and median always result in floats
+        result.x = result.x.astype(np.int64)
     tm.assert_frame_equal(result, expected, check_exact=True)
 
 
@@ -849,7 +901,11 @@ def test_multiindex_custom_func(func):
     data = [[1, 4, 2], [5, 7, 1]]
     df = DataFrame(data, columns=MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]]))
     result = df.groupby(np.array([0, 1])).agg(func)
-    expected_dict = {(1, 3): {0: 1, 1: 5}, (1, 4): {0: 4, 1: 7}, (2, 3): {0: 2, 1: 1}}
+    expected_dict = {
+        (1, 3): {0: 1.0, 1: 5.0},
+        (1, 4): {0: 4.0, 1: 7.0},
+        (2, 3): {0: 2.0, 1: 1.0},
+    }
     expected = DataFrame(expected_dict)
     tm.assert_frame_equal(result, expected)
 
 
@@ -285,8 +285,6 @@ def test_apply(ordered):
     result = grouped.apply(lambda x: np.mean(x))
     tm.assert_frame_equal(result, expected)
 
-    # we coerce back to ints
-    expected = expected.astype("int")
     result = grouped.mean()
     tm.assert_frame_equal(result, expected)
 
@@ -371,7 +369,7 @@ def test_observed(observed, using_array_manager):
     result = groups_double_key.agg("mean")
     expected = DataFrame(
         {
-            "val": [10, 30, 20, 40],
+            "val": [10.0, 30, 20, 40],
             "cat": Categorical(
                 ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True
             ),
@@ -418,7 +416,9 @@ def test_observed_codes_remap(observed):
     groups_double_key = df.groupby([values, "C2"], observed=observed)
 
     idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"])
-    expected = DataFrame({"C1": [3, 3, 4, 5], "C3": [10, 100, 200, 34]}, index=idx)
+    expected = DataFrame(
+        {"C1": [3.0, 3.0, 4.0, 5.0], "C3": [10.0, 100.0, 200.0, 34.0]}, index=idx
+    )
     if not observed:
         expected = cartesian_product_for_groupers(
             expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"]
@@ -1505,7 +1505,9 @@ def test_read_only_category_no_sort():
     df = DataFrame(
         {"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))}
     )
-    expected = DataFrame(data={"a": [2, 6]}, index=CategoricalIndex([1, 2], name="b"))
+    expected = DataFrame(
+        data={"a": [2.0, 6.0]}, index=CategoricalIndex([1, 2], name="b")
+    )
     result = df.groupby("b", sort=False).mean()
     tm.assert_frame_equal(result, expected)
 
@@ -1597,7 +1599,7 @@ def test_aggregate_categorical_with_isnan():
     index = MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B"))
     expected = DataFrame(
         data={
-            "numerical_col": [1.0, 0.0],
+            "numerical_col": [1, 0],
             "object_col": [0, 0],
             "categorical_col": [0, 0],
         },
 
@@ -408,7 +408,8 @@ def test_median_empty_bins(observed):
 
     result = df.groupby(bins, observed=observed).median()
     expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
-    tm.assert_frame_equal(result, expected)
+    # TODO: GH 41137
+    tm.assert_frame_equal(result, expected, check_dtype=False)
 
 
 @pytest.mark.parametrize(
@@ -588,7 +589,7 @@ def test_ops_general(op, targop):
     df = DataFrame(np.random.randn(1000))
     labels = np.random.randint(0, 50, size=1000).astype(float)
 
-    result = getattr(df.groupby(labels), op)().astype(float)
+    result = getattr(df.groupby(labels), op)()
     expected = df.groupby(labels).agg(targop)
     tm.assert_frame_equal(result, expected)