PERF: Slow performance of to_dict (#46470)

Roger Thomas · Roger Thomas · commit 4f0f872875ba · 2022-03-23T21:15:12.000Z
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -385,6 +385,7 @@ Performance improvements
 - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
 - Performance improvement in :func:`factorize` (:issue:`46109`)
 - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
+- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` especially when using non-mixed dtypes (:issue:`46470`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.bug_fixes:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1775,6 +1775,129 @@ def to_numpy(
 
         return result
 
+    def _to_dict_helper(self, orient, into_c, into):
+        """Helper function to do main work to convert frame into dict based on
+        `orient` and `into`
+
+        As part of GH46470 also takes care in when to use maybe_box_native as this
+        function can perform badly and is not necessary for non object cols
+        """
+        object_dtype_cols = {
+            col for col, dtype in self.dtypes.items() if is_object_dtype(dtype)
+        }
+        if orient == "dict":
+            return into_c((k, v.to_dict(into)) for k, v in self.items())
+        elif orient == "list":
+            return into_c(
+                (
+                    k,
+                    list(map(maybe_box_native, v.tolist()))
+                    if k in object_dtype_cols
+                    else v.tolist(),
+                )
+                for k, v in self.items()
+            )
+        elif orient == "split":
+            if object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                data = [
+                    [
+                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
+                        for i, v in enumerate(t)
+                    ]
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
+            return into_c(
+                (
+                    ("index", self.index.tolist()),
+                    ("columns", self.columns.tolist()),
+                    ("data", data),
+                )
+            )
+        elif orient == "series":
+            return into_c((k, v) for k, v in self.items())
+        elif orient == "records":
+            columns = self.columns.tolist()
+            if object_dtype_cols:
+                is_object_dtype_by_index = [col in object_dtype_cols for col in columns]
+                return [
+                    into_c(
+                        zip(
+                            columns,
+                            [
+                                maybe_box_native(v)
+                                if is_object_dtype_by_index[i]
+                                else v
+                                for i, v in enumerate(t)
+                            ],
+                        )
+                    )
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                return [
+                    into_c(zip(columns, t))
+                    for t in self.itertuples(index=False, name=None)
+                ]
+        elif orient == "index":
+            if not self.index.is_unique:
+                raise ValueError("DataFrame index must be unique for orient='index'.")
+            if object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                return into_c(
+                    (
+                        t[0],
+                        dict(
+                            zip(
+                                self.columns,
+                                [
+                                    maybe_box_native(v)
+                                    if is_object_dtype_by_index[i]
+                                    else v
+                                    for i, v in enumerate(t[1:])
+                                ],
+                            )
+                        ),
+                    )
+                    for t in self.itertuples(name=None)
+                )
+            else:
+                return into_c(
+                    (t[0], dict(zip(self.columns, t[1:])))
+                    for t in self.itertuples(name=None)
+                )
+        elif orient == "tight":
+            if object_dtype_cols:
+                is_object_dtype_by_index = [
+                    col in object_dtype_cols for col in self.columns
+                ]
+                data = [
+                    [
+                        maybe_box_native(v) if is_object_dtype_by_index[i] else v
+                        for i, v in enumerate(t)
+                    ]
+                    for t in self.itertuples(index=False, name=None)
+                ]
+            else:
+                data = [list(t) for t in self.itertuples(index=False, name=None)]
+            return into_c(
+                (
+                    ("index", self.index.tolist()),
+                    ("columns", self.columns.tolist()),
+                    ("data", data),
+                    ("index_names", list(self.index.names)),
+                    ("column_names", list(self.columns.names)),
+                )
+            )
+        else:
+            raise ValueError(f"orient '{orient}' not understood")
+
     def to_dict(self, orient: str = "dict", into=dict):
         """
         Convert the DataFrame to a dictionary.
@@ -1913,67 +2036,7 @@ def to_dict(self, orient: str = "dict", into=dict):
             elif orient.startswith("i"):
                 orient = "index"
 
-        if orient == "dict":
-            return into_c((k, v.to_dict(into)) for k, v in self.items())
-
-        elif orient == "list":
-            return into_c((k, v.tolist()) for k, v in self.items())
-
-        elif orient == "split":
-            return into_c(
-                (
-                    ("index", self.index.tolist()),
-                    ("columns", self.columns.tolist()),
-                    (
-                        "data",
-                        [
-                            list(map(maybe_box_native, t))
-                            for t in self.itertuples(index=False, name=None)
-                        ],
-                    ),
-                )
-            )
-
-        elif orient == "tight":
-            return into_c(
-                (
-                    ("index", self.index.tolist()),
-                    ("columns", self.columns.tolist()),
-                    (
-                        "data",
-                        [
-                            list(map(maybe_box_native, t))
-                            for t in self.itertuples(index=False, name=None)
-                        ],
-                    ),
-                    ("index_names", list(self.index.names)),
-                    ("column_names", list(self.columns.names)),
-                )
-            )
-
-        elif orient == "series":
-            return into_c((k, v) for k, v in self.items())
-
-        elif orient == "records":
-            columns = self.columns.tolist()
-            rows = (
-                dict(zip(columns, row))
-                for row in self.itertuples(index=False, name=None)
-            )
-            return [
-                into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
-            ]
-
-        elif orient == "index":
-            if not self.index.is_unique:
-                raise ValueError("DataFrame index must be unique for orient='index'.")
-            return into_c(
-                (t[0], dict(zip(self.columns, t[1:])))
-                for t in self.itertuples(name=None)
-            )
-
-        else:
-            raise ValueError(f"orient '{orient}' not understood")
+        return self._to_dict_helper(orient, into_c, into)
 
     def to_gbq(
         self,
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1770,7 +1770,13 @@ def to_dict(self, into=dict):
         """
         # GH16122
         into_c = com.standardize_mapping(into)
-        return into_c((k, maybe_box_native(v)) for k, v in self.items())
+
+        if is_object_dtype(self):
+            return into_c((k, maybe_box_native(v)) for k, v in self.items())
+        else:
+            # Not an object dtype => all types will be the same so let the default
+            # indexer return native python type
+            return into_c((k, v) for k, v in self.items())
 
     def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
         """
diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py
@@ -344,3 +344,79 @@ def test_to_dict_orient_tight(self, index, columns):
         roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight")
 
         tm.assert_frame_equal(df, roundtrip)
+
+    @pytest.mark.parametrize(
+        "orient",
+        ["dict", "list", "split", "records", "index", "tight"],
+    )
+    @pytest.mark.parametrize(
+        "data,expected_types",
+        (
+            (
+                {
+                    "a": [np.int64(1), 1, np.int64(3)],
+                    "b": [np.float64(1.0), 2.0, np.float64(3.0)],
+                    "c": [np.float64(1.0), 2, np.int64(3)],
+                    "d": [np.float64(1.0), "a", np.int64(3)],
+                    "e": [np.float64(1.0), ["a"], np.int64(3)],
+                    "f": [np.float64(1.0), ("a",), np.int64(3)],
+                },
+                {
+                    "a": [int, int, int],
+                    "b": [float, float, float],
+                    "c": [float, float, float],
+                    "d": [float, str, int],
+                    "e": [float, list, int],
+                    "f": [float, tuple, int],
+                },
+            ),
+            (
+                {
+                    "a": [1, 2, 3],
+                    "b": [1.1, 2.2, 3.3],
+                },
+                {
+                    "a": [int, int, int],
+                    "b": [float, float, float],
+                },
+            ),
+        ),
+    )
+    def test_to_dict_return_types(self, orient, data, expected_types):
+        # GH46470
+        df = DataFrame(data)
+        result = df.to_dict(orient)
+        if orient == "dict":
+            assertion_iterator = (
+                (i, key, value)
+                for key, index_value_map in result.items()
+                for i, value in index_value_map.items()
+            )
+        elif orient == "list":
+            assertion_iterator = (
+                (i, key, value)
+                for key, values in result.items()
+                for i, value in enumerate(values)
+            )
+        elif orient in {"split", "tight"}:
+            assertion_iterator = (
+                (i, key, result["data"][i][j])
+                for i in result["index"]
+                for j, key in enumerate(result["columns"])
+            )
+        elif orient == "records":
+            assertion_iterator = (
+                (i, key, value)
+                for i, record in enumerate(result)
+                for key, value in record.items()
+            )
+        elif orient == "index":
+            assertion_iterator = (
+                (i, key, value)
+                for i, record in result.items()
+                for key, value in record.items()
+            )
+
+        for i, key, value in assertion_iterator:
+            assert value == data[key][i]
+            assert type(value) is expected_types[key][i]