diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 20e99d007c798..8188916a06008 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -606,6 +606,7 @@ Performance improvements - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``observed=False`` (:issue:`49596`) - Performance improvement in :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default). Now the index will be a :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49745`) - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`) +- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7b181a3e8e391..acd8af0241e68 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1813,6 +1813,28 @@ def to_numpy( return result + def _create_data_for_split_and_tight_to_dict( + self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int] + ) -> list: + """ + Simple helper method to create data for to ``to_dict(orient="split")`` and + ``to_dict(orient="tight")`` to create the main output data + """ + if are_all_object_dtype_cols: + data = [ + list(map(maybe_box_native, t)) + for t in self.itertuples(index=False, name=None) + ] + else: + data = [list(t) for t in self.itertuples(index=False, name=None)] + if object_dtype_indices: + # If we have object_dtype_cols, apply maybe_box_naive after list + # comprehension for perf + for row in data: + for i in object_dtype_indices: + row[i] = maybe_box_native(row[i]) + return data + @overload def to_dict( self, @@ -1952,30 +1974,50 @@ def to_dict( "'index=False' is only valid when 'orient' is 'split' or 'tight'" ) + if orient == "series": + # GH46470 Return quickly if orient series to avoid creating dtype objects + return into_c((k, v) for k, v in self.items()) + + object_dtype_indices = [ + i + for i, col_dtype in enumerate(self.dtypes.values) + if is_object_dtype(col_dtype) + ] + are_all_object_dtype_cols = len(object_dtype_indices) == len(self.dtypes) + if orient == "dict": return into_c((k, v.to_dict(into)) for k, v in self.items()) elif orient == "list": + object_dtype_indices_as_set = set(object_dtype_indices) return into_c( - (k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items() + ( + k, + list(map(maybe_box_native, v.tolist())) + if i in object_dtype_indices_as_set + else v.tolist(), + ) + for i, (k, v) in enumerate(self.items()) ) elif orient == "split": + data = self._create_data_for_split_and_tight_to_dict( + are_all_object_dtype_cols, object_dtype_indices + ) + return into_c( ((("index", self.index.tolist()),) if index else ()) + ( ("columns", self.columns.tolist()), - ( - "data", - [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ], - ), + ("data", data), ) ) elif orient == "tight": + data = self._create_data_for_split_and_tight_to_dict( + are_all_object_dtype_cols, object_dtype_indices + ) + return into_c( ((("index", self.index.tolist()),) if index else ()) + ( @@ -1992,26 +2034,65 @@ def to_dict( + (("column_names", list(self.columns.names)),) ) - elif orient == "series": - return into_c((k, v) for k, v in self.items()) - elif orient == "records": columns = self.columns.tolist() - rows = ( - dict(zip(columns, row)) - for row in self.itertuples(index=False, name=None) - ) - return [ - into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows - ] + if are_all_object_dtype_cols: + rows = ( + dict(zip(columns, row)) + for row in self.itertuples(index=False, name=None) + ) + return [ + into_c((k, maybe_box_native(v)) for k, v in row.items()) + for row in rows + ] + else: + data = [ + into_c(zip(columns, t)) + for t in self.itertuples(index=False, name=None) + ] + if object_dtype_indices: + object_dtype_indices_as_set = set(object_dtype_indices) + object_dtype_cols = { + col + for i, col in enumerate(self.columns) + if i in object_dtype_indices_as_set + } + for row in data: + for col in object_dtype_cols: + row[col] = maybe_box_native(row[col]) + return data elif orient == "index": if not self.index.is_unique: raise ValueError("DataFrame index must be unique for orient='index'.") - return into_c( - (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:])))) - for t in self.itertuples(name=None) - ) + columns = self.columns.tolist() + if are_all_object_dtype_cols: + return into_c( + (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:])))) + for t in self.itertuples(name=None) + ) + elif object_dtype_indices: + object_dtype_indices_as_set = set(object_dtype_indices) + is_object_dtype_by_index = [ + i in object_dtype_indices_as_set for i in range(len(self.columns)) + ] + return into_c( + ( + t[0], + { + columns[i]: maybe_box_native(v) + if is_object_dtype_by_index[i] + else v + for i, v in enumerate(t[1:]) + }, + ) + for t in self.itertuples(name=None) + ) + else: + return into_c( + (t[0], dict(zip(self.columns, t[1:]))) + for t in self.itertuples(name=None) + ) else: raise ValueError(f"orient '{orient}' not understood") diff --git a/pandas/core/series.py b/pandas/core/series.py index dceec40ae666a..e4c4bd305fc57 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1812,7 +1812,13 @@ def to_dict(self, into: type[dict] = dict) -> dict: """ # GH16122 into_c = com.standardize_mapping(into) - return into_c((k, maybe_box_native(v)) for k, v in self.items()) + + if is_object_dtype(self): + return into_c((k, maybe_box_native(v)) for k, v in self.items()) + else: + # Not an object dtype => all types will be the same so let the default + # indexer return native python type + return into_c((k, v) for k, v in self.items()) def to_frame(self, name: Hashable = lib.no_default) -> DataFrame: """ diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 521f6ead2e69e..c76699cafd481 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -379,6 +379,16 @@ def test_to_dict_orient_tight(self, index, columns): "b": [float, float, float], }, ), + ( # Make sure we have one df which is all object type cols + { + "a": [1, "hello", 3], + "b": [1.1, "world", 3.3], + }, + { + "a": [int, str, int], + "b": [float, str, float], + }, + ), ), ) def test_to_dict_returns_native_types(self, orient, data, expected_types):