Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ Performance improvements
- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
- Performance improvement in :func:`factorize` (:issue:`46109`)
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` especially when using non-mixed dtypes (:issue:`46470`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.bug_fixes:
Expand Down
130 changes: 104 additions & 26 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1909,41 +1909,73 @@ def to_dict(self, orient: str = "dict", into=dict):
elif orient.startswith("i"):
orient = "index"

object_dtype_cols = {
col for col, dtype in self.dtypes.items() if is_object_dtype(dtype)
}
are_all_object_dtype_cols = len(object_dtype_cols) == len(self.dtypes)
if orient == "dict":
return into_c((k, v.to_dict(into)) for k, v in self.items())

elif orient == "list":
return into_c(
(k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items()
(
k,
list(map(maybe_box_native, v.tolist()))
if k in object_dtype_cols
else v.tolist(),
)
for k, v in self.items()
)

elif orient == "split":
if are_all_object_dtype_cols:
data = [
list(map(maybe_box_native, t))
for t in self.itertuples(index=False, name=None)
]
elif object_dtype_cols:
# A number of ways were tried here, this solution proved to be the
# most optimal in general
data = [list(t) for t in self.itertuples(index=False, name=None)]
object_type_indices = [
i for i, col in enumerate(self.columns) if col in object_dtype_cols
]
for row in data:
for i in object_type_indices:
row[i] = maybe_box_native(row[i])
else:
data = [list(t) for t in self.itertuples(index=False, name=None)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you share code between any of these cases? e.g. make a helper function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback done

return into_c(
(
("index", self.index.tolist()),
("columns", self.columns.tolist()),
(
"data",
[
list(map(maybe_box_native, t))
for t in self.itertuples(index=False, name=None)
],
),
("data", data),
)
)

elif orient == "tight":
if are_all_object_dtype_cols:
data = [
list(map(maybe_box_native, t))
for t in self.itertuples(index=False, name=None)
]
elif object_dtype_cols:
# A number of ways were tried here, this solution proved to be the
# most optimal in general
data = [list(t) for t in self.itertuples(index=False, name=None)]
object_type_indices = [
i for i, col in enumerate(self.columns) if col in object_dtype_cols
]
for row in data:
for i in object_type_indices:
row[i] = maybe_box_native(row[i])
else:
data = [list(t) for t in self.itertuples(index=False, name=None)]
return into_c(
(
("index", self.index.tolist()),
("columns", self.columns.tolist()),
(
"data",
[
list(map(maybe_box_native, t))
for t in self.itertuples(index=False, name=None)
],
),
("data", data),
("index_names", list(self.index.names)),
("column_names", list(self.columns.names)),
)
Expand All @@ -1954,21 +1986,67 @@ def to_dict(self, orient: str = "dict", into=dict):

elif orient == "records":
columns = self.columns.tolist()
rows = (
dict(zip(columns, row))
for row in self.itertuples(index=False, name=None)
)
return [
into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
]
if are_all_object_dtype_cols:
rows = (
dict(zip(columns, row))
for row in self.itertuples(index=False, name=None)
)
return [
into_c((k, maybe_box_native(v)) for k, v in row.items())
for row in rows
]
elif object_dtype_cols:
is_object_dtype_by_index = [col in object_dtype_cols for col in columns]
return [
into_c(
zip(
columns,
[
maybe_box_native(v)
if is_object_dtype_by_index[i]
else v
for i, v in enumerate(t)
],
)
)
for t in self.itertuples(index=False, name=None)
]
else:
return [
into_c(zip(columns, t))
for t in self.itertuples(index=False, name=None)
]

elif orient == "index":
if not self.index.is_unique:
raise ValueError("DataFrame index must be unique for orient='index'.")
return into_c(
(t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
for t in self.itertuples(name=None)
)
columns = self.columns.tolist()
if are_all_object_dtype_cols:
return into_c(
(t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
for t in self.itertuples(name=None)
)
elif object_dtype_cols:
is_object_dtype_by_index = [
col in object_dtype_cols for col in self.columns
]
return into_c(
(
t[0],
{
columns[i]: maybe_box_native(v)
if is_object_dtype_by_index[i]
else v
for i, v in enumerate(t[1:])
},
)
for t in self.itertuples(name=None)
)
else:
return into_c(
(t[0], dict(zip(self.columns, t[1:])))
for t in self.itertuples(name=None)
)

else:
raise ValueError(f"orient '{orient}' not understood")
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1771,7 +1771,13 @@ def to_dict(self, into=dict):
"""
# GH16122
into_c = com.standardize_mapping(into)
return into_c((k, maybe_box_native(v)) for k, v in self.items())

if is_object_dtype(self):
return into_c((k, maybe_box_native(v)) for k, v in self.items())
else:
# Not an object dtype => all types will be the same so let the default
# indexer return native python type
return into_c((k, v) for k, v in self.items())

def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
"""
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/frame/methods/test_to_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,16 @@ def test_to_dict_orient_tight(self, index, columns):
"b": [float, float, float],
},
),
( # Make sure we have one df which is all object type cols
{
"a": [1, "hello", 3],
"b": [1.1, "world", 3.3],
},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this hits all of the new added code?

{
"a": [int, str, int],
"b": [float, str, float],
},
),
),
)
def test_to_dict_returns_native_types(self, orient, data, expected_types):
Expand Down