Skip to content

Commit 4f0f872

Browse files
author
Roger Thomas
committed
PERF: Slow performance of to_dict (#46470)
1 parent c68c626 commit 4f0f872

File tree

4 files changed

+208
-62
lines changed

4 files changed

+208
-62
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,7 @@ Performance improvements
385385
- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
386386
- Performance improvement in :func:`factorize` (:issue:`46109`)
387387
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
388+
- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` especially when using non-mixed dtypes (:issue:`46470`)
388389

389390
.. ---------------------------------------------------------------------------
390391
.. _whatsnew_150.bug_fixes:

pandas/core/frame.py

+124-61
Original file line numberDiff line numberDiff line change
@@ -1775,6 +1775,129 @@ def to_numpy(
17751775

17761776
return result
17771777

1778+
def _to_dict_helper(self, orient, into_c, into):
1779+
"""Helper function to do main work to convert frame into dict based on
1780+
`orient` and `into`
1781+
1782+
As part of GH46470 also takes care in when to use maybe_box_native as this
1783+
function can perform badly and is not necessary for non object cols
1784+
"""
1785+
object_dtype_cols = {
1786+
col for col, dtype in self.dtypes.items() if is_object_dtype(dtype)
1787+
}
1788+
if orient == "dict":
1789+
return into_c((k, v.to_dict(into)) for k, v in self.items())
1790+
elif orient == "list":
1791+
return into_c(
1792+
(
1793+
k,
1794+
list(map(maybe_box_native, v.tolist()))
1795+
if k in object_dtype_cols
1796+
else v.tolist(),
1797+
)
1798+
for k, v in self.items()
1799+
)
1800+
elif orient == "split":
1801+
if object_dtype_cols:
1802+
is_object_dtype_by_index = [
1803+
col in object_dtype_cols for col in self.columns
1804+
]
1805+
data = [
1806+
[
1807+
maybe_box_native(v) if is_object_dtype_by_index[i] else v
1808+
for i, v in enumerate(t)
1809+
]
1810+
for t in self.itertuples(index=False, name=None)
1811+
]
1812+
else:
1813+
data = [list(t) for t in self.itertuples(index=False, name=None)]
1814+
return into_c(
1815+
(
1816+
("index", self.index.tolist()),
1817+
("columns", self.columns.tolist()),
1818+
("data", data),
1819+
)
1820+
)
1821+
elif orient == "series":
1822+
return into_c((k, v) for k, v in self.items())
1823+
elif orient == "records":
1824+
columns = self.columns.tolist()
1825+
if object_dtype_cols:
1826+
is_object_dtype_by_index = [col in object_dtype_cols for col in columns]
1827+
return [
1828+
into_c(
1829+
zip(
1830+
columns,
1831+
[
1832+
maybe_box_native(v)
1833+
if is_object_dtype_by_index[i]
1834+
else v
1835+
for i, v in enumerate(t)
1836+
],
1837+
)
1838+
)
1839+
for t in self.itertuples(index=False, name=None)
1840+
]
1841+
else:
1842+
return [
1843+
into_c(zip(columns, t))
1844+
for t in self.itertuples(index=False, name=None)
1845+
]
1846+
elif orient == "index":
1847+
if not self.index.is_unique:
1848+
raise ValueError("DataFrame index must be unique for orient='index'.")
1849+
if object_dtype_cols:
1850+
is_object_dtype_by_index = [
1851+
col in object_dtype_cols for col in self.columns
1852+
]
1853+
return into_c(
1854+
(
1855+
t[0],
1856+
dict(
1857+
zip(
1858+
self.columns,
1859+
[
1860+
maybe_box_native(v)
1861+
if is_object_dtype_by_index[i]
1862+
else v
1863+
for i, v in enumerate(t[1:])
1864+
],
1865+
)
1866+
),
1867+
)
1868+
for t in self.itertuples(name=None)
1869+
)
1870+
else:
1871+
return into_c(
1872+
(t[0], dict(zip(self.columns, t[1:])))
1873+
for t in self.itertuples(name=None)
1874+
)
1875+
elif orient == "tight":
1876+
if object_dtype_cols:
1877+
is_object_dtype_by_index = [
1878+
col in object_dtype_cols for col in self.columns
1879+
]
1880+
data = [
1881+
[
1882+
maybe_box_native(v) if is_object_dtype_by_index[i] else v
1883+
for i, v in enumerate(t)
1884+
]
1885+
for t in self.itertuples(index=False, name=None)
1886+
]
1887+
else:
1888+
data = [list(t) for t in self.itertuples(index=False, name=None)]
1889+
return into_c(
1890+
(
1891+
("index", self.index.tolist()),
1892+
("columns", self.columns.tolist()),
1893+
("data", data),
1894+
("index_names", list(self.index.names)),
1895+
("column_names", list(self.columns.names)),
1896+
)
1897+
)
1898+
else:
1899+
raise ValueError(f"orient '{orient}' not understood")
1900+
17781901
def to_dict(self, orient: str = "dict", into=dict):
17791902
"""
17801903
Convert the DataFrame to a dictionary.
@@ -1913,67 +2036,7 @@ def to_dict(self, orient: str = "dict", into=dict):
19132036
elif orient.startswith("i"):
19142037
orient = "index"
19152038

1916-
if orient == "dict":
1917-
return into_c((k, v.to_dict(into)) for k, v in self.items())
1918-
1919-
elif orient == "list":
1920-
return into_c((k, v.tolist()) for k, v in self.items())
1921-
1922-
elif orient == "split":
1923-
return into_c(
1924-
(
1925-
("index", self.index.tolist()),
1926-
("columns", self.columns.tolist()),
1927-
(
1928-
"data",
1929-
[
1930-
list(map(maybe_box_native, t))
1931-
for t in self.itertuples(index=False, name=None)
1932-
],
1933-
),
1934-
)
1935-
)
1936-
1937-
elif orient == "tight":
1938-
return into_c(
1939-
(
1940-
("index", self.index.tolist()),
1941-
("columns", self.columns.tolist()),
1942-
(
1943-
"data",
1944-
[
1945-
list(map(maybe_box_native, t))
1946-
for t in self.itertuples(index=False, name=None)
1947-
],
1948-
),
1949-
("index_names", list(self.index.names)),
1950-
("column_names", list(self.columns.names)),
1951-
)
1952-
)
1953-
1954-
elif orient == "series":
1955-
return into_c((k, v) for k, v in self.items())
1956-
1957-
elif orient == "records":
1958-
columns = self.columns.tolist()
1959-
rows = (
1960-
dict(zip(columns, row))
1961-
for row in self.itertuples(index=False, name=None)
1962-
)
1963-
return [
1964-
into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
1965-
]
1966-
1967-
elif orient == "index":
1968-
if not self.index.is_unique:
1969-
raise ValueError("DataFrame index must be unique for orient='index'.")
1970-
return into_c(
1971-
(t[0], dict(zip(self.columns, t[1:])))
1972-
for t in self.itertuples(name=None)
1973-
)
1974-
1975-
else:
1976-
raise ValueError(f"orient '{orient}' not understood")
2039+
return self._to_dict_helper(orient, into_c, into)
19772040

19782041
def to_gbq(
19792042
self,

pandas/core/series.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1770,7 +1770,13 @@ def to_dict(self, into=dict):
17701770
"""
17711771
# GH16122
17721772
into_c = com.standardize_mapping(into)
1773-
return into_c((k, maybe_box_native(v)) for k, v in self.items())
1773+
1774+
if is_object_dtype(self):
1775+
return into_c((k, maybe_box_native(v)) for k, v in self.items())
1776+
else:
1777+
# Not an object dtype => all types will be the same so let the default
1778+
# indexer return native python type
1779+
return into_c((k, v) for k, v in self.items())
17741780

17751781
def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
17761782
"""

pandas/tests/frame/methods/test_to_dict.py

+76
Original file line numberDiff line numberDiff line change
@@ -344,3 +344,79 @@ def test_to_dict_orient_tight(self, index, columns):
344344
roundtrip = DataFrame.from_dict(df.to_dict(orient="tight"), orient="tight")
345345

346346
tm.assert_frame_equal(df, roundtrip)
347+
348+
@pytest.mark.parametrize(
349+
"orient",
350+
["dict", "list", "split", "records", "index", "tight"],
351+
)
352+
@pytest.mark.parametrize(
353+
"data,expected_types",
354+
(
355+
(
356+
{
357+
"a": [np.int64(1), 1, np.int64(3)],
358+
"b": [np.float64(1.0), 2.0, np.float64(3.0)],
359+
"c": [np.float64(1.0), 2, np.int64(3)],
360+
"d": [np.float64(1.0), "a", np.int64(3)],
361+
"e": [np.float64(1.0), ["a"], np.int64(3)],
362+
"f": [np.float64(1.0), ("a",), np.int64(3)],
363+
},
364+
{
365+
"a": [int, int, int],
366+
"b": [float, float, float],
367+
"c": [float, float, float],
368+
"d": [float, str, int],
369+
"e": [float, list, int],
370+
"f": [float, tuple, int],
371+
},
372+
),
373+
(
374+
{
375+
"a": [1, 2, 3],
376+
"b": [1.1, 2.2, 3.3],
377+
},
378+
{
379+
"a": [int, int, int],
380+
"b": [float, float, float],
381+
},
382+
),
383+
),
384+
)
385+
def test_to_dict_return_types(self, orient, data, expected_types):
386+
# GH46470
387+
df = DataFrame(data)
388+
result = df.to_dict(orient)
389+
if orient == "dict":
390+
assertion_iterator = (
391+
(i, key, value)
392+
for key, index_value_map in result.items()
393+
for i, value in index_value_map.items()
394+
)
395+
elif orient == "list":
396+
assertion_iterator = (
397+
(i, key, value)
398+
for key, values in result.items()
399+
for i, value in enumerate(values)
400+
)
401+
elif orient in {"split", "tight"}:
402+
assertion_iterator = (
403+
(i, key, result["data"][i][j])
404+
for i in result["index"]
405+
for j, key in enumerate(result["columns"])
406+
)
407+
elif orient == "records":
408+
assertion_iterator = (
409+
(i, key, value)
410+
for i, record in enumerate(result)
411+
for key, value in record.items()
412+
)
413+
elif orient == "index":
414+
assertion_iterator = (
415+
(i, key, value)
416+
for i, record in result.items()
417+
for key, value in record.items()
418+
)
419+
420+
for i, key, value in assertion_iterator:
421+
assert value == data[key][i]
422+
assert type(value) is expected_types[key][i]

0 commit comments

Comments
 (0)