Skip to content

Commit 0ec42bb

Browse files
authored
ENH: Support ArrowDtype in interchange Column.dtype (#52792)
1 parent 88b5a91 commit 0ec42bb

File tree

3 files changed

+103
-1
lines changed

3 files changed

+103
-1
lines changed

pandas/core/interchange/column.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import pandas as pd
1313
from pandas.api.types import is_string_dtype
14+
from pandas.core.arrays.arrow.dtype import ArrowDtype
1415
from pandas.core.interchange.buffer import PandasBuffer
1516
from pandas.core.interchange.dataframe_protocol import (
1617
Column,
@@ -134,8 +135,12 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
134135
if kind is None:
135136
# Not a NumPy dtype. Check if it's a categorical maybe
136137
raise ValueError(f"Data type {dtype} not supported by interchange protocol")
138+
if isinstance(dtype, ArrowDtype):
139+
byteorder = dtype.numpy_dtype.byteorder
140+
else:
141+
byteorder = dtype.byteorder
137142

138-
return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder
143+
return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder
139144

140145
@property
141146
def describe_categorical(self):

pandas/core/interchange/utils.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,47 @@
1313

1414
from pandas.core.dtypes.dtypes import CategoricalDtype
1515

16+
from pandas.core.arrays.arrow.dtype import ArrowDtype
17+
1618
if typing.TYPE_CHECKING:
1719
from pandas._typing import DtypeObj
1820

1921

22+
# Maps str(pyarrow.DataType) = C type format string
23+
# Currently, no pyarrow API for this
24+
PYARROW_CTYPES = {
25+
"null": "n",
26+
"bool": "b",
27+
"uint8": "C",
28+
"uint16": "S",
29+
"uint32": "I",
30+
"uint64": "L",
31+
"int8": "c",
32+
"int16": "S",
33+
"int32": "i",
34+
"int64": "l",
35+
"halffloat": "e", # float16
36+
"float": "f", # float32
37+
"double": "g", # float64
38+
"string": "u",
39+
"binary": "z",
40+
"time32[s]": "tts",
41+
"time32[ms]": "ttm",
42+
"time64[us]": "ttu",
43+
"time64[ns]": "ttn",
44+
"date32[day]": "tdD",
45+
"date64[ms]": "tdm",
46+
"timestamp[s]": "tss:",
47+
"timestamp[ms]": "tsm:",
48+
"timestamp[us]": "tsu:",
49+
"timestamp[ns]": "tsn:",
50+
"duration[s]": "tDs",
51+
"duration[ms]": "tDm",
52+
"duration[us]": "tDu",
53+
"duration[ns]": "tDn",
54+
}
55+
56+
2057
class ArrowCTypes:
2158
"""
2259
Enum for Apache Arrow C type format strings.
@@ -78,6 +115,17 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
78115
return ArrowCTypes.INT64
79116
elif dtype == np.dtype("O"):
80117
return ArrowCTypes.STRING
118+
elif isinstance(dtype, ArrowDtype):
119+
import pyarrow as pa
120+
121+
pa_type = dtype.pyarrow_dtype
122+
if pa.types.is_decimal(pa_type):
123+
return f"d:{pa_type.precision},{pa_type.scale}"
124+
elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None:
125+
return f"ts{pa_type.unit[0]}:{pa_type.tz}"
126+
format_str = PYARROW_CTYPES.get(str(pa_type), None)
127+
if format_str is not None:
128+
return format_str
81129

82130
format_str = getattr(ArrowCTypes, dtype.name.upper(), None)
83131
if format_str is not None:

pandas/tests/interchange/test_utils.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,52 @@
3838
def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string): # PR01
3939
"""Test ``dtype_to_arrow_c_fmt`` utility function."""
4040
assert dtype_to_arrow_c_fmt(pandas_dtype) == c_string
41+
42+
43+
@pytest.mark.parametrize(
44+
"pa_dtype, args_kwargs, c_string",
45+
[
46+
["null", {}, "n"],
47+
["bool_", {}, "b"],
48+
["uint8", {}, "C"],
49+
["uint16", {}, "S"],
50+
["uint32", {}, "I"],
51+
["uint64", {}, "L"],
52+
["int8", {}, "c"],
53+
["int16", {}, "S"],
54+
["int32", {}, "i"],
55+
["int64", {}, "l"],
56+
["float16", {}, "e"],
57+
["float32", {}, "f"],
58+
["float64", {}, "g"],
59+
["string", {}, "u"],
60+
["binary", {}, "z"],
61+
["time32", ("s",), "tts"],
62+
["time32", ("ms",), "ttm"],
63+
["time64", ("us",), "ttu"],
64+
["time64", ("ns",), "ttn"],
65+
["date32", {}, "tdD"],
66+
["date64", {}, "tdm"],
67+
["timestamp", {"unit": "s"}, "tss:"],
68+
["timestamp", {"unit": "ms"}, "tsm:"],
69+
["timestamp", {"unit": "us"}, "tsu:"],
70+
["timestamp", {"unit": "ns"}, "tsn:"],
71+
["timestamp", {"unit": "ns", "tz": "UTC"}, "tsn:UTC"],
72+
["duration", ("s",), "tDs"],
73+
["duration", ("ms",), "tDm"],
74+
["duration", ("us",), "tDu"],
75+
["duration", ("ns",), "tDn"],
76+
["decimal128", {"precision": 4, "scale": 2}, "d:4,2"],
77+
],
78+
)
79+
def test_dtype_to_arrow_c_fmt_arrowdtype(pa_dtype, args_kwargs, c_string):
80+
# GH 52323
81+
pa = pytest.importorskip("pyarrow")
82+
if not args_kwargs:
83+
pa_type = getattr(pa, pa_dtype)()
84+
elif isinstance(args_kwargs, tuple):
85+
pa_type = getattr(pa, pa_dtype)(*args_kwargs)
86+
else:
87+
pa_type = getattr(pa, pa_dtype)(**args_kwargs)
88+
arrow_type = pd.ArrowDtype(pa_type)
89+
assert dtype_to_arrow_c_fmt(arrow_type) == c_string

0 commit comments

Comments
 (0)