Skip to content

Commit 48c99f2

Browse files
authored
ENH: Implement arrow support for read_csv with engine=c (#51128)
* ENH: Implement arrow support for read_csv with engine=c * Adjust docs * Fix import * Review
1 parent 4096733 commit 48c99f2

File tree

7 files changed

+30
-33
lines changed

7 files changed

+30
-33
lines changed

doc/source/whatsnew/v2.0.0.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,8 @@ The option will only work for functions with the keyword ``use_nullable_dtypes``
138138
Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
139139
to select the nullable dtypes implementation.
140140

141-
* :func:`read_csv` (with ``engine="pyarrow"`` or ``engine="python"``)
142-
* :func:`read_clipboard` (with ``engine="python"``)
141+
* :func:`read_csv`
142+
* :func:`read_clipboard`
143143
* :func:`read_fwf`
144144
* :func:`read_excel`
145145
* :func:`read_html`

pandas/_libs/parsers.pyx

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ from pandas.util._exceptions import find_stack_level
1515

1616
from pandas import StringDtype
1717
from pandas.core.arrays import (
18+
ArrowExtensionArray,
1819
BooleanArray,
1920
FloatingArray,
2021
IntegerArray,
@@ -341,6 +342,7 @@ cdef class TextReader:
341342
bint use_nullable_dtypes
342343
object usecols
343344
set unnamed_cols # set[str]
345+
str dtype_backend
344346

345347
def __cinit__(self, source,
346348
delimiter=b",", # bytes | str
@@ -377,7 +379,8 @@ cdef class TextReader:
377379
float_precision=None,
378380
bint skip_blank_lines=True,
379381
encoding_errors=b"strict",
380-
use_nullable_dtypes=False):
382+
use_nullable_dtypes=False,
383+
dtype_backend="pandas"):
381384

382385
# set encoding for native Python and C library
383386
if isinstance(encoding_errors, str):
@@ -499,6 +502,7 @@ cdef class TextReader:
499502
# - dict[Any, DtypeObj]
500503
self.dtype = dtype
501504
self.use_nullable_dtypes = use_nullable_dtypes
505+
self.dtype_backend = dtype_backend
502506

503507
self.noconvert = set()
504508

@@ -1054,7 +1058,9 @@ cdef class TextReader:
10541058
):
10551059
use_nullable_dtypes = self.use_nullable_dtypes and col_dtype is None
10561060
col_res = _maybe_upcast(
1057-
col_res, use_nullable_dtypes=use_nullable_dtypes
1061+
col_res,
1062+
use_nullable_dtypes=use_nullable_dtypes,
1063+
dtype_backend=self.dtype_backend,
10581064
)
10591065

10601066
if col_res is None:
@@ -1387,7 +1393,9 @@ STR_NA_VALUES = {
13871393
_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
13881394

13891395

1390-
def _maybe_upcast(arr, use_nullable_dtypes: bool = False):
1396+
def _maybe_upcast(
1397+
arr, use_nullable_dtypes: bool = False, dtype_backend: str = "pandas"
1398+
):
13911399
"""Sets nullable dtypes or upcasts if nans are present.
13921400
13931401
Upcast, if use_nullable_dtypes is false and nans are present so that the
@@ -1440,6 +1448,13 @@ def _maybe_upcast(arr, use_nullable_dtypes: bool = False):
14401448
if use_nullable_dtypes:
14411449
arr = StringDtype().construct_array_type()._from_sequence(arr)
14421450

1451+
if use_nullable_dtypes and dtype_backend == "pyarrow":
1452+
import pyarrow as pa
1453+
if isinstance(arr, IntegerArray) and arr.isna().all():
1454+
# use null instead of int64 in pyarrow
1455+
arr = arr.to_numpy()
1456+
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
1457+
14431458
return arr
14441459

14451460

pandas/io/clipboards.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,6 @@ def read_clipboard(
4343
numpy-backed nullable dtypes or
4444
``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
4545
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
46-
This is only implemented for the ``python``
47-
engine.
4846
4947
.. versionadded:: 2.0
5048

pandas/io/parsers/c_parser_wrapper.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,16 @@
1111

1212
import numpy as np
1313

14+
from pandas._config.config import get_option
15+
1416
from pandas._libs import parsers
1517
from pandas._typing import (
1618
ArrayLike,
1719
DtypeArg,
1820
DtypeObj,
1921
ReadCsvBuffer,
2022
)
23+
from pandas.compat._optional import import_optional_dependency
2124
from pandas.errors import DtypeWarning
2225
from pandas.util._exceptions import find_stack_level
2326

@@ -79,6 +82,11 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
7982
kwds.pop(key, None)
8083

8184
kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
85+
dtype_backend = get_option("mode.dtype_backend")
86+
kwds["dtype_backend"] = dtype_backend
87+
if dtype_backend == "pyarrow":
88+
# Fail here loudly instead of in cython after reading
89+
import_optional_dependency("pyarrow")
8290
self._reader = parsers.TextReader(src, **kwds)
8391

8492
self.unnamed_cols = self._reader.unnamed_cols

pandas/io/parsers/readers.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@
2424

2525
import numpy as np
2626

27-
from pandas._config import (
28-
get_option,
29-
using_nullable_dtypes,
30-
)
27+
from pandas._config import using_nullable_dtypes
3128

3229
from pandas._libs import lib
3330
from pandas._libs.parsers import STR_NA_VALUES
@@ -408,8 +405,6 @@
408405
numpy-backed nullable dtypes or
409406
``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
410407
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
411-
This is only implemented for the ``pyarrow`` or ``python``
412-
engines.
413408
414409
.. versionadded:: 2.0
415410
@@ -566,15 +561,6 @@ def _read(
566561
raise ValueError(
567562
"The 'chunksize' option is not supported with the 'pyarrow' engine"
568563
)
569-
elif (
570-
kwds.get("use_nullable_dtypes", False)
571-
and get_option("mode.dtype_backend") == "pyarrow"
572-
and kwds.get("engine") == "c"
573-
):
574-
raise NotImplementedError(
575-
f"use_nullable_dtypes=True and engine={kwds['engine']} with "
576-
"mode.dtype_backend set to 'pyarrow' is not implemented."
577-
)
578564
else:
579565
chunksize = validate_integer("chunksize", chunksize, 1)
580566

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -500,13 +500,6 @@ def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request):
500500
3,4.5,False,b,6,7.5,True,a,12-31-2019,
501501
"""
502502
with pd.option_context("mode.dtype_backend", "pyarrow"):
503-
if engine == "c":
504-
request.node.add_marker(
505-
pytest.mark.xfail(
506-
raises=NotImplementedError,
507-
reason=f"Not implemented with engine={parser.engine}",
508-
)
509-
)
510503
result = parser.read_csv(
511504
StringIO(data), use_nullable_dtypes=True, parse_dates=["i"]
512505
)
@@ -520,7 +513,7 @@ def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request):
520513
"f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"),
521514
"g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"),
522515
"h": pd.Series(
523-
[pd.NA if engine == "python" else "", "a"],
516+
[pd.NA if engine != "pyarrow" else "", "a"],
524517
dtype=pd.ArrowDtype(pa.string()),
525518
),
526519
"i": pd.Series([Timestamp("2019-12-31")] * 2),

pandas/tests/io/test_clipboard.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -426,9 +426,6 @@ def test_read_clipboard_nullable_dtypes(
426426
if string_storage == "pyarrow" or dtype_backend == "pyarrow":
427427
pa = pytest.importorskip("pyarrow")
428428

429-
if dtype_backend == "pyarrow" and engine == "c":
430-
pytest.skip(reason="c engine not yet supported")
431-
432429
if string_storage == "python":
433430
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
434431
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))

0 commit comments

Comments
 (0)