ENH: Implement arrow support for read_csv with engine=c (#51128)

phofl · web-flow · commit 48c99f21f86d · 2023-02-08T14:02:31.000-08:00
* ENH: Implement arrow support for read_csv with engine=c

* Adjust docs

* Fix import

* Review
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -138,8 +138,8 @@ The option will only work for functions with the keyword ``use_nullable_dtypes``
 Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
 to select the nullable dtypes implementation.
 
-* :func:`read_csv` (with ``engine="pyarrow"`` or ``engine="python"``)
-* :func:`read_clipboard` (with ``engine="python"``)
+* :func:`read_csv`
+* :func:`read_clipboard`
 * :func:`read_fwf`
 * :func:`read_excel`
 * :func:`read_html`
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -15,6 +15,7 @@ from pandas.util._exceptions import find_stack_level
 
 from pandas import StringDtype
 from pandas.core.arrays import (
+    ArrowExtensionArray,
     BooleanArray,
     FloatingArray,
     IntegerArray,
@@ -341,6 +342,7 @@ cdef class TextReader:
         bint use_nullable_dtypes
         object usecols
         set unnamed_cols  # set[str]
+        str dtype_backend
 
     def __cinit__(self, source,
                   delimiter=b",",  # bytes | str
@@ -377,7 +379,8 @@ cdef class TextReader:
                   float_precision=None,
                   bint skip_blank_lines=True,
                   encoding_errors=b"strict",
-                  use_nullable_dtypes=False):
+                  use_nullable_dtypes=False,
+                  dtype_backend="pandas"):
 
         # set encoding for native Python and C library
         if isinstance(encoding_errors, str):
@@ -499,6 +502,7 @@ cdef class TextReader:
         # - dict[Any, DtypeObj]
         self.dtype = dtype
         self.use_nullable_dtypes = use_nullable_dtypes
+        self.dtype_backend = dtype_backend
 
         self.noconvert = set()
 
@@ -1054,7 +1058,9 @@ cdef class TextReader:
             ):
                 use_nullable_dtypes = self.use_nullable_dtypes and col_dtype is None
                 col_res = _maybe_upcast(
-                    col_res, use_nullable_dtypes=use_nullable_dtypes
+                    col_res,
+                    use_nullable_dtypes=use_nullable_dtypes,
+                    dtype_backend=self.dtype_backend,
                 )
 
             if col_res is None:
@@ -1387,7 +1393,9 @@ STR_NA_VALUES = {
 _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
 
 
-def _maybe_upcast(arr, use_nullable_dtypes: bool = False):
+def _maybe_upcast(
+    arr, use_nullable_dtypes: bool = False, dtype_backend: str = "pandas"
+):
     """Sets nullable dtypes or upcasts if nans are present.
 
     Upcast, if use_nullable_dtypes is false and nans are present so that the
@@ -1440,6 +1448,13 @@ def _maybe_upcast(arr, use_nullable_dtypes: bool = False):
         if use_nullable_dtypes:
             arr = StringDtype().construct_array_type()._from_sequence(arr)
 
+    if use_nullable_dtypes and dtype_backend == "pyarrow":
+        import pyarrow as pa
+        if isinstance(arr, IntegerArray) and arr.isna().all():
+            # use null instead of int64 in pyarrow
+            arr = arr.to_numpy()
+        arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
+
     return arr
 
 
diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py
@@ -43,8 +43,6 @@ def read_clipboard(
             numpy-backed nullable dtypes or
             ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
             pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
-            This is only implemented for the ``python``
-            engine.
 
         .. versionadded:: 2.0
 
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -11,13 +11,16 @@
 
 import numpy as np
 
+from pandas._config.config import get_option
+
 from pandas._libs import parsers
 from pandas._typing import (
     ArrayLike,
     DtypeArg,
     DtypeObj,
     ReadCsvBuffer,
 )
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import DtypeWarning
 from pandas.util._exceptions import find_stack_level
 
@@ -79,6 +82,11 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
             kwds.pop(key, None)
 
         kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
+        dtype_backend = get_option("mode.dtype_backend")
+        kwds["dtype_backend"] = dtype_backend
+        if dtype_backend == "pyarrow":
+            # Fail here loudly instead of in cython after reading
+            import_optional_dependency("pyarrow")
         self._reader = parsers.TextReader(src, **kwds)
 
         self.unnamed_cols = self._reader.unnamed_cols
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -24,10 +24,7 @@
 
 import numpy as np
 
-from pandas._config import (
-    get_option,
-    using_nullable_dtypes,
-)
+from pandas._config import using_nullable_dtypes
 
 from pandas._libs import lib
 from pandas._libs.parsers import STR_NA_VALUES
@@ -408,8 +405,6 @@
         numpy-backed nullable dtypes or
         ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
         pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
-        This is only implemented for the ``pyarrow`` or ``python``
-        engines.
 
     .. versionadded:: 2.0
 
@@ -566,15 +561,6 @@ def _read(
             raise ValueError(
                 "The 'chunksize' option is not supported with the 'pyarrow' engine"
             )
-    elif (
-        kwds.get("use_nullable_dtypes", False)
-        and get_option("mode.dtype_backend") == "pyarrow"
-        and kwds.get("engine") == "c"
-    ):
-        raise NotImplementedError(
-            f"use_nullable_dtypes=True and engine={kwds['engine']} with "
-            "mode.dtype_backend set to 'pyarrow' is not implemented."
-        )
     else:
         chunksize = validate_integer("chunksize", chunksize, 1)
 
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -500,13 +500,6 @@ def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request):
 3,4.5,False,b,6,7.5,True,a,12-31-2019,
 """
     with pd.option_context("mode.dtype_backend", "pyarrow"):
-        if engine == "c":
-            request.node.add_marker(
-                pytest.mark.xfail(
-                    raises=NotImplementedError,
-                    reason=f"Not implemented with engine={parser.engine}",
-                )
-            )
         result = parser.read_csv(
             StringIO(data), use_nullable_dtypes=True, parse_dates=["i"]
         )
@@ -520,7 +513,7 @@ def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request):
                 "f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"),
                 "g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"),
                 "h": pd.Series(
-                    [pd.NA if engine == "python" else "", "a"],
+                    [pd.NA if engine != "pyarrow" else "", "a"],
                     dtype=pd.ArrowDtype(pa.string()),
                 ),
                 "i": pd.Series([Timestamp("2019-12-31")] * 2),
diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py
@@ -426,9 +426,6 @@ def test_read_clipboard_nullable_dtypes(
         if string_storage == "pyarrow" or dtype_backend == "pyarrow":
             pa = pytest.importorskip("pyarrow")
 
-        if dtype_backend == "pyarrow" and engine == "c":
-            pytest.skip(reason="c engine not yet supported")
-
         if string_storage == "python":
             string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
             string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))