pandas-dev · WillAyd · Mar 11, 2023 · Feb 18, 2023 · Feb 18, 2023 · Feb 18, 2023
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -58,3 +58,5 @@ prune pandas/tests/io/parser/data
 # Selectively re-add *.cxx files that were excluded above
 graft pandas/_libs/src
 graft pandas/_libs/tslibs/src
+include pandas/_libs/pd_parser.h
+include pandas/_libs/pd_parser.c
diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py
@@ -10,6 +10,11 @@
 ]
 
 
+# Below imports needs to happen first to ensure pandas top level
+# module gets monkeypatched with the pandas_datetime_CAPI
+# see pandas_datetime_exec in pd_datetime.c
+import pandas._libs.pandas_parser  # noqa # isort: skip # type: ignore[reportUnusedImport]
+import pandas._libs.pandas_datetime  # noqa # isort: skip # type: ignore[reportUnusedImport]
 from pandas._libs.interval import Interval
 from pandas._libs.tslibs import (
     NaT,

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -20,7 +20,12 @@ from pandas._libs.tslibs.nattype cimport c_NaT as NaT
 from pandas._libs.tslibs.np_datetime cimport (
     NPY_DATETIMEUNIT,
     get_unit_from_dtype,
+    import_pandas_datetime,
 )
+
+import_pandas_datetime()
+
+
 from pandas._libs.tslibs.period cimport is_period_object
 from pandas._libs.tslibs.timedeltas cimport _Timedelta
 from pandas._libs.tslibs.timestamps cimport _Timestamp

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -88,9 +88,11 @@ cdef extern from "numpy/arrayobject.h":
 cdef extern from "numpy/ndarrayobject.h":
     bint PyArray_CheckScalar(obj) nogil
 
-
-cdef extern from "src/parse_helper.h":
+cdef extern from "pd_parser.h":
     int floatify(object, float64_t *result, int *maybe_int) except -1
+    void PandasParser_IMPORT()
+
+PandasParser_IMPORT
 
 from pandas._libs cimport util
 from pandas._libs.util cimport (

diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
@@ -34,8 +34,11 @@ from pandas._libs.tslibs.np_datetime cimport (
     get_datetime64_unit,
     get_datetime64_value,
     get_timedelta64_value,
+    import_pandas_datetime,
 )
 
+import_pandas_datetime()
+
 from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
 
 cdef:

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -229,9 +229,9 @@ cdef extern from "parser/tokenizer.h":
         int64_t skip_first_N_rows
         int64_t skipfooter
         # pick one, depending on whether the converter requires GIL
-        float64_t (*double_converter)(const char *, char **,
-                                      char, char, char,
-                                      int, int *, int *) nogil
+        double (*double_converter)(const char *, char **,
+                                   char, char, char,
+                                   int, int *, int *) nogil
 
         #  error handling
         char *warn_msg
@@ -249,6 +249,16 @@ cdef extern from "parser/tokenizer.h":
         int seen_uint
         int seen_null
 
+    void COLITER_NEXT(coliter_t, const char *) nogil
+
+cdef extern from "pd_parser.h":
+    void *new_rd_source(object obj) except NULL
+
+    int del_rd_source(void *src)
+
+    void* buffer_rd_bytes(void *source, size_t nbytes,
+                          size_t *bytes_read, int *status, const char *encoding_errors)
+
     void uint_state_init(uint_state *self)
     int uint64_conflict(uint_state *self)
 
@@ -279,26 +289,49 @@ cdef extern from "parser/tokenizer.h":
     uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
                            uint64_t uint_max, int *error, char tsep) nogil
 
-    float64_t xstrtod(const char *p, char **q, char decimal,
+    double xstrtod(const char *p, char **q, char decimal,
+                   char sci, char tsep, int skip_trailing,
+                   int *error, int *maybe_int) nogil
+    double precise_xstrtod(const char *p, char **q, char decimal,
+                           char sci, char tsep, int skip_trailing,
+                           int *error, int *maybe_int) nogil
+    double round_trip(const char *p, char **q, char decimal,
                       char sci, char tsep, int skip_trailing,
                       int *error, int *maybe_int) nogil
-    float64_t precise_xstrtod(const char *p, char **q, char decimal,
-                              char sci, char tsep, int skip_trailing,
-                              int *error, int *maybe_int) nogil
-    float64_t round_trip(const char *p, char **q, char decimal,
-                         char sci, char tsep, int skip_trailing,
-                         int *error, int *maybe_int) nogil
 
     int to_boolean(const char *item, uint8_t *val) nogil
 
+    void PandasParser_IMPORT()
 
-cdef extern from "parser/io.h":
-    void *new_rd_source(object obj) except NULL
+PandasParser_IMPORT
 
-    int del_rd_source(void *src)
+# When not invoked directly but rather assigned as a function,
+# cdef extern'ed declarations seem to leave behind an undefined symbol
+cdef double xstrtod_wrapper(const char *p, char **q, char decimal,
+                            char sci, char tsep, int skip_trailing,
+                            int *error, int *maybe_int) nogil:
+    return xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)
 
-    void* buffer_rd_bytes(void *source, size_t nbytes,
-                          size_t *bytes_read, int *status, const char *encoding_errors)
+
+cdef double precise_xstrtod_wrapper(const char *p, char **q, char decimal,
+                                    char sci, char tsep, int skip_trailing,
+                                    int *error, int *maybe_int) nogil:
+    return precise_xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)
+
+
+cdef double round_trip_wrapper(const char *p, char **q, char decimal,
+                               char sci, char tsep, int skip_trailing,
+                               int *error, int *maybe_int) nogil:
+    return round_trip(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)
+
+
+cdef void* buffer_rd_bytes_wrapper(void *source, size_t nbytes,
+                                   size_t *bytes_read, int *status,
+                                   const char *encoding_errors) noexcept:
+    return buffer_rd_bytes(source, nbytes, bytes_read, status, encoding_errors)
+
+cdef int del_rd_source_wrapper(void *src) noexcept:
+    return del_rd_source(src)
 
 
 cdef class TextReader:
@@ -487,11 +520,11 @@ cdef class TextReader:
 
         if float_precision == "round_trip":
             # see gh-15140
-            self.parser.double_converter = round_trip
+            self.parser.double_converter = round_trip_wrapper
         elif float_precision == "legacy":
-            self.parser.double_converter = xstrtod
+            self.parser.double_converter = xstrtod_wrapper
         elif float_precision == "high" or float_precision is None:
-            self.parser.double_converter = precise_xstrtod
+            self.parser.double_converter = precise_xstrtod_wrapper
         else:
             raise ValueError(f"Unrecognized float_precision option: "
                              f"{float_precision}")
@@ -610,8 +643,8 @@ cdef class TextReader:
 
         ptr = new_rd_source(source)
         self.parser.source = ptr
-        self.parser.cb_io = &buffer_rd_bytes
-        self.parser.cb_cleanup = &del_rd_source
+        self.parser.cb_io = buffer_rd_bytes_wrapper
+        self.parser.cb_cleanup = del_rd_source_wrapper
 
     cdef _get_header(self, list prelim_header):
         # header is now a list of lists, so field_count should use header[0]

diff --git a/pandas/_libs/pd_parser.c b/pandas/_libs/pd_parser.c
@@ -0,0 +1,178 @@
+/*
+
+Copyright (c) 2023, PyData Development Team
+All rights reserved.
+
+Distributed under the terms of the BSD Simplified License.
+
+*/
+#define _PANDAS_PARSER_IMPL
+
+#include "pd_parser.h"
+#include "src/parser/io.h"
+
+static int to_double(char *item, double *p_value, char sci, char decimal,
+                            int *maybe_int) {
+  char *p_end = NULL;
+  int error = 0;
+
+  /* Switch to precise xstrtod GH 31364 */
+  *p_value =
+      precise_xstrtod(item, &p_end, decimal, sci, '\0', 1, &error, maybe_int);
+
+  return (error == 0) && (!*p_end);
+}
+
+static int floatify(PyObject *str, double *result, int *maybe_int) {
+  int status;
+  char *data;
+  PyObject *tmp = NULL;
+  const char sci = 'E';
+  const char dec = '.';
+
+  if (PyBytes_Check(str)) {
+    data = PyBytes_AS_STRING(str);
+  } else if (PyUnicode_Check(str)) {
+    tmp = PyUnicode_AsUTF8String(str);
+    if (tmp == NULL) {
+      return -1;
+    }
+    data = PyBytes_AS_STRING(tmp);
+  } else {
+    PyErr_SetString(PyExc_TypeError, "Invalid object type");
+    return -1;
+  }
+
+  status = to_double(data, result, sci, dec, maybe_int);
+
+  if (!status) {
+    /* handle inf/-inf infinity/-infinity */
+    if (strlen(data) == 3) {
+      if (0 == strcasecmp(data, "inf")) {
+        *result = HUGE_VAL;
+        *maybe_int = 0;
+      } else {
+        goto parsingerror;
+      }
+    } else if (strlen(data) == 4) {
+      if (0 == strcasecmp(data, "-inf")) {
+        *result = -HUGE_VAL;
+        *maybe_int = 0;
+      } else if (0 == strcasecmp(data, "+inf")) {
+        *result = HUGE_VAL;
+        *maybe_int = 0;
+      } else {
+        goto parsingerror;
+      }
+    } else if (strlen(data) == 8) {
+      if (0 == strcasecmp(data, "infinity")) {
+        *result = HUGE_VAL;
+        *maybe_int = 0;
+      } else {
+        goto parsingerror;
+      }
+    } else if (strlen(data) == 9) {
+      if (0 == strcasecmp(data, "-infinity")) {
+        *result = -HUGE_VAL;
+        *maybe_int = 0;
+      } else if (0 == strcasecmp(data, "+infinity")) {
+        *result = HUGE_VAL;
+        *maybe_int = 0;
+      } else {
+        goto parsingerror;
+      }
+    } else {
+      goto parsingerror;
+    }
+  }
+
+  Py_XDECREF(tmp);
+  return 0;
+
+parsingerror:
+  PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data);
+  Py_XDECREF(tmp);
+  return -1;
+}
+
+
+static void pandas_parser_destructor(PyObject *op) {
+  void *ptr = PyCapsule_GetPointer(op, PandasParser_CAPSULE_NAME);
+  PyMem_Free(ptr);
+}
+
+static int pandas_parser_exec(PyObject *module) {
+  PandasParser_CAPI *capi = PyMem_Malloc(sizeof(PandasParser_CAPI));
+  if (capi == NULL) {
+    PyErr_NoMemory();
+    return -1;
+  }
+
+  capi->to_double = to_double;
+  capi->floatify = floatify;
+  capi->new_rd_source = new_rd_source;
+  capi->del_rd_source = del_rd_source;
+  capi->buffer_rd_bytes = buffer_rd_bytes;
+  capi->uint_state_init = uint_state_init;
+  capi->uint64_conflict = uint64_conflict;
+  capi->coliter_setup = coliter_setup;
+  capi->parser_new = parser_new;
+  capi->parser_init = parser_init;
+  capi->parser_free = parser_free;
+  capi->parser_del = parser_del;
+  capi->parser_add_skiprow = parser_add_skiprow;
+  capi->parser_set_skipfirstnrows = parser_set_skipfirstnrows;
+  capi->parser_set_default_options = parser_set_default_options;
+  capi->parser_consume_rows = parser_consume_rows;
+  capi->parser_trim_buffers = parser_trim_buffers;
+  capi->tokenize_all_rows = tokenize_all_rows;
+  capi->tokenize_nrows = tokenize_nrows;
+  capi->str_to_int64 = str_to_int64;
+  capi->str_to_uint64 = str_to_uint64;
+  capi->xstrtod = xstrtod;
+  capi->precise_xstrtod = precise_xstrtod;
+  capi->round_trip = round_trip;
+  capi->to_boolean = to_boolean;
+
+  PyObject *capsule =
+      PyCapsule_New(capi, PandasParser_CAPSULE_NAME, pandas_parser_destructor);
+  if (capsule == NULL) {
+    PyMem_Free(capi);
+    return -1;
+  }
+
+  // Monkeypatch the top level pandas module to have an attribute for the
+  // C-API. This is required because Python capsules do not support setting
+  // this attribute on anything but the top level package. Ideally not
+  // done when cpython gh-6898 gets implemented
+  PyObject *pandas = PyImport_ImportModule("pandas");
+  if (!pandas) {
+    PyErr_SetString(PyExc_ImportError,
+                    "pd_parser.c could not import module pandas");
+    Py_DECREF(capsule);
+    return -1;
+  }
+
+  if (PyModule_AddObject(pandas, "_pandas_parser_CAPI", capsule) < 0) {
+    Py_DECREF(capsule);
+    return -1;
+  }
+
+  return 0;
+}
+
+static PyModuleDef_Slot pandas_parser_slots[] = {
+    {Py_mod_exec, pandas_parser_exec}, {0, NULL}};
+
+static struct PyModuleDef pandas_parsermodule = {
+    PyModuleDef_HEAD_INIT,
+    .m_name = "pandas._libs.pandas_parser",
+
+    .m_doc = "Internal module with parser support for other extensions",
+    .m_size = 0,
+    .m_methods = NULL,
+    .m_slots = pandas_parser_slots};
+
+PyMODINIT_FUNC PyInit_pandas_parser(void) {
+  return PyModuleDef_Init(&pandas_parsermodule);
+}