-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Use PyCapsule for internal datetime functions #51525
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
431495b
c3dca72
d77a2ed
4525e77
782b971
69abccd
69460bd
776f0f5
c4a05b5
a995ee2
05e28ad
d938374
df687b3
a26f312
96b6f96
7b28333
ccea2b3
2bf7264
418910d
a4f7e1a
ad1d149
5887254
138ea0d
679d03d
7c4e365
5aee18a
a0523be
554d701
726d93d
d2fe542
49a2739
3981ec2
fb75100
f51e7f4
709bf6c
f4dac4f
7fd0a49
12179c7
67f5445
9271ce3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -229,9 +229,9 @@ cdef extern from "parser/tokenizer.h": | |
int64_t skip_first_N_rows | ||
int64_t skipfooter | ||
# pick one, depending on whether the converter requires GIL | ||
float64_t (*double_converter)(const char *, char **, | ||
char, char, char, | ||
int, int *, int *) nogil | ||
double (*double_converter)(const char *, char **, | ||
char, char, char, | ||
int, int *, int *) nogil | ||
|
||
# error handling | ||
char *warn_msg | ||
|
@@ -249,6 +249,16 @@ cdef extern from "parser/tokenizer.h": | |
int seen_uint | ||
int seen_null | ||
|
||
void COLITER_NEXT(coliter_t, const char *) nogil | ||
|
||
cdef extern from "pd_parser.h": | ||
void *new_rd_source(object obj) except NULL | ||
|
||
int del_rd_source(void *src) | ||
|
||
void* buffer_rd_bytes(void *source, size_t nbytes, | ||
size_t *bytes_read, int *status, const char *encoding_errors) | ||
|
||
void uint_state_init(uint_state *self) | ||
int uint64_conflict(uint_state *self) | ||
|
||
|
@@ -279,26 +289,49 @@ cdef extern from "parser/tokenizer.h": | |
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max, | ||
uint64_t uint_max, int *error, char tsep) nogil | ||
|
||
float64_t xstrtod(const char *p, char **q, char decimal, | ||
double xstrtod(const char *p, char **q, char decimal, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. similar comment as above on float64_t -> double change in declaration |
||
char sci, char tsep, int skip_trailing, | ||
int *error, int *maybe_int) nogil | ||
double precise_xstrtod(const char *p, char **q, char decimal, | ||
char sci, char tsep, int skip_trailing, | ||
int *error, int *maybe_int) nogil | ||
double round_trip(const char *p, char **q, char decimal, | ||
char sci, char tsep, int skip_trailing, | ||
int *error, int *maybe_int) nogil | ||
float64_t precise_xstrtod(const char *p, char **q, char decimal, | ||
char sci, char tsep, int skip_trailing, | ||
int *error, int *maybe_int) nogil | ||
float64_t round_trip(const char *p, char **q, char decimal, | ||
char sci, char tsep, int skip_trailing, | ||
int *error, int *maybe_int) nogil | ||
|
||
int to_boolean(const char *item, uint8_t *val) nogil | ||
|
||
void PandasParser_IMPORT() | ||
|
||
cdef extern from "parser/io.h": | ||
void *new_rd_source(object obj) except NULL | ||
PandasParser_IMPORT | ||
|
||
int del_rd_source(void *src) | ||
# When not invoked directly but rather assigned as a function, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be nice to not have to do this. Working on an MRE for Cython upstream |
||
# cdef extern'ed declarations seem to leave behind an undefined symbol | ||
cdef double xstrtod_wrapper(const char *p, char **q, char decimal, | ||
char sci, char tsep, int skip_trailing, | ||
int *error, int *maybe_int) nogil: | ||
return xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int) | ||
|
||
void* buffer_rd_bytes(void *source, size_t nbytes, | ||
size_t *bytes_read, int *status, const char *encoding_errors) | ||
|
||
cdef double precise_xstrtod_wrapper(const char *p, char **q, char decimal, | ||
char sci, char tsep, int skip_trailing, | ||
int *error, int *maybe_int) nogil: | ||
return precise_xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int) | ||
|
||
|
||
cdef double round_trip_wrapper(const char *p, char **q, char decimal, | ||
char sci, char tsep, int skip_trailing, | ||
int *error, int *maybe_int) nogil: | ||
return round_trip(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int) | ||
|
||
|
||
cdef void* buffer_rd_bytes_wrapper(void *source, size_t nbytes, | ||
size_t *bytes_read, int *status, | ||
const char *encoding_errors) noexcept: | ||
return buffer_rd_bytes(source, nbytes, bytes_read, status, encoding_errors) | ||
|
||
cdef int del_rd_source_wrapper(void *src) noexcept: | ||
return del_rd_source(src) | ||
|
||
|
||
cdef class TextReader: | ||
|
@@ -487,11 +520,11 @@ cdef class TextReader: | |
|
||
if float_precision == "round_trip": | ||
# see gh-15140 | ||
self.parser.double_converter = round_trip | ||
self.parser.double_converter = round_trip_wrapper | ||
elif float_precision == "legacy": | ||
self.parser.double_converter = xstrtod | ||
self.parser.double_converter = xstrtod_wrapper | ||
elif float_precision == "high" or float_precision is None: | ||
self.parser.double_converter = precise_xstrtod | ||
self.parser.double_converter = precise_xstrtod_wrapper | ||
else: | ||
raise ValueError(f"Unrecognized float_precision option: " | ||
f"{float_precision}") | ||
|
@@ -610,8 +643,8 @@ cdef class TextReader: | |
|
||
ptr = new_rd_source(source) | ||
self.parser.source = ptr | ||
self.parser.cb_io = &buffer_rd_bytes | ||
self.parser.cb_cleanup = &del_rd_source | ||
self.parser.cb_io = buffer_rd_bytes_wrapper | ||
self.parser.cb_cleanup = del_rd_source_wrapper | ||
|
||
cdef _get_header(self, list prelim_header): | ||
# header is now a list of lists, so field_count should use header[0] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
/* | ||
|
||
Copyright (c) 2023, PyData Development Team | ||
All rights reserved. | ||
|
||
Distributed under the terms of the BSD Simplified License. | ||
|
||
*/ | ||
#define _PANDAS_PARSER_IMPL | ||
|
||
#include "pd_parser.h" | ||
#include "src/parser/io.h" | ||
|
||
static int to_double(char *item, double *p_value, char sci, char decimal, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This and floatify were moved from |
||
int *maybe_int) { | ||
char *p_end = NULL; | ||
int error = 0; | ||
|
||
/* Switch to precise xstrtod GH 31364 */ | ||
*p_value = | ||
precise_xstrtod(item, &p_end, decimal, sci, '\0', 1, &error, maybe_int); | ||
|
||
return (error == 0) && (!*p_end); | ||
} | ||
|
||
static int floatify(PyObject *str, double *result, int *maybe_int) { | ||
int status; | ||
char *data; | ||
PyObject *tmp = NULL; | ||
const char sci = 'E'; | ||
const char dec = '.'; | ||
|
||
if (PyBytes_Check(str)) { | ||
data = PyBytes_AS_STRING(str); | ||
} else if (PyUnicode_Check(str)) { | ||
tmp = PyUnicode_AsUTF8String(str); | ||
if (tmp == NULL) { | ||
return -1; | ||
} | ||
data = PyBytes_AS_STRING(tmp); | ||
} else { | ||
PyErr_SetString(PyExc_TypeError, "Invalid object type"); | ||
return -1; | ||
} | ||
|
||
status = to_double(data, result, sci, dec, maybe_int); | ||
|
||
if (!status) { | ||
/* handle inf/-inf infinity/-infinity */ | ||
if (strlen(data) == 3) { | ||
if (0 == strcasecmp(data, "inf")) { | ||
*result = HUGE_VAL; | ||
*maybe_int = 0; | ||
} else { | ||
goto parsingerror; | ||
} | ||
} else if (strlen(data) == 4) { | ||
if (0 == strcasecmp(data, "-inf")) { | ||
*result = -HUGE_VAL; | ||
*maybe_int = 0; | ||
} else if (0 == strcasecmp(data, "+inf")) { | ||
*result = HUGE_VAL; | ||
*maybe_int = 0; | ||
} else { | ||
goto parsingerror; | ||
} | ||
} else if (strlen(data) == 8) { | ||
if (0 == strcasecmp(data, "infinity")) { | ||
*result = HUGE_VAL; | ||
*maybe_int = 0; | ||
} else { | ||
goto parsingerror; | ||
} | ||
} else if (strlen(data) == 9) { | ||
if (0 == strcasecmp(data, "-infinity")) { | ||
*result = -HUGE_VAL; | ||
*maybe_int = 0; | ||
} else if (0 == strcasecmp(data, "+infinity")) { | ||
*result = HUGE_VAL; | ||
*maybe_int = 0; | ||
} else { | ||
goto parsingerror; | ||
} | ||
} else { | ||
goto parsingerror; | ||
} | ||
} | ||
|
||
Py_XDECREF(tmp); | ||
return 0; | ||
|
||
parsingerror: | ||
PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data); | ||
Py_XDECREF(tmp); | ||
return -1; | ||
} | ||
|
||
|
||
static void pandas_parser_destructor(PyObject *op) { | ||
void *ptr = PyCapsule_GetPointer(op, PandasParser_CAPSULE_NAME); | ||
PyMem_Free(ptr); | ||
} | ||
|
||
static int pandas_parser_exec(PyObject *module) { | ||
PandasParser_CAPI *capi = PyMem_Malloc(sizeof(PandasParser_CAPI)); | ||
if (capi == NULL) { | ||
PyErr_NoMemory(); | ||
return -1; | ||
} | ||
|
||
capi->to_double = to_double; | ||
capi->floatify = floatify; | ||
capi->new_rd_source = new_rd_source; | ||
capi->del_rd_source = del_rd_source; | ||
capi->buffer_rd_bytes = buffer_rd_bytes; | ||
capi->uint_state_init = uint_state_init; | ||
capi->uint64_conflict = uint64_conflict; | ||
capi->coliter_setup = coliter_setup; | ||
capi->parser_new = parser_new; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible to separate actual parsing stuff from string to numeric parsing more? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea that's a great question. The vast majority of these are used in parsers.pyx; we could implement different capsules for different purposes. It probably would be paired well with a refactor of that Cython module. It is hard to tell from the history of parsers.pyx what was placed there as a matter of code organization versus tricks to prevent the build system from getting too heavy handed |
||
capi->parser_init = parser_init; | ||
capi->parser_free = parser_free; | ||
capi->parser_del = parser_del; | ||
capi->parser_add_skiprow = parser_add_skiprow; | ||
capi->parser_set_skipfirstnrows = parser_set_skipfirstnrows; | ||
capi->parser_set_default_options = parser_set_default_options; | ||
capi->parser_consume_rows = parser_consume_rows; | ||
capi->parser_trim_buffers = parser_trim_buffers; | ||
capi->tokenize_all_rows = tokenize_all_rows; | ||
capi->tokenize_nrows = tokenize_nrows; | ||
capi->str_to_int64 = str_to_int64; | ||
capi->str_to_uint64 = str_to_uint64; | ||
capi->xstrtod = xstrtod; | ||
capi->precise_xstrtod = precise_xstrtod; | ||
capi->round_trip = round_trip; | ||
capi->to_boolean = to_boolean; | ||
|
||
PyObject *capsule = | ||
PyCapsule_New(capi, PandasParser_CAPSULE_NAME, pandas_parser_destructor); | ||
if (capsule == NULL) { | ||
PyMem_Free(capi); | ||
return -1; | ||
} | ||
|
||
// Monkeypatch the top level pandas module to have an attribute for the | ||
// C-API. This is required because Python capsules do not support setting | ||
// this attribute on anything but the top level package. Ideally not | ||
// done when cpython gh-6898 gets implemented | ||
PyObject *pandas = PyImport_ImportModule("pandas"); | ||
if (!pandas) { | ||
PyErr_SetString(PyExc_ImportError, | ||
"pd_parser.c could not import module pandas"); | ||
Py_DECREF(capsule); | ||
return -1; | ||
} | ||
|
||
if (PyModule_AddObject(pandas, "_pandas_parser_CAPI", capsule) < 0) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm interested why the parser needs to have this exposed to Python (I can't see to ctrl+F anything useful out of the diff). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is just a requirement of the CPython implementation: https://docs.python.org/3/extending/extending.html#providing-a-c-api-for-an-extension-module Ideally we wouldn't add this to the pandas namespace and attach it solely to the extension module itself. From what I could see in the CPython source though I don't think that would be possible without python/cpython#6898 |
||
Py_DECREF(capsule); | ||
return -1; | ||
} | ||
|
||
return 0; | ||
} | ||
|
||
static PyModuleDef_Slot pandas_parser_slots[] = { | ||
{Py_mod_exec, pandas_parser_exec}, {0, NULL}}; | ||
|
||
static struct PyModuleDef pandas_parsermodule = { | ||
PyModuleDef_HEAD_INIT, | ||
.m_name = "pandas._libs.pandas_parser", | ||
|
||
.m_doc = "Internal module with parser support for other extensions", | ||
.m_size = 0, | ||
.m_methods = NULL, | ||
.m_slots = pandas_parser_slots}; | ||
|
||
PyMODINIT_FUNC PyInit_pandas_parser(void) { | ||
return PyModuleDef_Init(&pandas_parsermodule); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function actually returns a double; not sure why Cython was OK with the previous declaration but had to explictly change it as part of this PR for things to cythonize