Skip to content

Use PyCapsule for internal datetime functions #51525

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 40 commits into from
Mar 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
431495b
initial file setup and build
WillAyd Feb 18, 2023
c3dca72
Building standalone impl
WillAyd Feb 18, 2023
d77a2ed
more build
WillAyd Feb 18, 2023
4525e77
linting
WillAyd Feb 18, 2023
782b971
more updates
WillAyd Feb 18, 2023
69abccd
Capsule location change
WillAyd Feb 19, 2023
69460bd
working import
WillAyd Feb 20, 2023
776f0f5
working imports?
WillAyd Feb 20, 2023
c4a05b5
moved lots
WillAyd Feb 20, 2023
a995ee2
Working impl
WillAyd Feb 21, 2023
05e28ad
styling
WillAyd Feb 21, 2023
d938374
import cleanups
WillAyd Feb 21, 2023
df687b3
revert init change
WillAyd Feb 21, 2023
a26f312
cleanups
WillAyd Feb 21, 2023
96b6f96
isort fixups
WillAyd Feb 21, 2023
7b28333
api test fix
WillAyd Feb 21, 2023
ccea2b3
removed unneeded tokenizer add
WillAyd Feb 21, 2023
2bf7264
Hacked together parser capsule
WillAyd Feb 26, 2023
418910d
Symbol cleanups
WillAyd Feb 26, 2023
a4f7e1a
Resolved all undefined symbols in parsers.pyx
WillAyd Feb 26, 2023
ad1d149
IO callbacks restored
WillAyd Feb 26, 2023
5887254
Fix build and test failures
WillAyd Feb 26, 2023
138ea0d
Try relative imports for MSFT compat
WillAyd Feb 26, 2023
679d03d
try py_ssize_t_clean macro
WillAyd Feb 26, 2023
7c4e365
Removed double tokenizer include
WillAyd Feb 26, 2023
5aee18a
removed unneeded include path
WillAyd Feb 26, 2023
a0523be
more cleanups
WillAyd Feb 27, 2023
554d701
noexcept
WillAyd Feb 27, 2023
726d93d
signature cleanup
WillAyd Feb 27, 2023
d2fe542
simplify parser impl
WillAyd Feb 28, 2023
49a2739
retain np_datetime_string license
WillAyd Feb 28, 2023
3981ec2
retained old file structure where possible
WillAyd Feb 28, 2023
fb75100
Commited C file
WillAyd Feb 28, 2023
f51e7f4
Removed erroneous comments
WillAyd Mar 2, 2023
709bf6c
Merge branch 'main' into np-datetime-capsule
WillAyd Mar 2, 2023
f4dac4f
graft pd_parser.c
WillAyd Mar 6, 2023
7fd0a49
Merge branch 'main' into np-datetime-capsule
WillAyd Mar 6, 2023
12179c7
graft -> include
WillAyd Mar 6, 2023
67f5445
Merge remote-tracking branch 'upstream/main' into np-datetime-capsule
WillAyd Mar 9, 2023
9271ce3
Merge branch 'main' into np-datetime-capsule
WillAyd Mar 11, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,5 @@ prune pandas/tests/io/parser/data
# Selectively re-add *.cxx files that were excluded above
graft pandas/_libs/src
graft pandas/_libs/tslibs/src
include pandas/_libs/pd_parser.h
include pandas/_libs/pd_parser.c
5 changes: 5 additions & 0 deletions pandas/_libs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
]


# Below imports needs to happen first to ensure pandas top level
# module gets monkeypatched with the pandas_datetime_CAPI
# see pandas_datetime_exec in pd_datetime.c
import pandas._libs.pandas_parser # noqa # isort: skip # type: ignore[reportUnusedImport]
import pandas._libs.pandas_datetime # noqa # isort: skip # type: ignore[reportUnusedImport]
from pandas._libs.interval import Interval
from pandas._libs.tslibs import (
NaT,
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@ from pandas._libs.tslibs.nattype cimport c_NaT as NaT
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
get_unit_from_dtype,
import_pandas_datetime,
)

import_pandas_datetime()


from pandas._libs.tslibs.period cimport is_period_object
from pandas._libs.tslibs.timedeltas cimport _Timedelta
from pandas._libs.tslibs.timestamps cimport _Timestamp
Expand Down
6 changes: 4 additions & 2 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,11 @@ cdef extern from "numpy/arrayobject.h":
cdef extern from "numpy/ndarrayobject.h":
bint PyArray_CheckScalar(obj) nogil


cdef extern from "src/parse_helper.h":
cdef extern from "pd_parser.h":
int floatify(object, float64_t *result, int *maybe_int) except -1
void PandasParser_IMPORT()

PandasParser_IMPORT

from pandas._libs cimport util
from pandas._libs.util cimport (
Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,11 @@ from pandas._libs.tslibs.np_datetime cimport (
get_datetime64_unit,
get_datetime64_value,
get_timedelta64_value,
import_pandas_datetime,
)

import_pandas_datetime()

from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op

cdef:
Expand Down
73 changes: 53 additions & 20 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,9 @@ cdef extern from "parser/tokenizer.h":
int64_t skip_first_N_rows
int64_t skipfooter
# pick one, depending on whether the converter requires GIL
float64_t (*double_converter)(const char *, char **,
char, char, char,
int, int *, int *) nogil
double (*double_converter)(const char *, char **,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function actually returns a double; not sure why Cython was OK with the previous declaration but had to explictly change it as part of this PR for things to cythonize

char, char, char,
int, int *, int *) nogil

# error handling
char *warn_msg
Expand All @@ -249,6 +249,16 @@ cdef extern from "parser/tokenizer.h":
int seen_uint
int seen_null

void COLITER_NEXT(coliter_t, const char *) nogil

cdef extern from "pd_parser.h":
void *new_rd_source(object obj) except NULL

int del_rd_source(void *src)

void* buffer_rd_bytes(void *source, size_t nbytes,
size_t *bytes_read, int *status, const char *encoding_errors)

void uint_state_init(uint_state *self)
int uint64_conflict(uint_state *self)

Expand Down Expand Up @@ -279,26 +289,49 @@ cdef extern from "parser/tokenizer.h":
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep) nogil

float64_t xstrtod(const char *p, char **q, char decimal,
double xstrtod(const char *p, char **q, char decimal,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similar comment as above on float64_t -> double change in declaration

char sci, char tsep, int skip_trailing,
int *error, int *maybe_int) nogil
double precise_xstrtod(const char *p, char **q, char decimal,
char sci, char tsep, int skip_trailing,
int *error, int *maybe_int) nogil
double round_trip(const char *p, char **q, char decimal,
char sci, char tsep, int skip_trailing,
int *error, int *maybe_int) nogil
float64_t precise_xstrtod(const char *p, char **q, char decimal,
char sci, char tsep, int skip_trailing,
int *error, int *maybe_int) nogil
float64_t round_trip(const char *p, char **q, char decimal,
char sci, char tsep, int skip_trailing,
int *error, int *maybe_int) nogil

int to_boolean(const char *item, uint8_t *val) nogil

void PandasParser_IMPORT()

cdef extern from "parser/io.h":
void *new_rd_source(object obj) except NULL
PandasParser_IMPORT

int del_rd_source(void *src)
# When not invoked directly but rather assigned as a function,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice to not have to do this. Working on an MRE for Cython upstream

# cdef extern'ed declarations seem to leave behind an undefined symbol
cdef double xstrtod_wrapper(const char *p, char **q, char decimal,
char sci, char tsep, int skip_trailing,
int *error, int *maybe_int) nogil:
return xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)

void* buffer_rd_bytes(void *source, size_t nbytes,
size_t *bytes_read, int *status, const char *encoding_errors)

cdef double precise_xstrtod_wrapper(const char *p, char **q, char decimal,
char sci, char tsep, int skip_trailing,
int *error, int *maybe_int) nogil:
return precise_xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)


cdef double round_trip_wrapper(const char *p, char **q, char decimal,
char sci, char tsep, int skip_trailing,
int *error, int *maybe_int) nogil:
return round_trip(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int)


cdef void* buffer_rd_bytes_wrapper(void *source, size_t nbytes,
size_t *bytes_read, int *status,
const char *encoding_errors) noexcept:
return buffer_rd_bytes(source, nbytes, bytes_read, status, encoding_errors)

cdef int del_rd_source_wrapper(void *src) noexcept:
return del_rd_source(src)


cdef class TextReader:
Expand Down Expand Up @@ -487,11 +520,11 @@ cdef class TextReader:

if float_precision == "round_trip":
# see gh-15140
self.parser.double_converter = round_trip
self.parser.double_converter = round_trip_wrapper
elif float_precision == "legacy":
self.parser.double_converter = xstrtod
self.parser.double_converter = xstrtod_wrapper
elif float_precision == "high" or float_precision is None:
self.parser.double_converter = precise_xstrtod
self.parser.double_converter = precise_xstrtod_wrapper
else:
raise ValueError(f"Unrecognized float_precision option: "
f"{float_precision}")
Expand Down Expand Up @@ -610,8 +643,8 @@ cdef class TextReader:

ptr = new_rd_source(source)
self.parser.source = ptr
self.parser.cb_io = &buffer_rd_bytes
self.parser.cb_cleanup = &del_rd_source
self.parser.cb_io = buffer_rd_bytes_wrapper
self.parser.cb_cleanup = del_rd_source_wrapper

cdef _get_header(self, list prelim_header):
# header is now a list of lists, so field_count should use header[0]
Expand Down
178 changes: 178 additions & 0 deletions pandas/_libs/pd_parser.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/*

Copyright (c) 2023, PyData Development Team
All rights reserved.

Distributed under the terms of the BSD Simplified License.

*/
#define _PANDAS_PARSER_IMPL

#include "pd_parser.h"
#include "src/parser/io.h"

static int to_double(char *item, double *p_value, char sci, char decimal,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This and floatify were moved from parse_helper.h - seem to more naturally fit in a module than a header

int *maybe_int) {
char *p_end = NULL;
int error = 0;

/* Switch to precise xstrtod GH 31364 */
*p_value =
precise_xstrtod(item, &p_end, decimal, sci, '\0', 1, &error, maybe_int);

return (error == 0) && (!*p_end);
}

static int floatify(PyObject *str, double *result, int *maybe_int) {
int status;
char *data;
PyObject *tmp = NULL;
const char sci = 'E';
const char dec = '.';

if (PyBytes_Check(str)) {
data = PyBytes_AS_STRING(str);
} else if (PyUnicode_Check(str)) {
tmp = PyUnicode_AsUTF8String(str);
if (tmp == NULL) {
return -1;
}
data = PyBytes_AS_STRING(tmp);
} else {
PyErr_SetString(PyExc_TypeError, "Invalid object type");
return -1;
}

status = to_double(data, result, sci, dec, maybe_int);

if (!status) {
/* handle inf/-inf infinity/-infinity */
if (strlen(data) == 3) {
if (0 == strcasecmp(data, "inf")) {
*result = HUGE_VAL;
*maybe_int = 0;
} else {
goto parsingerror;
}
} else if (strlen(data) == 4) {
if (0 == strcasecmp(data, "-inf")) {
*result = -HUGE_VAL;
*maybe_int = 0;
} else if (0 == strcasecmp(data, "+inf")) {
*result = HUGE_VAL;
*maybe_int = 0;
} else {
goto parsingerror;
}
} else if (strlen(data) == 8) {
if (0 == strcasecmp(data, "infinity")) {
*result = HUGE_VAL;
*maybe_int = 0;
} else {
goto parsingerror;
}
} else if (strlen(data) == 9) {
if (0 == strcasecmp(data, "-infinity")) {
*result = -HUGE_VAL;
*maybe_int = 0;
} else if (0 == strcasecmp(data, "+infinity")) {
*result = HUGE_VAL;
*maybe_int = 0;
} else {
goto parsingerror;
}
} else {
goto parsingerror;
}
}

Py_XDECREF(tmp);
return 0;

parsingerror:
PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data);
Py_XDECREF(tmp);
return -1;
}


static void pandas_parser_destructor(PyObject *op) {
void *ptr = PyCapsule_GetPointer(op, PandasParser_CAPSULE_NAME);
PyMem_Free(ptr);
}

static int pandas_parser_exec(PyObject *module) {
PandasParser_CAPI *capi = PyMem_Malloc(sizeof(PandasParser_CAPI));
if (capi == NULL) {
PyErr_NoMemory();
return -1;
}

capi->to_double = to_double;
capi->floatify = floatify;
capi->new_rd_source = new_rd_source;
capi->del_rd_source = del_rd_source;
capi->buffer_rd_bytes = buffer_rd_bytes;
capi->uint_state_init = uint_state_init;
capi->uint64_conflict = uint64_conflict;
capi->coliter_setup = coliter_setup;
capi->parser_new = parser_new;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to separate actual parsing stuff from string to numeric parsing more?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea that's a great question. The vast majority of these are used in parsers.pyx; we could implement different capsules for different purposes. It probably would be paired well with a refactor of that Cython module.

It is hard to tell from the history of parsers.pyx what was placed there as a matter of code organization versus tricks to prevent the build system from getting too heavy handed

capi->parser_init = parser_init;
capi->parser_free = parser_free;
capi->parser_del = parser_del;
capi->parser_add_skiprow = parser_add_skiprow;
capi->parser_set_skipfirstnrows = parser_set_skipfirstnrows;
capi->parser_set_default_options = parser_set_default_options;
capi->parser_consume_rows = parser_consume_rows;
capi->parser_trim_buffers = parser_trim_buffers;
capi->tokenize_all_rows = tokenize_all_rows;
capi->tokenize_nrows = tokenize_nrows;
capi->str_to_int64 = str_to_int64;
capi->str_to_uint64 = str_to_uint64;
capi->xstrtod = xstrtod;
capi->precise_xstrtod = precise_xstrtod;
capi->round_trip = round_trip;
capi->to_boolean = to_boolean;

PyObject *capsule =
PyCapsule_New(capi, PandasParser_CAPSULE_NAME, pandas_parser_destructor);
if (capsule == NULL) {
PyMem_Free(capi);
return -1;
}

// Monkeypatch the top level pandas module to have an attribute for the
// C-API. This is required because Python capsules do not support setting
// this attribute on anything but the top level package. Ideally not
// done when cpython gh-6898 gets implemented
PyObject *pandas = PyImport_ImportModule("pandas");
if (!pandas) {
PyErr_SetString(PyExc_ImportError,
"pd_parser.c could not import module pandas");
Py_DECREF(capsule);
return -1;
}

if (PyModule_AddObject(pandas, "_pandas_parser_CAPI", capsule) < 0) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm interested why the parser needs to have this exposed to Python (I can't see to ctrl+F anything useful out of the diff).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is just a requirement of the CPython implementation:

https://docs.python.org/3/extending/extending.html#providing-a-c-api-for-an-extension-module

Ideally we wouldn't add this to the pandas namespace and attach it solely to the extension module itself. From what I could see in the CPython source though I don't think that would be possible without python/cpython#6898

Py_DECREF(capsule);
return -1;
}

return 0;
}

static PyModuleDef_Slot pandas_parser_slots[] = {
{Py_mod_exec, pandas_parser_exec}, {0, NULL}};

static struct PyModuleDef pandas_parsermodule = {
PyModuleDef_HEAD_INIT,
.m_name = "pandas._libs.pandas_parser",

.m_doc = "Internal module with parser support for other extensions",
.m_size = 0,
.m_methods = NULL,
.m_slots = pandas_parser_slots};

PyMODINIT_FUNC PyInit_pandas_parser(void) {
return PyModuleDef_Init(&pandas_parsermodule);
}
Loading