diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b97d0f5232f1e..703d6ede42583 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -565,7 +565,7 @@ I/O - Improved error message in :func:`read_excel` by including the offending sheet name when an exception is raised while reading a file (:issue:`48706`) - Bug when a pickling a subset PyArrow-backed data that would serialize the entire data instead of the subset (:issue:`42600`) - Bug in :func:`read_csv` for a single-line csv with fewer columns than ``names`` raised :class:`.errors.ParserError` with ``engine="c"`` (:issue:`47566`) -- +- Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`) Period ^^^^^^ diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 260f1ffb6165f..591dff72e3872 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -50,19 +50,18 @@ Numeric decoder derived from TCL library #include "date_conversions.h" #include "datetime.h" -static PyTypeObject *type_decimal; -static PyTypeObject *cls_dataframe; -static PyTypeObject *cls_series; -static PyTypeObject *cls_index; -static PyTypeObject *cls_nat; -static PyTypeObject *cls_na; -PyObject *cls_timedelta; - npy_int64 get_nat(void) { return NPY_MIN_INT64; } typedef char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, size_t *_outLen); +int object_is_decimal_type(PyObject *obj); +int object_is_dataframe_type(PyObject *obj); +int object_is_series_type(PyObject *obj); +int object_is_index_type(PyObject *obj); +int object_is_nat_type(PyObject *obj); +int object_is_na_type(PyObject *obj); + typedef struct __NpyArrContext { PyObject *array; char *dataptr; @@ -146,44 +145,6 @@ enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; int PdBlock_iterNext(JSOBJ, JSONTypeContext *); -void *initObjToJSON(void) { - PyObject *mod_pandas; - PyObject *mod_nattype; - PyObject *mod_natype; - PyObject *mod_decimal = PyImport_ImportModule("decimal"); - type_decimal = - (PyTypeObject *)PyObject_GetAttrString(mod_decimal, "Decimal"); - Py_DECREF(mod_decimal); - - PyDateTime_IMPORT; - - mod_pandas = PyImport_ImportModule("pandas"); - if (mod_pandas) { - cls_dataframe = - (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "DataFrame"); - cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); - cls_series = - (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); - Py_DECREF(mod_pandas); - } - - mod_nattype = PyImport_ImportModule("pandas._libs.tslibs.nattype"); - if (mod_nattype) { - cls_nat = - (PyTypeObject *)PyObject_GetAttrString(mod_nattype, "NaTType"); - Py_DECREF(mod_nattype); - } - - mod_natype = PyImport_ImportModule("pandas._libs.missing"); - if (mod_natype) { - cls_na = (PyTypeObject *)PyObject_GetAttrString(mod_natype, "NAType"); - Py_DECREF(mod_natype); - } - - // GH 31463 - return NULL; -} - static TypeContext *createTypeContext(void) { TypeContext *pc; @@ -216,8 +177,7 @@ static TypeContext *createTypeContext(void) { static PyObject *get_values(PyObject *obj) { PyObject *values = NULL; - if (PyObject_TypeCheck(obj, cls_index) || - PyObject_TypeCheck(obj, cls_series)) { + if (object_is_index_type(obj) || object_is_series_type(obj)) { // The special cases to worry about are dt64tz and category[dt64tz]. // In both cases we want the UTC-localized datetime64 ndarray, // without going through and object array of Timestamps. @@ -1510,12 +1470,12 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->PyTypeToUTF8 = PyUnicodeToUTF8; tc->type = JT_UTF8; return; - } else if (PyObject_TypeCheck(obj, type_decimal)) { + } else if (object_is_decimal_type(obj)) { GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj); tc->type = JT_DOUBLE; return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { - if (PyObject_TypeCheck(obj, cls_nat)) { + if (object_is_nat_type(obj)) { tc->type = JT_NULL; return; } @@ -1606,14 +1566,14 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { "%R (0d array) is not JSON serializable at the moment", obj); goto INVALID; - } else if (PyObject_TypeCheck(obj, cls_na)) { + } else if (object_is_na_type(obj)) { tc->type = JT_NULL; return; } ISITERABLE: - if (PyObject_TypeCheck(obj, cls_index)) { + if (object_is_index_type(obj)) { if (enc->outputFormat == SPLIT) { tc->type = JT_OBJECT; pc->iterBegin = Index_iterBegin; @@ -1637,7 +1597,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } return; - } else if (PyObject_TypeCheck(obj, cls_series)) { + } else if (object_is_series_type(obj)) { if (enc->outputFormat == SPLIT) { tc->type = JT_OBJECT; pc->iterBegin = Series_iterBegin; @@ -1701,7 +1661,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterGetValue = NpyArr_iterGetValue; pc->iterGetName = NpyArr_iterGetName; return; - } else if (PyObject_TypeCheck(obj, cls_dataframe)) { + } else if (object_is_dataframe_type(obj)) { if (enc->blkCtxtPassthru) { pc->pdblock = enc->blkCtxtPassthru; tc->type = @@ -1969,6 +1929,11 @@ char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, PyObject *kwargs) { + PyDateTime_IMPORT; + if (PyDateTimeAPI == NULL) { + return NULL; + } + static char *kwlist[] = {"obj", "ensure_ascii", "double_precision", diff --git a/pandas/_libs/src/ujson/python/ujson.c b/pandas/_libs/src/ujson/python/ujson.c index 5d4a5693c0ff6..c12f88d2f9354 100644 --- a/pandas/_libs/src/ujson/python/ujson.c +++ b/pandas/_libs/src/ujson/python/ujson.c @@ -67,15 +67,385 @@ static PyMethodDef ujsonMethods[] = { {NULL, NULL, 0, NULL} /* Sentinel */ }; -static PyModuleDef moduledef = { - .m_base = PyModuleDef_HEAD_INIT, - .m_name = "_libjson", - .m_methods = ujsonMethods -}; +typedef struct { + PyObject *type_decimal; + PyObject *type_dataframe; + PyObject *type_series; + PyObject *type_index; + PyObject *type_nat; + PyObject *type_na; +} modulestate; + +#define modulestate(o) ((modulestate *)PyModule_GetState(o)) + +static int module_traverse(PyObject *m, visitproc visit, void *arg); +static int module_clear(PyObject *m); +static void module_free(void *module); + +static struct PyModuleDef moduledef = {.m_base = PyModuleDef_HEAD_INIT, + .m_name = "_libjson", + .m_methods = ujsonMethods, + .m_size = sizeof(modulestate), + .m_traverse = module_traverse, + .m_clear = module_clear, + .m_free = module_free}; + +#ifndef PYPY_VERSION +/* Used in objToJSON.c */ +int object_is_decimal_type(PyObject *obj) { + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_decimal = state->type_decimal; + if (type_decimal == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_decimal); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; +} +int object_is_dataframe_type(PyObject *obj) { + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_dataframe = state->type_dataframe; + if (type_dataframe == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_dataframe); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; +} + +int object_is_series_type(PyObject *obj) { + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_series = state->type_series; + if (type_series == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_series); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; +} + +int object_is_index_type(PyObject *obj) { + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_index = state->type_index; + if (type_index == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_index); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; +} + +int object_is_nat_type(PyObject *obj) { + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_nat = state->type_nat; + if (type_nat == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_nat); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; +} + +int object_is_na_type(PyObject *obj) { + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_na = state->type_na; + if (type_na == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_na); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; +} +#else + /* Used in objToJSON.c */ +int object_is_decimal_type(PyObject *obj) { + PyObject *module = PyImport_ImportModule("decimal"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_decimal = PyObject_GetAttrString(module, "Decimal"); + if (type_decimal == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_decimal); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_decimal); + PyErr_Clear(); + return 0; + } + return result; +} + +int object_is_dataframe_type(PyObject *obj) { + PyObject *module = PyImport_ImportModule("pandas"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_dataframe = PyObject_GetAttrString(module, "DataFrame"); + if (type_dataframe == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_dataframe); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_dataframe); + PyErr_Clear(); + return 0; + } + return result; +} + +int object_is_series_type(PyObject *obj) { + PyObject *module = PyImport_ImportModule("pandas"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_series = PyObject_GetAttrString(module, "Series"); + if (type_series == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_series); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_series); + PyErr_Clear(); + return 0; + } + return result; +} + +int object_is_index_type(PyObject *obj) { + PyObject *module = PyImport_ImportModule("pandas"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_index = PyObject_GetAttrString(module, "Index"); + if (type_index == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_index); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_index); + PyErr_Clear(); + return 0; + } + return result; +} + +int object_is_nat_type(PyObject *obj) { + PyObject *module = PyImport_ImportModule("pandas._libs.tslibs.nattype"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_nat = PyObject_GetAttrString(module, "NaTType"); + if (type_nat == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_nat); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_nat); + PyErr_Clear(); + return 0; + } + return result; +} + +int object_is_na_type(PyObject *obj) { + PyObject *module = PyImport_ImportModule("pandas._libs.missing"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_na = PyObject_GetAttrString(module, "NAType"); + if (type_na == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_na); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_na); + PyErr_Clear(); + return 0; + } + return result; +} + +#endif + +static int module_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(modulestate(m)->type_decimal); + Py_VISIT(modulestate(m)->type_dataframe); + Py_VISIT(modulestate(m)->type_series); + Py_VISIT(modulestate(m)->type_index); + Py_VISIT(modulestate(m)->type_nat); + Py_VISIT(modulestate(m)->type_na); + return 0; +} + +static int module_clear(PyObject *m) { + Py_CLEAR(modulestate(m)->type_decimal); + Py_CLEAR(modulestate(m)->type_dataframe); + Py_CLEAR(modulestate(m)->type_series); + Py_CLEAR(modulestate(m)->type_index); + Py_CLEAR(modulestate(m)->type_nat); + Py_CLEAR(modulestate(m)->type_na); + return 0; +} + +static void module_free(void *module) { module_clear((PyObject *)module); } PyMODINIT_FUNC PyInit_json(void) { - import_array() - initObjToJSON(); // TODO(username): clean up, maybe via tp_free? - return PyModuleDef_Init(&moduledef); + import_array() + PyObject *module; + +#ifndef PYPY_VERSION + // This function is not supported in PyPy. + if ((module = PyState_FindModule(&moduledef)) != NULL) { + Py_INCREF(module); + return module; + } +#endif + + module = PyModule_Create(&moduledef); + if (module == NULL) { + return NULL; + } + +#ifndef PYPY_VERSION + PyObject *mod_decimal = PyImport_ImportModule("decimal"); + if (mod_decimal) { + PyObject *type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal"); + assert(type_decimal != NULL); + modulestate(module)->type_decimal = type_decimal; + Py_DECREF(mod_decimal); + } + + PyObject *mod_pandas = PyImport_ImportModule("pandas"); + if (mod_pandas) { + PyObject *type_dataframe = + PyObject_GetAttrString(mod_pandas, "DataFrame"); + assert(type_dataframe != NULL); + modulestate(module)->type_dataframe = type_dataframe; + + PyObject *type_series = PyObject_GetAttrString(mod_pandas, "Series"); + assert(type_series != NULL); + modulestate(module)->type_series = type_series; + + PyObject *type_index = PyObject_GetAttrString(mod_pandas, "Index"); + assert(type_index != NULL); + modulestate(module)->type_index = type_index; + + Py_DECREF(mod_pandas); + } + + PyObject *mod_nattype = + PyImport_ImportModule("pandas._libs.tslibs.nattype"); + if (mod_nattype) { + PyObject *type_nat = PyObject_GetAttrString(mod_nattype, "NaTType"); + assert(type_nat != NULL); + modulestate(module)->type_nat = type_nat; + + Py_DECREF(mod_nattype); + } + + PyObject *mod_natype = PyImport_ImportModule("pandas._libs.missing"); + if (mod_natype) { + PyObject *type_na = PyObject_GetAttrString(mod_natype, "NAType"); + assert(type_na != NULL); + modulestate(module)->type_na = type_na; + + Py_DECREF(mod_natype); + } else { + PyErr_Clear(); + } +#endif + + /* Not vendored for now + JSONDecodeError = PyErr_NewException("ujson.JSONDecodeError", + PyExc_ValueError, NULL); Py_XINCREF(JSONDecodeError); if + (PyModule_AddObject(module, "JSONDecodeError", JSONDecodeError) < 0) + { + Py_XDECREF(JSONDecodeError); + Py_CLEAR(JSONDecodeError); + Py_DECREF(module); + return NULL; + } + */ + + return module; }