Skip to content

ENH: Enabling parsing ulonglong from json #44770

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ Other enhancements
- :meth:`.Rolling.var`, :meth:`.Expanding.var`, :meth:`.Rolling.std`, :meth:`.Expanding.std` now support `Numba <http://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`44461`)
- :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`)
- :meth:`UInt64Index.map` now retains ``dtype`` where possible (:issue:`44609`)
- :meth:`read_json` can now parse unsigned long long integers (:issue:`26068`)
-


Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/src/ujson/lib/ultrajson.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ typedef struct __JSONObjectDecoder {
JSOBJ (*endArray)(void *prv, JSOBJ obj);
JSOBJ (*newInt)(void *prv, JSINT32 value);
JSOBJ (*newLong)(void *prv, JSINT64 value);
JSOBJ (*newUnsignedLong)(void *prv, JSUINT64 value);
JSOBJ (*newDouble)(void *prv, double value);
void (*releaseObject)(void *prv, JSOBJ obj, void *decoder);
JSPFN_MALLOC malloc;
Expand Down
30 changes: 15 additions & 15 deletions pandas/_libs/src/ujson/lib/ultrajsondec.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) {

JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
int intNeg = 1;
int mantSize = 0;
JSUINT64 intValue;
JSUINT64 prevIntValue;
int chr;
int decimalCount = 0;
double frcValue = 0.0;
Expand All @@ -134,10 +134,10 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
} else if (*(offset) == '-') {
offset++;
intNeg = -1;
overflowLimit = LLONG_MIN;
if (*(offset) == 'I') {
goto DECODE_INF;
}
overflowLimit = LLONG_MIN;
}

// Scan integer part
Expand All @@ -157,19 +157,18 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
case '7':
case '8':
case '9': {
// FIXME: Check for arithmetic overflow here
// PERF: Don't do 64-bit arithmetic here unless we know we have
// to
intValue = intValue * 10ULL + (JSLONG)(chr - 48);

if (intValue > overflowLimit) {
return SetError(ds, -1, overflowLimit == LLONG_MAX
? "Value is too big"
: "Value is too small");
// PERF: Don't do 64-bit arithmetic here unless we have to
prevIntValue = intValue;
intValue = intValue * 10ULL + (JSLONG) (chr - 48);

if (intNeg == 1 && prevIntValue > intValue) {
return SetError(ds, -1, "Value is too big!");
} else if (intNeg == -1 && intValue > overflowLimit) {
return SetError(ds, -1, overflowLimit == LLONG_MAX ?
"Value is too big!" : "Value is too small");
}

offset++;
mantSize++;
break;
}
case '.': {
Expand All @@ -196,11 +195,12 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
ds->lastType = JT_INT;
ds->start = offset;

if ((intValue >> 31)) {
if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0)
return ds->dec->newUnsignedLong(ds->prv, intValue);
else if ((intValue >> 31))
return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg));
} else {
else
return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg));
}

DECODE_FRACTION:

Expand Down
7 changes: 6 additions & 1 deletion pandas/_libs/src/ujson/python/JSONtoObj.c
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,10 @@ JSOBJ Object_newLong(void *prv, JSINT64 value) {
return PyLong_FromLongLong(value);
}

JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) {
return PyLong_FromUnsignedLongLong(value);
}

JSOBJ Object_newDouble(void *prv, double value) {
return PyFloat_FromDouble(value);
}
Expand Down Expand Up @@ -508,7 +512,8 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) {
Object_newTrue, Object_newFalse, Object_newNull,
Object_newPosInf, Object_newNegInf, Object_newObject,
Object_endObject, Object_newArray, Object_endArray,
Object_newInteger, Object_newLong, Object_newDouble,
Object_newInteger, Object_newLong, Object_newUnsignedLong,
Object_newDouble,
Object_releaseObject, PyObject_Malloc, PyObject_Free,
PyObject_Realloc};

Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
DatetimeIndex,
Series,
Timestamp,
compat,
read_json,
)
import pandas._testing as tm
Expand Down Expand Up @@ -1275,11 +1274,9 @@ def test_to_json_large_numbers(self, bigNum):
expected = '{"0":{"articleId":' + str(bigNum) + "}}"
assert json == expected

@pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
@pytest.mark.skipif(not compat.IS64, reason="GH-35279")
@pytest.mark.parametrize("bigNum", [-(2 ** 63) - 1, 2 ** 64])
def test_read_json_large_numbers(self, bigNum):
# GH20599

# GH20599, 26068
json = StringIO('{"articleId":' + str(bigNum) + "}")
msg = r"Value is too small|Value is too big"
with pytest.raises(ValueError, match=msg):
Expand Down
35 changes: 19 additions & 16 deletions pandas/tests/io/json/test_ujson.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import locale
import math
import re
import sys
import time

import dateutil
Expand Down Expand Up @@ -599,24 +598,23 @@ def test_encode_list_long_conversion(self):
np.array(long_input), ujson.decode(output, numpy=True, dtype=np.int64)
)

def test_encode_long_conversion(self):
long_input = 9223372036854775807
@pytest.mark.parametrize("long_input", [9223372036854775807, 18446744073709551615])
def test_encode_long_conversion(self, long_input):
output = ujson.encode(long_input)

assert long_input == json.loads(output)
assert output == json.dumps(long_input)
assert long_input == ujson.decode(output)

@pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
@pytest.mark.xfail(not IS64, reason="GH-35288")
@pytest.mark.parametrize("bigNum", [2 ** 64, -(2 ** 63) - 1])
def test_dumps_ints_larger_than_maxsize(self, bigNum):
# GH34395
bigNum = sys.maxsize + 1
encoding = ujson.encode(bigNum)
assert str(bigNum) == encoding

# GH20599
with pytest.raises(ValueError, match="Value is too big"):
with pytest.raises(
ValueError,
match="Value is too big|Value is too small",
):
assert ujson.loads(encoding) == bigNum

@pytest.mark.parametrize(
Expand Down Expand Up @@ -1162,11 +1160,12 @@ def test_decode_array(self, arr):
def test_decode_extreme_numbers(self, extreme_num):
assert extreme_num == ujson.decode(str(extreme_num))

@pytest.mark.parametrize(
"too_extreme_num", ["9223372036854775808", "-90223372036854775809"]
)
@pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-2**63-1}"])
def test_decode_too_extreme_numbers(self, too_extreme_num):
with pytest.raises(ValueError, match="Value is too big|Value is too small"):
with pytest.raises(
ValueError,
match="Value is too big|Value is too small",
):
ujson.decode(too_extreme_num)

def test_decode_with_trailing_whitespaces(self):
Expand All @@ -1176,9 +1175,13 @@ def test_decode_with_trailing_non_whitespaces(self):
with pytest.raises(ValueError, match="Trailing data"):
ujson.decode("{}\n\t a")

def test_decode_array_with_big_int(self):
with pytest.raises(ValueError, match="Value is too big"):
ujson.loads("[18446098363113800555]")
@pytest.mark.parametrize("value", [f"{2**64}", f"{-2**63-1}"])
def test_decode_array_with_big_int(self, value):
with pytest.raises(
ValueError,
match="Value is too big|Value is too small",
):
ujson.loads(value)

@pytest.mark.parametrize(
"float_number",
Expand Down