diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2249790b7ff1b..944f7727ee559 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -228,6 +228,7 @@ Other enhancements - :meth:`.Rolling.var`, :meth:`.Expanding.var`, :meth:`.Rolling.std`, :meth:`.Expanding.std` now support `Numba `_ execution with the ``engine`` keyword (:issue:`44461`) - :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) - :meth:`UInt64Index.map` now retains ``dtype`` where possible (:issue:`44609`) +- :meth:`read_json` can now parse unsigned long long integers (:issue:`26068`) - diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 757cabdbbc730..5b5995a671b2c 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -297,6 +297,7 @@ typedef struct __JSONObjectDecoder { JSOBJ (*endArray)(void *prv, JSOBJ obj); JSOBJ (*newInt)(void *prv, JSINT32 value); JSOBJ (*newLong)(void *prv, JSINT64 value); + JSOBJ (*newUnsignedLong)(void *prv, JSUINT64 value); JSOBJ (*newDouble)(void *prv, double value); void (*releaseObject)(void *prv, JSOBJ obj, void *decoder); JSPFN_MALLOC malloc; diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 81327fd9efb06..fee552672b8b6 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -116,8 +116,8 @@ JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) { JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { int intNeg = 1; - int mantSize = 0; JSUINT64 intValue; + JSUINT64 prevIntValue; int chr; int decimalCount = 0; double frcValue = 0.0; @@ -134,10 +134,10 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { } else if (*(offset) == '-') { offset++; intNeg = -1; + overflowLimit = LLONG_MIN; if (*(offset) == 'I') { goto DECODE_INF; } - overflowLimit = LLONG_MIN; } // Scan integer part @@ -157,19 +157,18 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { case '7': case '8': case '9': { - // FIXME: Check for arithmetic overflow here - // PERF: Don't do 64-bit arithmetic here unless we know we have - // to - intValue = intValue * 10ULL + (JSLONG)(chr - 48); - - if (intValue > overflowLimit) { - return SetError(ds, -1, overflowLimit == LLONG_MAX - ? "Value is too big" - : "Value is too small"); + // PERF: Don't do 64-bit arithmetic here unless we have to + prevIntValue = intValue; + intValue = intValue * 10ULL + (JSLONG) (chr - 48); + + if (intNeg == 1 && prevIntValue > intValue) { + return SetError(ds, -1, "Value is too big!"); + } else if (intNeg == -1 && intValue > overflowLimit) { + return SetError(ds, -1, overflowLimit == LLONG_MAX ? + "Value is too big!" : "Value is too small"); } offset++; - mantSize++; break; } case '.': { @@ -196,11 +195,12 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { ds->lastType = JT_INT; ds->start = offset; - if ((intValue >> 31)) { + if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) + return ds->dec->newUnsignedLong(ds->prv, intValue); + else if ((intValue >> 31)) return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); - } else { + else return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg)); - } DECODE_FRACTION: diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index 3db10237b2688..14683f4c28cbe 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -479,6 +479,10 @@ JSOBJ Object_newLong(void *prv, JSINT64 value) { return PyLong_FromLongLong(value); } +JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) { + return PyLong_FromUnsignedLongLong(value); +} + JSOBJ Object_newDouble(void *prv, double value) { return PyFloat_FromDouble(value); } @@ -508,7 +512,8 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { Object_newTrue, Object_newFalse, Object_newNull, Object_newPosInf, Object_newNegInf, Object_newObject, Object_endObject, Object_newArray, Object_endArray, - Object_newInteger, Object_newLong, Object_newDouble, + Object_newInteger, Object_newLong, Object_newUnsignedLong, + Object_newDouble, Object_releaseObject, PyObject_Malloc, PyObject_Free, PyObject_Realloc}; diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f228c826bc795..75a92ee1b9a45 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -22,7 +22,6 @@ DatetimeIndex, Series, Timestamp, - compat, read_json, ) import pandas._testing as tm @@ -1275,11 +1274,9 @@ def test_to_json_large_numbers(self, bigNum): expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected - @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) - @pytest.mark.skipif(not compat.IS64, reason="GH-35279") + @pytest.mark.parametrize("bigNum", [-(2 ** 63) - 1, 2 ** 64]) def test_read_json_large_numbers(self, bigNum): - # GH20599 - + # GH20599, 26068 json = StringIO('{"articleId":' + str(bigNum) + "}") msg = r"Value is too small|Value is too big" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index b5c22e959b4d7..b4ae54d48dc68 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -5,7 +5,6 @@ import locale import math import re -import sys import time import dateutil @@ -599,24 +598,23 @@ def test_encode_list_long_conversion(self): np.array(long_input), ujson.decode(output, numpy=True, dtype=np.int64) ) - def test_encode_long_conversion(self): - long_input = 9223372036854775807 + @pytest.mark.parametrize("long_input", [9223372036854775807, 18446744073709551615]) + def test_encode_long_conversion(self, long_input): output = ujson.encode(long_input) assert long_input == json.loads(output) assert output == json.dumps(long_input) assert long_input == ujson.decode(output) - @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) - @pytest.mark.xfail(not IS64, reason="GH-35288") + @pytest.mark.parametrize("bigNum", [2 ** 64, -(2 ** 63) - 1]) def test_dumps_ints_larger_than_maxsize(self, bigNum): - # GH34395 - bigNum = sys.maxsize + 1 encoding = ujson.encode(bigNum) assert str(bigNum) == encoding - # GH20599 - with pytest.raises(ValueError, match="Value is too big"): + with pytest.raises( + ValueError, + match="Value is too big|Value is too small", + ): assert ujson.loads(encoding) == bigNum @pytest.mark.parametrize( @@ -1162,11 +1160,12 @@ def test_decode_array(self, arr): def test_decode_extreme_numbers(self, extreme_num): assert extreme_num == ujson.decode(str(extreme_num)) - @pytest.mark.parametrize( - "too_extreme_num", ["9223372036854775808", "-90223372036854775809"] - ) + @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-2**63-1}"]) def test_decode_too_extreme_numbers(self, too_extreme_num): - with pytest.raises(ValueError, match="Value is too big|Value is too small"): + with pytest.raises( + ValueError, + match="Value is too big|Value is too small", + ): ujson.decode(too_extreme_num) def test_decode_with_trailing_whitespaces(self): @@ -1176,9 +1175,13 @@ def test_decode_with_trailing_non_whitespaces(self): with pytest.raises(ValueError, match="Trailing data"): ujson.decode("{}\n\t a") - def test_decode_array_with_big_int(self): - with pytest.raises(ValueError, match="Value is too big"): - ujson.loads("[18446098363113800555]") + @pytest.mark.parametrize("value", [f"{2**64}", f"{-2**63-1}"]) + def test_decode_array_with_big_int(self, value): + with pytest.raises( + ValueError, + match="Value is too big|Value is too small", + ): + ujson.loads(value) @pytest.mark.parametrize( "float_number",