From c094a6b3dc3cd275633a5b1e243b8b904fb264cc Mon Sep 17 00:00:00 2001 From: deponovo Date: Sun, 5 Dec 2021 16:38:40 +0100 Subject: [PATCH 01/12] ENH : adding support to parse unsigned long long from json (#26068) --- pandas/_libs/src/ujson/lib/ultrajson.h | 1 + pandas/_libs/src/ujson/lib/ultrajsondec.c | 47 +++++++++++++++-------- pandas/_libs/src/ujson/python/JSONtoObj.c | 6 ++- 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 757cabdbbc730..5b5995a671b2c 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -297,6 +297,7 @@ typedef struct __JSONObjectDecoder { JSOBJ (*endArray)(void *prv, JSOBJ obj); JSOBJ (*newInt)(void *prv, JSINT32 value); JSOBJ (*newLong)(void *prv, JSINT64 value); + JSOBJ (*newUnsignedLong)(void *prv, JSUINT64 value); JSOBJ (*newDouble)(void *prv, double value); void (*releaseObject)(void *prv, JSOBJ obj, void *decoder); JSPFN_MALLOC malloc; diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 81327fd9efb06..9464ceb81fb85 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -116,7 +116,7 @@ JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) { JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { int intNeg = 1; - int mantSize = 0; + char charCount = 0; JSUINT64 intValue; int chr; int decimalCount = 0; @@ -125,7 +125,7 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { double expValue; char *offset = ds->start; - JSUINT64 overflowLimit = LLONG_MAX; + JSUINT64 signedOverflowLimit = LLONG_MAX; if (*(offset) == 'I') { goto DECODE_INF; @@ -137,7 +137,7 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { if (*(offset) == 'I') { goto DECODE_INF; } - overflowLimit = LLONG_MIN; + signedOverflowLimit = LLONG_MAX + 1; } // Scan integer part @@ -157,19 +157,29 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { case '7': case '8': case '9': { - // FIXME: Check for arithmetic overflow here - // PERF: Don't do 64-bit arithmetic here unless we know we have - // to - intValue = intValue * 10ULL + (JSLONG)(chr - 48); - - if (intValue > overflowLimit) { - return SetError(ds, -1, overflowLimit == LLONG_MAX - ? "Value is too big" - : "Value is too small"); + JSUINT64 newContribution = (JSUINT64)(chr - 48); + + // check overflow when unsigned + if (charCount > 18 && intNeg == 1) { // 2**64 = 18_446_744_073_709_551_616 + if (intValue > (ULLONG_MAX - newContribution) / 10) + return SetError(ds, -1, "unsigned long long overflow: Value is too big"); } + intValue = intValue * 10ULL + newContribution; + + // check overflow when signed + if (charCount > 17 && intNeg == -1) { // 2**63 = 9_223_372_036_854_775_807 + if (intValue > signedOverflowLimit) { + return SetError(ds, -1, signedOverflowLimit == LLONG_MAX + ? "Value is too big" + : "Value is too small"); + } + } + + if (intValue != 0) + charCount++; + offset++; - mantSize++; break; } case '.': { @@ -197,9 +207,16 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { ds->start = offset; if ((intValue >> 31)) { - return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); + if (intNeg == 1) + return ds->dec->newUnsignedLong(ds->prv, intValue); + else + return ds->dec->newLong(ds->prv, (JSINT64)(-intValue)); } else { - return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg)); + if (intNeg == 1) + return ds->dec->newInt(ds->prv, intValue); + else + return ds->dec->newInt(ds->prv, (JSINT32)(-intValue)); + } DECODE_FRACTION: diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index 3db10237b2688..dfa91509914c5 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -479,6 +479,10 @@ JSOBJ Object_newLong(void *prv, JSINT64 value) { return PyLong_FromLongLong(value); } +JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) { + return PyLong_FromUnsignedLongLong(value); +} + JSOBJ Object_newDouble(void *prv, double value) { return PyFloat_FromDouble(value); } @@ -508,7 +512,7 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { Object_newTrue, Object_newFalse, Object_newNull, Object_newPosInf, Object_newNegInf, Object_newObject, Object_endObject, Object_newArray, Object_endArray, - Object_newInteger, Object_newLong, Object_newDouble, + Object_newInteger, Object_newLong, Object_newUnsignedLong, Object_newDouble, Object_releaseObject, PyObject_Malloc, PyObject_Free, PyObject_Realloc}; From bf81d32b3d2ea6155295c4236622283e85573af2 Mon Sep 17 00:00:00 2001 From: deponovo Date: Sun, 5 Dec 2021 21:25:28 +0100 Subject: [PATCH 02/12] TST: refactored tests where necessary and added new testing conditions (#26068) --- pandas/tests/io/json/test_ujson.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index b5c22e959b4d7..15b0547031c0f 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -600,23 +600,19 @@ def test_encode_list_long_conversion(self): ) def test_encode_long_conversion(self): - long_input = 9223372036854775807 - output = ujson.encode(long_input) + for long_input in [9223372036854775807, 18446744073709551615]: + output = ujson.encode(long_input) - assert long_input == json.loads(output) - assert output == json.dumps(long_input) - assert long_input == ujson.decode(output) + assert long_input == json.loads(output) + assert output == json.dumps(long_input) + assert long_input == ujson.decode(output) - @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) - @pytest.mark.xfail(not IS64, reason="GH-35288") + @pytest.mark.parametrize("bigNum", [2**64, -2**63-1]) def test_dumps_ints_larger_than_maxsize(self, bigNum): - # GH34395 - bigNum = sys.maxsize + 1 encoding = ujson.encode(bigNum) assert str(bigNum) == encoding - # GH20599 - with pytest.raises(ValueError, match="Value is too big"): + with pytest.raises(ValueError, match="unsigned long long overflow: Value is too big|Value is too small"): assert ujson.loads(encoding) == bigNum @pytest.mark.parametrize( @@ -1163,10 +1159,10 @@ def test_decode_extreme_numbers(self, extreme_num): assert extreme_num == ujson.decode(str(extreme_num)) @pytest.mark.parametrize( - "too_extreme_num", ["9223372036854775808", "-90223372036854775809"] + "too_extreme_num", [f"{2**64}", f"{-2**63-1}"] ) def test_decode_too_extreme_numbers(self, too_extreme_num): - with pytest.raises(ValueError, match="Value is too big|Value is too small"): + with pytest.raises(ValueError, match="unsigned long long overflow: Value is too big|Value is too small"): ujson.decode(too_extreme_num) def test_decode_with_trailing_whitespaces(self): @@ -1176,9 +1172,10 @@ def test_decode_with_trailing_non_whitespaces(self): with pytest.raises(ValueError, match="Trailing data"): ujson.decode("{}\n\t a") - def test_decode_array_with_big_int(self): - with pytest.raises(ValueError, match="Value is too big"): - ujson.loads("[18446098363113800555]") + @pytest.mark.parametrize("value", [f"{2**64}", f"{-2**63-1}"]) + def test_decode_array_with_big_int(self, value): + with pytest.raises(ValueError, match="unsigned long long overflow: Value is too big|Value is too small"): + ujson.loads(value) @pytest.mark.parametrize( "float_number", From a7d7cf254a37d9988c7b873ad757ef9d4e1df353 Mon Sep 17 00:00:00 2001 From: deponovo Date: Sun, 5 Dec 2021 21:47:50 +0100 Subject: [PATCH 03/12] CLN: PEP8 cleanups and pre-commit checked --- pandas/_libs/src/ujson/lib/ultrajsondec.c | 10 ++++++---- pandas/_libs/src/ujson/python/JSONtoObj.c | 3 ++- pandas/tests/io/json/test_ujson.py | 22 ++++++++++++++-------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 9464ceb81fb85..2221ee4fdc74e 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -160,15 +160,18 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { JSUINT64 newContribution = (JSUINT64)(chr - 48); // check overflow when unsigned - if (charCount > 18 && intNeg == 1) { // 2**64 = 18_446_744_073_709_551_616 + // 2**64 = 18_446_744_073_709_551_616 + if (charCount > 18 && intNeg == 1) { if (intValue > (ULLONG_MAX - newContribution) / 10) - return SetError(ds, -1, "unsigned long long overflow: Value is too big"); + return SetError(ds, -1, + "unsigned long long overflow: Value is too big"); } intValue = intValue * 10ULL + newContribution; // check overflow when signed - if (charCount > 17 && intNeg == -1) { // 2**63 = 9_223_372_036_854_775_807 + // 2**63 = 9_223_372_036_854_775_807 + if (charCount > 17 && intNeg == -1) { if (intValue > signedOverflowLimit) { return SetError(ds, -1, signedOverflowLimit == LLONG_MAX ? "Value is too big" @@ -216,7 +219,6 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { return ds->dec->newInt(ds->prv, intValue); else return ds->dec->newInt(ds->prv, (JSINT32)(-intValue)); - } DECODE_FRACTION: diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index dfa91509914c5..14683f4c28cbe 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -512,7 +512,8 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { Object_newTrue, Object_newFalse, Object_newNull, Object_newPosInf, Object_newNegInf, Object_newObject, Object_endObject, Object_newArray, Object_endArray, - Object_newInteger, Object_newLong, Object_newUnsignedLong, Object_newDouble, + Object_newInteger, Object_newLong, Object_newUnsignedLong, + Object_newDouble, Object_releaseObject, PyObject_Malloc, PyObject_Free, PyObject_Realloc}; diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 15b0547031c0f..626796477f776 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -5,7 +5,6 @@ import locale import math import re -import sys import time import dateutil @@ -607,12 +606,15 @@ def test_encode_long_conversion(self): assert output == json.dumps(long_input) assert long_input == ujson.decode(output) - @pytest.mark.parametrize("bigNum", [2**64, -2**63-1]) + @pytest.mark.parametrize("bigNum", [2 ** 64, -(2 ** 63) - 1]) def test_dumps_ints_larger_than_maxsize(self, bigNum): encoding = ujson.encode(bigNum) assert str(bigNum) == encoding - with pytest.raises(ValueError, match="unsigned long long overflow: Value is too big|Value is too small"): + with pytest.raises( + ValueError, + match="unsigned long long overflow: Value is too big|Value is too small", + ): assert ujson.loads(encoding) == bigNum @pytest.mark.parametrize( @@ -1158,11 +1160,12 @@ def test_decode_array(self, arr): def test_decode_extreme_numbers(self, extreme_num): assert extreme_num == ujson.decode(str(extreme_num)) - @pytest.mark.parametrize( - "too_extreme_num", [f"{2**64}", f"{-2**63-1}"] - ) + @pytest.mark.parametrize("too_extreme_num", [f"{2**64}", f"{-2**63-1}"]) def test_decode_too_extreme_numbers(self, too_extreme_num): - with pytest.raises(ValueError, match="unsigned long long overflow: Value is too big|Value is too small"): + with pytest.raises( + ValueError, + match="unsigned long long overflow: Value is too big|Value is too small", + ): ujson.decode(too_extreme_num) def test_decode_with_trailing_whitespaces(self): @@ -1174,7 +1177,10 @@ def test_decode_with_trailing_non_whitespaces(self): @pytest.mark.parametrize("value", [f"{2**64}", f"{-2**63-1}"]) def test_decode_array_with_big_int(self, value): - with pytest.raises(ValueError, match="unsigned long long overflow: Value is too big|Value is too small"): + with pytest.raises( + ValueError, + match="unsigned long long overflow: Value is too big|Value is too small", + ): ujson.loads(value) @pytest.mark.parametrize( From 61e51ac52b16260b83d8013969682482445d39d4 Mon Sep 17 00:00:00 2001 From: deponovo Date: Tue, 7 Dec 2021 00:11:37 +0100 Subject: [PATCH 04/12] CLN: cleanup according to PR review --- pandas/_libs/src/ujson/lib/ultrajsondec.c | 71 +++++++++-------------- pandas/tests/io/json/test_ujson.py | 18 +++--- 2 files changed, 36 insertions(+), 53 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 2221ee4fdc74e..0ae71fe2e3082 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -116,8 +116,8 @@ JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) { JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { int intNeg = 1; - char charCount = 0; JSUINT64 intValue; + JSUINT64 prevIntValue; int chr; int decimalCount = 0; double frcValue = 0.0; @@ -125,19 +125,13 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { double expValue; char *offset = ds->start; - JSUINT64 signedOverflowLimit = LLONG_MAX; + JSUINT64 overflowLimit = LLONG_MAX; - if (*(offset) == 'I') { - goto DECODE_INF; - } else if (*(offset) == 'N') { - goto DECODE_NAN; - } else if (*(offset) == '-') { - offset++; + if (*(offset) == '-') + { + offset ++; intNeg = -1; - if (*(offset) == 'I') { - goto DECODE_INF; - } - signedOverflowLimit = LLONG_MAX + 1; + overflowLimit = LLONG_MIN; } // Scan integer part @@ -157,32 +151,20 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { case '7': case '8': case '9': { - JSUINT64 newContribution = (JSUINT64)(chr - 48); + //PERF: Don't do 64-bit arithmetic here unless we know we have to + prevIntValue = intValue; + intValue = intValue * 10ULL + (JSLONG) (chr - 48); - // check overflow when unsigned - // 2**64 = 18_446_744_073_709_551_616 - if (charCount > 18 && intNeg == 1) { - if (intValue > (ULLONG_MAX - newContribution) / 10) - return SetError(ds, -1, - "unsigned long long overflow: Value is too big"); + if (intNeg == 1 && prevIntValue > intValue) + { + return SetError(ds, -1, "Value is too big!"); } - - intValue = intValue * 10ULL + newContribution; - - // check overflow when signed - // 2**63 = 9_223_372_036_854_775_807 - if (charCount > 17 && intNeg == -1) { - if (intValue > signedOverflowLimit) { - return SetError(ds, -1, signedOverflowLimit == LLONG_MAX - ? "Value is too big" - : "Value is too small"); - } + else if (intNeg == -1 && intValue > overflowLimit) + { + return SetError(ds, -1, overflowLimit == LLONG_MAX ? "Value is too big!" : "Value is too small"); } - if (intValue != 0) - charCount++; - - offset++; + offset ++; break; } case '.': { @@ -209,16 +191,17 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { ds->lastType = JT_INT; ds->start = offset; - if ((intValue >> 31)) { - if (intNeg == 1) - return ds->dec->newUnsignedLong(ds->prv, intValue); - else - return ds->dec->newLong(ds->prv, (JSINT64)(-intValue)); - } else { - if (intNeg == 1) - return ds->dec->newInt(ds->prv, intValue); - else - return ds->dec->newInt(ds->prv, (JSINT32)(-intValue)); + if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) + { + return ds->dec->newUnsignedLong(ds->prv, intValue); + } + else if ((intValue >> 31)) + { + return ds->dec->newLong(ds->prv, (JSINT64) (intValue * (JSINT64) intNeg)); + } + else + { + return ds->dec->newInt(ds->prv, (JSINT32) (intValue * intNeg)); } DECODE_FRACTION: diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 626796477f776..b4ae54d48dc68 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -598,13 +598,13 @@ def test_encode_list_long_conversion(self): np.array(long_input), ujson.decode(output, numpy=True, dtype=np.int64) ) - def test_encode_long_conversion(self): - for long_input in [9223372036854775807, 18446744073709551615]: - output = ujson.encode(long_input) + @pytest.mark.parametrize("long_input", [9223372036854775807, 18446744073709551615]) + def test_encode_long_conversion(self, long_input): + output = ujson.encode(long_input) - assert long_input == json.loads(output) - assert output == json.dumps(long_input) - assert long_input == ujson.decode(output) + assert long_input == json.loads(output) + assert output == json.dumps(long_input) + assert long_input == ujson.decode(output) @pytest.mark.parametrize("bigNum", [2 ** 64, -(2 ** 63) - 1]) def test_dumps_ints_larger_than_maxsize(self, bigNum): @@ -613,7 +613,7 @@ def test_dumps_ints_larger_than_maxsize(self, bigNum): with pytest.raises( ValueError, - match="unsigned long long overflow: Value is too big|Value is too small", + match="Value is too big|Value is too small", ): assert ujson.loads(encoding) == bigNum @@ -1164,7 +1164,7 @@ def test_decode_extreme_numbers(self, extreme_num): def test_decode_too_extreme_numbers(self, too_extreme_num): with pytest.raises( ValueError, - match="unsigned long long overflow: Value is too big|Value is too small", + match="Value is too big|Value is too small", ): ujson.decode(too_extreme_num) @@ -1179,7 +1179,7 @@ def test_decode_with_trailing_non_whitespaces(self): def test_decode_array_with_big_int(self, value): with pytest.raises( ValueError, - match="unsigned long long overflow: Value is too big|Value is too small", + match="Value is too big|Value is too small", ): ujson.loads(value) From 4dbf09bd11baef0689d0822cd533145a110f1a68 Mon Sep 17 00:00:00 2001 From: deponovo Date: Tue, 7 Dec 2021 00:35:55 +0100 Subject: [PATCH 05/12] CLN: cleanup on pre-commit fails --- pandas/_libs/src/ujson/lib/ultrajsondec.c | 33 +++++++++-------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 0ae71fe2e3082..c7d91d2728469 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -127,9 +127,8 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { JSUINT64 overflowLimit = LLONG_MAX; - if (*(offset) == '-') - { - offset ++; + if (*(offset) == '-') { + offset++; intNeg = -1; overflowLimit = LLONG_MIN; } @@ -151,20 +150,18 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { case '7': case '8': case '9': { - //PERF: Don't do 64-bit arithmetic here unless we know we have to + // PERF: Don't do 64-bit arithmetic here unless we have to prevIntValue = intValue; intValue = intValue * 10ULL + (JSLONG) (chr - 48); - if (intNeg == 1 && prevIntValue > intValue) - { - return SetError(ds, -1, "Value is too big!"); - } - else if (intNeg == -1 && intValue > overflowLimit) - { - return SetError(ds, -1, overflowLimit == LLONG_MAX ? "Value is too big!" : "Value is too small"); + if (intNeg == 1 && prevIntValue > intValue) { + return SetError(ds, -1, "Value is too big!"); + } else if (intNeg == -1 && intValue > overflowLimit) { + return SetError(ds, -1, overflowLimit == LLONG_MAX ? + "Value is too big!" : "Value is too small"); } - offset ++; + offset++; break; } case '.': { @@ -191,18 +188,12 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { ds->lastType = JT_INT; ds->start = offset; - if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) - { + if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) { return ds->dec->newUnsignedLong(ds->prv, intValue); - } else if ((intValue >> 31)) - { - return ds->dec->newLong(ds->prv, (JSINT64) (intValue * (JSINT64) intNeg)); - } + return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); else - { - return ds->dec->newInt(ds->prv, (JSINT32) (intValue * intNeg)); - } + return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg)); DECODE_FRACTION: From 8939c8c0360e15c98edaa6f5879907c6dc1debdf Mon Sep 17 00:00:00 2001 From: deponovo Date: Tue, 7 Dec 2021 00:38:58 +0100 Subject: [PATCH 06/12] CLN: minor cleanup --- pandas/_libs/src/ujson/lib/ultrajsondec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index c7d91d2728469..3edf7bab1d60d 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -188,7 +188,7 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { ds->lastType = JT_INT; ds->start = offset; - if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) { + if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) return ds->dec->newUnsignedLong(ds->prv, intValue); else if ((intValue >> 31)) return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); From 7ce72e21e18ee4f42731037ffce87e63a9c57f08 Mon Sep 17 00:00:00 2001 From: deponovo Date: Tue, 7 Dec 2021 21:21:18 +0100 Subject: [PATCH 07/12] CLN: reset previous structure partially for passing build pandas workflow step --- pandas/_libs/src/ujson/lib/ultrajsondec.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 3edf7bab1d60d..fee552672b8b6 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -127,10 +127,17 @@ JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { JSUINT64 overflowLimit = LLONG_MAX; - if (*(offset) == '-') { + if (*(offset) == 'I') { + goto DECODE_INF; + } else if (*(offset) == 'N') { + goto DECODE_NAN; + } else if (*(offset) == '-') { offset++; intNeg = -1; overflowLimit = LLONG_MIN; + if (*(offset) == 'I') { + goto DECODE_INF; + } } // Scan integer part From 2e11b3e7a79280a54e6f9ba997d83e4c73d336c4 Mon Sep 17 00:00:00 2001 From: deponovo Date: Wed, 8 Dec 2021 14:02:51 +0100 Subject: [PATCH 08/12] TST: fixed ujson test --- pandas/tests/io/json/test_pandas.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f228c826bc795..77c770274ab67 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1275,11 +1275,8 @@ def test_to_json_large_numbers(self, bigNum): expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected - @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) - @pytest.mark.skipif(not compat.IS64, reason="GH-35279") + @pytest.mark.parametrize("bigNum", [-2**63-1, 2**64]) def test_read_json_large_numbers(self, bigNum): - # GH20599 - json = StringIO('{"articleId":' + str(bigNum) + "}") msg = r"Value is too small|Value is too big" with pytest.raises(ValueError, match=msg): From cd762537318adc91dd3d5bfed11ff9cf5af51848 Mon Sep 17 00:00:00 2001 From: deponovo Date: Wed, 8 Dec 2021 14:20:54 +0100 Subject: [PATCH 09/12] CLN: pre-commit cleanup --- pandas/tests/io/json/test_pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 77c770274ab67..cd5484fc3a9e9 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1275,7 +1275,7 @@ def test_to_json_large_numbers(self, bigNum): expected = '{"0":{"articleId":' + str(bigNum) + "}}" assert json == expected - @pytest.mark.parametrize("bigNum", [-2**63-1, 2**64]) + @pytest.mark.parametrize("bigNum", [-(2 ** 63) - 1, 2 ** 64]) def test_read_json_large_numbers(self, bigNum): json = StringIO('{"articleId":' + str(bigNum) + "}") msg = r"Value is too small|Value is too big" From d2b6d7a6457afde0531e5bb860d07e270e8e8476 Mon Sep 17 00:00:00 2001 From: deponovo Date: Wed, 8 Dec 2021 15:29:26 +0100 Subject: [PATCH 10/12] CLN: removed unused import --- pandas/tests/io/json/test_pandas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index cd5484fc3a9e9..c85fdaabbb3ed 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -22,7 +22,6 @@ DatetimeIndex, Series, Timestamp, - compat, read_json, ) import pandas._testing as tm From 86003007dd1127fbaef82cc549af3e515cc04171 Mon Sep 17 00:00:00 2001 From: deponovo Date: Thu, 9 Dec 2021 17:04:06 +0100 Subject: [PATCH 11/12] TST: re-added refrences to GH issues on test file --- pandas/tests/io/json/test_pandas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c85fdaabbb3ed..75a92ee1b9a45 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1276,6 +1276,7 @@ def test_to_json_large_numbers(self, bigNum): @pytest.mark.parametrize("bigNum", [-(2 ** 63) - 1, 2 ** 64]) def test_read_json_large_numbers(self, bigNum): + # GH20599, 26068 json = StringIO('{"articleId":' + str(bigNum) + "}") msg = r"Value is too small|Value is too big" with pytest.raises(ValueError, match=msg): From 08f84e416384f128ddc0f48e112f2a5bd2a36a20 Mon Sep 17 00:00:00 2001 From: deponovo Date: Fri, 10 Dec 2021 13:01:12 +0100 Subject: [PATCH 12/12] DOC: added entry to the whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2249790b7ff1b..944f7727ee559 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -228,6 +228,7 @@ Other enhancements - :meth:`.Rolling.var`, :meth:`.Expanding.var`, :meth:`.Rolling.std`, :meth:`.Expanding.std` now support `Numba `_ execution with the ``engine`` keyword (:issue:`44461`) - :meth:`Series.info` has been added, for compatibility with :meth:`DataFrame.info` (:issue:`5167`) - :meth:`UInt64Index.map` now retains ``dtype`` where possible (:issue:`44609`) +- :meth:`read_json` can now parse unsigned long long integers (:issue:`26068`) -