From af700c5fc46e4a2a5477bdee4a558f4b37d2f7b1 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 30 Oct 2024 23:26:24 +0000 Subject: [PATCH 1/6] fix: support JSONDtype on pandas version 1.5 --- db_dtypes/json.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index ed04b72..af0b6e9 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -72,14 +72,23 @@ class JSONArray(arrays.ArrowExtensionArray): _dtype = JSONDtype() - def __init__(self, values, dtype=None, copy=False) -> None: + def __init__(self, values) -> None: + super().__init__(values) self._dtype = JSONDtype() if isinstance(values, pa.Array): - self._pa_array = pa.chunked_array([values]) + pa_data = pa.chunked_array([values]) elif isinstance(values, pa.ChunkedArray): - self._pa_array = values + pa_data = values else: raise ValueError(f"Unsupported type '{type(values)}' for JSONArray") + + # Ensures compatibility with pandas version 1.5.3 + if hasattr(self, '_data'): + self._data = pa_data + elif hasattr(self, '_pa_array'): + self._pa_array = pa_data + else: + raise ValueError(f"Unsupported pandas version: {pd.__version__}") @classmethod def _box_pa( @@ -111,7 +120,7 @@ def _box_pa_scalar(cls, value) -> pa.Scalar: def _box_pa_array(cls, value, copy: bool = False) -> pa.Array | pa.ChunkedArray: """Box value into a pyarrow Array or ChunkedArray.""" if isinstance(value, cls): - pa_array = value._pa_array + pa_array = value.pa_data else: value = [JSONArray._serialize_json(x) for x in value] pa_array = pa.array(value, type=cls._dtype.pyarrow_dtype, from_pandas=True) @@ -147,11 +156,20 @@ def dtype(self) -> JSONDtype: """An instance of JSONDtype""" return self._dtype + @property + def pa_data(self): + """An instance of stored pa data""" + # Ensures compatibility with pandas version 1.5.3 + if hasattr(self, '_data'): + return self._data + elif hasattr(self, '_pa_array'): + return self._pa_array + def _cmp_method(self, other, op): if op.__name__ == "eq": - result = pyarrow.compute.equal(self._pa_array, self._box_pa(other)) + result = pyarrow.compute.equal(self.pa_data, self._box_pa(other)) elif op.__name__ == "ne": - result = pyarrow.compute.not_equal(self._pa_array, self._box_pa(other)) + result = pyarrow.compute.not_equal(self.pa_data, self._box_pa(other)) else: # Comparison is not a meaningful one. We don't want to support sorting by JSON columns. raise TypeError(f"{op.__name__} not supported for JSONArray") @@ -169,7 +187,7 @@ def __getitem__(self, item): else: # `check_array_indexer` should verify that the assertion hold true. assert item.dtype.kind == "b" - return type(self)(self._pa_array.filter(item)) + return type(self)(self.pa_data.filter(item)) elif isinstance(item, tuple): item = indexers.unpack_tuple_and_ellipses(item) @@ -181,7 +199,7 @@ def __getitem__(self, item): r"(`None`) and integer or boolean arrays are valid indices" ) - value = self._pa_array[item] + value = self.pa_data[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) else: @@ -193,7 +211,7 @@ def __getitem__(self, item): def __iter__(self): """Iterate over elements of the array.""" - for value in self._pa_array: + for value in self.pa_data: val = JSONArray._deserialize_json(value.as_py()) if val is None: yield self._dtype.na_value From 94f0aa6add41d3e61037c76d2a481bd3f4285375 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 31 Oct 2024 18:18:17 +0000 Subject: [PATCH 2/6] sets constraints-3.9 for pandas 1.5.3 --- db_dtypes/json.py | 4 +++- testing/constraints-3.9.txt | 6 +++--- tests/unit/test_json.py | 14 ++------------ 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index af0b6e9..6384eee 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -88,7 +88,7 @@ def __init__(self, values) -> None: elif hasattr(self, '_pa_array'): self._pa_array = pa_data else: - raise ValueError(f"Unsupported pandas version: {pd.__version__}") + raise NotImplementedError(f"Unsupported pandas version: {pd.__version__}") @classmethod def _box_pa( @@ -164,6 +164,8 @@ def pa_data(self): return self._data elif hasattr(self, '_pa_array'): return self._pa_array + else: + raise NotImplementedError(f"Unsupported pandas version: {pd.__version__}") def _cmp_method(self, other, op): if op.__name__ == "eq": diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index b9ab6bf..bbf7312 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -1,3 +1,3 @@ -# Make sure we test with pandas 1.3.0. The Python version isn't that relevant. -pandas==1.3.0 -numpy<2.0.0 +# Make sure we test with pandas 1.5.0. The Python version isn't that relevant. +pandas==1.5.3 +numpy==1.24.0 \ No newline at end of file diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py index c48635d..77c0a0d 100644 --- a/tests/unit/test_json.py +++ b/tests/unit/test_json.py @@ -78,18 +78,8 @@ def test_getitems_when_iter_with_null(): assert pd.isna(result) -def test_to_numpy(): - s = pd.Series(db_dtypes.JSONArray._from_sequence(JSON_DATA.values())) - data = s.to_numpy() - for id, key in enumerate(JSON_DATA.keys()): - if key == "null": - assert pd.isna(data[id]) - else: - assert data[id] == json.dumps(JSON_DATA[key], sort_keys=True) - - def test_deterministic_json_serialization(): x = {"a": 0, "b": 1} y = {"b": 1, "a": 0} - data = db_dtypes.JSONArray._from_sequence([x]) - assert y in data + data = db_dtypes.JSONArray._from_sequence([y]) + assert data[0] == x From c4103547ee2397c17a09eefb0464830bb2c5dc3f Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 31 Oct 2024 18:22:18 +0000 Subject: [PATCH 3/6] fix test cov --- db_dtypes/json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 6384eee..5829612 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -80,7 +80,7 @@ def __init__(self, values) -> None: elif isinstance(values, pa.ChunkedArray): pa_data = values else: - raise ValueError(f"Unsupported type '{type(values)}' for JSONArray") + raise NotImplementedError(f"Unsupported type '{type(values)}' for JSONArray") # Ensures compatibility with pandas version 1.5.3 if hasattr(self, '_data'): From 73e3f8e2f6923d80e9ba13cff1749d2e37580e8d Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 31 Oct 2024 18:24:57 +0000 Subject: [PATCH 4/6] fix format --- db_dtypes/json.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 5829612..a00fe2b 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -80,12 +80,14 @@ def __init__(self, values) -> None: elif isinstance(values, pa.ChunkedArray): pa_data = values else: - raise NotImplementedError(f"Unsupported type '{type(values)}' for JSONArray") - + raise NotImplementedError( + f"Unsupported type '{type(values)}' for JSONArray" + ) + # Ensures compatibility with pandas version 1.5.3 - if hasattr(self, '_data'): + if hasattr(self, "_data"): self._data = pa_data - elif hasattr(self, '_pa_array'): + elif hasattr(self, "_pa_array"): self._pa_array = pa_data else: raise NotImplementedError(f"Unsupported pandas version: {pd.__version__}") @@ -160,9 +162,9 @@ def dtype(self) -> JSONDtype: def pa_data(self): """An instance of stored pa data""" # Ensures compatibility with pandas version 1.5.3 - if hasattr(self, '_data'): + if hasattr(self, "_data"): return self._data - elif hasattr(self, '_pa_array'): + elif hasattr(self, "_pa_array"): return self._pa_array else: raise NotImplementedError(f"Unsupported pandas version: {pd.__version__}") From a157fceb0cf846535de3088037f8778ebca13cb6 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 31 Oct 2024 18:26:21 +0000 Subject: [PATCH 5/6] nit --- testing/constraints-3.9.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index bbf7312..4700825 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -1,3 +1,3 @@ # Make sure we test with pandas 1.5.0. The Python version isn't that relevant. pandas==1.5.3 -numpy==1.24.0 \ No newline at end of file +numpy==1.24.0 \ No newline at end of file From f306a408855be4ecf73d4c788ecc734e76301eee Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 31 Oct 2024 18:38:19 +0000 Subject: [PATCH 6/6] fix lint --- tests/unit/test_json.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py index 77c0a0d..365bd8f 100644 --- a/tests/unit/test_json.py +++ b/tests/unit/test_json.py @@ -13,8 +13,6 @@ # limitations under the License. -import json - import pandas as pd import pytest