Skip to content

Commit a7546b7

Browse files
authored
Coerce data to text for JSON parsing (#429)
* Rewrite `ensure_str` to be `ensure_text_type` To simplify the branching required for Python 2/3 compatibility. Rename `ensure_str` to `ensure_text_type` and rework the code to coerce data that is `bytes` or `bytes`-like to `bytes` and then to text data. It appears JSON on Python 2 or Python 3 handles this just fine. So should make handling these two cases a bit more straightforward. * Use `ensure_text_type` elsewhere in `meta` * Rework `convenience` to use `ensure_text_type` * Use `ensure_text_type` in `n5` * Use `ensure_text_type` in attribute tests * Drop Python 2 workaround for `bson.Binary` `MongoDBStore` inherited the behavior on `pymongo` with respect to returning `bson.Binary` for blob values on Python 2. As this caused some issues on Python 2 when parsing JSON content (as the parser was unable) to work with objects that were not `bytes` type (i.e. `bson.Binary`), a workaround was needed to coerce `bson.Binary` to `bytes` on Python 2. It's worth noting that this workaround is not needed for loading binary data from chunks as we use the buffer protocol there. As we have now fixed our handling of JSON data to coerce data to text on Python 2/3 and leverage the buffer protocol in the effort, we no longer need this workaround in `MongoDBStore`. Hence we go ahead and drop it. * Simplify `MongoDBStore`'s `__getitem__`'s `return` * Drop unused import of `binary_type` in `storage` * Add a helper function for loading JSON Much as we have a helper function for writing JSON, this adds a helper function for loading JSON. Mainly it ensure data is coerced to text before handing it off to the JSON parser. Should simplify code that is loading JSON. * Rewrite code to use `json_loads` directly Changes other library code to use `json_loads` for handling text encoding and JSON parsing. Should simplify things a bit and avoid having some errors sneak in. * Note JSON changes in release notes
1 parent f6ced1e commit a7546b7

File tree

6 files changed

+37
-37
lines changed

6 files changed

+37
-37
lines changed

docs/release.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,18 @@
11
Release notes
22
=============
33

4+
.. _release_2.3.2:
5+
6+
2.3.2
7+
-----
8+
9+
Bug fixes
10+
~~~~~~~~~
11+
12+
* Coerce data to text for JSON parsing.
13+
By :user:`John Kirkham <jakirkham>`; :issue:`429`
14+
15+
416
.. _release_2.3.1:
517

618
2.3.1

zarr/convenience.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from zarr.errors import err_path_not_found, CopyError
1616
from zarr.util import normalize_storage_path, TreeViewer, buffer_size
1717
from zarr.compat import PY2, text_type
18-
from zarr.meta import ensure_str, json_dumps
18+
from zarr.meta import json_dumps, json_loads
1919

2020

2121
# noinspection PyShadowingBuiltins
@@ -1112,8 +1112,6 @@ def consolidate_metadata(store, metadata_key='.zmetadata'):
11121112
open_consolidated
11131113
11141114
"""
1115-
import json
1116-
11171115
store = normalize_store_arg(store)
11181116

11191117
def is_zarr_key(key):
@@ -1123,7 +1121,7 @@ def is_zarr_key(key):
11231121
out = {
11241122
'zarr_consolidated_format': 1,
11251123
'metadata': {
1126-
key: json.loads(ensure_str(store[key]))
1124+
key: json_loads(store[key])
11271125
for key in store if is_zarr_key(key)
11281126
}
11291127
}

zarr/meta.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,24 @@
22
from __future__ import absolute_import, print_function, division
33
import json
44
import base64
5+
import codecs
56

67

78
import numpy as np
8-
from numcodecs.compat import ensure_bytes
9+
from numcodecs.compat import ensure_contiguous_ndarray
910

1011

11-
from zarr.compat import PY2, Mapping
12+
from zarr.compat import PY2, Mapping, text_type
1213
from zarr.errors import MetadataError
1314

1415

1516
ZARR_FORMAT = 2
1617

1718

18-
def ensure_str(s):
19-
if not isinstance(s, str):
20-
s = ensure_bytes(s)
21-
if not PY2: # pragma: py2 no cover
22-
s = s.decode('ascii')
19+
def ensure_text_type(s):
20+
if not isinstance(s, text_type):
21+
s = ensure_contiguous_ndarray(s)
22+
s = codecs.decode(s, 'ascii')
2323
return s
2424

2525

@@ -29,6 +29,11 @@ def json_dumps(o):
2929
separators=(',', ': '))
3030

3131

32+
def json_loads(s):
33+
"""Read JSON in a consistent way."""
34+
return json.loads(ensure_text_type(s))
35+
36+
3237
def parse_metadata(s):
3338

3439
# Here we allow that a store may return an already-parsed metadata object,
@@ -42,8 +47,7 @@ def parse_metadata(s):
4247

4348
else:
4449
# assume metadata needs to be parsed as JSON
45-
s = ensure_str(s)
46-
meta = json.loads(s)
50+
meta = json_loads(s)
4751

4852
return meta
4953

zarr/n5.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"""This module contains a storage class and codec to support the N5 format.
33
"""
44
from __future__ import absolute_import, division
5-
from .meta import ZARR_FORMAT, ensure_str, json_dumps
5+
from .meta import ZARR_FORMAT, json_dumps, json_loads
66
from .storage import (
77
NestedDirectoryStore,
88
group_meta_key as zarr_group_meta_key,
@@ -12,7 +12,6 @@
1212
from numcodecs.abc import Codec
1313
from numcodecs.compat import ndarray_copy
1414
from numcodecs.registry import register_codec, get_codec
15-
import json
1615
import numpy as np
1716
import struct
1817
import sys
@@ -103,29 +102,26 @@ def __setitem__(self, key, value):
103102

104103
key = key.replace(zarr_group_meta_key, n5_attrs_key)
105104

106-
value = ensure_str(value)
107105
n5_attrs = self._load_n5_attrs(key)
108-
n5_attrs.update(**group_metadata_to_n5(json.loads(value)))
106+
n5_attrs.update(**group_metadata_to_n5(json_loads(value)))
109107

110108
value = json_dumps(n5_attrs).encode('ascii')
111109

112110
elif key.endswith(zarr_array_meta_key):
113111

114112
key = key.replace(zarr_array_meta_key, n5_attrs_key)
115113

116-
value = ensure_str(value)
117114
n5_attrs = self._load_n5_attrs(key)
118-
n5_attrs.update(**array_metadata_to_n5(json.loads(value)))
115+
n5_attrs.update(**array_metadata_to_n5(json_loads(value)))
119116

120117
value = json_dumps(n5_attrs).encode('ascii')
121118

122119
elif key.endswith(zarr_attrs_key):
123120

124121
key = key.replace(zarr_attrs_key, n5_attrs_key)
125122

126-
value = ensure_str(value)
127123
n5_attrs = self._load_n5_attrs(key)
128-
zarr_attrs = json.loads(value)
124+
zarr_attrs = json_loads(value)
129125

130126
for k in n5_keywords:
131127
if k in zarr_attrs.keys():
@@ -246,8 +242,7 @@ def listdir(self, path=None):
246242
def _load_n5_attrs(self, path):
247243
try:
248244
s = super(N5Store, self).__getitem__(path)
249-
s = ensure_str(s)
250-
return json.loads(s)
245+
return json_loads(s)
251246
except KeyError:
252247
return {}
253248

zarr/storage.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
normalize_storage_path, buffer_size,
3939
normalize_fill_value, nolock, normalize_dtype)
4040
from zarr.meta import encode_array_metadata, encode_group_metadata
41-
from zarr.compat import PY2, OrderedDict_move_to_end, binary_type
41+
from zarr.compat import PY2, OrderedDict_move_to_end
4242
from numcodecs.registry import codec_registry
4343
from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray
4444
from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor,
@@ -2296,15 +2296,7 @@ def __getitem__(self, key):
22962296
if doc is None:
22972297
raise KeyError(key)
22982298
else:
2299-
value = doc[self._value]
2300-
2301-
# Coerce `bson.Binary` to `bytes` type on Python 2.
2302-
# PyMongo handles this conversion for us on Python 3.
2303-
# ref: http://api.mongodb.com/python/current/python3.html#id3
2304-
if PY2: # pragma: py3 no cover
2305-
value = binary_type(value)
2306-
2307-
return value
2299+
return doc[self._value]
23082300

23092301
def __setitem__(self, key, value):
23102302
value = ensure_bytes(value)

zarr/tests/test_core.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import unittest
44
from tempfile import mkdtemp, mktemp
55
import atexit
6-
import json
76
import shutil
87
import pickle
98
import os
@@ -26,7 +25,7 @@
2625
from zarr.core import Array
2726
from zarr.errors import PermissionError
2827
from zarr.compat import PY2, text_type, binary_type, zip_longest
29-
from zarr.meta import ensure_str
28+
from zarr.meta import json_loads
3029
from zarr.util import buffer_size
3130
from zarr.n5 import n5_keywords, N5Store
3231
from numcodecs import (Delta, FixedScaleOffset, LZ4, GZip, Zlib, Blosc, BZ2, MsgPack, Pickle,
@@ -1273,10 +1272,10 @@ def test_endian(self):
12731272
def test_attributes(self):
12741273
a = self.create_array(shape=10, chunks=10, dtype='i8')
12751274
a.attrs['foo'] = 'bar'
1276-
attrs = json.loads(ensure_str(a.store[a.attrs.key]))
1275+
attrs = json_loads(a.store[a.attrs.key])
12771276
assert 'foo' in attrs and attrs['foo'] == 'bar'
12781277
a.attrs['bar'] = 'foo'
1279-
attrs = json.loads(ensure_str(a.store[a.attrs.key]))
1278+
attrs = json_loads(a.store[a.attrs.key])
12801279
assert 'foo' in attrs and attrs['foo'] == 'bar'
12811280
assert 'bar' in attrs and attrs['bar'] == 'foo'
12821281

0 commit comments

Comments
 (0)