Skip to content

Coerce data to text for JSON parsing #429

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Apr 19, 2019
4 changes: 2 additions & 2 deletions zarr/convenience.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from zarr.errors import err_path_not_found, CopyError
from zarr.util import normalize_storage_path, TreeViewer, buffer_size
from zarr.compat import PY2, text_type
from zarr.meta import ensure_str, json_dumps
from zarr.meta import ensure_text_type, json_dumps


# noinspection PyShadowingBuiltins
Expand Down Expand Up @@ -1123,7 +1123,7 @@ def is_zarr_key(key):
out = {
'zarr_consolidated_format': 1,
'metadata': {
key: json.loads(ensure_str(store[key]))
key: json.loads(ensure_text_type(store[key]))
for key in store if is_zarr_key(key)
}
}
Expand Down
16 changes: 8 additions & 8 deletions zarr/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,24 @@
from __future__ import absolute_import, print_function, division
import json
import base64
import codecs


import numpy as np
from numcodecs.compat import ensure_bytes
from numcodecs.compat import ensure_contiguous_ndarray


from zarr.compat import PY2, Mapping
from zarr.compat import PY2, Mapping, text_type
from zarr.errors import MetadataError


ZARR_FORMAT = 2


def ensure_str(s):
if not isinstance(s, str):
s = ensure_bytes(s)
if not PY2: # pragma: py2 no cover
s = s.decode('ascii')
def ensure_text_type(s):
if not isinstance(s, text_type):
s = ensure_contiguous_ndarray(s)
s = codecs.decode(s, 'ascii')
return s


Expand All @@ -42,7 +42,7 @@ def parse_metadata(s):

else:
# assume metadata needs to be parsed as JSON
s = ensure_str(s)
s = ensure_text_type(s)
meta = json.loads(s)

return meta
Expand Down
10 changes: 5 additions & 5 deletions zarr/n5.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""This module contains a storage class and codec to support the N5 format.
"""
from __future__ import absolute_import, division
from .meta import ZARR_FORMAT, ensure_str, json_dumps
from .meta import ZARR_FORMAT, ensure_text_type, json_dumps
from .storage import (
NestedDirectoryStore,
group_meta_key as zarr_group_meta_key,
Expand Down Expand Up @@ -103,7 +103,7 @@ def __setitem__(self, key, value):

key = key.replace(zarr_group_meta_key, n5_attrs_key)

value = ensure_str(value)
value = ensure_text_type(value)
n5_attrs = self._load_n5_attrs(key)
n5_attrs.update(**group_metadata_to_n5(json.loads(value)))

Expand All @@ -113,7 +113,7 @@ def __setitem__(self, key, value):

key = key.replace(zarr_array_meta_key, n5_attrs_key)

value = ensure_str(value)
value = ensure_text_type(value)
n5_attrs = self._load_n5_attrs(key)
n5_attrs.update(**array_metadata_to_n5(json.loads(value)))

Expand All @@ -123,7 +123,7 @@ def __setitem__(self, key, value):

key = key.replace(zarr_attrs_key, n5_attrs_key)

value = ensure_str(value)
value = ensure_text_type(value)
n5_attrs = self._load_n5_attrs(key)
zarr_attrs = json.loads(value)

Expand Down Expand Up @@ -246,7 +246,7 @@ def listdir(self, path=None):
def _load_n5_attrs(self, path):
try:
s = super(N5Store, self).__getitem__(path)
s = ensure_str(s)
s = ensure_text_type(s)
return json.loads(s)
except KeyError:
return {}
Expand Down
12 changes: 2 additions & 10 deletions zarr/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
normalize_storage_path, buffer_size,
normalize_fill_value, nolock, normalize_dtype)
from zarr.meta import encode_array_metadata, encode_group_metadata
from zarr.compat import PY2, OrderedDict_move_to_end, binary_type
from zarr.compat import PY2, OrderedDict_move_to_end
from numcodecs.registry import codec_registry
from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray
from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor,
Expand Down Expand Up @@ -2296,15 +2296,7 @@ def __getitem__(self, key):
if doc is None:
raise KeyError(key)
else:
value = doc[self._value]

# Coerce `bson.Binary` to `bytes` type on Python 2.
# PyMongo handles this conversion for us on Python 3.
# ref: http://api.mongodb.com/python/current/python3.html#id3
if PY2: # pragma: py3 no cover
value = binary_type(value)

return value
return doc[self._value]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this makes me very happy to see

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likewise. 🙂

FWIW it turns out this is not Python 2 specific. We just only handled decoding before parsing JSON on Python 3 (hence avoiding the issue there). With this change we just always decode to text before parsing JSON. Here's a short reproducer.

>>> import json
>>> json.loads(b"{}")
{}
>>> json.loads(b"{\x00}")
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users/jkirkham/miniconda/lib/python3.7/json/__init__.py", line 348, in loads
    return _default_decoder.decode(s)
  File "/Users/jkirkham/miniconda/lib/python3.7/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/Users/jkirkham/miniconda/lib/python3.7/json/decoder.py", line 353, in raw_decode
    obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)


def __setitem__(self, key, value):
value = ensure_bytes(value)
Expand Down
6 changes: 3 additions & 3 deletions zarr/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from zarr.core import Array
from zarr.errors import PermissionError
from zarr.compat import PY2, text_type, binary_type, zip_longest
from zarr.meta import ensure_str
from zarr.meta import ensure_text_type
from zarr.util import buffer_size
from zarr.n5 import n5_keywords, N5Store
from numcodecs import (Delta, FixedScaleOffset, LZ4, GZip, Zlib, Blosc, BZ2, MsgPack, Pickle,
Expand Down Expand Up @@ -1273,10 +1273,10 @@ def test_endian(self):
def test_attributes(self):
a = self.create_array(shape=10, chunks=10, dtype='i8')
a.attrs['foo'] = 'bar'
attrs = json.loads(ensure_str(a.store[a.attrs.key]))
attrs = json.loads(ensure_text_type(a.store[a.attrs.key]))
assert 'foo' in attrs and attrs['foo'] == 'bar'
a.attrs['bar'] = 'foo'
attrs = json.loads(ensure_str(a.store[a.attrs.key]))
attrs = json.loads(ensure_text_type(a.store[a.attrs.key]))
assert 'foo' in attrs and attrs['foo'] == 'bar'
assert 'bar' in attrs and attrs['bar'] == 'foo'

Expand Down