diff --git a/docs/release.rst b/docs/release.rst index 96ac7c8f2f..335ec58fb3 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -22,6 +22,11 @@ Enhancements Maintenance ~~~~~~~~~~~ +* The required version of the `numcodecs `_ package has been upgraded + to 0.6.2, which has enabled some code simplification and fixes a failing test involving + msgpack encoding. By :user:`John Kirkham `, :issue:`352`, :issue:`355`, + :issue:`324`. + * CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and upgrade all pinned package requirements. :issue:`308`. diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 606b5acef5..29ce8b0935 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -178,8 +178,8 @@ print some diagnostics, e.g.:: : blocksize=0) Store type : builtins.dict No. bytes : 400000000 (381.5M) - No. bytes stored : 3242241 (3.1M) - Storage ratio : 123.4 + No. bytes stored : 3379344 (3.2M) + Storage ratio : 118.4 Chunks initialized : 100/100 If you don't specify a compressor, by default Zarr uses the Blosc diff --git a/requirements_dev.txt b/requirements_dev.txt index 2ad18f372c..03eaa8e871 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,3 +1,3 @@ asciitree==0.3.3 fasteners==0.14.1 -numcodecs==0.5.5 +numcodecs==0.6.2 diff --git a/setup.py b/setup.py index a5e8334e43..b6d237fe0a 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ 'asciitree', 'numpy>=1.7', 'fasteners', - 'numcodecs>=0.5.3', + 'numcodecs>=0.6.2', ], package_dir={'': '.'}, packages=['zarr', 'zarr.tests'], diff --git a/zarr/core.py b/zarr/core.py index 97d1bdc0f8..65bfff3cbb 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,6 +8,7 @@ import numpy as np +from numcodecs.compat import ensure_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1743,18 +1744,22 @@ def _decode_chunk(self, cdata): for f in self._filters[::-1]: chunk = f.decode(chunk) - # view as correct dtype - if self._dtype == object: - if isinstance(chunk, np.ndarray): - chunk = chunk.astype(self._dtype) - else: - raise RuntimeError('cannot read object array without object codec') - elif isinstance(chunk, np.ndarray): + # view as numpy array with correct dtype + chunk = ensure_ndarray(chunk) + # special case object dtype, because incorrect handling can lead to + # segfaults and other bad things happening + if self._dtype != object: chunk = chunk.view(self._dtype) - else: - chunk = np.frombuffer(chunk, dtype=self._dtype) - - # reshape + elif chunk.dtype != object: + # If we end up here, someone must have hacked around with the filters. + # We cannot deal with object arrays unless there is an object + # codec in the filter chain, i.e., a filter that converts from object + # array to something else during encoding, and converts back to object + # array during decoding. + raise RuntimeError('cannot read object array without object codec') + + # ensure correct chunk shape + chunk = chunk.reshape(-1, order='A') chunk = chunk.reshape(self._chunks, order=self._order) return chunk diff --git a/zarr/meta.py b/zarr/meta.py index 9ce580eff2..7984efb701 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -5,9 +5,10 @@ import numpy as np +from numcodecs.compat import ensure_bytes -from zarr.compat import PY2, binary_type, Mapping +from zarr.compat import PY2, Mapping from zarr.errors import MetadataError @@ -15,14 +16,9 @@ def ensure_str(s): - if PY2: # pragma: py3 no cover - # noinspection PyUnresolvedReferences - if isinstance(s, buffer): # noqa - s = str(s) - else: # pragma: py2 no cover - if isinstance(s, memoryview): - s = s.tobytes() - if isinstance(s, binary_type): + if not isinstance(s, str): + s = ensure_bytes(s) + if not PY2: # pragma: py2 no cover s = s.decode('ascii') return s diff --git a/zarr/storage.py b/zarr/storage.py index 65afc58539..82ba1d308a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -31,15 +31,13 @@ import warnings -import numpy as np - - from zarr.util import (normalize_shape, normalize_chunks, normalize_order, normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) from zarr.meta import encode_array_metadata, encode_group_metadata -from zarr.compat import PY2, binary_type, OrderedDict_move_to_end +from zarr.compat import PY2, OrderedDict_move_to_end from numcodecs.registry import codec_registry +from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, err_fspath_exists_notdir, err_read_only, MetadataError) @@ -444,23 +442,6 @@ def _init_group_metadata(store, overwrite=False, path=None, chunk_store=None): store[key] = encode_group_metadata(meta) -def ensure_bytes(s): - if isinstance(s, binary_type): - return s - if isinstance(s, np.ndarray): - if PY2: # pragma: py3 no cover - # noinspection PyArgumentList - return s.tostring(order='A') - else: # pragma: py2 no cover - # noinspection PyArgumentList - return s.tobytes(order='A') - if hasattr(s, 'tobytes'): - return s.tobytes() - if PY2 and hasattr(s, 'tostring'): # pragma: py3 no cover - return s.tostring() - return memoryview(s).tobytes() - - def _dict_store_keys(d, prefix='', cls=dict): for k in d.keys(): v = d[k] @@ -741,9 +722,8 @@ def __getitem__(self, key): def __setitem__(self, key, value): - # handle F-contiguous numpy arrays - if isinstance(value, np.ndarray) and value.flags.f_contiguous: - value = ensure_bytes(value) + # coerce to flat, contiguous array (ideally without copying) + value = ensure_contiguous_ndarray(value) # destination path for key file_path = os.path.join(self.path, key) @@ -1192,7 +1172,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): if self.mode == 'r': err_read_only() - value = ensure_bytes(value) + value = ensure_contiguous_ndarray(value) with self.mutex: self.zf.writestr(key, value) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 11891f8fe9..544ec95c41 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -982,7 +982,7 @@ def test_object_arrays(self): z[0] = 'foo' assert z[0] == 'foo' z[1] = b'bar' - assert z[1] == 'bar' # msgpack gets this wrong + assert z[1] == b'bar' z[2] = 1 assert z[2] == 1 z[3] = [2, 4, 6, 'baz'] diff --git a/zarr/util.py b/zarr/util.py index b79865bfe8..ad882c41d5 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division -import operator from textwrap import TextWrapper, dedent import numbers import uuid @@ -10,10 +9,11 @@ from asciitree import BoxStyle, LeftAligned from asciitree.traversal import Traversal import numpy as np +from numcodecs.compat import ensure_ndarray from numcodecs.registry import codec_registry -from zarr.compat import PY2, reduce, text_type, binary_type +from zarr.compat import PY2, text_type, binary_type # codecs to use for object dtype convenience API @@ -314,17 +314,7 @@ def normalize_storage_path(path): def buffer_size(v): - from array import array as _stdlib_array - if PY2 and isinstance(v, _stdlib_array): # pragma: py3 no cover - # special case array.array because does not support buffer - # interface in PY2 - return v.buffer_info()[1] * v.itemsize - else: # pragma: py2 no cover - v = memoryview(v) - if v.shape: - return reduce(operator.mul, v.shape) * v.itemsize - else: - return v.itemsize + return ensure_ndarray(v).nbytes def info_text_report(items):