zarr-developers · alimanfoo · Dec 4, 2018 · Nov 30, 2018 · Nov 30, 2018 · Dec 1, 2018
diff --git a/docs/release.rst b/docs/release.rst
@@ -22,6 +22,11 @@ Enhancements
 Maintenance
 ~~~~~~~~~~~
 
+* The required version of the `numcodecs <http://numcodecs.rtfd.io>`_ package has been upgraded 
+  to 0.6.2, which has enabled some code simplification and fixes a failing test involving
+  msgpack encoding. By :user:`John Kirkham <jakirkham>`, :issue:`352`, :issue:`355`, 
+  :issue:`324`.
+
 * CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and
   upgrade all pinned package requirements. :issue:`308`.
 

diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -178,8 +178,8 @@ print some diagnostics, e.g.::
                        : blocksize=0)
     Store type         : builtins.dict
     No. bytes          : 400000000 (381.5M)
-    No. bytes stored   : 3242241 (3.1M)
-    Storage ratio      : 123.4
+    No. bytes stored   : 3379344 (3.2M)
+    Storage ratio      : 118.4
     Chunks initialized : 100/100
 
 If you don't specify a compressor, by default Zarr uses the Blosc

diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -1,3 +1,3 @@
 asciitree==0.3.3
 fasteners==0.14.1
-numcodecs==0.5.5
+numcodecs==0.6.2
diff --git a/setup.py b/setup.py
@@ -26,7 +26,7 @@
         'asciitree',
         'numpy>=1.7',
         'fasteners',
-        'numcodecs>=0.5.3',
+        'numcodecs>=0.6.2',
     ],
     package_dir={'': '.'},
     packages=['zarr', 'zarr.tests'],

diff --git a/zarr/core.py b/zarr/core.py
@@ -8,6 +8,7 @@
 
 
 import numpy as np
+from numcodecs.compat import ensure_ndarray
 
 
 from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args,
@@ -1743,18 +1744,22 @@ def _decode_chunk(self, cdata):
             for f in self._filters[::-1]:
                 chunk = f.decode(chunk)
 
-        # view as correct dtype
-        if self._dtype == object:
-            if isinstance(chunk, np.ndarray):
-                chunk = chunk.astype(self._dtype)
-            else:
-                raise RuntimeError('cannot read object array without object codec')
-        elif isinstance(chunk, np.ndarray):
+        # view as numpy array with correct dtype
+        chunk = ensure_ndarray(chunk)
+        # special case object dtype, because incorrect handling can lead to
+        # segfaults and other bad things happening
+        if self._dtype != object:
             chunk = chunk.view(self._dtype)
-        else:
-            chunk = np.frombuffer(chunk, dtype=self._dtype)
-
-        # reshape
+        elif chunk.dtype != object:
+            # If we end up here, someone must have hacked around with the filters.
+            # We cannot deal with object arrays unless there is an object
+            # codec in the filter chain, i.e., a filter that converts from object
+            # array to something else during encoding, and converts back to object
+            # array during decoding.
+            raise RuntimeError('cannot read object array without object codec')
+
+        # ensure correct chunk shape
+        chunk = chunk.reshape(-1, order='A')
         chunk = chunk.reshape(self._chunks, order=self._order)
 
         return chunk

diff --git a/zarr/meta.py b/zarr/meta.py
@@ -5,24 +5,20 @@
 
 
 import numpy as np
+from numcodecs.compat import ensure_bytes
 
 
-from zarr.compat import PY2, binary_type, Mapping
+from zarr.compat import PY2, Mapping
 from zarr.errors import MetadataError
 
 
 ZARR_FORMAT = 2
 
 
 def ensure_str(s):
-    if PY2:  # pragma: py3 no cover
-        # noinspection PyUnresolvedReferences
-        if isinstance(s, buffer):  # noqa
-            s = str(s)
-    else:  # pragma: py2 no cover
-        if isinstance(s, memoryview):
-            s = s.tobytes()
-        if isinstance(s, binary_type):
+    if not isinstance(s, str):
+        s = ensure_bytes(s)
+        if not PY2:  # pragma: py2 no cover
             s = s.decode('ascii')
     return s
 

diff --git a/zarr/storage.py b/zarr/storage.py
@@ -31,15 +31,13 @@
 import warnings
 
 
-import numpy as np
-
-
 from zarr.util import (normalize_shape, normalize_chunks, normalize_order,
                        normalize_storage_path, buffer_size,
                        normalize_fill_value, nolock, normalize_dtype)
 from zarr.meta import encode_array_metadata, encode_group_metadata
-from zarr.compat import PY2, binary_type, OrderedDict_move_to_end
+from zarr.compat import PY2, OrderedDict_move_to_end
 from numcodecs.registry import codec_registry
+from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray
 from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor,
                          err_fspath_exists_notdir, err_read_only, MetadataError)
 
@@ -444,23 +442,6 @@ def _init_group_metadata(store, overwrite=False, path=None, chunk_store=None):
     store[key] = encode_group_metadata(meta)
 
 
-def ensure_bytes(s):
-    if isinstance(s, binary_type):
-        return s
-    if isinstance(s, np.ndarray):
-        if PY2:  # pragma: py3 no cover
-            # noinspection PyArgumentList
-            return s.tostring(order='A')
-        else:  # pragma: py2 no cover
-            # noinspection PyArgumentList
-            return s.tobytes(order='A')
-    if hasattr(s, 'tobytes'):
-        return s.tobytes()
-    if PY2 and hasattr(s, 'tostring'):  # pragma: py3 no cover
-        return s.tostring()
-    return memoryview(s).tobytes()
-
-
 def _dict_store_keys(d, prefix='', cls=dict):
     for k in d.keys():
         v = d[k]
@@ -741,9 +722,8 @@ def __getitem__(self, key):
 
     def __setitem__(self, key, value):
 
-        # handle F-contiguous numpy arrays
-        if isinstance(value, np.ndarray) and value.flags.f_contiguous:
-            value = ensure_bytes(value)
+        # coerce to flat, contiguous array (ideally without copying)
+        value = ensure_contiguous_ndarray(value)
 
         # destination path for key
         file_path = os.path.join(self.path, key)
@@ -1192,7 +1172,7 @@ def __getitem__(self, key):
     def __setitem__(self, key, value):
         if self.mode == 'r':
             err_read_only()
-        value = ensure_bytes(value)
+        value = ensure_contiguous_ndarray(value)
         with self.mutex:
             self.zf.writestr(key, value)
 

diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py
@@ -982,7 +982,7 @@ def test_object_arrays(self):
         z[0] = 'foo'
         assert z[0] == 'foo'
         z[1] = b'bar'
-        assert z[1] == 'bar'  # msgpack gets this wrong
+        assert z[1] == b'bar'
         z[2] = 1
         assert z[2] == 1
         z[3] = [2, 4, 6, 'baz']

diff --git a/zarr/util.py b/zarr/util.py
@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, print_function, division
-import operator
 from textwrap import TextWrapper, dedent
 import numbers
 import uuid
@@ -10,10 +9,11 @@
 from asciitree import BoxStyle, LeftAligned
 from asciitree.traversal import Traversal
 import numpy as np
+from numcodecs.compat import ensure_ndarray
 from numcodecs.registry import codec_registry
 
 
-from zarr.compat import PY2, reduce, text_type, binary_type
+from zarr.compat import PY2, text_type, binary_type
 
 
 # codecs to use for object dtype convenience API
@@ -314,17 +314,7 @@ def normalize_storage_path(path):
 
 
 def buffer_size(v):
-    from array import array as _stdlib_array
-    if PY2 and isinstance(v, _stdlib_array):  # pragma: py3 no cover
-        # special case array.array because does not support buffer
-        # interface in PY2
-        return v.buffer_info()[1] * v.itemsize
-    else:  # pragma: py2 no cover
-        v = memoryview(v)
-        if v.shape:
-            return reduce(operator.mul, v.shape) * v.itemsize
-        else:
-            return v.itemsize
+    return ensure_ndarray(v).nbytes
 
 
 def info_text_report(items):