Skip to content

Fix structured arrays that contain objects #806 #813

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Aug 30, 2021
Merged
13 changes: 11 additions & 2 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@ Release notes
Unreleased
----------

.. _release_2.9.4:

2.9.4
-----

Bug fixes
~~~~~~~~~

* Fix structured arrays that contain objects
By :user: `Attila Bergou <abergou>`; :issue: `806`

.. _release_2.9.3:

2.9.3
Expand All @@ -31,7 +42,6 @@ Maintenance
* Correct conda-forge deployment of Zarr by fixing some Zarr tests.
By :user:`Ben Williams <benjaminhwilliams>`; :issue:`821`.


.. _release_2.9.1:

2.9.1
Expand Down Expand Up @@ -92,7 +102,6 @@ Maintenance
* TST: add missing assert in test_hexdigest.
By :user:`Greggory Lee <grlee77>`; :issue:`801`.


.. _release_2.8.3:

2.8.3
Expand Down
1 change: 1 addition & 0 deletions requirements_dev_optional.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ ipytree==0.2.1
azure-storage-blob==12.8.1 # pyup: ignore
redis==3.5.3
types-redis
types-setuptools
pymongo==3.12.0
# optional test requirements
tox==3.24.3
Expand Down
2 changes: 1 addition & 1 deletion zarr/codecs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# flake8: noqa
from numcodecs import *
from numcodecs import get_codec, Blosc, Zlib, Delta, AsType, BZ2
from numcodecs import get_codec, Blosc, Pickle, Zlib, Delta, AsType, BZ2
from numcodecs.registry import codec_registry
38 changes: 31 additions & 7 deletions zarr/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,14 @@ def decode_array_metadata(s: Union[MappingType, str]) -> MappingType[str, Any]:
# extract array metadata fields
try:
dtype = decode_dtype(meta['dtype'])
fill_value = decode_fill_value(meta['fill_value'], dtype)

if dtype.hasobject:
import numcodecs
object_codec = numcodecs.get_codec(meta['filters'][0])
else:
object_codec = None

fill_value = decode_fill_value(meta['fill_value'], dtype, object_codec)
meta = dict(
zarr_format=meta['zarr_format'],
shape=tuple(meta['shape']),
Expand All @@ -66,14 +73,18 @@ def encode_array_metadata(meta: MappingType[str, Any]) -> bytes:
dtype, sdshape = dtype.subdtype

dimension_separator = meta.get('dimension_separator')

if dtype.hasobject:
import numcodecs
object_codec = numcodecs.get_codec(meta['filters'][0])
else:
object_codec = None
meta = dict(
zarr_format=ZARR_FORMAT,
shape=meta['shape'] + sdshape,
chunks=meta['chunks'],
dtype=encode_dtype(dtype),
compressor=meta['compressor'],
fill_value=encode_fill_value(meta['fill_value'], dtype),
fill_value=encode_fill_value(meta['fill_value'], dtype, object_codec),
order=meta['order'],
filters=meta['filters'],
)
Expand Down Expand Up @@ -132,10 +143,17 @@ def encode_group_metadata(meta=None) -> bytes:
}


def decode_fill_value(v, dtype):
def decode_fill_value(v, dtype, object_codec=None):
# early out
if v is None:
return v
if dtype.kind == 'V' and dtype.hasobject:
if object_codec is None:
raise ValueError('missing object_codec for object array')
v = base64.standard_b64decode(v)
v = object_codec.decode(v)
v = np.array(v, dtype=dtype)[()]
return v
if dtype.kind == 'f':
if v == 'NaN':
return np.nan
Expand Down Expand Up @@ -171,10 +189,16 @@ def decode_fill_value(v, dtype):
return np.array(v, dtype=dtype)[()]


def encode_fill_value(v: Any, dtype: np.dtype) -> Any:
def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any:
# early out
if v is None:
return v
if dtype.kind == 'V' and dtype.hasobject:
if object_codec is None:
raise ValueError('missing object_codec for object array')
v = object_codec.encode(v)
v = str(base64.standard_b64encode(v), 'ascii')
return v
if dtype.kind == 'f':
if np.isnan(v):
return 'NaN'
Expand All @@ -190,8 +214,8 @@ def encode_fill_value(v: Any, dtype: np.dtype) -> Any:
return bool(v)
elif dtype.kind in 'c':
c = cast(np.complex128, np.dtype(complex).type())
v = (encode_fill_value(v.real, c.real.dtype),
encode_fill_value(v.imag, c.imag.dtype))
v = (encode_fill_value(v.real, c.real.dtype, object_codec),
encode_fill_value(v.imag, c.imag.dtype, object_codec))
return v
elif dtype.kind in 'SV':
v = str(base64.standard_b64encode(v), 'ascii')
Expand Down
2 changes: 1 addition & 1 deletion zarr/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ def _init_array_metadata(
filters_config = []

# deal with object encoding
if dtype == object:
if dtype.hasobject:
if object_codec is None:
if not filters:
# there are no filters so we can be sure there is no object codec
Expand Down
59 changes: 59 additions & 0 deletions zarr/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from numcodecs.compat import ensure_bytes, ensure_ndarray
from numcodecs.tests.common import greetings
from numpy.testing import assert_array_almost_equal, assert_array_equal
from pkg_resources import parse_version

from zarr.core import Array
from zarr.meta import json_loads
Expand Down Expand Up @@ -1362,6 +1363,44 @@ def test_object_codec_warnings(self):
if hasattr(z.store, 'close'):
z.store.close()

@unittest.skipIf(parse_version(np.__version__) < parse_version('1.14.0'),
"unsupported numpy version")
def test_structured_array_contain_object(self):

if "PartialRead" in self.__class__.__name__:
pytest.skip("partial reads of object arrays not supported")

# ----------- creation --------------

structured_dtype = [('c_obj', object), ('c_int', int)]
a = np.array([(b'aaa', 1),
(b'bbb', 2)], dtype=structured_dtype)

# zarr-array with structured dtype require object codec
with pytest.raises(ValueError):
self.create_array(shape=a.shape, dtype=structured_dtype)

# create zarr-array by np-array
za = self.create_array(shape=a.shape, dtype=structured_dtype, object_codec=Pickle())
za[:] = a

# must be equal
assert_array_equal(a, za[:])

# ---------- indexing ---------------

assert za[0] == a[0]

za[0] = (b'ccc', 3)
za[1:2] = np.array([(b'ddd', 4)], dtype=structured_dtype) # ToDo: not work with list
assert_array_equal(za[:], np.array([(b'ccc', 3), (b'ddd', 4)], dtype=structured_dtype))

za['c_obj'] = [b'eee', b'fff']
za['c_obj', 0] = b'ggg'
assert_array_equal(za[:], np.array([(b'ggg', 3), (b'fff', 4)], dtype=structured_dtype))
assert za['c_obj', 0] == b'ggg'
assert za[1, 'c_int'] == 4

def test_iteration_exceptions(self):
# zero d array
a = np.array(1, dtype=int)
Expand Down Expand Up @@ -1490,6 +1529,14 @@ def test_attributes(self):
if hasattr(a.store, 'close'):
a.store.close()

def test_structured_with_object(self):
a = self.create_array(fill_value=(0.0, None),
shape=10,
chunks=10,
dtype=[('x', float), ('y', object)],
object_codec=Pickle())
assert tuple(a[0]) == (0.0, None)


class TestArrayWithPath(TestArray):

Expand Down Expand Up @@ -1893,6 +1940,14 @@ def test_object_arrays_danger(self):
# Cannot hacking out object codec as N5 doesn't allow object codecs
pass

def test_structured_with_object(self):
# Cannot hacking out object codec as N5 doesn't allow object codecs
pass

def test_structured_array_contain_object(self):
# Cannot hacking out object codec as N5 doesn't allow object codecs
pass

def test_attrs_n5_keywords(self):
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
for k in n5_keywords:
Expand Down Expand Up @@ -2326,6 +2381,10 @@ def test_object_arrays_danger(self):
# skip this one, cannot use delta with objects
pass

def test_structured_array_contain_object(self):
# skip this one, cannot use delta on structured array
pass


# custom store, does not support getsize()
class CustomMapping(object):
Expand Down
71 changes: 69 additions & 2 deletions zarr/tests/test_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import numpy as np
import pytest

from zarr.codecs import Blosc, Delta, Zlib
from zarr.codecs import Blosc, Delta, Pickle, Zlib
from zarr.errors import MetadataError
from zarr.meta import (ZARR_FORMAT, decode_array_metadata, decode_dtype,
decode_group_metadata, encode_array_metadata,
encode_dtype)
encode_dtype, encode_fill_value, decode_fill_value)
from zarr.util import normalize_dtype, normalize_fill_value


def assert_json_equal(expect, actual):
Expand Down Expand Up @@ -435,3 +436,69 @@ def test_decode_group():
}''' % (ZARR_FORMAT - 1)
with pytest.raises(MetadataError):
decode_group_metadata(b)


@pytest.mark.parametrize(
"fill_value,dtype,object_codec,result",
[
(
(0.0, None),
[('x', float), ('y', object)],
Pickle(),
True, # Pass
),
(
(0.0, None),
[('x', float), ('y', object)],
None,
False, # Fail
),
],
)
def test_encode_fill_value(fill_value, dtype, object_codec, result):

# normalize metadata (copied from _init_array_metadata)
dtype, object_codec = normalize_dtype(dtype, object_codec)
dtype = dtype.base
fill_value = normalize_fill_value(fill_value, dtype)

# test
if result:
encode_fill_value(fill_value, dtype, object_codec)
else:
with pytest.raises(ValueError):
encode_fill_value(fill_value, dtype, object_codec)


@pytest.mark.parametrize(
"fill_value,dtype,object_codec,result",
[
(
(0.0, None),
[('x', float), ('y', object)],
Pickle(),
True, # Pass
),
(
(0.0, None),
[('x', float), ('y', object)],
None,
False, # Fail
),
],
)
def test_decode_fill_value(fill_value, dtype, object_codec, result):

# normalize metadata (copied from _init_array_metadata)
dtype, object_codec = normalize_dtype(dtype, object_codec)
dtype = dtype.base
fill_value = normalize_fill_value(fill_value, dtype)

# test
if result:
v = encode_fill_value(fill_value, dtype, object_codec)
decode_fill_value(v, dtype, object_codec)
else:
with pytest.raises(ValueError):
# No encoding is possible
decode_fill_value(fill_value, dtype, object_codec)
3 changes: 1 addition & 2 deletions zarr/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,10 +253,9 @@ def normalize_dimension_separator(sep: Optional[str]) -> Optional[str]:

def normalize_fill_value(fill_value, dtype: np.dtype):

if fill_value is None:
if fill_value is None or dtype.hasobject:
# no fill value
pass

elif fill_value == 0:
# this should be compatible across numpy versions for any array type, including
# structured arrays
Expand Down