diff --git a/docs/release.rst b/docs/release.rst index 87710717a3..7b199a1c6d 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -95,6 +95,13 @@ Enhancements * **Arrays with one or more zero-length dimensions** are now fully supported; by :user:`Prakhar Goel `, :issue:`150`, :issue:`154`, :issue:`160`. +* **The .zattrs key is now optional** and will now only be created when the first + custom attribute is set; :issue:`121`, :issue:`200`. + +* **New Group.move() method** supports moving a sub-group or array to a different + location within the same hierarchy. By :user:`John Kirkham `, + :issue:`191`, :issue:`193`, :issue:`196`. + Bug fixes ~~~~~~~~~ @@ -120,13 +127,16 @@ Documentation ~~~~~~~~~~~~~ * Some changes have been made to the :ref:`spec_v2` document to clarify - ambiguities and add some missing information. These changes do not modify any - of the material previously implemented, and so the changes have been made + ambiguities and add some missing information. These changes do not break compatibility + with any of the material as previously implemented, and so the changes have been made in-place in the document without incrementing the document version number. The specification now describes how bytes fill values should be encoded and decoded for arrays with a fixed-length byte string data type (:issue:`165`, - :issue:`176`). The specification now also clarifies that datetime64 and - timedelta64 data types are not supported in this version (:issue:`85`). + :issue:`176`). The specification now clarifies that datetime64 and + timedelta64 data types are not supported in this version (:issue:`85`). The + specification now clarifies that the '.zattrs' key does not have to be present for + either arrays or groups, and if absent then custom attributes should be treated as + empty. * A new :ref:`tutorial_indexing` section has been added to the tutorial. * A new :ref:`tutorial_strings` section has been added to the tutorial (:issue:`135`, :issue:`175`). diff --git a/docs/spec/v2.rst b/docs/spec/v2.rst index f34a8e3179..e1dcc31601 100644 --- a/docs/spec/v2.rst +++ b/docs/spec/v2.rst @@ -275,7 +275,8 @@ Attributes An array or group can be associated with custom attributes, which are simple key/value items with application-specific meaning. Custom attributes are encoded as a JSON object and stored under the ".zattrs" key within an array -store. +store. The ".zattrs" key does not have to be present, and if it is absent the +attributes should be treated as empty. For example, the JSON object below encodes three attributes named "foo", "bar" and "baz":: @@ -308,7 +309,7 @@ have been set in the store:: >>> import os >>> sorted(os.listdir('data/example.zarr')) - ['.zarray', '.zattrs'] + ['.zarray'] Inspect the array metadata:: @@ -333,23 +334,18 @@ Inspect the array metadata:: "zarr_format": 2 } -Inspect the array attributes:: - - >>> print(open('data/example.zarr/.zattrs').read()) - {} - Chunks are initialized on demand. E.g., set some data:: >>> a[0:10, 0:10] = 1 >>> sorted(os.listdir('data/example.zarr')) - ['.zarray', '.zattrs', '0.0'] + ['.zarray', '0.0'] Set some more data:: >>> a[0:10, 10:20] = 2 >>> a[10:20, :] = 3 >>> sorted(os.listdir('data/example.zarr')) - ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] + ['.zarray', '0.0', '0.1', '1.0', '1.1'] Manually decompress a single chunk for illustration:: @@ -369,6 +365,8 @@ Modify the array attributes:: >>> a.attrs['foo'] = 42 >>> a.attrs['bar'] = 'apples' >>> a.attrs['baz'] = [1, 2, 3, 4] + >>> sorted(os.listdir('data/example.zarr')) + ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] >>> print(open('data/example.zarr/.zattrs').read()) { "bar": "apples", @@ -398,12 +396,11 @@ Create the root group:: >>> root_grp = zarr.group(store, overwrite=True) -The metadata resource for the root group has been created, as well as a custom -attributes resource:: +The metadata resource for the root group has been created:: >>> import os >>> sorted(os.listdir('data/group.zarr')) - ['.zattrs', '.zgroup'] + ['.zgroup'] Inspect the group metadata:: @@ -412,11 +409,6 @@ Inspect the group metadata:: "zarr_format": 2 } -Inspect the group attributes:: - - >>> print(open('data/group.zarr/.zattrs').read()) - {} - Create a sub-group:: >>> sub_grp = root_grp.create_group('foo') @@ -424,21 +416,25 @@ Create a sub-group:: What has been stored:: >>> sorted(os.listdir('data/group.zarr')) - ['.zattrs', '.zgroup', 'foo'] + ['.zgroup', 'foo'] >>> sorted(os.listdir('data/group.zarr/foo')) - ['.zattrs', '.zgroup'] + ['.zgroup'] Create an array within the sub-group:: >>> a = sub_grp.create_dataset('bar', shape=(20, 20), chunks=(10, 10)) >>> a[:] = 42 +Set a custom attributes:: + + >>> a.attrs['comment'] = 'answer to life, the universe and everything' + What has been stored:: >>> sorted(os.listdir('data/group.zarr')) - ['.zattrs', '.zgroup', 'foo'] + ['.zgroup', 'foo'] >>> sorted(os.listdir('data/group.zarr/foo')) - ['.zattrs', '.zgroup', 'bar'] + ['.zgroup', 'bar'] >>> sorted(os.listdir('data/group.zarr/foo/bar')) ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] @@ -449,6 +445,7 @@ Here is the same example using a Zip file as storage:: >>> sub_grp = root_grp.create_group('foo') >>> a = sub_grp.create_dataset('bar', shape=(20, 20), chunks=(10, 10)) >>> a[:] = 42 + >>> a.attrs['comment'] = 'answer to life, the universe and everything' >>> store.close() What has been stored:: @@ -457,9 +454,7 @@ What has been stored:: >>> zf = zipfile.ZipFile('data/group.zip', mode='r') >>> for name in sorted(zf.namelist()): ... print(name) - .zattrs .zgroup - foo/.zattrs foo/.zgroup foo/bar/.zarray foo/bar/.zattrs @@ -471,9 +466,29 @@ What has been stored:: Changes ------- +Version 2 clarifications +~~~~~~~~~~~~~~~~~~~~~~~~ + +The following changes have been made to the version 2 specification since it was +initially published to clarify ambiguities and add some missing information. + +* The specification now describes how bytes fill values should be encoded and + decoded for arrays with a fixed-length byte string data type (:issue:`165`, + :issue:`176`). + +* The specification now clarifies that datetime64 and timedelta64 data types are not + supported in this version (:issue:`85`). + +* The specification now clarifies that the '.zattrs' key does not have to be present for + either arrays or groups, and if absent then custom attributes should be treated as + empty. + + Changes in version 2 ~~~~~~~~~~~~~~~~~~~~ +The following changes were made between version 1 and version 2 of this specification: + * Added support for storing multiple arrays in the same store and organising arrays into hierarchies using groups. * Array metadata is now stored under the ".zarray" key instead of the "meta" diff --git a/docs/tutorial.rst b/docs/tutorial.rst index dfb61bea29..a889ee7081 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -178,7 +178,7 @@ print some diagnostics, e.g.:: : blocksize=0) Store type : builtins.dict No. bytes : 400000000 (381.5M) - No. bytes stored : 4565055 (4.4M) + No. bytes stored : 4565053 (4.4M) Storage ratio : 87.6 Chunks initialized : 100/100 @@ -270,7 +270,7 @@ Here is an example using a delta filter with the Blosc compressor:: Compressor : Blosc(cname='zstd', clevel=1, shuffle=SHUFFLE, blocksize=0) Store type : builtins.dict No. bytes : 400000000 (381.5M) - No. bytes stored : 648607 (633.4K) + No. bytes stored : 648605 (633.4K) Storage ratio : 616.7 Chunks initialized : 100/100 @@ -394,7 +394,7 @@ property. E.g.:: Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) Store type : zarr.storage.DictStore No. bytes : 8000000 (7.6M) - No. bytes stored : 37482 (36.6K) + No. bytes stored : 37480 (36.6K) Storage ratio : 213.4 Chunks initialized : 10/10 @@ -409,7 +409,7 @@ property. E.g.:: Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) Store type : zarr.storage.DictStore No. bytes : 4000000 (3.8M) - No. bytes stored : 23245 (22.7K) + No. bytes stored : 23243 (22.7K) Storage ratio : 172.1 Chunks initialized : 100/100 @@ -898,7 +898,7 @@ ratios, depending on the correlation structure within the data. E.g.:: Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) Store type : builtins.dict No. bytes : 400000000 (381.5M) - No. bytes stored : 26805737 (25.6M) + No. bytes stored : 26805735 (25.6M) Storage ratio : 14.9 Chunks initialized : 100/100 >>> f = zarr.array(a, chunks=(1000, 1000), order='F') @@ -912,7 +912,7 @@ ratios, depending on the correlation structure within the data. E.g.:: Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) Store type : builtins.dict No. bytes : 400000000 (381.5M) - No. bytes stored : 9633603 (9.2M) + No. bytes stored : 9633601 (9.2M) Storage ratio : 41.5 Chunks initialized : 100/100 @@ -1099,3 +1099,7 @@ behaviour, set the value of the ``blosc.use_threads`` variable to ``True`` (Blosc always uses multiple internal threads) or ``False`` (Blosc always runs in single-threaded contextual mode). To re-enable automatic switching, set ``blosc.use_threads`` to ``None``. + +Please note that if Zarr is being used within a multi-process program, Blosc may not +be safe to use in multi-threaded mode and may cause the program to hang. If using Blosc +in a multi-process program then it is recommended to set ``blosc.use_threads = False``. diff --git a/zarr/storage.py b/zarr/storage.py index d7c3b50604..1c080ce07c 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -10,7 +10,6 @@ from collections import MutableMapping import os import tempfile -import json import zipfile import shutil import atexit @@ -212,7 +211,7 @@ def init_array(store, shape, chunks=True, dtype=None, compressor='default', >>> store = dict() >>> init_array(store, shape=(10000, 10000), chunks=(1000, 1000)) >>> sorted(store.keys()) - ['.zarray', '.zattrs'] + ['.zarray'] Array metadata is stored as JSON:: @@ -240,17 +239,12 @@ def init_array(store, shape, chunks=True, dtype=None, compressor='default', "zarr_format": 2 } - User-defined attributes are also stored as JSON, initially empty:: - - >>> print(store['.zattrs'].decode()) - {} - Initialize an array using a storage path:: >>> store = dict() >>> init_array(store, shape=100000000, chunks=1000000, dtype='i1', path='foo') >>> sorted(store.keys()) - ['.zattrs', '.zgroup', 'foo/.zarray', 'foo/.zattrs'] + ['.zgroup', 'foo/.zarray'] >>> print(store['foo/.zarray'].decode()) { "chunks": [ @@ -276,8 +270,7 @@ def init_array(store, shape, chunks=True, dtype=None, compressor='default', Notes ----- The initialisation process involves normalising all array metadata, encoding - as JSON and storing under the '.zarray' key. User attributes are also - initialized and stored as JSON under the '.zattrs' key. + as JSON and storing under the '.zarray' key. """ @@ -349,10 +342,6 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='defa key = _path_to_prefix(path) + array_meta_key store[key] = encode_array_metadata(meta) - # initialize attributes - key = _path_to_prefix(path) + attrs_key - store[key] = json.dumps(dict()).encode('ascii') - # backwards compatibility init_store = init_array @@ -408,10 +397,6 @@ def _init_group_metadata(store, overwrite=False, path=None, chunk_store=None): key = _path_to_prefix(path) + group_meta_key store[key] = encode_group_metadata(meta) - # initialize attributes - key = _path_to_prefix(path) + attrs_key - store[key] = json.dumps(dict()).encode('ascii') - def ensure_bytes(s): if isinstance(s, binary_type): @@ -654,7 +639,7 @@ class DirectoryStore(MutableMapping): >>> import os >>> sorted(os.listdir('data/array.zarr')) - ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] + ['.zarray', '0.0', '0.1', '1.0', '1.1'] Store a group:: @@ -668,11 +653,11 @@ class DirectoryStore(MutableMapping): directories on the file system, i.e.:: >>> sorted(os.listdir('data/group.zarr')) - ['.zattrs', '.zgroup', 'foo'] + ['.zgroup', 'foo'] >>> sorted(os.listdir('data/group.zarr/foo')) - ['.zattrs', '.zgroup', 'bar'] + ['.zgroup', 'bar'] >>> sorted(os.listdir('data/group.zarr/foo/bar')) - ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] + ['.zarray', '0.0', '0.1', '1.0', '1.1'] Notes ----- @@ -909,7 +894,7 @@ class NestedDirectoryStore(DirectoryStore): >>> import os >>> sorted(os.listdir('data/array.zarr')) - ['.zarray', '.zattrs', '0', '1'] + ['.zarray', '0', '1'] >>> sorted(os.listdir('data/array.zarr/0')) ['0', '1'] >>> sorted(os.listdir('data/array.zarr/1')) @@ -927,11 +912,11 @@ class NestedDirectoryStore(DirectoryStore): directories on the file system, i.e.:: >>> sorted(os.listdir('data/group.zarr')) - ['.zattrs', '.zgroup', 'foo'] + ['.zgroup', 'foo'] >>> sorted(os.listdir('data/group.zarr/foo')) - ['.zattrs', '.zgroup', 'bar'] + ['.zgroup', 'bar'] >>> sorted(os.listdir('data/group.zarr/foo/bar')) - ['.zarray', '.zattrs', '0', '1'] + ['.zarray', '0', '1'] >>> sorted(os.listdir('data/group.zarr/foo/bar/0')) ['0', '1'] >>> sorted(os.listdir('data/group.zarr/foo/bar/1')) @@ -1487,7 +1472,9 @@ def _lmdb_decode_key_bytes(key): class LMDBStore(MutableMapping): - """Storage class using LMDB. + """Storage class using LMDB. Requires the `lmdb `_ + package to be installed. + Parameters ---------- @@ -1499,10 +1486,6 @@ class LMDBStore(MutableMapping): **kwargs Keyword arguments passed through to the `lmdb.open` function. - Notes - ----- - Requires the `lmdb `_ package to be installed. - Examples -------- Store a single array:: diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index c5a773c71a..f87f0cb629 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -18,7 +18,7 @@ from zarr.storage import (DictStore, DirectoryStore, ZipStore, init_group, init_array, - attrs_key, array_meta_key, group_meta_key, atexit_rmtree, + array_meta_key, group_meta_key, atexit_rmtree, NestedDirectoryStore, DBMStore, LMDBStore) from zarr.core import Array from zarr.compat import PY2, text_type @@ -66,6 +66,8 @@ def test_group_init_1(self): eq('/', g.name) eq('', g.basename) assert_is_instance(g.attrs, Attributes) + g.attrs['foo'] = 'bar' + assert g.attrs['foo'] == 'bar' assert_is_instance(g.info, InfoReporter) assert_is_instance(repr(g.info), str) assert_is_instance(g.info._repr_html_(), str) @@ -940,8 +942,7 @@ def test_chunk_store(self): assert_array_equal(np.arange(100), a[:]) # check store keys - expect = sorted([attrs_key, group_meta_key, 'foo/' + attrs_key, - 'foo/' + array_meta_key]) + expect = sorted([group_meta_key, 'foo/' + array_meta_key]) actual = sorted(store.keys()) eq(expect, actual) expect = ['foo/' + str(i) for i in range(10)] diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 34892da750..75be6253c3 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -22,7 +22,7 @@ NestedDirectoryStore, default_compressor, DBMStore, LMDBStore) from zarr.meta import (decode_array_metadata, encode_array_metadata, ZARR_FORMAT, decode_group_metadata, encode_group_metadata) -from zarr.compat import text_type, PY2 +from zarr.compat import PY2 from zarr.codecs import Zlib, Blosc, BZ2 from zarr.errors import PermissionError from zarr.hierarchy import group @@ -309,10 +309,6 @@ def test_init_array(self): eq(default_compressor.get_config(), meta['compressor']) assert_is_none(meta['fill_value']) - # check attributes - assert attrs_key in store - eq(dict(), json.loads(text_type(store[attrs_key], 'ascii'))) - def test_init_array_overwrite(self): # setup store = self.create_store() @@ -360,11 +356,6 @@ def test_init_array_path(self): eq(default_compressor.get_config(), meta['compressor']) assert_is_none(meta['fill_value']) - # check attributes - key = path + '/' + attrs_key - assert key in store - eq(dict(), json.loads(text_type(store[key], 'ascii'))) - def test_init_array_overwrite_path(self): # setup path = 'foo/bar' @@ -476,10 +467,6 @@ def test_init_group(self): meta = decode_group_metadata(store[group_meta_key]) eq(ZARR_FORMAT, meta['zarr_format']) - # check attributes - assert attrs_key in store - eq(dict(), json.loads(text_type(store[attrs_key], 'ascii'))) - def test_init_group_overwrite(self): # setup store = self.create_store()