diff --git a/.gitignore b/.gitignore index 6d0c005da0..f397966371 100644 --- a/.gitignore +++ b/.gitignore @@ -66,3 +66,5 @@ zarr/version.py *.zarr *~ *.zip +example* +doesnotexist diff --git a/docs/api.rst b/docs/api.rst index aff5ecf91c..8db174da69 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -6,6 +6,7 @@ API reference api/creation api/core + api/hierarchy api/storage api/compressors api/sync diff --git a/docs/api/creation.rst b/docs/api/creation.rst index 6bc24d8c89..66422c0670 100644 --- a/docs/api/creation.rst +++ b/docs/api/creation.rst @@ -7,7 +7,7 @@ Array creation (``zarr.creation``) .. autofunction:: ones .. autofunction:: full .. autofunction:: array -.. autofunction:: open +.. autofunction:: open_array .. autofunction:: empty_like .. autofunction:: zeros_like .. autofunction:: ones_like diff --git a/docs/api/hierarchy.rst b/docs/api/hierarchy.rst new file mode 100644 index 0000000000..73db5bbc34 --- /dev/null +++ b/docs/api/hierarchy.rst @@ -0,0 +1,33 @@ +Groups (``zarr.hierarchy``) +=========================== +.. module:: zarr.hierarchy + +.. autofunction:: group +.. autofunction:: open_group + +.. autoclass:: Group + + .. automethod:: __len__ + .. automethod:: __iter__ + .. automethod:: __contains__ + .. automethod:: __getitem__ + .. automethod:: group_keys + .. automethod:: groups + .. automethod:: array_keys + .. automethod:: arrays + .. automethod:: create_group + .. automethod:: require_group + .. automethod:: create_groups + .. automethod:: require_groups + .. automethod:: create_dataset + .. automethod:: require_dataset + .. automethod:: create + .. automethod:: empty + .. automethod:: zeros + .. automethod:: ones + .. automethod:: full + .. automethod:: array + .. automethod:: empty_like + .. automethod:: zeros_like + .. automethod:: ones_like + .. automethod:: full_like diff --git a/docs/api/storage.rst b/docs/api/storage.rst index b382a2408f..7d2594b335 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -2,11 +2,13 @@ Storage (``zarr.storage``) ========================== .. module:: zarr.storage -This module contains a single :class:`DirectoryStore` class providing -a ``MutableMapping`` interface to a directory on the file -system. However, note that any object implementing the -``MutableMapping`` interface can be used as a Zarr array store. +This module contains storage classes for use with Zarr arrays and groups. +However, note that any object implementing the ``MutableMapping`` interface +can be used as a Zarr array store. -.. autofunction:: init_store +.. autofunction:: init_array +.. autofunction:: init_group +.. autoclass:: DictStore .. autoclass:: DirectoryStore +.. autoclass:: ZipStore diff --git a/docs/api/sync.rst b/docs/api/sync.rst index faef87c90b..a139805e78 100644 --- a/docs/api/sync.rst +++ b/docs/api/sync.rst @@ -4,4 +4,3 @@ Synchronization (``zarr.sync``) .. autoclass:: ThreadSynchronizer .. autoclass:: ProcessSynchronizer -.. autoclass:: SynchronizedArray diff --git a/docs/index.rst b/docs/index.rst index 47fb71a772..9298deac5c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,6 +16,7 @@ Highlights * Store arrays in memory, on disk, inside a Zip file, on S3, ... * Read an array concurrently from multiple threads or processes. * Write to an array concurrently from multiple threads or processes. +* Organize arrays into hierarchies via groups. Status ------ diff --git a/docs/release.rst b/docs/release.rst index 7e0941097f..e8432c7e12 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -1,6 +1,21 @@ Release notes ============= +.. _release_2.0.0: + +2.0.0 +----- + +Hierarchies +~~~~~~~~~~~ + +Support has been added for organizing arrays into hierarchies via groups. See +the tutorial section on :ref:`tutorial_groups` and the :mod:`zarr.hierarchy` +API docs for more information. + +To accommodate support for hierarchies the Zarr format has been modified. See +the :ref:`spec_v2` for more information. + .. _release_1.1.0: 1.1.0 diff --git a/docs/spec.rst b/docs/spec.rst index 4f60e4d467..765dcd782a 100644 --- a/docs/spec.rst +++ b/docs/spec.rst @@ -1,3 +1,5 @@ +.. _spec: + Specifications ============== @@ -5,3 +7,4 @@ Specifications :maxdepth: 3 spec/v1 + spec/v2 diff --git a/docs/spec/v1.rst b/docs/spec/v1.rst index 173797bc18..dd13fb7d39 100644 --- a/docs/spec/v1.rst +++ b/docs/spec/v1.rst @@ -10,6 +10,11 @@ NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", interpreted as described in `RFC 2119 `_. +Status +------ + +This specification is deprecated. See :ref:`spec` for the latest version. + Storage ------- diff --git a/docs/spec/v2.rst b/docs/spec/v2.rst new file mode 100644 index 0000000000..66202928e2 --- /dev/null +++ b/docs/spec/v2.rst @@ -0,0 +1,457 @@ +.. _spec_v2: + +Zarr storage specification version 2 +==================================== + +This document provides a technical specification of the protocol and format +used for storing a Zarr array. The key words "MUST", "MUST NOT", "REQUIRED", +"SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and +"OPTIONAL" in this document are to be interpreted as described in `RFC 2119 +`_. + +Status +------ + +This specification is the latest version. See :ref:`spec` for previous +versions. + +Storage +------- + +A Zarr array can be stored in any storage system that provides a key/value +interface, where a key is an ASCII string and a value is an arbitrary sequence +of bytes, and the supported operations are read (get the sequence of bytes +associated with a given key), write (set the sequence of bytes associated with +a given key) and delete (remove a key/value pair). + +For example, a directory in a file system can provide this interface, where +keys are file names, values are file contents, and files can be read, written +or deleted via the operating system. Equally, an S3 bucket can provide this +interface, where keys are resource names, values are resource contents, and +resources can be read, written or deleted via HTTP. + +Below an "array store" refers to any system implementing this interface. + +Arrays +------ + +Metadata +~~~~~~~~ + +Each array requires essential configuration metadata to be stored, enabling +correct interpretation of the stored data. This metadata is encoded using JSON +and stored as the value of the ".zarray" key within an array store. + +The metadata resource is a JSON object. The following keys MUST be present +within the object: + +zarr_format + An integer defining the version of the storage specification to which the + array store adheres. +shape + A list of integers defining the length of each dimension of the array. +chunks + A list of integers defining the length of each dimension of a chunk of the + array. Note that all chunks within a Zarr array have the same shape. +dtype + A string or list defining a valid data type for the array. See also + the subsection below on data type encoding. +compression + A string identifying the primary compression library used to compress + each chunk of the array. +compression_opts + An integer, string or dictionary providing options to the primary + compression library. +fill_value + A scalar value providing the default value to use for uninitialized + portions of the array. +order + Either "C" or "F", defining the layout of bytes within each chunk of the + array. "C" means row-major order, i.e., the last dimension varies fastest; + "F" means column-major order, i.e., the first dimension varies fastest. + +Other keys MUST NOT be present within the metadata object. + +For example, the JSON object below defines a 2-dimensional array of 64-bit +little-endian floating point numbers with 10000 rows and 10000 columns, divided +into chunks of 1000 rows and 1000 columns (so there will be 100 chunks in total +arranged in a 10 by 10 grid). Within each chunk the data are laid out in C +contiguous order, and each chunk is compressed using the Blosc compression +library:: + + { + "chunks": [ + 1000, + 1000 + ], + "compression": "blosc", + "compression_opts": { + "clevel": 5, + "cname": "lz4", + "shuffle": 1 + }, + "dtype": "`_. The format +consists of 3 parts: + +* One character describing the byteorder of the data (``"<"``: little-endian; + ``">"``: big-endian; ``"|"``: not-relevant) +* One character code giving the basic type of the array (``"b"``: Boolean (integer + type where all values are only True or False); ``"i"``: integer; ``"u"``: unsigned + integer; ``"f"``: floating point; ``"c"``: complex floating point; ``"m"``: timedelta; + ``"M"``: datetime; ``"S"``: string (fixed-length sequence of char); ``"U"``: unicode + (fixed-length sequence of Py_UNICODE); ``"V"``: other (void * – each item is a + fixed-size chunk of memory)) +* An integer specifying the number of bytes the type uses. + +The byte order MUST be specified. E.g., ``"i4"``, ``"|b1"`` and +``"|S12"`` are valid data type encodings. + +Structured data types (i.e., with multiple named fields) are encoded as a list +of two-element lists, following `NumPy array protocol type descriptions (descr) +`_. For +example, the JSON list ``[["r", "|u1"], ["g", "|u1"], ["b", "|u1"]]`` defines a +data type composed of three single-byte unsigned integers labelled "r", "g" and +"b". + +Fill value encoding +~~~~~~~~~~~~~~~~~~~ + +For simple floating point data types, the following table MUST be used to +encode values of the "fill_value" field: + +================= =============== +Value JSON encoding +================= =============== +Not a Number ``"NaN"`` +Positive Infinity ``"Infinity"`` +Negative Infinity ``"-Infinity"`` +================= =============== + + +Chunks +~~~~~~ + +Each chunk of the array is compressed by passing the raw bytes for the chunk +through the primary compression library to obtain a new sequence of bytes +comprising the compressed chunk data. No header is added to the compressed +bytes or any other modification made. The internal structure of the compressed +bytes will depend on which primary compressor was used. For example, the `Blosc +compressor `_ +produces a sequence of bytes that begins with a 16-byte header followed by +compressed data. + +The compressed sequence of bytes for each chunk is stored under a key formed +from the index of the chunk within the grid of chunks representing the array. +To form a string key for a chunk, the indices are converted to strings and +concatenated with the period character (".") separating each index. For +example, given an array with shape (10000, 10000) and chunk shape (1000, 1000) +there will be 100 chunks laid out in a 10 by 10 grid. The chunk with indices +(0, 0) provides data for rows 0-1000 and columns 0-1000 and is stored under the +key "0.0"; the chunk with indices (2, 4) provides data for rows 2000-3000 and +columns 4000-5000 and is stored under the key "2.4"; etc. + +There is no need for all chunks to be present within an array store. If a chunk +is not present then it is considered to be in an uninitialized state. An +unitialized chunk MUST be treated as if it was uniformly filled with the value +of the "fill_value" field in the array metadata. If the "fill_value" field is +``null`` then the contents of the chunk are undefined. + +Note that all chunks in an array have the same shape. If the length of any +array dimension is not exactly divisible by the length of the corresponding +chunk dimension then some chunks will overhang the edge of the array. The +contents of any chunk region falling outside the array are undefined. + +Hierarchies +----------- + +Logical storage paths +~~~~~~~~~~~~~~~~~~~~~ + +Multiple arrays can be stored in the same array store by associating each array +with a different logical path. A logical path is simply an ASCII string. The +logical path is used to form a prefix for keys used by the array. For example, +if an array is stored at logical path "foo/bar" then the array metadata will be +stored under the key "foo/bar/.zarray", the user-defined attributes will be +stored under the key "foo/bar/.zattrs", and the chunks will be stored under +keys like "foo/bar/0.0", "foo/bar/0.1", etc. + +To ensure consistent behaviour across different storage systems, logical paths +MUST be normalized as follows: + +* Replace all backward slash characters ("\\") with forward slash characters + ("/") +* Strip any leading "/" characters +* Strip any trailing "/" characters +* Collapse any sequence of more than one "/" character into a single "/" + character + +The key prefix is then obtained by appending a single "/" character to the +normalized logical path. + +After normalization, if splitting a logical path by the "/" character results +in any path segment equal to the string "." or the string ".." then an error +MUST be raised. + +N.B., how the underlying array store processes requests to store values under +keys containing the "/" character is entirely up to the store implementation +and is not constrained by this specification. E.g., an array store could simply +treat all keys as opaque ASCII strings; equally, an array store could map +logical paths onto some kind of hierarchical storage (e.g., directories on a +file system). + +Groups +~~~~~~ + +Arrays can be organized into groups which can also contain other groups. A +group is created by storing group metadata under the ".zgroup" key under some +logical path. E.g., a group exists at the root of an array store if the +".zgroup" key exists in the store, and a group exists at logical path "foo/bar" +if the "foo/bar/.zgroup" key exists in the store. + +If the user requests a group to be created under some logical path, then groups +MUST also be created at all ancestor paths. E.g., if the user requests group +creation at path "foo/bar" then groups MUST be created at path "foo" and the +root of the store, if they don't already exist. + +If the user requests an array to be created under some logical path, then +groups MUST also be created at all ancestor paths. E.g., if the user requests +array creation at path "foo/bar/baz" then groups must be created at path +"foo/bar", path "foo", and the root of the store, if they don't already exist. + +The group metadata resource is a JSON object. The following keys MUST be present +within the object: + +zarr_format + An integer defining the version of the storage specification to which the + array store adheres. + +Other keys MUST NOT be present within the metadata object. + +The members of a group are arrays and groups stored under logical paths that +are direct children of the parent group's logical path. E.g., if groups exist +under the logical paths "foo" and "foo/bar" and an array exists at logical path +"foo/baz" then the members of the group at path "foo" are the group at path +"foo/bar" and the array at path "foo/baz". + +Attributes +---------- + +An array or group can be associated with custom attributes, which are simple +key/value items with application-specific meaning. Custom attributes are +encoded as a JSON object and stored under the ".zattrs" key within an array +store. + +For example, the JSON object below encodes three attributes named +"foo", "bar" and "baz":: + + { + "foo": 42, + "bar": "apples", + "baz": [1, 2, 3, 4] + } + +Examples +-------- + +Storing a single array +~~~~~~~~~~~~~~~~~~~~~~ + +Below is an example of storing a Zarr array, using a directory on the +local file system as storage. + +Create an array:: + + >>> import zarr + >>> store = zarr.DirectoryStore('example') + >>> a = zarr.create(shape=(20, 20), chunks=(10, 10), dtype='i4', + ... fill_value=42, compression='zlib', compression_opts=1, + ... store=store, overwrite=True) + +No chunks are initialized yet, so only the ".zarray" and ".zattrs" keys +have been set in the store:: + + >>> import os + >>> sorted(os.listdir('example')) + ['.zarray', '.zattrs'] + +Inspect the array metadata:: + + >>> print(open('example/.zarray').read()) + { + "chunks": [ + 10, + 10 + ], + "compression": "zlib", + "compression_opts": 1, + "dtype": ">> print(open('example/.zattrs').read()) + {} + +Chunks are initialized on demand. E.g., set some data:: + + >>> a[0:10, 0:10] = 1 + >>> sorted(os.listdir('example')) + ['.zarray', '.zattrs', '0.0'] + +Set some more data:: + + >>> a[0:10, 10:20] = 2 + >>> a[10:20, :] = 3 + >>> sorted(os.listdir('example')) + ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] + +Manually decompress a single chunk for illustration:: + + >>> import zlib + >>> buf = zlib.decompress(open('example/0.0', 'rb').read()) + >>> import numpy as np + >>> chunk = np.frombuffer(buf, dtype='>> chunk + array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) + +Modify the array attributes:: + + >>> a.attrs['foo'] = 42 + >>> a.attrs['bar'] = 'apples' + >>> a.attrs['baz'] = [1, 2, 3, 4] + >>> print(open('example/.zattrs').read()) + { + "bar": "apples", + "baz": [ + 1, + 2, + 3, + 4 + ], + "foo": 42 + } + +Storing multiple arrays in a hierarchy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Below is an example of storing multiple Zarr arrays organized into a group +hierarchy, using a directory on the local file system as storage. This storage +implementation maps logical paths onto directory paths on the file system, +however this is an implementation choice and is not required. + +Setup the store:: + + >>> import zarr + >>> store = zarr.DirectoryStore('example_hierarchy') + +Create the root group:: + + >>> root_grp = zarr.group(store, overwrite=True) + +The metadata resource for the root group has been created, as well as a custom +attributes resource:: + + >>> import os + >>> sorted(os.listdir('example_hierarchy')) + ['.zattrs', '.zgroup'] + +Inspect the group metadata:: + + >>> print(open('example_hierarchy/.zgroup').read()) + { + "zarr_format": 2 + } + +Inspect the group attributes:: + + >>> print(open('example_hierarchy/.zattrs').read()) + {} + +Create a sub-group:: + + >>> sub_grp = root_grp.create_group('foo') + +What has been stored:: + + >>> sorted(os.listdir('example_hierarchy')) + ['.zattrs', '.zgroup', 'foo'] + >>> sorted(os.listdir('example_hierarchy/foo')) + ['.zattrs', '.zgroup'] + +Create an array within the sub-group:: + + >>> a = sub_grp.create_dataset('bar', shape=(20, 20), chunks=(10, 10)) + >>> a[:] = 42 + +What has been stored:: + + >>> sorted(os.listdir('example_hierarchy')) + ['.zattrs', '.zgroup', 'foo'] + >>> sorted(os.listdir('example_hierarchy/foo')) + ['.zattrs', '.zgroup', 'bar'] + >>> sorted(os.listdir('example_hierarchy/foo/bar')) + ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] + +Here is the same example using a Zip file as storage:: + + >>> store = zarr.ZipStore('example_hierarchy.zip', mode='w') + >>> root_grp = zarr.group(store) + >>> sub_grp = root_grp.create_group('foo') + >>> a = sub_grp.create_dataset('bar', shape=(20, 20), chunks=(10, 10)) + >>> a[:] = 42 + +What has been stored:: + + >>> import zipfile + >>> zf = zipfile.ZipFile('example_hierarchy.zip', mode='r') + >>> for name in sorted(zf.namelist()): + ... print(name) + .zattrs + .zgroup + foo/.zattrs + foo/.zgroup + foo/bar/.zarray + foo/bar/.zattrs + foo/bar/0.0 + foo/bar/0.1 + foo/bar/1.0 + foo/bar/1.1 + +Changes +------- + +Changes in version 2 +~~~~~~~~~~~~~~~~~~~~ + +* Added support for storing multiple arrays in the same store and organising + arrays into hierarchies using groups. +* Array metadata is now stored under the ".zarray" key instead of the "meta" + key +* Custom attributes are now stored under the ".zattrs" key instead of the + "attrs" key diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 49c299bf97..7be0bc8093 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -178,7 +178,7 @@ and can be configured in a variety of ways to improve the compression ratio for different types of data. Blosc is in fact a "meta-compressor", which means that it can used a number of different compression algorithms internally to compress the data. Blosc also -provides highly optimised implementations of byte and bit shuffle +provides highly optimized implementations of byte and bit shuffle filters, which can significantly improve compression ratios for some data. @@ -276,10 +276,11 @@ array with thread synchronization:: >>> z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4', ... synchronizer=zarr.ThreadSynchronizer()) >>> z - zarr.sync.SynchronizedArray((10000, 10000), int32, chunks=(1000, 1000), order=C) + zarr.core.Array((10000, 10000), int32, chunks=(1000, 1000), order=C) compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} nbytes: 381.5M; nbytes_stored: 313; ratio: 1277955.3; initialized: 0/100 - store: builtins.dict; synchronizer: zarr.sync.ThreadSynchronizer + store: builtins.dict + synchronizer: zarr.sync.ThreadSynchronizer This array is safe to read or write within a multi-threaded program. @@ -291,10 +292,11 @@ provided that all processes have access to a shared file system. E.g.:: ... chunks=(1000, 1000), dtype='i4', ... synchronizer=synchronizer) >>> z - zarr.sync.SynchronizedArray((10000, 10000), int32, chunks=(1000, 1000), order=C) + zarr.core.Array((10000, 10000), int32, chunks=(1000, 1000), order=C) compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} nbytes: 381.5M; nbytes_stored: 313; ratio: 1277955.3; initialized: 0/100 - store: zarr.storage.DirectoryStore; synchronizer: zarr.sync.ProcessSynchronizer + store: zarr.storage.DirectoryStore + synchronizer: zarr.sync.ProcessSynchronizer This array is safe to read or write from multiple processes. @@ -321,6 +323,80 @@ for associating an array with application-specific metadata. For example:: Internally Zarr uses JSON to store array attributes, so attribute values must be JSON serializable. +.. _tutorial_groups: + +Groups +------ + +Zarr supports hierarchical organization of arrays via groups. As with arrays, +groups can be stored in memory, on disk, or via other storage systems that +support a similar interface. + +To create a group, use the :func:`zarr.hierarchy.group` function:: + + >>> root_group = zarr.group() + >>> root_group + zarr.hierarchy.Group(/, 0) + store: zarr.storage.DictStore + +Groups have a similar API to the Group class from `h5py `_. +For example, groups can contain other groups:: + + >>> foo_group = root_group.create_group('foo') + >>> bar_group = foo_group.create_group('bar') + +Groups can also contain arrays, also known as "datasets" in HDF5 terminology. +For compatibility with h5py, Zarr groups implement the +:func:`zarr.hierarchy.Group.create_dataset` method, e.g.:: + + >>> z = bar_group.create_dataset('baz', shape=(10000, 10000), + ... chunks=(1000, 1000), dtype='i4', + ... fill_value=0) + >>> z + zarr.core.Array(/foo/bar/baz, (10000, 10000), int32, chunks=(1000, 1000), order=C) + compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} + nbytes: 381.5M; nbytes_stored: 313; ratio: 1277955.3; initialized: 0/100 + store: zarr.storage.DictStore + +Members of a group can be accessed via the suffix notation, e.g.:: + + >>> root_group['foo'] + zarr.hierarchy.Group(/foo, 1) + groups: 1; bar + store: zarr.storage.DictStore + +The '/' character can be used to access multiple levels of the hierarchy, +e.g.:: + + >>> root_group['foo/bar'] + zarr.hierarchy.Group(/foo/bar, 1) + arrays: 1; baz + store: zarr.storage.DictStore + >>> root_group['foo/bar/baz'] + zarr.core.Array(/foo/bar/baz, (10000, 10000), int32, chunks=(1000, 1000), order=C) + compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} + nbytes: 381.5M; nbytes_stored: 313; ratio: 1277955.3; initialized: 0/100 + store: zarr.storage.DictStore + +The :func:`zarr.hierarchy.open_group` provides a convenient way to create or +re-open a group stored in a directory on the file-system, with sub-groups +stored in sub-directories, e.g.:: + + >>> persistent_group = zarr.open_group('example', mode='w') + >>> persistent_group + zarr.hierarchy.Group(/, 0) + store: zarr.storage.DirectoryStore + >>> z = persistent_group.create_dataset('foo/bar/baz', shape=(10000, 10000), + ... chunks=(1000, 1000), dtype='i4', + ... fill_value=0) + >>> z + zarr.core.Array(/foo/bar/baz, (10000, 10000), int32, chunks=(1000, 1000), order=C) + compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} + nbytes: 381.5M; nbytes_stored: 313; ratio: 1277955.3; initialized: 0/100 + store: zarr.storage.DirectoryStore + +For more information on groups see the :mod:`zarr.hierarchy` API docs. + .. _tutorial_tips: Tips and tricks @@ -380,33 +456,35 @@ Storage alternatives Zarr can use any object that implements the ``MutableMapping`` interface as the store for an array. -Here is an example storing an array directly into a Zip file via the -`zict `_ package:: +Here is an example storing an array directly into a Zip file:: - >>> import zict - >>> import os - >>> store = zict.Zip('example.zip', mode='w') + >>> store = zarr.ZipStore('example.zip', mode='w') >>> z = zarr.zeros((1000, 1000), chunks=(100, 100), dtype='i4', ... compression='zlib', compression_opts=1, store=store) >>> z zarr.core.Array((1000, 1000), int32, chunks=(100, 100), order=C) compression: zlib; compression_opts: 1 - nbytes: 3.8M; initialized: 0/100 - store: zict.zip.Zip + nbytes: 3.8M; nbytes_stored: 236; ratio: 16949.2; initialized: 0/100 + store: zarr.storage.ZipStore >>> z[:] = 42 - >>> store.flush() # only required for zict.Zip + >>> z + zarr.core.Array((1000, 1000), int32, chunks=(100, 100), order=C) + compression: zlib; compression_opts: 1 + nbytes: 3.8M; nbytes_stored: 21.9K; ratio: 178.3; initialized: 100/100 + store: zarr.storage.ZipStore + >>> import os >>> os.path.getsize('example.zip') - 30828 + 30838 Re-open and check that data have been written:: - >>> store = zict.Zip('example.zip', mode='r') + >>> store = zarr.ZipStore('example.zip', mode='r') >>> z = zarr.Array(store) >>> z zarr.core.Array((1000, 1000), int32, chunks=(100, 100), order=C) compression: zlib; compression_opts: 1 - nbytes: 3.8M; initialized: 100/100 - store: zict.zip.Zip + nbytes: 3.8M; nbytes_stored: 21.9K; ratio: 178.3; initialized: 100/100 + store: zarr.storage.ZipStore >>> z[:] array([[42, 42, 42, ..., 42, 42, 42], [42, 42, 42, ..., 42, 42, 42], diff --git a/tox.ini b/tox.ini index f83adba099..6a6f3986e7 100644 --- a/tox.ini +++ b/tox.ini @@ -13,7 +13,7 @@ commands = python setup.py build_ext --inplace py27: nosetests -v zarr py34,py35: nosetests -v --with-coverage --cover-erase --cover-min-percentage=100 --cover-package=zarr --with-doctest --doctest-options=+NORMALIZE_WHITESPACE zarr - py34,py35: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v1.rst + py34,py35: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst py35: flake8 zarr python setup.py bdist_wheel deps = diff --git a/zarr/__init__.py b/zarr/__init__.py index 47bca94dbe..32f4fab1d0 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -5,12 +5,13 @@ import atexit -from zarr.creation import create, array, empty, zeros, ones, full, open, \ - empty_like, zeros_like, ones_like, full_like, open_like -from zarr.storage import init_store, DirectoryStore from zarr.core import Array -from zarr.sync import ThreadSynchronizer, ProcessSynchronizer, \ - SynchronizedArray +from zarr.creation import create, array, empty, zeros, ones, full, open, \ + empty_like, zeros_like, ones_like, full_like, open_like, open_array +from zarr.storage import DictStore, DirectoryStore, ZipStore, init_array, \ + init_group, init_store +from zarr.hierarchy import group, open_group, Group +from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.version import version as __version__ diff --git a/zarr/attrs.py b/zarr/attrs.py index 5fe5de50dc..7ad21bafc8 100644 --- a/zarr/attrs.py +++ b/zarr/attrs.py @@ -10,12 +10,12 @@ class Attributes(MutableMapping): - def __init__(self, store, key='attrs', readonly=False): - if key not in store: - store[key] = json.dumps(dict()).encode('ascii') + def __init__(self, store, key='.zattrs', read_only=False, + synchronizer=None): self.store = store self.key = key - self.readonly = readonly + self.read_only = read_only + self.synchronizer = synchronizer def __contains__(self, x): return x in self.asdict() @@ -23,35 +23,41 @@ def __contains__(self, x): def __getitem__(self, item): return self.asdict()[item] - def put(self, d): - - # guard conditions - if self.readonly: - raise ReadOnlyError('attributes are read-only') - + def _put(self, d): s = json.dumps(d, indent=4, sort_keys=True, ensure_ascii=True) self.store[self.key] = s.encode('ascii') - def __setitem__(self, key, value): + def _write_op(self, f, *args, **kwargs): - # guard conditions - if self.readonly: + # guard condition + if self.read_only: raise ReadOnlyError('attributes are read-only') + # synchronization + if self.synchronizer is None: + return f(*args, **kwargs) + else: + with self.synchronizer[self.key]: + return f(*args, **kwargs) + + def __setitem__(self, item, value): + self._write_op(self._setitem_nosync, item, value) + + def _setitem_nosync(self, item, value): + # load existing data d = self.asdict() # set key value - d[key] = value + d[item] = value - # put modified data - self.put(d) + # _put modified data + self._put(d) - def __delitem__(self, key): + def __delitem__(self, item): + self._write_op(self._delitem_nosync, item) - # guard conditions - if self.readonly: - raise ReadOnlyError('mapping is read-only') + def _delitem_nosync(self, key): # load existing data d = self.asdict() @@ -59,18 +65,20 @@ def __delitem__(self, key): # delete key value del d[key] - # put modified data - self.put(d) + # _put modified data + self._put(d) def asdict(self): - return json.loads(text_type(self.store[self.key], 'ascii')) + if self.key in self.store: + return json.loads(text_type(self.store[self.key], 'ascii')) + else: + return dict() def update(self, *args, **kwargs): # override to provide update in a single write + self._write_op(self._update_nosync, *args, **kwargs) - # guard conditions - if self.readonly: - raise ReadOnlyError('mapping is read-only') + def _update_nosync(self, *args, **kwargs): # load existing data d = self.asdict() @@ -78,20 +86,11 @@ def update(self, *args, **kwargs): # update d.update(*args, **kwargs) - # put modified data - self.put(d) + # _put modified data + self._put(d) def __iter__(self): return iter(self.asdict()) def __len__(self): return len(self.asdict()) - - def keys(self): - return self.asdict().keys() - - def values(self): - return self.asdict().values() - - def items(self): - return self.asdict().items() diff --git a/zarr/blosc.c b/zarr/blosc.c index 8074e31e6a..b0e4330716 100644 --- a/zarr/blosc.c +++ b/zarr/blosc.c @@ -1,4 +1,4 @@ -/* Generated by Cython 0.24 */ +/* Generated by Cython 0.24.1 */ /* BEGIN: Cython Metadata { @@ -42,7 +42,8 @@ "c-blosc/internal-complibs/zstd-0.7.4/common", "c-blosc/internal-complibs/zstd-0.7.4/decompress" ] - } + }, + "module_name": "zarr.blosc" } END: Cython Metadata */ @@ -53,7 +54,7 @@ END: Cython Metadata */ #elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03020000) #error Cython requires Python 2.6+ or Python 3.2+. #else -#define CYTHON_ABI "0_24" +#define CYTHON_ABI "0_24_1" #include #ifndef offsetof #define offsetof(type, member) ( (size_t) & ((type*)0) -> member ) @@ -156,6 +157,9 @@ END: Cython Metadata */ #if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains) #define PyUnicode_Contains(u, s) PySequence_Contains(u, s) #endif +#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check) + #define PyByteArray_Check(obj) PyObject_TypeCheck(obj, &PyByteArray_Type) +#endif #if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format) #define PyObject_Format(obj, fmt) PyObject_CallMethod(obj, "__format__", "O", fmt) #endif @@ -281,6 +285,11 @@ static CYTHON_INLINE float __PYX_NAN() { return value; } #endif +#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL) +#define __Pyx_truncl trunc +#else +#define __Pyx_truncl truncl +#endif #define __PYX_ERR(f_index, lineno, Ln_error) \ @@ -526,10 +535,10 @@ static const char *__pyx_filename; static const char *__pyx_f[] = { "zarr/blosc.pyx", - "array.pxd", - "type.pxd", - "bool.pxd", - "complex.pxd", + ".tox/py35/lib/python3.5/site-packages/Cython/Includes/cpython/array.pxd", + ".tox/py35/lib/python3.5/site-packages/Cython/Includes/cpython/type.pxd", + ".tox/py35/lib/python3.5/site-packages/Cython/Includes/cpython/bool.pxd", + ".tox/py35/lib/python3.5/site-packages/Cython/Includes/cpython/complex.pxd", }; /*--- Type declarations ---*/ diff --git a/zarr/compat.py b/zarr/compat.py index c35315d32c..11a1db6ddf 100644 --- a/zarr/compat.py +++ b/zarr/compat.py @@ -9,18 +9,14 @@ if PY2: # pragma: no cover - def itervalues(d, **kw): - return d.itervalues(**kw) - text_type = unicode binary_type = str integer_types = (int, long) + reduce = reduce else: - def itervalues(d, **kw): - return iter(d.values(**kw)) - text_type = str binary_type = bytes integer_types = int, + from functools import reduce diff --git a/zarr/compressors.py b/zarr/compressors.py index dbd12748d9..881165e8ce 100644 --- a/zarr/compressors.py +++ b/zarr/compressors.py @@ -138,7 +138,7 @@ def normalize_opts(cls, compression_opts): if shuffle not in [0, 1, 2]: raise ValueError('invalid shuffle: %s' % shuffle) - # construct normalised options + # construct normalized options compression_opts = dict( cname=cname, clevel=clevel, shuffle=shuffle ) diff --git a/zarr/core.py b/zarr/core.py index b230abf488..ea64e1f06e 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division -from functools import reduce # TODO PY2 compatibility import operator import itertools @@ -10,27 +9,39 @@ from zarr.compressors import get_compressor_cls from zarr.util import is_total_slice, normalize_array_selection, \ - get_chunk_range, human_readable_size, normalize_resize_args -from zarr.meta import decode_metadata, encode_metadata + get_chunk_range, human_readable_size, normalize_resize_args, \ + normalize_storage_path +from zarr.storage import array_meta_key, attrs_key, listdir, getsize +from zarr.meta import decode_array_metadata, encode_array_metadata from zarr.attrs import Attributes -from zarr.compat import itervalues from zarr.errors import ReadOnlyError +from zarr.compat import reduce class Array(object): - """Instantiate an array from an initialised store. + """Instantiate an array from an initialized store. Parameters ---------- store : MutableMapping - Array store, already initialised. - readonly : bool, optional + Array store, already initialized. + path : string, optional + Storage path. + read_only : bool, optional True if array should be protected against modification. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + synchronizer : object, optional + Array synchronizer. Attributes ---------- store - readonly + path + name + read_only + chunk_store shape chunks dtype @@ -38,6 +49,7 @@ class Array(object): compression_opts fill_value order + synchronizer attrs size itemsize @@ -53,35 +65,34 @@ class Array(object): resize append - Examples - -------- - >>> import zarr - >>> store = dict() - >>> zarr.init_store(store, shape=(10000, 10000), chunks=(1000, 1000)) - >>> z = zarr.Array(store) - >>> z - zarr.core.Array((10000, 10000), float64, chunks=(1000, 1000), order=C) - compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} - nbytes: 762.9M; nbytes_stored: 316; ratio: 2531645.6; initialized: 0/100 - store: builtins.dict - """ # flake8: noqa - def __init__(self, store, readonly=False): - # N.B., expect at this point store is fully initialised with all - # configuration metadata fully specified and normalised + def __init__(self, store, path=None, read_only=False, chunk_store=None, + synchronizer=None): + # N.B., expect at this point store is fully initialized with all + # configuration metadata fully specified and normalized - #: store docstring - self._store = store #: inline docstring - self._readonly = readonly + self._store = store + self._path = normalize_storage_path(path) + if self._path: + self._key_prefix = self._path + '/' + else: + self._key_prefix = '' + self._read_only = read_only + if chunk_store is None: + self._chunk_store = store + else: + self._chunk_store = chunk_store + self._synchronizer = synchronizer - # initialise metadata + # initialize metadata try: - meta_bytes = store['meta'] + mkey = self._key_prefix + array_meta_key + meta_bytes = store[mkey] except KeyError: raise ValueError('store has no metadata') else: - meta = decode_metadata(meta_bytes) + meta = decode_array_metadata(meta_bytes) self._meta = meta self._shape = meta['shape'] self._chunks = meta['chunks'] @@ -93,15 +104,18 @@ def __init__(self, store, readonly=False): compressor_cls = get_compressor_cls(self._compression) self._compressor = compressor_cls(self._compression_opts) - # initialise attributes - self._attrs = Attributes(store, readonly=readonly) + # initialize attributes + akey = self._key_prefix + attrs_key + self._attrs = Attributes(store, key=akey, read_only=read_only, + synchronizer=synchronizer) - def flush_metadata(self): + def _flush_metadata(self): meta = dict(shape=self._shape, chunks=self._chunks, dtype=self._dtype, compression=self._compression, compression_opts=self._compression_opts, fill_value=self._fill_value, order=self._order) - self._store['meta'] = encode_metadata(meta) + mkey = self._key_prefix + array_meta_key + self._store[mkey] = encode_array_metadata(meta) @property def store(self): @@ -109,9 +123,31 @@ def store(self): return self._store @property - def readonly(self): - """A boolean, True if write operations are not permitted.""" - return self._readonly + def path(self): + """Storage path.""" + return self._path + + @property + def name(self): + """Array name following h5py convention.""" + if self.path: + # follow h5py convention: add leading slash + name = self.path + if name[0] != '/': + name = '/' + name + return name + return None + + @property + def read_only(self): + """A boolean, True if modification operations are not permitted.""" + return self._read_only + + @property + def chunk_store(self): + """A MutableMapping providing the underlying storage for array + chunks.""" + return self._chunk_store @property def shape(self): @@ -153,6 +189,11 @@ def order(self): chunks of the array.""" return self._order + @property + def synchronizer(self): + """TODO doc me""" + return self._synchronizer + @property def attrs(self): """A MutableMapping containing user-defined attributes. Note that @@ -179,21 +220,22 @@ def nbytes(self): def nbytes_stored(self): """The total number of stored bytes of data for the array. This includes storage required for configuration metadata and user - attributes encoded as JSON.""" - if hasattr(self._store, 'size'): - # pass through - return self._store.size - elif isinstance(self._store, dict): - # cheap to compute by summing length of values - return sum(len(v) for v in itervalues(self._store)) + attributes.""" + m = getsize(self._store, self._path) + if self._store == self._chunk_store: + return m else: - return -1 + n = getsize(self._chunk_store, self._path) + if m < 0 or n < 0: + return -1 + else: + return m + n @property def initialized(self): """The number of chunks that have been initialized with some data.""" - # N.B., expect 'meta' and 'attrs' keys in store also, so subtract 2 - return len(self._store) - 2 + return sum(1 for k in listdir(self._chunk_store, self._path) + if k not in [array_meta_key, attrs_key]) @property def cdata_shape(self): @@ -203,6 +245,16 @@ def cdata_shape(self): int(np.ceil(s / c)) for s, c in zip(self._shape, self._chunks) ) + def __eq__(self, other): + return ( + isinstance(other, Array) and + self.store == other.store and + self.read_only == other.read_only and + self.path == other.path + # N.B., no need to compare other properties, should be covered by + # store comparison + ) + def __array__(self): return self[:] @@ -401,7 +453,7 @@ def __setitem__(self, key, value): """ # guard conditions - if self._readonly: + if self._read_only: raise ReadOnlyError('array is read-only') # normalize selection @@ -473,8 +525,8 @@ def _chunk_getitem(self, cidx, item, dest): try: # obtain compressed data for chunk - ckey = '.'.join(map(str, cidx)) - cdata = self._store[ckey] + ckey = self._chunk_key(cidx) + cdata = self._chunk_store[ckey] except KeyError: @@ -522,6 +574,20 @@ def _chunk_setitem(self, cidx, key, value): """ + # synchronization + if self._synchronizer is None: + self._chunk_setitem_nosync(cidx, key, value) + else: + # synchronize on the chunk + ckey = self._chunk_key(cidx) + with self._synchronizer[ckey]: + self._chunk_setitem_nosync(cidx, key, value) + + def _chunk_setitem_nosync(self, cidx, key, value): + + # obtain key for chunk storage + ckey = self._chunk_key(cidx) + if is_total_slice(key, self._chunks): # optimisation: we are completely replacing the chunk, so no need @@ -548,8 +614,7 @@ def _chunk_setitem(self, cidx, key, value): try: # obtain compressed data for chunk - ckey = '.'.join(map(str, cidx)) - cdata = self._store[ckey] + cdata = self._chunk_store[ckey] except KeyError: @@ -573,18 +638,22 @@ def _chunk_setitem(self, cidx, key, value): cdata = self._compressor.compress(chunk) # store - ckey = '.'.join(map(str, cidx)) - self._store[ckey] = cdata + self._chunk_store[ckey] = cdata + + def _chunk_key(self, cidx): + return self._key_prefix + '.'.join(map(str, cidx)) def __repr__(self): r = '%s.%s(' % (type(self).__module__, type(self).__name__) - r += '%s' % str(self._shape) - r += ', %s' % str(self._dtype) - r += ', chunks=%s' % str(self._chunks) - r += ', order=%s' % self._order + if self.name: + r += '%s, ' % self.name + r += '%s, ' % str(self.shape) + r += '%s, ' % str(self.dtype) + r += 'chunks=%s, ' % str(self.chunks) + r += 'order=%s' % self.order r += ')' - r += '\n compression: %s' % self._compression - r += '; compression_opts: %s' % str(self._compression_opts) + r += '\n compression: %s' % self.compression + r += '; compression_opts: %s' % str(self.compression_opts) r += '\n nbytes: %s' % human_readable_size(self.nbytes) if self.nbytes_stored > 0: r += '; nbytes_stored: %s' % human_readable_size( @@ -592,16 +661,40 @@ def __repr__(self): r += '; ratio: %.1f' % (self.nbytes / self.nbytes_stored) n_chunks = reduce(operator.mul, self.cdata_shape) r += '; initialized: %s/%s' % (self.initialized, n_chunks) - r += '\n store: %s.%s' % (type(self._store).__module__, - type(self._store).__name__) + r += '\n store: %s.%s' % (type(self.store).__module__, + type(self.store).__name__) + if self._store != self._chunk_store: + r += '\n chunk_store: %s.%s' % \ + (type(self._chunk_store).__module__, + type(self._chunk_store).__name__) + if self._synchronizer is not None: + r += ('\n synchronizer: %s.%s' % + (type(self._synchronizer).__module__, + type(self._synchronizer).__name__)) return r def __getstate__(self): - return self._store, self._readonly + return self._store, self._path, self._read_only, self._chunk_store, \ + self._synchronizer def __setstate__(self, state): self.__init__(*state) + def _write_op(self, f, *args, **kwargs): + + # guard condition + if self._read_only: + raise ReadOnlyError('array is read-only') + + # synchronization + if self._synchronizer is None: + return f(*args, **kwargs) + else: + # synchronize on the array + mkey = self._key_prefix + array_meta_key + with self._synchronizer[mkey]: + return f(*args, **kwargs) + def resize(self, *args): """Change the shape of the array by growing or shrinking one or more dimensions. @@ -637,9 +730,9 @@ def resize(self, *args): """ # flake8: noqa - # guard conditions - if self._readonly: - raise ReadOnlyError('array is read-only') + return self._write_op(self._resize_nosync, *args) + + def _resize_nosync(self, *args): # normalize new shape argument old_shape = self._shape @@ -651,17 +744,17 @@ def resize(self, *args): for s, c in zip(new_shape, chunks)) # remove any chunks not within range - for key in list(self._store): - if key not in ['meta', 'attrs']: + for key in listdir(self._chunk_store, self._path): + if key not in [array_meta_key, attrs_key]: cidx = map(int, key.split('.')) if all(i < c for i, c in zip(cidx, new_cdata_shape)): pass # keep the chunk else: - del self._store[key] + del self._chunk_store[self._key_prefix + key] # update metadata self._shape = new_shape - self.flush_metadata() + self._flush_metadata() def append(self, data, axis=0): """Append `data` to `axis`. @@ -703,10 +796,9 @@ def append(self, data, axis=0): store: builtins.dict """ + return self._write_op(self._append_nosync, data, axis=axis) - # guard conditions - if self._readonly: - raise ReadOnlyError('array is read-only') + def _append_nosync(self, data, axis=0): # ensure data is array-like if not hasattr(data, 'shape') or not hasattr(data, 'dtype'): @@ -730,7 +822,7 @@ def append(self, data, axis=0): ) # resize - self.resize(new_shape) + self._resize_nosync(new_shape) # store data # noinspection PyTypeChecker diff --git a/zarr/creation.py b/zarr/creation.py index 0240ecfcae..83dc52aa05 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -1,27 +1,26 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division -import os import numpy as np from zarr.core import Array -from zarr.sync import SynchronizedArray -from zarr.storage import DirectoryStore, init_store +from zarr.storage import DirectoryStore, init_array, contains_array, \ + contains_group -def create(shape, chunks, dtype=None, compression='default', +def create(shape, chunks=None, dtype=None, compression='default', compression_opts=None, fill_value=None, order='C', store=None, - synchronizer=None, overwrite=False): + synchronizer=None, overwrite=False, path=None, chunk_store=None): """Create an array. Parameters ---------- shape : int or tuple of ints Array shape. - chunks : int or tuple of ints - Chunk shape. + chunks : int or tuple of ints, optional + Chunk shape. If not provided, will be guessed from `shape` and `dtype`. dtype : string or dtype, optional NumPy dtype. compression : string, optional @@ -31,17 +30,22 @@ def create(shape, chunks, dtype=None, compression='default', Options to primary compressor. E.g., for blosc, provide a dictionary with keys 'cname', 'clevel' and 'shuffle'. fill_value : object - Default value to use for uninitialised portions of the array. + Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Memory layout to be used within each chunk. store : MutableMapping, optional Array storage. If not provided, a Python dict will be used, meaning array data will be stored in memory. - synchronizer : zarr.sync.ArraySynchronizer, optional + synchronizer : object, optional Array synchronizer. overwrite : bool, optional If True, delete all pre-existing data in `store` before creating the array. + path : string, optional + Path under which array is stored. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. Returns ------- @@ -62,24 +66,26 @@ def create(shape, chunks, dtype=None, compression='default', """ # flake8: noqa - # initialise store + # initialize store if store is None: store = dict() - init_store(store, shape=shape, chunks=chunks, dtype=dtype, + + # initialize array metadata + init_array(store, shape=shape, chunks=chunks, dtype=dtype, compression=compression, compression_opts=compression_opts, - fill_value=fill_value, order=order, overwrite=overwrite) + fill_value=fill_value, order=order, overwrite=overwrite, + path=path, chunk_store=chunk_store) # instantiate array - if synchronizer is not None: - z = SynchronizedArray(store, synchronizer) - else: - z = Array(store) + z = Array(store, path=path, chunk_store=chunk_store, + synchronizer=synchronizer) return z -def empty(shape, chunks, dtype=None, compression='default', - compression_opts=None, order='C', store=None, synchronizer=None): +def empty(shape, chunks=None, dtype=None, compression='default', + compression_opts=None, order='C', store=None, synchronizer=None, + path=None, overwrite=False, chunk_store=None): """Create an empty array. For parameter definitions see :func:`zarr.creation.create`. @@ -94,13 +100,15 @@ def empty(shape, chunks, dtype=None, compression='default', return create(shape=shape, chunks=chunks, dtype=dtype, compression=compression, compression_opts=compression_opts, fill_value=None, order=order, store=store, - synchronizer=synchronizer) + synchronizer=synchronizer, path=path, overwrite=overwrite, + chunk_store=chunk_store) -def zeros(shape, chunks, dtype=None, compression='default', - compression_opts=None, order='C', store=None, synchronizer=None): +def zeros(shape, chunks=None, dtype=None, compression='default', + compression_opts=None, order='C', store=None, synchronizer=None, + path=None, overwrite=False, chunk_store=None): """Create an array, with zero being used as the default value for - uninitialised portions of the array. + uninitialized portions of the array. For parameter definitions see :func:`zarr.creation.create`. @@ -122,13 +130,15 @@ def zeros(shape, chunks, dtype=None, compression='default', return create(shape=shape, chunks=chunks, dtype=dtype, compression=compression, compression_opts=compression_opts, fill_value=0, order=order, - store=store, synchronizer=synchronizer) + store=store, synchronizer=synchronizer, path=path, + overwrite=overwrite, chunk_store=chunk_store) -def ones(shape, chunks, dtype=None, compression='default', - compression_opts=None, order='C', store=None, synchronizer=None): +def ones(shape, chunks=None, dtype=None, compression='default', + compression_opts=None, order='C', store=None, synchronizer=None, + path=None, overwrite=False, chunk_store=None): """Create an array, with one being used as the default value for - uninitialised portions of the array. + uninitialized portions of the array. For parameter definitions see :func:`zarr.creation.create`. @@ -150,13 +160,15 @@ def ones(shape, chunks, dtype=None, compression='default', return create(shape=shape, chunks=chunks, dtype=dtype, compression=compression, compression_opts=compression_opts, fill_value=1, order=order, store=store, - synchronizer=synchronizer) + synchronizer=synchronizer, path=path, overwrite=overwrite, + chunk_store=chunk_store) -def full(shape, chunks, fill_value, dtype=None, compression='default', - compression_opts=None, order='C', store=None, synchronizer=None): +def full(shape, fill_value, chunks=None, dtype=None, compression='default', + compression_opts=None, order='C', store=None, synchronizer=None, + path=None, overwrite=False, chunk_store=None): """Create an array, with `fill_value` being used as the default value for - uninitialised portions of the array. + uninitialized portions of the array. For parameter definitions see :func:`zarr.creation.create`. @@ -178,12 +190,13 @@ def full(shape, chunks, fill_value, dtype=None, compression='default', return create(shape=shape, chunks=chunks, dtype=dtype, compression=compression, compression_opts=compression_opts, fill_value=fill_value, order=order, store=store, - synchronizer=synchronizer) + synchronizer=synchronizer, path=path, overwrite=overwrite, + chunk_store=chunk_store) def array(data, chunks=None, dtype=None, compression='default', compression_opts=None, fill_value=None, order='C', store=None, - synchronizer=None): + synchronizer=None, path=None, overwrite=False, chunk_store=None): """Create an array filled with `data`. The `data` argument should be a NumPy array or array-like object. For @@ -230,7 +243,8 @@ def array(data, chunks=None, dtype=None, compression='default', z = create(shape=shape, chunks=chunks, dtype=dtype, compression=compression, compression_opts=compression_opts, fill_value=fill_value, order=order, store=store, - synchronizer=synchronizer) + synchronizer=synchronizer, path=path, overwrite=overwrite, + chunk_store=chunk_store) # fill with data z[:] = data @@ -238,25 +252,25 @@ def array(data, chunks=None, dtype=None, compression='default', return z -# noinspection PyShadowingBuiltins -def open(path, mode='a', shape=None, chunks=None, dtype=None, - compression='default', compression_opts=None, fill_value=0, order='C', - synchronizer=None): - """Open an array stored in a directory on the file system. +def open_array(path, mode='a', shape=None, chunks=None, dtype=None, + compression='default', compression_opts=None, fill_value=0, + order='C', synchronizer=None): + """Convenience function to instantiate an array stored in a + directory on the file system. Parameters ---------- path : string - Path to directory in which to store the array. + Path to directory in file system in which to store the array. mode : {'r', 'r+', 'a', 'w', 'w-'} - Persistence mode: 'r' means readonly (must exist); 'r+' means + Persistence mode: 'r' means read only (must exist); 'r+' means read/write (must exist); 'a' means read/write (create if doesn't exist); 'w' means create (overwrite if exists); 'w-' means create (fail if exists). shape : int or tuple of ints Array shape. - chunks : int or tuple of ints - Chunk shape. + chunks : int or tuple of ints, optional + Chunk shape. If not provided, will be guessed from `shape` and `dtype`. dtype : string or dtype, optional NumPy dtype. compression : string, optional @@ -266,10 +280,10 @@ def open(path, mode='a', shape=None, chunks=None, dtype=None, Options to primary compressor. E.g., for blosc, provide a dictionary with keys 'cname', 'clevel' and 'shuffle'. fill_value : object - Default value to use for uninitialised portions of the array. + Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Memory layout to be used within each chunk. - synchronizer : zarr.sync.ArraySynchronizer, optional + synchronizer : object, optional Array synchronizer. Returns @@ -280,15 +294,15 @@ def open(path, mode='a', shape=None, chunks=None, dtype=None, -------- >>> import numpy as np >>> import zarr - >>> z1 = zarr.open('example.zarr', mode='w', shape=(10000, 10000), - ... chunks=(1000, 1000), fill_value=0) + >>> z1 = zarr.open_array('example.zarr', mode='w', shape=(10000, 10000), + ... chunks=(1000, 1000), fill_value=0) >>> z1[:] = np.arange(100000000).reshape(10000, 10000) >>> z1 zarr.core.Array((10000, 10000), float64, chunks=(1000, 1000), order=C) compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} nbytes: 762.9M; nbytes_stored: 24.8M; ratio: 30.8; initialized: 100/100 store: zarr.storage.DirectoryStore - >>> z2 = zarr.open('example.zarr', mode='r') + >>> z2 = zarr.open_array('example.zarr', mode='r') >>> z2 zarr.core.Array((10000, 10000), float64, chunks=(1000, 1000), order=C) compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} @@ -306,45 +320,61 @@ def open(path, mode='a', shape=None, chunks=None, dtype=None, # use same mode semantics as h5py, although N.B., here `path` is a # directory: - # r : readonly, must exist + # r : read only, must exist # r+ : read/write, must exist # w : create, delete if exists # w- or x : create, fail if exists # a : read/write if exists, create otherwise (default) - # ensure directory exists - if not os.path.exists(path): - if mode in ['w', 'w-', 'x', 'a']: - os.makedirs(path) - elif mode in ['r', 'r+']: - raise ValueError('path does not exist: %r' % path) - # setup store store = DirectoryStore(path) - exists = 'meta' in store # use metadata key as indicator of existence # ensure store is initialized - if mode in ['r', 'r+'] and not exists: - raise ValueError('array does not exist') - elif mode in ['w-', 'x'] and exists: - raise ValueError('array exists') - elif mode == 'w' or (mode in ['a', 'w-', 'x'] and not exists): - init_store(store, shape=shape, chunks=chunks, dtype=dtype, + + if mode in ['r', 'r+']: + if contains_group(store): + raise ValueError('store contains group') + elif not contains_array(store): + raise ValueError('array does not exist') + + elif mode == 'w': + init_array(store, shape=shape, chunks=chunks, dtype=dtype, compression=compression, compression_opts=compression_opts, fill_value=fill_value, order=order, overwrite=True) - # determine readonly status - readonly = mode == 'r' + elif mode == 'a': + if contains_group(store): + raise ValueError('store contains group') + elif not contains_array(store): + init_array(store, shape=shape, chunks=chunks, dtype=dtype, + compression=compression, + compression_opts=compression_opts, + fill_value=fill_value, order=order) + + elif mode in ['w-', 'x']: + if contains_group(store): + raise ValueError('store contains group') + elif contains_array(store): + raise ValueError('store contains array') + else: + init_array(store, shape=shape, chunks=chunks, dtype=dtype, + compression=compression, + compression_opts=compression_opts, + fill_value=fill_value, order=order) + + # determine read only status + read_only = mode == 'r' - # handle optional synchronizer - if synchronizer is not None: - z = SynchronizedArray(store, synchronizer, readonly=readonly) - else: - z = Array(store, readonly=readonly) + # instantiate array + z = Array(store, read_only=read_only, synchronizer=synchronizer) return z +# backwards compatibility +open = open_array + + def _like_args(a, shape, chunks, dtype, compression, compression_opts, order): if shape is None: shape = a.shape @@ -352,7 +382,8 @@ def _like_args(a, shape, chunks, dtype, compression, compression_opts, order): try: chunks = a.chunks except AttributeError: - raise ValueError('chunks must be specified') + # use auto-chunking + pass if dtype is None: dtype = a.dtype if compression is None: @@ -375,43 +406,49 @@ def _like_args(a, shape, chunks, dtype, compression, compression_opts, order): def empty_like(a, shape=None, chunks=None, dtype=None, compression=None, compression_opts=None, order=None, store=None, - synchronizer=None): + synchronizer=None, path=None, overwrite=False, + chunk_store=None): """Create an empty array like `a`.""" shape, chunks, dtype, compression, compression_opts, order = \ _like_args(a, shape, chunks, dtype, compression, compression_opts, order) - return empty(shape, chunks, dtype=dtype, compression=compression, + return empty(shape, chunks=chunks, dtype=dtype, compression=compression, compression_opts=compression_opts, order=order, - store=store, synchronizer=synchronizer) + store=store, synchronizer=synchronizer, path=path, + overwrite=overwrite, chunk_store=chunk_store) def zeros_like(a, shape=None, chunks=None, dtype=None, compression=None, compression_opts=None, order=None, store=None, - synchronizer=None): + synchronizer=None, path=None, overwrite=False, + chunk_store=None): """Create an array of zeros like `a`.""" shape, chunks, dtype, compression, compression_opts, order = \ _like_args(a, shape, chunks, dtype, compression, compression_opts, order) - return zeros(shape, chunks, dtype=dtype, compression=compression, + return zeros(shape, chunks=chunks, dtype=dtype, compression=compression, compression_opts=compression_opts, order=order, - store=store, synchronizer=synchronizer) + store=store, synchronizer=synchronizer, path=path, + overwrite=overwrite, chunk_store=chunk_store) def ones_like(a, shape=None, chunks=None, dtype=None, compression=None, compression_opts=None, order=None, store=None, - synchronizer=None): + synchronizer=None, path=None, overwrite=False, chunk_store=None): """Create an array of ones like `a`.""" shape, chunks, dtype, compression, compression_opts, order = \ _like_args(a, shape, chunks, dtype, compression, compression_opts, order) - return ones(shape, chunks, dtype=dtype, compression=compression, + return ones(shape, chunks=chunks, dtype=dtype, compression=compression, compression_opts=compression_opts, order=order, - store=store, synchronizer=synchronizer) + store=store, synchronizer=synchronizer, path=path, + overwrite=overwrite, chunk_store=chunk_store) def full_like(a, shape=None, chunks=None, fill_value=None, dtype=None, compression=None, compression_opts=None, order=None, - store=None, synchronizer=None): + store=None, synchronizer=None, path=None, overwrite=False, + chunk_store=None): """Create a filled array like `a`.""" shape, chunks, dtype, compression, compression_opts, order = \ _like_args(a, shape, chunks, dtype, compression, compression_opts, @@ -421,9 +458,10 @@ def full_like(a, shape=None, chunks=None, fill_value=None, dtype=None, fill_value = a.fill_value except AttributeError: raise ValueError('fill_value must be specified') - return full(shape, chunks, fill_value, dtype=dtype, + return full(shape, chunks=chunks, fill_value=fill_value, dtype=dtype, compression=compression, compression_opts=compression_opts, - order=order, store=store, synchronizer=synchronizer) + order=order, store=store, synchronizer=synchronizer, + path=path, overwrite=overwrite, chunk_store=chunk_store) def open_like(a, path, mode='a', shape=None, chunks=None, dtype=None, @@ -439,6 +477,8 @@ def open_like(a, path, mode='a', shape=None, chunks=None, dtype=None, except AttributeError: # leave empty pass - return open(path, mode=mode, shape=shape, chunks=chunks, dtype=dtype, - compression=compression, compression_opts=compression_opts, - fill_value=fill_value, order=order, synchronizer=synchronizer) + return open_array(path, mode=mode, shape=shape, chunks=chunks, dtype=dtype, + compression=compression, + compression_opts=compression_opts, + fill_value=fill_value, order=order, + synchronizer=synchronizer) diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py new file mode 100644 index 0000000000..b9a2246498 --- /dev/null +++ b/zarr/hierarchy.py @@ -0,0 +1,916 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, division +from collections import Mapping +from warnings import warn + + +import numpy as np + + +from zarr.attrs import Attributes +from zarr.core import Array +from zarr.storage import contains_array, contains_group, init_group, \ + DictStore, DirectoryStore, group_meta_key, attrs_key, listdir +from zarr.creation import array, create, empty, zeros, ones, full, \ + empty_like, zeros_like, ones_like, full_like +from zarr.util import normalize_storage_path, normalize_shape +from zarr.errors import ReadOnlyError +from zarr.meta import decode_group_metadata + + +class Group(Mapping): + """Instantiate a group from an initialized store. + + Parameters + ---------- + store : HierarchicalStore + Group store, already initialized. + path : string, optional + Storage path. + read_only : bool, optional + True if group should be protected against modification. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + synchronizer : object, optional + Array synchronizer. + + Attributes + ---------- + store + path + name + read_only + chunk_store + synchronizer + attrs + + Methods + ------- + __len__ + __iter__ + __contains__ + __getitem__ + group_keys + groups + array_keys + arrays + create_group + require_group + create_groups + require_groups + create_dataset + require_dataset + create + empty + zeros + ones + full + array + empty_like + zeros_like + ones_like + full_like + + """ + + def __init__(self, store, path=None, read_only=False, chunk_store=None, + synchronizer=None): + + self._store = store + self._path = normalize_storage_path(path) + if self._path: + self._key_prefix = self._path + '/' + else: + self._key_prefix = '' + self._read_only = read_only + if chunk_store is None: + self._chunk_store = store + else: + self._chunk_store = chunk_store + self._synchronizer = synchronizer + + # guard conditions + if contains_array(store, path=self._path): + raise ValueError('store contains an array') + + # initialize metadata + try: + mkey = self._key_prefix + group_meta_key + meta_bytes = store[mkey] + except KeyError: + raise ValueError('store has no metadata') + else: + meta = decode_group_metadata(meta_bytes) + self._meta = meta + + # setup attributes + akey = self._key_prefix + attrs_key + self._attrs = Attributes(store, key=akey, read_only=read_only, + synchronizer=synchronizer) + + @property + def store(self): + """A MutableMapping providing the underlying storage for the group.""" + return self._store + + @property + def path(self): + """Storage path.""" + return self._path + + @property + def name(self): + """Group name following h5py convention.""" + if self._path: + # follow h5py convention: add leading slash + name = self._path + if name[0] != '/': + name = '/' + name + return name + return '/' + + @property + def read_only(self): + """A boolean, True if modification operations are not permitted.""" + return self._read_only + + @property + def chunk_store(self): + """A MutableMapping providing the underlying storage for array + chunks.""" + return self._chunk_store + + @property + def synchronizer(self): + """TODO doc me""" + return self._synchronizer + + @property + def attrs(self): + """A MutableMapping containing user-defined attributes. Note that + attribute values must be JSON serializable.""" + return self._attrs + + def __eq__(self, other): + return ( + isinstance(other, Group) and + self._store == other.store and + self._read_only == other.read_only and + self._path == other.path + # N.B., no need to compare attributes, should be covered by + # store comparison + ) + + def __iter__(self): + """Return an iterator over group member names. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) + >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) + >>> for name in g1: + ... print(name) + bar + baz + foo + quux + + """ + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if (contains_array(self._store, path) or + contains_group(self._store, path)): + yield key + + def __len__(self): + """Number of members.""" + return sum(1 for _ in self) + + def __repr__(self): + r = '%s.%s(' % (type(self).__module__, type(self).__name__) + r += self.name + ', ' + r += str(len(self)) + r += ')' + array_keys = list(self.array_keys()) + if array_keys: + arrays_line = '\n arrays: %s; %s' % \ + (len(array_keys), ', '.join(array_keys)) + if len(arrays_line) > 80: + arrays_line = arrays_line[:77] + '...' + r += arrays_line + group_keys = list(self.group_keys()) + if group_keys: + groups_line = '\n groups: %s; %s' % \ + (len(group_keys), ', '.join(group_keys)) + if len(groups_line) > 80: + groups_line = groups_line[:77] + '...' + r += groups_line + r += '\n store: %s.%s' % (type(self._store).__module__, + type(self._store).__name__) + if self._store != self._chunk_store: + r += '\n chunk_store: %s.%s' % \ + (type(self._chunk_store).__module__, + type(self._chunk_store).__name__) + if self._synchronizer is not None: + r += ('\n synchronizer: %s.%s' % + (type(self._synchronizer).__module__, + type(self._synchronizer).__name__)) + return r + + def __getstate__(self): + return self._store, self._path, self._read_only, self._chunk_store, \ + self._synchronizer + + def __setstate__(self, state): + self.__init__(*state) + + def _item_path(self, item): + if item and item[0] == '/': + # absolute path + path = normalize_storage_path(item) + else: + # relative path + path = normalize_storage_path(item) + if self._path: + path = self._key_prefix + path + return path + + def __contains__(self, item): + """Test for group membership. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> d1 = g1.create_dataset('bar', shape=100, chunks=10) + >>> 'foo' in g1 + True + >>> 'bar' in g1 + True + >>> 'baz' in g1 + False + + """ + path = self._item_path(item) + return contains_array(self._store, path) or \ + contains_group(self._store, path) + + def __getitem__(self, item): + """Obtain a group member. + + Parameters + ---------- + item : string + Member name or path. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> d1 = g1.create_dataset('foo/bar/baz', shape=100, chunks=10) + >>> g1['foo'] + zarr.hierarchy.Group(/foo, 1) + groups: 1; bar + store: zarr.storage.DictStore + >>> g1['foo/bar'] + zarr.hierarchy.Group(/foo/bar, 1) + arrays: 1; baz + store: zarr.storage.DictStore + >>> g1['foo/bar/baz'] + zarr.core.Array(/foo/bar/baz, (100,), float64, chunks=(10,), order=C) + compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} + nbytes: 800; nbytes_stored: 283; ratio: 2.8; initialized: 0/10 + store: zarr.storage.DictStore + + """ # flake8: noqa + path = self._item_path(item) + if contains_array(self._store, path): + return Array(self._store, read_only=self._read_only, path=path, + chunk_store=self._chunk_store, + synchronizer=self._synchronizer) + elif contains_group(self._store, path): + return Group(self._store, read_only=self._read_only, path=path, + chunk_store=self._chunk_store, + synchronizer=self._synchronizer) + else: + raise KeyError(item) + + def group_keys(self): + """Return an iterator over member names for groups only. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) + >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) + >>> sorted(g1.group_keys()) + ['bar', 'foo'] + + """ + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_group(self._store, path): + yield key + + def groups(self): + """Return an iterator over (name, value) pairs for groups only. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) + >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) + >>> for n, v in g1.groups(): + ... print(n, type(v)) + bar + foo + + """ + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_group(self._store, path): + yield key, Group(self._store, path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + synchronizer=self._synchronizer) + + def array_keys(self): + """Return an iterator over member names for arrays only. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) + >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) + >>> sorted(g1.array_keys()) + ['baz', 'quux'] + + """ + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_array(self._store, path): + yield key + + def arrays(self): + """Return an iterator over (name, value) pairs for arrays only. + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> d1 = g1.create_dataset('baz', shape=100, chunks=10) + >>> d2 = g1.create_dataset('quux', shape=200, chunks=20) + >>> for n, v in g1.arrays(): + ... print(n, type(v)) + baz + quux + + """ + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_array(self._store, path): + yield key, Array(self._store, path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + synchronizer=self._synchronizer) + + def _write_op(self, f, *args, **kwargs): + + # guard condition + if self._read_only: + raise ReadOnlyError('group is read-only') + + # synchronization + if self._synchronizer is None: + return f(*args, **kwargs) + else: + # synchronize on the root group + with self._synchronizer[group_meta_key]: + return f(*args, **kwargs) + + def create_group(self, name): + """Create a sub-group. + + Parameters + ---------- + name : string + Group name. + + Returns + ------- + g : zarr.hierarchy.Group + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.create_group('foo') + >>> g3 = g1.create_group('bar') + >>> g4 = g1.create_group('baz/quux') + + """ + + return self._write_op(self._create_group_nosync, name) + + def _create_group_nosync(self, name): + + path = self._item_path(name) + + # require intermediate groups + segments = path.split('/') + for i in range(len(segments)): + p = '/'.join(segments[:i]) + if contains_array(self._store, p): + raise KeyError(name) + elif not contains_group(self._store, p): + init_group(self._store, path=p, chunk_store=self._chunk_store) + + # create terminal group + if contains_array(self._store, path): + raise KeyError(name) + if contains_group(self._store, path): + raise KeyError(name) + else: + init_group(self._store, path=path, chunk_store=self._chunk_store) + return Group(self._store, path=path, read_only=self._read_only, + chunk_store=self._chunk_store, + synchronizer=self._synchronizer) + + def create_groups(self, *names): + """Convenience method to create multiple groups in a single call.""" + return tuple(self.create_group(name) for name in names) + + def require_group(self, name): + """Obtain a sub-group, creating one if it doesn't exist. + + Parameters + ---------- + name : string + Group name. + + Returns + ------- + g : zarr.hierarchy.Group + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> g2 = g1.require_group('foo') + >>> g3 = g1.require_group('foo') + >>> g2 == g3 + True + + """ + + return self._write_op(self._require_group_nosync, name) + + def _require_group_nosync(self, name): + + path = self._item_path(name) + + # require all intermediate groups + segments = path.split('/') + for i in range(len(segments) + 1): + p = '/'.join(segments[:i]) + if contains_array(self._store, p): + raise KeyError(name) + elif not contains_group(self._store, p): + init_group(self._store, path=p, chunk_store=self._chunk_store) + + return Group(self._store, path=path, read_only=self._read_only, + chunk_store=self._chunk_store, + synchronizer=self._synchronizer) + + def require_groups(self, *names): + """Convenience method to require multiple groups in a single call.""" + return tuple(self.require_group(name) for name in names) + + def _require_parent_group(self, path): + segments = path.split('/') + for i in range(len(segments)): + p = '/'.join(segments[:i]) + if contains_array(self._store, p): + raise KeyError(path) + elif not contains_group(self._store, p): + init_group(self._store, path=p, chunk_store=self._chunk_store) + + def create_dataset(self, name, data=None, shape=None, chunks=None, + dtype=None, compression='default', + compression_opts=None, fill_value=None, order='C', + synchronizer=None, **kwargs): + """Create an array. + + Parameters + ---------- + name : string + Array name. + data : array_like, optional + Initial data. + shape : int or tuple of ints + Array shape. + chunks : int or tuple of ints, optional + Chunk shape. If not provided, will be guessed from `shape` and + `dtype`. + dtype : string or dtype, optional + NumPy dtype. + compression : string, optional + Name of primary compression library, e.g., 'blosc', 'zlib', 'bz2', + 'lzma'. + compression_opts : object, optional + Options to primary compressor. E.g., for blosc, provide a dictionary + with keys 'cname', 'clevel' and 'shuffle'. + fill_value : object + Default value to use for uninitialized portions of the array. + order : {'C', 'F'}, optional + Memory layout to be used within each chunk. + synchronizer : zarr.sync.ArraySynchronizer, optional + Array synchronizer. + + Returns + ------- + a : zarr.core.Array + + Examples + -------- + >>> import zarr + >>> g1 = zarr.group() + >>> d1 = g1.create_dataset('foo', shape=(10000, 10000), + ... chunks=(1000, 1000)) + >>> d1 + zarr.core.Array(/foo, (10000, 10000), float64, chunks=(1000, 1000), order=C) + compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} + nbytes: 762.9M; nbytes_stored: 316; ratio: 2531645.6; initialized: 0/100 + store: zarr.storage.DictStore + + """ # flake8: noqa + + # N.B., additional kwargs are included in method signature to + # improve compatibility for users familiar with h5py and adapting + # code that previously used h5py. These keyword arguments are + # ignored here but we issue a warning to let the user know. + for k in kwargs: + if k == 'fillvalue': + warn("ignoring keyword argument %r; please use 'fill_value' " + "instead" % k) + else: + warn('ignoring keyword argument %r' % k) + + return self._write_op(self._create_dataset_nosync, name, data=data, + shape=shape, chunks=chunks, dtype=dtype, + compression=compression, + compression_opts=compression_opts, + fill_value=fill_value, order=order, + synchronizer=synchronizer) + + def _create_dataset_nosync(self, name, data=None, shape=None, chunks=None, + dtype=None, compression='default', + compression_opts=None, fill_value=None, + order='C', synchronizer=None): + + path = self._item_path(name) + self._require_parent_group(path) + + # guard conditions + if contains_array(self._store, path): + raise KeyError(name) + if contains_group(self._store, path): + raise KeyError(name) + + # determine synchronizer + if synchronizer is None: + synchronizer = self._synchronizer + + if data is not None: + a = array(data, chunks=chunks, dtype=dtype, + compression=compression, + compression_opts=compression_opts, + fill_value=fill_value, order=order, + synchronizer=synchronizer, store=self._store, + path=path, chunk_store=self._chunk_store) + + else: + a = create(shape=shape, chunks=chunks, dtype=dtype, + compression=compression, + compression_opts=compression_opts, + fill_value=fill_value, order=order, + synchronizer=synchronizer, store=self._store, + path=path, chunk_store=self._chunk_store) + + return a + + def require_dataset(self, name, shape, dtype=None, exact=False, **kwargs): + """Obtain an array, creating if it doesn't exist. Other `kwargs` are + as per :func:`zarr.hierarchy.Group.create_dataset`. + + Parameters + ---------- + name : string + Array name. + shape : int or tuple of ints + Array shape. + dtype : string or dtype, optional + NumPy dtype. + exact : bool, optional + If True, require `dtype` to match exactly. If false, require + `dtype` can be cast from array dtype. + + """ + + return self._write_op(self._require_dataset_nosync, name, shape=shape, + dtype=dtype, exact=exact, **kwargs) + + def _require_dataset_nosync(self, name, shape, dtype=None, exact=False, + **kwargs): + + path = self._item_path(name) + + if contains_array(self._store, path): + synchronizer = kwargs.get('synchronizer', self._synchronizer) + a = Array(self._store, path=path, read_only=self._read_only, + chunk_store=self._chunk_store, synchronizer=synchronizer) + shape = normalize_shape(shape) + if shape != a.shape: + raise TypeError('shapes do not match') + dtype = np.dtype(dtype) + if exact: + if dtype != a.dtype: + raise TypeError('dtypes do not match exactly') + else: + if not np.can_cast(dtype, a.dtype): + raise TypeError('dtypes cannot be safely cast') + return a + + else: + return self._create_dataset_nosync(name, shape=shape, dtype=dtype, + **kwargs) + + def create(self, name, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.create`.""" + return self._write_op(self._create_nosync, name, **kwargs) + + def _create_nosync(self, name, **kwargs): + path = self._item_path(name) + self._require_parent_group(path) + kwargs.setdefault('synchronizer', self._synchronizer) + return create(store=self._store, path=path, + chunk_store=self._chunk_store, **kwargs) + + def empty(self, name, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.empty`.""" + return self._write_op(self._empty_nosync, name, **kwargs) + + def _empty_nosync(self, name, **kwargs): + path = self._item_path(name) + self._require_parent_group(path) + kwargs.setdefault('synchronizer', self._synchronizer) + return empty(store=self._store, path=path, + chunk_store=self._chunk_store, **kwargs) + + def zeros(self, name, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.zeros`.""" + return self._write_op(self._zeros_nosync, name, **kwargs) + + def _zeros_nosync(self, name, **kwargs): + path = self._item_path(name) + self._require_parent_group(path) + kwargs.setdefault('synchronizer', self._synchronizer) + return zeros(store=self._store, path=path, + chunk_store=self._chunk_store, **kwargs) + + def ones(self, name, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.ones`.""" + return self._write_op(self._ones_nosync, name, **kwargs) + + def _ones_nosync(self, name, **kwargs): + path = self._item_path(name) + self._require_parent_group(path) + kwargs.setdefault('synchronizer', self._synchronizer) + return ones(store=self._store, path=path, + chunk_store=self._chunk_store, **kwargs) + + def full(self, name, fill_value, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.full`.""" + return self._write_op(self._full_nosync, name, fill_value, **kwargs) + + def _full_nosync(self, name, fill_value, **kwargs): + path = self._item_path(name) + self._require_parent_group(path) + kwargs.setdefault('synchronizer', self._synchronizer) + return full(store=self._store, path=path, + chunk_store=self._chunk_store, + fill_value=fill_value, **kwargs) + + def array(self, name, data, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.array`.""" + return self._write_op(self._array_nosync, name, data, **kwargs) + + def _array_nosync(self, name, data, **kwargs): + path = self._item_path(name) + self._require_parent_group(path) + kwargs.setdefault('synchronizer', self._synchronizer) + return array(data, store=self._store, path=path, + chunk_store=self._chunk_store, **kwargs) + + def empty_like(self, name, data, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.empty_like`.""" + return self._write_op(self._empty_like_nosync, name, data, **kwargs) + + def _empty_like_nosync(self, name, data, **kwargs): + path = self._item_path(name) + self._require_parent_group(path) + kwargs.setdefault('synchronizer', self._synchronizer) + return empty_like(data, store=self._store, path=path, + chunk_store=self._chunk_store, **kwargs) + + def zeros_like(self, name, data, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.zeros_like`.""" + return self._write_op(self._zeros_like_nosync, name, data, **kwargs) + + def _zeros_like_nosync(self, name, data, **kwargs): + path = self._item_path(name) + self._require_parent_group(path) + kwargs.setdefault('synchronizer', self._synchronizer) + return zeros_like(data, store=self._store, path=path, + chunk_store=self._chunk_store, **kwargs) + + def ones_like(self, name, data, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.ones_like`.""" + return self._write_op(self._ones_like_nosync, name, data, **kwargs) + + def _ones_like_nosync(self, name, data, **kwargs): + path = self._item_path(name) + self._require_parent_group(path) + kwargs.setdefault('synchronizer', self._synchronizer) + return ones_like(data, store=self._store, path=path, + chunk_store=self._chunk_store, **kwargs) + + def full_like(self, name, data, **kwargs): + """Create an array. Keyword arguments as per + :func:`zarr.creation.full_like`.""" + return self._write_op(self._full_like_nosync, name, data, **kwargs) + + def _full_like_nosync(self, name, data, **kwargs): + path = self._item_path(name) + self._require_parent_group(path) + kwargs.setdefault('synchronizer', self._synchronizer) + return full_like(data, store=self._store, path=path, + chunk_store=self._chunk_store, **kwargs) + + +def group(store=None, overwrite=False, chunk_store=None, synchronizer=None): + """Create a group. + + Parameters + ---------- + store : MutableMapping, optional + Group storage. If not provided, a DictStore will be used, meaning + that data will be stored in memory. + overwrite : bool, optional + If True, delete any pre-existing data in `store` at `path` before + creating the group. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + synchronizer : object, optional + Array synchronizer. + + Returns + ------- + g : zarr.hierarchy.Group + + Examples + -------- + + Create a group in memory:: + + >>> import zarr + >>> g = zarr.group() + >>> g + zarr.hierarchy.Group(/, 0) + store: zarr.storage.DictStore + + Create a group with a different store:: + + >>> store = zarr.DirectoryStore('example') + >>> g = zarr.group(store=store, overwrite=True) + >>> g + zarr.hierarchy.Group(/, 0) + store: zarr.storage.DirectoryStore + + """ + + # ensure store + if store is None: + store = DictStore() + + # require group + if overwrite: + init_group(store, overwrite=True, chunk_store=chunk_store) + elif contains_array(store): + raise ValueError('store contains an array') + elif not contains_group(store): + init_group(store, chunk_store=chunk_store) + + return Group(store, read_only=False, chunk_store=chunk_store, + synchronizer=synchronizer) + + +def open_group(path, mode='a', synchronizer=None): + """Convenience function to instantiate a group stored in a directory on + the file system. + + Parameters + ---------- + path : string + Path to directory in file system in which to store the group. + mode : {'r', 'r+', 'a', 'w', 'w-'} + Persistence mode: 'r' means read only (must exist); 'r+' means + read/write (must exist); 'a' means read/write (create if doesn't + exist); 'w' means create (overwrite if exists); 'w-' means create + (fail if exists). + synchronizer : object, optional + Array synchronizer. + + Returns + ------- + g : zarr.hierarchy.Group + + Examples + -------- + >>> import zarr + >>> root = zarr.open_group('example', mode='w') + >>> foo = root.create_group('foo') + >>> bar = root.create_group('bar') + >>> root + zarr.hierarchy.Group(/, 2) + groups: 2; bar, foo + store: zarr.storage.DirectoryStore + >>> root2 = zarr.open_group('example', mode='a') + >>> root2 + zarr.hierarchy.Group(/, 2) + groups: 2; bar, foo + store: zarr.storage.DirectoryStore + >>> root == root2 + True + + """ + + # setup store + store = DirectoryStore(path) + + # ensure store is initialized + + if mode in ['r', 'r+']: + if contains_array(store): + raise ValueError('store contains array') + elif not contains_group(store): + raise ValueError('group does not exist') + + elif mode == 'w': + init_group(store, overwrite=True) + + elif mode == 'a': + if contains_array(store): + raise ValueError('store contains array') + elif not contains_group(store): + init_group(store) + + elif mode in ['w-', 'x']: + if contains_array(store): + raise ValueError('store contains array') + elif contains_group(store): + raise ValueError('store contains group') + else: + init_group(store) + + # determine read only status + read_only = mode == 'r' + + return Group(store, read_only=read_only, synchronizer=synchronizer) diff --git a/zarr/meta.py b/zarr/meta.py index 05a055cc76..15f3d729cb 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -6,25 +6,31 @@ import numpy as np -from zarr.compat import PY2, text_type +from zarr.compat import PY2, text_type, binary_type from zarr.errors import MetadataError -def decode_metadata(b): - s = text_type(b, 'ascii') +ZARR_FORMAT = 2 + + +def decode_array_metadata(s): + if isinstance(s, binary_type): + s = text_type(s, 'ascii') meta = json.loads(s) zarr_format = meta.get('zarr_format', None) - if zarr_format != 1: + if zarr_format != ZARR_FORMAT: raise MetadataError('unsupported zarr format: %s' % zarr_format) try: + dtype = decode_dtype(meta['dtype']) + fill_value = decode_fill_value(meta['fill_value'], dtype) meta = dict( zarr_format=meta['zarr_format'], shape=tuple(meta['shape']), chunks=tuple(meta['chunks']), - dtype=decode_dtype(meta['dtype']), + dtype=dtype, compression=meta['compression'], compression_opts=meta['compression_opts'], - fill_value=meta['fill_value'], + fill_value=fill_value, order=meta['order'], ) except Exception as e: @@ -33,15 +39,15 @@ def decode_metadata(b): return meta -def encode_metadata(meta): +def encode_array_metadata(meta): meta = dict( - zarr_format=1, + zarr_format=ZARR_FORMAT, shape=meta['shape'], chunks=meta['chunks'], dtype=encode_dtype(meta['dtype']), compression=meta['compression'], compression_opts=meta['compression_opts'], - fill_value=meta['fill_value'], + fill_value=encode_fill_value(meta['fill_value']), order=meta['order'], ) s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True) @@ -72,3 +78,60 @@ def _decode_dtype_descr(d): def decode_dtype(d): d = _decode_dtype_descr(d) return np.dtype(d) + + +def decode_group_metadata(s): + if isinstance(s, binary_type): + s = text_type(s, 'ascii') + meta = json.loads(s) + zarr_format = meta.get('zarr_format', None) + if zarr_format != ZARR_FORMAT: + raise MetadataError('unsupported zarr format: %s' % zarr_format) + meta = dict( + zarr_format=ZARR_FORMAT, + ) + return meta + + +def encode_group_metadata(meta=None): + meta = dict( + zarr_format=ZARR_FORMAT, + ) + s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True) + b = s.encode('ascii') + return b + + +FLOAT_FILLS = { + 'NaN': np.nan, + 'Infinity': np.PINF, + '-Infinity': np.NINF +} + + +def decode_fill_value(v, dtype): + if dtype.kind == 'f': + if v == 'NaN': + return np.nan + elif v == 'Infinity': + return np.PINF + elif v == '-Infinity': + return np.NINF + else: + return v + else: + return v + + +def encode_fill_value(v): + try: + if np.isnan(v): + return 'NaN' + elif np.isposinf(v): + return 'Infinity' + elif np.isneginf(v): + return '-Infinity' + else: + return v + except TypeError: + return v diff --git a/zarr/storage.py b/zarr/storage.py index 691df02b60..24e954611f 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -4,29 +4,131 @@ import os import tempfile import json +import zipfile +import shutil +import operator import numpy as np -from zarr.util import normalize_shape, normalize_chunks, normalize_order +from zarr.util import normalize_shape, normalize_chunks, normalize_order, \ + normalize_storage_path from zarr.compressors import get_compressor_cls -from zarr.meta import encode_metadata - - -def init_store(store, shape, chunks, dtype=None, compression='default', - compression_opts=None, fill_value=None, - order='C', overwrite=False): - """Initialise an array store with the given configuration. +from zarr.meta import encode_array_metadata, encode_group_metadata +from zarr.compat import PY2, binary_type, reduce + + +array_meta_key = '.zarray' +group_meta_key = '.zgroup' +attrs_key = '.zattrs' + + +def _path_to_prefix(path): + # assume path already normalized + if path: + prefix = path + '/' + else: + prefix = '' + return prefix + + +def contains_array(store, path=None): + """Return True if the store contains an array at the given logical path.""" + path = normalize_storage_path(path) + prefix = _path_to_prefix(path) + key = prefix + array_meta_key + return key in store + + +def contains_group(store, path=None): + """Return True if the store contains a group at the given logical path.""" + path = normalize_storage_path(path) + prefix = _path_to_prefix(path) + key = prefix + group_meta_key + return key in store + + +def _rmdir_from_keys(store, path=None): + # assume path already normalized + prefix = _path_to_prefix(path) + for key in set(store.keys()): + if key.startswith(prefix): + del store[key] + + +def rmdir(store, path=None): + """Remove all items under the given path.""" + path = normalize_storage_path(path) + if hasattr(store, 'rmdir'): + # pass through + store.rmdir(path) + else: + # slow version, delete one key at a time + _rmdir_from_keys(store, path) + + +def _listdir_from_keys(store, path=None): + # assume path already normalized + prefix = _path_to_prefix(path) + children = set() + for key in store.keys(): + if key.startswith(prefix) and len(key) > len(prefix): + suffix = key[len(prefix):] + child = suffix.split('/')[0] + children.add(child) + return sorted(children) + + +def listdir(store, path=None): + """Obtain a directory listing for the given path.""" + path = normalize_storage_path(path) + if hasattr(store, 'listdir'): + # pass through + return store.listdir(path) + else: + # slow version, iterate through all keys + return _listdir_from_keys(store, path) + + +def getsize(store, path=None): + """Compute size of stored items for a given path.""" + path = normalize_storage_path(path) + if hasattr(store, 'getsize'): + # pass through + return store.getsize(path) + elif isinstance(store, dict): + # compute from size of values + prefix = _path_to_prefix(path) + size = 0 + for k in listdir(store, path): + try: + v = store[prefix + k] + except KeyError: + pass + else: + try: + size += buffersize(v) + except TypeError: + return -1 + return size + else: + return -1 + + +def init_array(store, shape, chunks=None, dtype=None, compression='default', + compression_opts=None, fill_value=None, order='C', + overwrite=False, path=None, chunk_store=None): + """initialize an array store with the given configuration. Parameters ---------- store : MutableMapping - A mapping that supports string keys and byte sequence values. + A mapping that supports string keys and bytes-like values. shape : int or tuple of ints Array shape. - chunks : int or tuple of ints - Chunk shape. + chunks : int or tuple of ints, optional + Chunk shape. If not provided, will be guessed from `shape` and `dtype`. dtype : string or dtype, optional NumPy dtype. compression : string, optional @@ -36,60 +138,108 @@ def init_store(store, shape, chunks, dtype=None, compression='default', Options to primary compressor. E.g., for blosc, provide a dictionary with keys 'cname', 'clevel' and 'shuffle'. fill_value : object - Default value to use for uninitialised portions of the array. + Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Memory layout to be used within each chunk. overwrite : bool, optional If True, erase all data in `store` prior to initialisation. + path : string, optional + Path under which array is stored. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. Examples -------- - >>> import zarr - >>> store = dict() - >>> zarr.init_store(store, shape=(10000, 10000), chunks=(1000, 1000)) - >>> sorted(store.keys()) - ['attrs', 'meta'] - >>> print(str(store['meta'], 'ascii')) - { - "chunks": [ - 1000, - 1000 - ], - "compression": "blosc", - "compression_opts": { - "clevel": 5, - "cname": "lz4", - "shuffle": 1 - }, - "dtype": ">> print(str(store['attrs'], 'ascii')) - {} + Initialize an array store:: + + >>> from zarr.storage import init_array + >>> store = dict() + >>> init_array(store, shape=(10000, 10000), chunks=(1000, 1000)) + >>> sorted(store.keys()) + ['.zarray', '.zattrs'] + + Array metadata is stored as JSON:: + + >>> print(str(store['.zarray'], 'ascii')) + { + "chunks": [ + 1000, + 1000 + ], + "compression": "blosc", + "compression_opts": { + "clevel": 5, + "cname": "lz4", + "shuffle": 1 + }, + "dtype": ">> print(str(store['.zattrs'], 'ascii')) + {} + + Initialize an array using a storage path:: + + >>> init_array(store, shape=100000000, chunks=1000000, dtype='i1', + ... path='foo/bar') + >>> sorted(store.keys()) + ['.zarray', '.zattrs', 'foo/bar/.zarray', 'foo/bar/.zattrs'] + >>> print(str(store['foo/bar/.zarray'], 'ascii')) + { + "chunks": [ + 1000000 + ], + "compression": "blosc", + "compression_opts": { + "clevel": 5, + "cname": "lz4", + "shuffle": 1 + }, + "dtype": "|i1", + "fill_value": null, + "order": "C", + "shape": [ + 100000000 + ], + "zarr_format": 2 + } Notes ----- The initialisation process involves normalising all array metadata, - encoding as JSON and storing under the 'meta' key. User attributes are also - initialised and stored as JSON under the 'attrs' key. + encoding as JSON and storing under the '.zarray' key. User attributes are + also initialized and stored as JSON under the '.zattrs' key. """ + # normalize path + path = normalize_storage_path(path) + # guard conditions - empty = len(store) == 0 - if not empty and not overwrite: - raise ValueError('store is not empty') - - # normalise metadata + if overwrite: + # attempt to delete any pre-existing items in store + rmdir(store, path) + if chunk_store is not None and chunk_store != store: + rmdir(chunk_store, path) + elif contains_array(store, path): + raise ValueError('store contains an array') + elif contains_group(store, path): + raise ValueError('store contains a group') + + # normalize metadata shape = normalize_shape(shape) - chunks = normalize_chunks(chunks, shape) dtype = np.dtype(dtype) + chunks = normalize_chunks(chunks, shape, dtype.itemsize) compressor_cls = get_compressor_cls(compression) compression = compressor_cls.canonical_name compression_opts = compressor_cls.normalize_opts( @@ -97,17 +247,260 @@ def init_store(store, shape, chunks, dtype=None, compression='default', ) order = normalize_order(order) - # delete any pre-existing items in store - store.clear() - - # initialise metadata + # initialize metadata meta = dict(shape=shape, chunks=chunks, dtype=dtype, compression=compression, compression_opts=compression_opts, fill_value=fill_value, order=order) - store['meta'] = encode_metadata(meta) + key = _path_to_prefix(path) + array_meta_key + store[key] = encode_array_metadata(meta) + + # initialize attributes + key = _path_to_prefix(path) + attrs_key + store[key] = json.dumps(dict()).encode('ascii') + + +# backwards compatibility +init_store = init_array + + +def init_group(store, overwrite=False, path=None, chunk_store=None): + """initialize a group store. + + Parameters + ---------- + store : MutableMapping + A mapping that supports string keys and byte sequence values. + overwrite : bool, optional + If True, erase all data in `store` prior to initialisation. + path : string, optional + Path under which array is stored. + chunk_store : MutableMapping, optional + Separate storage for chunks. If not provided, `store` will be used + for storage of both chunks and metadata. + + """ + + # normalize path + path = normalize_storage_path(path) + + # guard conditions + if overwrite: + # attempt to delete any pre-existing items in store + rmdir(store, path) + if chunk_store is not None and chunk_store != store: + rmdir(chunk_store, path) + elif contains_array(store, path): + raise ValueError('store contains an array') + elif contains_group(store, path): + raise ValueError('store contains a group') + + # initialize metadata + # N.B., currently no metadata properties are needed, however there may + # be in future + meta = dict() + key = _path_to_prefix(path) + group_meta_key + store[key] = encode_group_metadata(meta) + + # initialize attributes + key = _path_to_prefix(path) + attrs_key + store[key] = json.dumps(dict()).encode('ascii') + + +def ensure_bytes(s): + if isinstance(s, binary_type): + return s + if hasattr(s, 'tobytes'): + return s.tobytes() + if PY2 and hasattr(s, 'tostring'): # pragma: no cover + return s.tostring() + return memoryview(s).tobytes() + + +def _dict_store_keys(d, prefix='', cls=dict): + for k in d.keys(): + v = d[k] + if isinstance(v, cls): + for sk in _dict_store_keys(v, prefix + k + '/', cls): + yield sk + else: + yield prefix + k + + +def buffersize(v): + from array import array as _stdlib_array + if PY2 and isinstance(v, _stdlib_array): # pragma: no cover + # special case array.array because does not support buffer + # interface in PY2 + return v.buffer_info()[1] * v.itemsize + else: + v = memoryview(v) + return reduce(operator.mul, v.shape) * v.itemsize + + +class DictStore(MutableMapping): + """Extended mutable mapping interface to a hierarchy of dicts. + + Examples + -------- + >>> import zarr + >>> store = zarr.DictStore() + >>> store['foo'] = b'bar' + >>> store['foo'] + b'bar' + >>> store['a/b/c'] = b'xxx' + >>> store['a/b/c'] + b'xxx' + >>> sorted(store.keys()) + ['a/b/c', 'foo'] + >>> store.listdir() + ['a', 'foo'] + >>> store.listdir('a/b') + ['c'] + >>> store.rmdir('a') + >>> sorted(store.keys()) + ['foo'] + + """ # flake8: noqa + + def __init__(self, cls=dict): + self.root = cls() + self.cls = cls + + def _get_parent(self, item): + parent = self.root + # split the item + segments = item.split('/') + # find the parent container + for k in segments[:-1]: + parent = parent[k] + if not isinstance(parent, self.cls): + raise KeyError(item) + return parent, segments[-1] + + def _require_parent(self, item): + parent = self.root + # split the item + segments = item.split('/') + # require the parent container + for k in segments[:-1]: + try: + parent = parent[k] + except KeyError: + parent[k] = self.cls() + parent = parent[k] + else: + if not isinstance(parent, self.cls): + raise KeyError(item) + return parent, segments[-1] + + def __getitem__(self, item): + parent, key = self._get_parent(item) + try: + value = parent[key] + except KeyError: + raise KeyError(item) + else: + if isinstance(value, self.cls): + raise KeyError(item) + else: + return value + + def __setitem__(self, item, value): + parent, key = self._require_parent(item) + parent[key] = value + + def __delitem__(self, item): + parent, key = self._get_parent(item) + try: + del parent[key] + except KeyError: + raise KeyError(item) + + def __contains__(self, item): + try: + parent, key = self._get_parent(item) + value = parent[key] + except KeyError: + return False + else: + return not isinstance(value, self.cls) + + def __eq__(self, other): + return ( + isinstance(other, DictStore) and + self.root == other.root and + self.cls == other.cls + ) + + def keys(self): + for k in _dict_store_keys(self.root, cls=self.cls): + yield k + + def __iter__(self): + return self.keys() - # initialise attributes - store['attrs'] = json.dumps(dict()).encode('ascii') + def __len__(self): + return sum(1 for _ in self.keys()) + + def listdir(self, path=None): + path = normalize_storage_path(path) + if path: + try: + parent, key = self._get_parent(path) + value = parent[key] + except KeyError: + return [] + else: + value = self.root + if isinstance(value, self.cls): + return sorted(value.keys()) + else: + return [] + + def rmdir(self, path=None): + path = normalize_storage_path(path) + if path: + try: + parent, key = self._get_parent(path) + value = parent[key] + except KeyError: + return + else: + if isinstance(value, self.cls): + del parent[key] + else: + # clear out root + self.root = self.cls() + + def getsize(self, path=None): + path = normalize_storage_path(path) + + # obtain value to return size of + if path: + try: + parent, key = self._get_parent(path) + value = parent[key] + except KeyError: + raise ValueError('path not found: %r' % path) + else: + value = self.root + + # obtain size of value + if isinstance(value, self.cls): + # total size for directory + size = 0 + for v in value.values(): + if not isinstance(v, self.cls): + try: + size += buffersize(v) + except TypeError: + return -1 + return size + else: + try: + return buffersize(value) + except TypeError: + return -1 class DirectoryStore(MutableMapping): @@ -122,52 +515,29 @@ class DirectoryStore(MutableMapping): Examples -------- >>> import zarr - >>> store = zarr.DirectoryStore('example.zarr') - >>> zarr.init_store(store, shape=(10000, 10000), chunks=(1000, 1000), - ... fill_value=0, overwrite=True) + >>> store = zarr.DirectoryStore('example_store') + >>> store['foo'] = b'bar' + >>> store['foo'] + b'bar' + >>> open('example_store/foo', 'rb').read() + b'bar' + >>> store['a/b/c'] = b'xxx' + >>> store['a/b/c'] + b'xxx' + >>> open('example_store/a/b/c', 'rb').read() + b'xxx' + >>> sorted(store.keys()) + ['a/b/c', 'foo'] + >>> store.listdir() + ['a', 'foo'] + >>> store.listdir('a/b') + ['c'] + >>> store.rmdir('a') + >>> sorted(store.keys()) + ['foo'] >>> import os - >>> sorted(os.listdir('example.zarr')) - ['attrs', 'meta'] - >>> print(open('example.zarr/meta').read()) - { - "chunks": [ - 1000, - 1000 - ], - "compression": "blosc", - "compression_opts": { - "clevel": 5, - "cname": "lz4", - "shuffle": 1 - }, - "dtype": ">> print(open('example.zarr/attrs').read()) - {} - >>> z = zarr.Array(store) - >>> z - zarr.core.Array((10000, 10000), float64, chunks=(1000, 1000), order=C) - compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} - nbytes: 762.9M; nbytes_stored: 313; ratio: 2555910.5; initialized: 0/100 - store: zarr.storage.DirectoryStore - >>> z[:] = 1 - >>> len(os.listdir('example.zarr')) - 102 - >>> sorted(os.listdir('example.zarr'))[0:5] - ['0.0', '0.1', '0.2', '0.3', '0.4'] - >>> print(open('example.zarr/0.0', 'rb').read(10)) - b'\\x02\\x01!\\x08\\x00\\x12z\\x00\\x00\\x80' - - See Also - -------- - zarr.creation.open + >>> os.path.exists('example_store/a') + False """ # flake8: noqa @@ -175,56 +545,82 @@ def __init__(self, path): # guard conditions path = os.path.abspath(path) - if not os.path.exists(path): - raise ValueError('path does not exist') - elif not os.path.isdir(path): - raise ValueError('path is not a directory') + if os.path.exists(path) and not os.path.isdir(path): + raise ValueError('path exists but is not a directory') self.path = path def __getitem__(self, key): - - # guard conditions - if key not in self: + filepath = os.path.join(self.path, key) + if os.path.isfile(filepath): + with open(filepath, 'rb') as f: + return f.read() + else: raise KeyError(key) - with open(os.path.join(self.path, key), 'rb') as f: - return f.read() - def __setitem__(self, key, value): - # accept any value that can be written to a file # destination path for key - dest_path = os.path.join(self.path, key) + file_path = os.path.join(self.path, key) + + # ensure there is no directory in the way + if os.path.isdir(file_path): + shutil.rmtree(file_path) + + # ensure containing directory exists + dir_path, file_name = os.path.split(file_path) + if os.path.isfile(dir_path): + raise KeyError(key) + if not os.path.exists(dir_path): + try: + os.makedirs(dir_path) + except Exception: + raise KeyError(key) # write to temporary file with tempfile.NamedTemporaryFile(mode='wb', delete=False, - dir=self.path, - prefix=key + '.', + dir=dir_path, + prefix=file_name + '.', suffix='.partial') as f: f.write(value) temp_path = f.name # move temporary file into place - if os.path.exists(dest_path): - os.remove(dest_path) - os.rename(temp_path, dest_path) + if os.path.exists(file_path): + os.remove(file_path) + os.rename(temp_path, file_path) def __delitem__(self, key): - - # guard conditions - if key not in self: + path = os.path.join(self.path, key) + if os.path.isfile(path): + os.remove(path) + elif os.path.isdir(path): + # include support for deleting directories, even though strictly + # speaking these do not exist as keys in the store + shutil.rmtree(path) + else: raise KeyError(key) - os.remove(os.path.join(self.path, key)) - def __contains__(self, key): - return os.path.isfile(os.path.join(self.path, key)) + file_path = os.path.join(self.path, key) + return os.path.isfile(file_path) + + def __eq__(self, other): + return ( + isinstance(other, DirectoryStore) and + self.path == other.path + ) def keys(self): - for key in os.listdir(self.path): - if os.path.isfile(os.path.join(self.path, key)): - yield key + directories = [(self.path, '')] + while directories: + dir_name, prefix = directories.pop() + for name in os.listdir(dir_name): + path = os.path.join(dir_name, name) + if os.path.isfile(path): + yield prefix + name + elif os.path.isdir(path): + directories.append((path, prefix + name + '/')) def __iter__(self): return self.keys() @@ -232,8 +628,134 @@ def __iter__(self): def __len__(self): return sum(1 for _ in self.keys()) - @property - def size(self): - """Total size of all values in number of bytes.""" - return sum(os.path.getsize(os.path.join(self.path, key)) - for key in self.keys()) + def listdir(self, path=None): + store_path = normalize_storage_path(path) + dir_path = self.path + if store_path: + dir_path = os.path.join(dir_path, store_path) + if os.path.isdir(dir_path): + return sorted(os.listdir(dir_path)) + else: + return [] + + def rmdir(self, path=None): + store_path = normalize_storage_path(path) + dir_path = self.path + if store_path: + dir_path = os.path.join(dir_path, store_path) + if os.path.isdir(dir_path): + shutil.rmtree(dir_path) + + def getsize(self, path=None): + store_path = normalize_storage_path(path) + fs_path = self.path + if store_path: + fs_path = os.path.join(fs_path, store_path) + if os.path.isfile(fs_path): + return os.path.getsize(fs_path) + elif os.path.isdir(fs_path): + children = os.listdir(fs_path) + size = 0 + for child in children: + child_fs_path = os.path.join(fs_path, child) + if os.path.isfile(child_fs_path): + size += os.path.getsize(child_fs_path) + return size + else: + raise ValueError('path not found: %r' % path) + + +# noinspection PyPep8Naming +class ZipStore(MutableMapping): + """TODO""" + + def __init__(self, path, compression=zipfile.ZIP_STORED, + allowZip64=True, mode='a'): + + # ensure zip file exists + path = os.path.abspath(path) + with zipfile.ZipFile(path, mode=mode): + pass + + self.path = path + self.compression = compression + self.allowZip64 = allowZip64 + + def __getitem__(self, key): + with zipfile.ZipFile(self.path) as zf: + with zf.open(key) as f: # will raise KeyError + return f.read() + + def __setitem__(self, key, value): + value = ensure_bytes(value) + with zipfile.ZipFile(self.path, mode='a', + compression=self.compression, + allowZip64=self.allowZip64) as zf: + zf.writestr(key, value) + + def __delitem__(self, key): + raise NotImplementedError + + def __eq__(self, other): + return ( + isinstance(other, ZipStore) and + self.path == other.path and + self.compression == other.compression and + self.allowZip64 == other.allowZip64 + ) + + def keylist(self): + with zipfile.ZipFile(self.path) as zf: + keylist = sorted(zf.namelist()) + return keylist + + def keys(self): + for key in self.keylist(): + yield key + + def __iter__(self): + return self.keys() + + def __len__(self): + return sum(1 for _ in self.keys()) + + def __contains__(self, key): + with zipfile.ZipFile(self.path) as zf: + try: + zf.getinfo(key) + except KeyError: + return False + else: + return True + + def listdir(self, path=None): + path = normalize_storage_path(path) + return _listdir_from_keys(self, path) + + def getsize(self, path=None): + path = normalize_storage_path(path) + children = self.listdir(path) + with zipfile.ZipFile(self.path) as zf: + if children: + size = 0 + with zipfile.ZipFile(self.path) as zf: + for child in children: + if path: + name = path + '/' + child + else: + name = child + try: + info = zf.getinfo(name) + except KeyError: + pass + else: + size += info.compress_size + return size + elif path: + try: + info = zf.getinfo(path) + return info.compress_size + except KeyError: + raise ValueError('path not found: %r' % path) + else: + return 0 diff --git a/zarr/sync.py b/zarr/sync.py index 2fa3bffd2d..33d53bbf1c 100644 --- a/zarr/sync.py +++ b/zarr/sync.py @@ -10,6 +10,7 @@ from zarr.core import Array from zarr.attrs import Attributes +from zarr.storage import attrs_key class ThreadSynchronizer(object): @@ -17,22 +18,18 @@ class ThreadSynchronizer(object): def __init__(self): self.mutex = Lock() - self.attrs_lock = Lock() - self.chunk_locks = defaultdict(Lock) + self.locks = defaultdict(Lock) - def chunk_lock(self, ckey): + def __getitem__(self, item): with self.mutex: - lock = self.chunk_locks[ckey] - return lock + return self.locks[item] def __getstate__(self): return dict() def __setstate__(self, d): - # reinitialise from scratch - self.mutex = Lock() - self.attrs_lock = Lock() - self.chunk_locks = defaultdict(Lock) + # reinitialize from scratch + self.__init__() class ProcessSynchronizer(object): @@ -50,97 +47,10 @@ class ProcessSynchronizer(object): def __init__(self, path): self.path = path - @property - def attrs_lock(self): - return fasteners.InterProcessLock( - os.path.join(self.path, 'attrs.lock') - ) - - def chunk_lock(self, ckey): + def __getitem__(self, item): lock = fasteners.InterProcessLock( - os.path.join(self.path, '%s.lock' % ckey) + os.path.join(self.path, '%s.lock' % item) ) return lock - -class SynchronizedArray(Array): - """Instantiate a synchronized array. - - Parameters - ---------- - store : MutableMapping - Array store, already initialised. - synchronizer : object - Array synchronizer. - readonly : bool, optional - True if array should be protected against modification. - - Examples - -------- - >>> import zarr - >>> store = dict() - >>> zarr.init_store(store, shape=1000, chunks=100) - >>> synchronizer = zarr.ThreadSynchronizer() - >>> z = zarr.SynchronizedArray(store, synchronizer) - >>> z - zarr.sync.SynchronizedArray((1000,), float64, chunks=(100,), order=C) - compression: blosc; compression_opts: {'clevel': 5, 'cname': 'lz4', 'shuffle': 1} - nbytes: 7.8K; nbytes_stored: 285; ratio: 28.1; initialized: 0/10 - store: builtins.dict; synchronizer: zarr.sync.ThreadSynchronizer - - Notes - ----- - Only writing data to the array via the __setitem__() method and - modification of user attributes are synchronized. Neither append() nor - resize() are synchronized. - - Writing to the array is synchronized at the chunk level. I.e., - the array supports concurrent write operations via the __setitem__() - method, but these will only exclude each other if they both require - modification of the same chunk. - - """ # flake8: noqa - - def __init__(self, store, synchronizer, readonly=False): - super(SynchronizedArray, self).__init__(store, readonly=readonly) - self.synchronizer = synchronizer - self._attrs = SynchronizedAttributes(store, synchronizer, - readonly=readonly) - - def _chunk_setitem(self, cidx, key, value): - ckey = '.'.join(map(str, cidx)) - with self.synchronizer.chunk_lock(ckey): - super(SynchronizedArray, self)._chunk_setitem(cidx, key, value) - - def __repr__(self): - r = super(SynchronizedArray, self).__repr__() - r += ('; synchronizer: %s.%s' % - (type(self.synchronizer).__module__, - type(self.synchronizer).__name__)) - return r - - def __getstate__(self): - return self._store, self.synchronizer, self._readonly - - def __setstate__(self, state): - self.__init__(*state) - - -class SynchronizedAttributes(Attributes): - - def __init__(self, store, synchronizer, key='attrs', readonly=False): - super(SynchronizedAttributes, self).__init__(store, key=key, - readonly=readonly) - self.synchronizer = synchronizer - - def __setitem__(self, key, value): - with self.synchronizer.attrs_lock: - super(SynchronizedAttributes, self).__setitem__(key, value) - - def __delitem__(self, key): - with self.synchronizer.attrs_lock: - super(SynchronizedAttributes, self).__delitem__(key) - - def update(self, *args, **kwargs): - with self.synchronizer.attrs_lock: - super(SynchronizedAttributes, self).update(*args, **kwargs) + # pickling and unpickling should be handled automatically diff --git a/zarr/tests/test_attrs.py b/zarr/tests/test_attrs.py index 5748e00f35..01b5341af2 100644 --- a/zarr/tests/test_attrs.py +++ b/zarr/tests/test_attrs.py @@ -14,21 +14,21 @@ class TestAttributes(unittest.TestCase): - def init_attributes(self, store, readonly=False): - return Attributes(store, readonly=readonly) + def init_attributes(self, store, read_only=False): + return Attributes(store, key='attrs', read_only=read_only) def test_storage(self): store = dict() - a = self.init_attributes(store) - assert 'attrs' in store - assert isinstance(store['attrs'], binary_type) - d = json.loads(text_type(store['attrs'], 'ascii')) - eq(dict(), d) + a = Attributes(store=store, key='attrs') + assert 'foo' not in a + assert 'bar' not in a + eq(dict(), a.asdict()) a['foo'] = 'bar' a['baz'] = 42 - + assert 'attrs' in store + assert isinstance(store['attrs'], binary_type) d = json.loads(text_type(store['attrs'], 'ascii')) eq(dict(foo='bar', baz=42), d) @@ -74,11 +74,10 @@ def test_iterators(self): eq({'bar', 42}, set(a.values())) eq({('foo', 'bar'), ('baz', 42)}, set(a.items())) - def test_readonly(self): - + def test_read_only(self): store = dict() + a = self.init_attributes(store, read_only=True) store['attrs'] = json.dumps(dict(foo='bar', baz=42)).encode('ascii') - a = self.init_attributes(store, readonly=True) eq(a['foo'], 'bar') eq(a['baz'], 42) with assert_raises(ReadOnlyError): @@ -87,5 +86,3 @@ def test_readonly(self): del a['foo'] with assert_raises(ReadOnlyError): a.update(foo='quux') - with assert_raises(ReadOnlyError): - a.put(dict()) diff --git a/zarr/tests/test_compression.py b/zarr/tests/test_compression.py index a05a780b1e..860811b4f6 100644 --- a/zarr/tests/test_compression.py +++ b/zarr/tests/test_compression.py @@ -41,7 +41,7 @@ def test_compress_decompress_default(self): try: from zarr import blosc # flake8: noqa -except ImportError: +except ImportError: # pragma: no cover print('Blosc not available, skipping Blosc compressor tests') else: @@ -104,7 +104,7 @@ def test_normalize_opts(self): try: import lzma -except ImportError: +except ImportError: # pragma: no cover print('LZMA not available, skipping LZMA compressor tests') else: diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index b62a473bdc..9e13772cb7 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -5,63 +5,137 @@ import atexit import shutil import pickle +import os import numpy as np from numpy.testing import assert_array_equal from nose.tools import eq_ as eq, assert_is_instance, \ - assert_raises, assert_true, assert_false -import zict -from zarr.storage import DirectoryStore, init_store + assert_raises, assert_true, assert_false, assert_is, assert_is_none +from zarr.storage import DirectoryStore, ZipStore, init_array, init_group, \ + buffersize from zarr.core import Array from zarr.errors import ReadOnlyError +from zarr.compat import PY2 -def test_array_init(): - - store = dict() # store not initialised - with assert_raises(ValueError): - Array(store) - - -def test_nbytes_stored(): - - store = dict() - init_store(store, shape=1000, chunks=100) - z = Array(store) - eq(sum(len(v) for v in store.values()), z.nbytes_stored) - z[:] = 42 - eq(sum(len(v) for v in store.values()), z.nbytes_stored) - - # store supporting size determination - path = mkdtemp() - atexit.register(shutil.rmtree, path) - store = DirectoryStore(path) - init_store(store, shape=1000, chunks=100) - z = Array(store) - eq(sum(len(v) for v in store.values()), z.nbytes_stored) - z[:] = 42 - eq(sum(len(v) for v in store.values()), z.nbytes_stored) - - # custom store, doesn't support size determination - store = zict.Zip('test.zip', mode='w') - init_store(store, shape=1000, chunks=100, compression='zlib', - compression_opts=1) - z = Array(store) - eq(-1, z.nbytes_stored) - z[:] = 42 - eq(-1, z.nbytes_stored) +class TestArray(unittest.TestCase): + def test_array_init(self): + + # normal initialization + store = dict() + init_array(store, shape=100, chunks=10) + a = Array(store) + assert_is_instance(a, Array) + eq((100,), a.shape) + eq((10,), a.chunks) + eq('', a.path) + assert_is_none(a.name) + assert_is(store, a.store) + + # initialize at path + store = dict() + init_array(store, shape=100, chunks=10, path='foo/bar') + a = Array(store, path='foo/bar') + assert_is_instance(a, Array) + eq((100,), a.shape) + eq((10,), a.chunks) + eq('foo/bar', a.path) + eq('/foo/bar', a.name) + assert_is(store, a.store) + + # store not initialized + store = dict() + with assert_raises(ValueError): + Array(store) -class TestArray(unittest.TestCase): + # group is in the way + store = dict() + init_group(store, path='baz') + with assert_raises(ValueError): + Array(store, path='baz') - def create_array(self, store=None, readonly=False, **kwargs): + def create_array(self, store=None, path=None, read_only=False, + chunk_store=None, **kwargs): if store is None: store = dict() - init_store(store, **kwargs) - return Array(store, readonly=readonly) + init_array(store, path=path, chunk_store=chunk_store, **kwargs) + return Array(store, path=path, read_only=read_only, + chunk_store=chunk_store) + + def test_nbytes_stored(self): + + # custom store, does not implement getsize() + class CustomMapping(object): + def __init__(self): + self.inner = dict() + + def __getitem__(self, item): + return self.inner[item] + + def __setitem__(self, item, value): + self.inner[item] = value + + def __contains__(self, item): + return item in self.inner + + store = CustomMapping() + z = self.create_array(store=store, shape=1000, chunks=100) + eq(-1, z.nbytes_stored) + z[:] = 42 + eq(-1, z.nbytes_stored) + + store = dict() + chunk_store = CustomMapping() + z = self.create_array(store=store, chunk_store=chunk_store, + shape=1000, chunks=100) + eq(-1, z.nbytes_stored) + z[:] = 42 + eq(-1, z.nbytes_stored) + + # dict as store + store = dict() + z = self.create_array(store=store, shape=1000, chunks=100) + expect_nbytes_stored = sum(buffersize(v) for v in z.store.values()) + if z.store != z.chunk_store: + expect_nbytes_stored += sum(buffersize(v) for v in + z.chunk_store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) + z[:] = 42 + expect_nbytes_stored = sum(buffersize(v) for v in z.store.values()) + if z.store != z.chunk_store: + expect_nbytes_stored += sum(buffersize(v) for v in + z.chunk_store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) + # mess with store + store[z._key_prefix + 'foo'] = list(range(10)) + eq(-1, z.nbytes_stored) + + # for comparison + z = self.create_array(store=dict(), shape=1000, chunks=100, + compression='zlib', compression_opts=1) + z[:] = 42 + + # DirectoryStore + path = mkdtemp() + atexit.register(shutil.rmtree, path) + store = DirectoryStore(path) + zz = self.create_array(store=store, shape=1000, chunks=100, + compression='zlib', compression_opts=1) + zz[:] = 42 + eq(z.nbytes_stored, zz.nbytes_stored) + + # ZipStore + if os.path.exists('test.zip'): + os.remove('test.zip') + store = ZipStore('test.zip') + zz = self.create_array(store=store, shape=1000, chunks=100, + compression='zlib', compression_opts=1) + zz[:] = 42 + eq(z.nbytes_stored, zz.nbytes_stored) def test_array_1d(self): @@ -93,7 +167,11 @@ def test_array_1d(self): # check properties eq(a.nbytes, z.nbytes) - eq(sum(len(v) for v in z.store.values()), z.nbytes_stored) + expect_nbytes_stored = sum(buffersize(v) for v in z.store.values()) + if z.store != z.chunk_store: + expect_nbytes_stored += sum(buffersize(v) for v in + z.chunk_store.values()) + eq(expect_nbytes_stored, z.nbytes_stored) eq(11, z.initialized) # check slicing @@ -170,7 +248,10 @@ def test_array_2d(self): # check properties eq(a.nbytes, z.nbytes) - eq(sum(len(v) for v in z.store.values()), z.nbytes_stored) + expect_nbytes_stored = sum(buffersize(v) for v in z.store.values()) + if z.store != z.chunk_store: + expect_nbytes_stored += sum(buffersize(v) for v in + z.chunk_store.values()) eq(50, z.initialized) # check slicing @@ -421,13 +502,13 @@ def test_append_bad_shape(self): with assert_raises(ValueError): z.append(b) - def test_readonly(self): + def test_read_only(self): z = self.create_array(shape=1000, chunks=100) - assert_false(z.readonly) + assert_false(z.read_only) - z = self.create_array(shape=1000, chunks=100, readonly=True) - assert_true(z.readonly) + z = self.create_array(shape=1000, chunks=100, read_only=True) + assert_true(z.read_only) with assert_raises(ReadOnlyError): z[:] = 42 with assert_raises(ReadOnlyError): @@ -454,3 +535,73 @@ def test_pickle(self): eq(z.compression_opts, z2.compression_opts) eq(z.fill_value, z2.fill_value) assert_array_equal(z[:], z2[:]) + + def test_repr(self): + if not PY2: + + z = self.create_array(shape=100, chunks=10, dtype='f4', + compression='zlib', compression_opts=1) + expect = """zarr.core.Array((100,), float32, chunks=(10,), order=C) + compression: zlib; compression_opts: 1 + nbytes: 400; nbytes_stored: 210; ratio: 1.9; initialized: 0/10 + store: builtins.dict +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestArrayWithPath(TestArray): + + @staticmethod + def create_array(store=None, read_only=False, chunk_store=None, **kwargs): + if store is None: + store = dict() + init_array(store, path='foo/bar', chunk_store=chunk_store, **kwargs) + return Array(store, path='foo/bar', read_only=read_only, + chunk_store=chunk_store) + + def test_repr(self): + if not PY2: + + z = self.create_array(shape=100, chunks=10, dtype='f4', + compression='zlib', compression_opts=1) + # flake8: noqa + expect = """zarr.core.Array(/foo/bar, (100,), float32, chunks=(10,), order=C) + compression: zlib; compression_opts: 1 + nbytes: 400; nbytes_stored: 210; ratio: 1.9; initialized: 0/10 + store: builtins.dict +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestArrayWithChunkStore(TestArray): + + @staticmethod + def create_array(store=None, read_only=False, chunk_store=None, **kwargs): + if store is None: + store = dict() + if chunk_store is None: + # separate chunk store + chunk_store = dict() + init_array(store, path='foo/bar', chunk_store=chunk_store, **kwargs) + return Array(store, path='foo/bar', read_only=read_only, + chunk_store=chunk_store) + + def test_repr(self): + if not PY2: + + z = self.create_array(shape=100, chunks=10, dtype='f4', + compression='zlib', compression_opts=1) + # flake8: noqa + expect = """zarr.core.Array(/foo/bar, (100,), float32, chunks=(10,), order=C) + compression: zlib; compression_opts: 1 + nbytes: 400; nbytes_stored: 210; ratio: 1.9; initialized: 0/10 + store: builtins.dict + chunk_store: builtins.dict +""" + actual = repr(z) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index c2fbdd5737..3b4f46c08f 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -3,7 +3,6 @@ import tempfile import shutil import atexit -import os import numpy as np @@ -12,11 +11,13 @@ from numpy.testing import assert_array_equal -from zarr.creation import array, empty, zeros, ones, full, open, empty_like, \ - zeros_like, ones_like, full_like, open_like, create -from zarr.sync import ThreadSynchronizer, SynchronizedArray +from zarr.creation import array, empty, zeros, ones, full, open_array, \ + empty_like, zeros_like, ones_like, full_like, open_like, create +from zarr.sync import ThreadSynchronizer from zarr.core import Array -from zarr.storage import DirectoryStore, init_store +from zarr.storage import DirectoryStore +from zarr.hierarchy import open_group +from zarr.errors import ReadOnlyError def test_array(): @@ -67,77 +68,112 @@ def __getitem__(self, item): def test_empty(): - z = empty(100, 10) + z = empty(100, chunks=10) eq((100,), z.shape) eq((10,), z.chunks) def test_zeros(): - z = zeros(100, 10) + z = zeros(100, chunks=10) eq((100,), z.shape) eq((10,), z.chunks) assert_array_equal(np.zeros(100), z[:]) def test_ones(): - z = ones(100, 10) + z = ones(100, chunks=10) eq((100,), z.shape) eq((10,), z.chunks) assert_array_equal(np.ones(100), z[:]) def test_full(): - z = full(100, 10, fill_value=42, dtype='i4') + z = full(100, chunks=10, fill_value=42, dtype='i4') eq((100,), z.shape) eq((10,), z.chunks) assert_array_equal(np.full(100, fill_value=42, dtype='i4'), z[:]) + # nan + z = full(100, chunks=10, fill_value=np.nan, dtype='f8') + assert np.all(np.isnan(z[:])) -def test_open(): + # "NaN" + z = full(100, chunks=10, fill_value='NaN', dtype='U3') + assert np.all(z[:] == 'NaN') - path = tempfile.mktemp() - atexit.register( - lambda: shutil.rmtree(path) if os.path.exists(path) else None - ) - z = open(path, mode='w', shape=100, chunks=10, dtype='i4') + +def test_open_array(): + + path = 'example' + + # mode == 'w' + z = open_array(path, mode='w', shape=100, chunks=10) z[:] = 42 + assert_is_instance(z, Array) + assert_is_instance(z.store, DirectoryStore) eq((100,), z.shape) eq((10,), z.chunks) - assert_array_equal(np.full(100, fill_value=42, dtype='i4'), z[:]) - z2 = open(path, mode='r') - eq((100,), z2.shape) - eq((10,), z2.chunks) - assert_array_equal(z[:], z2[:]) - - # path does not exist - path = 'doesnotexist' - with assert_raises(ValueError): - open(path, mode='r') - - # path exists but store not initialised - path = tempfile.mkdtemp() - atexit.register(shutil.rmtree, path) - with assert_raises(ValueError): - open(path, mode='r') - with assert_raises(ValueError): - open(path, mode='r+') + assert_array_equal(np.full(100, fill_value=42), z[:]) + + # mode in 'r', 'r+' + open_group('example_group', mode='w') + for mode in 'r', 'r+': + with assert_raises(ValueError): + open_array('doesnotexist', mode=mode) + with assert_raises(ValueError): + open_array('example_group', mode=mode) + z = open_array(path, mode='r') + assert_is_instance(z, Array) + assert_is_instance(z.store, DirectoryStore) + eq((100,), z.shape) + eq((10,), z.chunks) + assert_array_equal(np.full(100, fill_value=42), z[:]) + with assert_raises(ReadOnlyError): + z[:] = 43 + z = open_array(path, mode='r+') + assert_is_instance(z, Array) + assert_is_instance(z.store, DirectoryStore) + eq((100,), z.shape) + eq((10,), z.chunks) + assert_array_equal(np.full(100, fill_value=42), z[:]) + z[:] = 43 + assert_array_equal(np.full(100, fill_value=43), z[:]) - # store initialised, mode w- - store = DirectoryStore(path) - init_store(store, shape=100, chunks=10) - with assert_raises(ValueError): - open(path, mode='w-') + # mode == 'a' + shutil.rmtree(path) + z = open_array(path, mode='a', shape=100, chunks=10) + z[:] = 42 + assert_is_instance(z, Array) + assert_is_instance(z.store, DirectoryStore) + eq((100,), z.shape) + eq((10,), z.chunks) + assert_array_equal(np.full(100, fill_value=42), z[:]) with assert_raises(ValueError): - open(path, mode='x') + open_array('example_group', mode='a') + + # mode in 'w-', 'x' + for mode in 'w-', 'x': + shutil.rmtree(path) + z = open_array(path, mode=mode, shape=100, chunks=10) + z[:] = 42 + assert_is_instance(z, Array) + assert_is_instance(z.store, DirectoryStore) + eq((100,), z.shape) + eq((10,), z.chunks) + assert_array_equal(np.full(100, fill_value=42), z[:]) + with assert_raises(ValueError): + open_array(path, mode=mode) + with assert_raises(ValueError): + open_array('example_group', mode=mode) # with synchronizer - z = open(path, synchronizer=ThreadSynchronizer()) - assert_is_instance(z, SynchronizedArray) + z = open_array(path, synchronizer=ThreadSynchronizer()) + assert_is_instance(z, Array) def test_empty_like(): # zarr array - z = empty(100, 10, dtype='f4', compression='zlib', + z = empty(100, chunks=10, dtype='f4', compression='zlib', compression_opts=5, order='F') z2 = empty_like(z) eq(z.shape, z2.shape) @@ -149,19 +185,16 @@ def test_empty_like(): eq(z.order, z2.order) # numpy array a = np.empty(100, dtype='f4') - z3 = empty_like(a, chunks=10) + z3 = empty_like(a) eq(a.shape, z3.shape) - eq((10,), z3.chunks) + eq((100,), z3.chunks) eq(a.dtype, z3.dtype) assert_is_none(z3.fill_value) - with assert_raises(ValueError): - # chunks missing - empty_like(a) def test_zeros_like(): # zarr array - z = zeros(100, 10, dtype='f4', compression='zlib', + z = zeros(100, chunks=10, dtype='f4', compression='zlib', compression_opts=5, order='F') z2 = zeros_like(z) eq(z.shape, z2.shape) @@ -178,14 +211,11 @@ def test_zeros_like(): eq((10,), z3.chunks) eq(a.dtype, z3.dtype) eq(0, z3.fill_value) - with assert_raises(ValueError): - # chunks missing - zeros_like(a) def test_ones_like(): # zarr array - z = ones(100, 10, dtype='f4', compression='zlib', + z = ones(100, chunks=10, dtype='f4', compression='zlib', compression_opts=5, order='F') z2 = ones_like(z) eq(z.shape, z2.shape) @@ -202,13 +232,10 @@ def test_ones_like(): eq((10,), z3.chunks) eq(a.dtype, z3.dtype) eq(1, z3.fill_value) - with assert_raises(ValueError): - # chunks missing - ones_like(a) def test_full_like(): - z = full(100, 10, dtype='f4', compression='zlib', + z = full(100, chunks=10, dtype='f4', compression='zlib', compression_opts=5, fill_value=42, order='F') z2 = full_like(z) eq(z.shape, z2.shape) @@ -225,9 +252,6 @@ def test_full_like(): eq((10,), z3.chunks) eq(a.dtype, z3.dtype) eq(42, z3.fill_value) - with assert_raises(ValueError): - # chunks missing - full_like(a) with assert_raises(ValueError): # fill_value missing full_like(a, chunks=10) @@ -237,7 +261,7 @@ def test_open_like(): # zarr array path = tempfile.mktemp() atexit.register(shutil.rmtree, path) - z = full(100, 10, dtype='f4', compression='zlib', + z = full(100, chunks=10, dtype='f4', compression='zlib', compression_opts=5, fill_value=42, order='F') z2 = open_like(z, path) eq(z.shape, z2.shape) @@ -256,24 +280,22 @@ def test_open_like(): eq((10,), z3.chunks) eq(a.dtype, z3.dtype) assert_is_none(z3.fill_value) - with assert_raises(ValueError): - # chunks missing - open_like(a, path) def test_create(): # defaults - z = create(100, 10) + z = create(100) assert_is_instance(z, Array) eq((100,), z.shape) - eq((10,), z.chunks) + eq((100,), z.chunks) # auto-chunks eq(np.dtype(None), z.dtype) eq('blosc', z.compression) assert_is_none(z.fill_value) # all specified - z = create(100, 10, dtype='i4', compression='zlib', compression_opts=1, + z = create(100, chunks=10, dtype='i4', compression='zlib', + compression_opts=1, fill_value=42, order='F') assert_is_instance(z, Array) eq((100,), z.shape) @@ -286,8 +308,8 @@ def test_create(): # with synchronizer synchronizer = ThreadSynchronizer() - z = create(100, 10, synchronizer=synchronizer) - assert_is_instance(z, SynchronizedArray) + z = create(100, chunks=10, synchronizer=synchronizer) + assert_is_instance(z, Array) eq((100,), z.shape) eq((10,), z.chunks) assert synchronizer is z.synchronizer diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py new file mode 100644 index 0000000000..94d93dcc2a --- /dev/null +++ b/zarr/tests/test_hierarchy.py @@ -0,0 +1,789 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, division +import unittest +import tempfile +import atexit +import shutil +import os +import pickle + + +from nose.tools import assert_raises, eq_ as eq, assert_is, assert_true, \ + assert_is_instance, assert_false, assert_is_none +import numpy as np +from numpy.testing import assert_array_equal + + +from zarr.storage import DictStore, DirectoryStore, ZipStore, init_group, \ + init_array, attrs_key, array_meta_key, group_meta_key +from zarr.core import Array +from zarr.hierarchy import Group, group, open_group +from zarr.attrs import Attributes +from zarr.errors import ReadOnlyError +from zarr.creation import open_array +from zarr.compat import PY2 +from zarr.sync import ThreadSynchronizer, ProcessSynchronizer + + +# noinspection PyStatementEffect +class TestGroup(unittest.TestCase): + + @staticmethod + def create_store(): + # can be overridden in sub-classes + return dict(), None + + def create_group(self, store=None, path=None, read_only=False, + chunk_store=None, synchronizer=None): + # can be overridden in sub-classes + if store is None: + store, chunk_store = self.create_store() + init_group(store, path=path, chunk_store=chunk_store) + g = Group(store, path=path, read_only=read_only, + chunk_store=chunk_store, synchronizer=synchronizer) + return g + + def test_group_init_1(self): + store, chunk_store = self.create_store() + g = self.create_group(store, chunk_store=chunk_store) + assert_is(store, g.store) + assert_false(g.read_only) + eq('', g.path) + eq('/', g.name) + assert_is_instance(g.attrs, Attributes) + + def test_group_init_2(self): + store, chunk_store = self.create_store() + g = self.create_group(store, chunk_store=chunk_store, + path='/foo/bar/', read_only=True) + assert_is(store, g.store) + assert_true(g.read_only) + eq('foo/bar', g.path) + eq('/foo/bar', g.name) + assert_is_instance(g.attrs, Attributes) + + def test_group_init_errors_1(self): + store, chunk_store = self.create_store() + # group metadata not initialized + with assert_raises(ValueError): + Group(store, chunk_store=chunk_store) + + def test_group_init_errors_2(self): + store, chunk_store = self.create_store() + init_array(store, shape=1000, chunks=100, chunk_store=chunk_store) + # array blocks group + with assert_raises(ValueError): + Group(store, chunk_store=chunk_store) + + def test_create_group(self): + g1 = self.create_group() + + # check root group + eq('', g1.path) + eq('/', g1.name) + + # create level 1 child group + g2 = g1.create_group('foo') + assert_is_instance(g2, Group) + eq('foo', g2.path) + eq('/foo', g2.name) + + # create level 2 child group + g3 = g2.create_group('bar') + assert_is_instance(g3, Group) + eq('foo/bar', g3.path) + eq('/foo/bar', g3.name) + + # create level 3 child group + g4 = g1.create_group('foo/bar/baz') + assert_is_instance(g4, Group) + eq('foo/bar/baz', g4.path) + eq('/foo/bar/baz', g4.name) + + # create level 3 group via root + g5 = g4.create_group('/a/b/c/') + assert_is_instance(g5, Group) + eq('a/b/c', g5.path) + eq('/a/b/c', g5.name) + + # test bad keys + with assert_raises(KeyError): + g1.create_group('foo') # already exists + with assert_raises(KeyError): + g1.create_group('a/b/c') # already exists + with assert_raises(KeyError): + g4.create_group('/a/b/c') # already exists + with assert_raises(KeyError): + g1.create_group('') + with assert_raises(KeyError): + g1.create_group('/') + with assert_raises(KeyError): + g1.create_group('//') + + # multi + g6, g7 = g1.create_groups('y', 'z') + assert_is_instance(g6, Group) + eq(g6.path, 'y') + assert_is_instance(g7, Group) + eq(g7.path, 'z') + + def test_require_group(self): + g1 = self.create_group() + + # test creation + g2 = g1.require_group('foo') + assert_is_instance(g2, Group) + eq('foo', g2.path) + g3 = g2.require_group('bar') + assert_is_instance(g3, Group) + eq('foo/bar', g3.path) + g4 = g1.require_group('foo/bar/baz') + assert_is_instance(g4, Group) + eq('foo/bar/baz', g4.path) + g5 = g4.require_group('/a/b/c/') + assert_is_instance(g5, Group) + eq('a/b/c', g5.path) + + # test when already created + g2a = g1.require_group('foo') + eq(g2, g2a) + assert_is(g2.store, g2a.store) + g3a = g2a.require_group('bar') + eq(g3, g3a) + assert_is(g3.store, g3a.store) + g4a = g1.require_group('foo/bar/baz') + eq(g4, g4a) + assert_is(g4.store, g4a.store) + g5a = g4a.require_group('/a/b/c/') + eq(g5, g5a) + assert_is(g5.store, g5a.store) + + # test path normalization + eq(g1.require_group('quux'), g1.require_group('/quux/')) + + # multi + g6, g7 = g1.require_groups('y', 'z') + assert_is_instance(g6, Group) + eq(g6.path, 'y') + assert_is_instance(g7, Group) + eq(g7.path, 'z') + + def test_create_dataset(self): + g = self.create_group() + + # create as immediate child + d1 = g.create_dataset('foo', shape=1000, chunks=100) + assert_is_instance(d1, Array) + eq((1000,), d1.shape) + eq((100,), d1.chunks) + eq('foo', d1.path) + eq('/foo', d1.name) + assert_is(g.store, d1.store) + + # create as descendant + d2 = g.create_dataset('/a/b/c/', shape=2000, chunks=200, dtype='i1', + compression='zlib', compression_opts=9, + fill_value=42, order='F') + assert_is_instance(d2, Array) + eq((2000,), d2.shape) + eq((200,), d2.chunks) + eq(np.dtype('i1'), d2.dtype) + eq('zlib', d2.compression) + eq(9, d2.compression_opts) + eq(42, d2.fill_value) + eq('F', d2.order) + eq('a/b/c', d2.path) + eq('/a/b/c', d2.name) + assert_is(g.store, d2.store) + + # create with data + data = np.arange(3000, dtype='u2') + d3 = g.create_dataset('bar', data=data, chunks=300) + assert_is_instance(d3, Array) + eq((3000,), d3.shape) + eq((300,), d3.chunks) + eq(np.dtype('u2'), d3.dtype) + assert_array_equal(data, d3[:]) + eq('bar', d3.path) + eq('/bar', d3.name) + assert_is(g.store, d3.store) + + def test_require_dataset(self): + g = self.create_group() + + # create + d1 = g.require_dataset('foo', shape=1000, chunks=100, dtype='f4') + d1[:] = np.arange(1000) + assert_is_instance(d1, Array) + eq((1000,), d1.shape) + eq((100,), d1.chunks) + eq(np.dtype('f4'), d1.dtype) + eq('foo', d1.path) + eq('/foo', d1.name) + assert_is(g.store, d1.store) + assert_array_equal(np.arange(1000), d1[:]) + + # require + d2 = g.require_dataset('foo', shape=1000, chunks=100, dtype='f4') + assert_is_instance(d2, Array) + eq((1000,), d2.shape) + eq((100,), d2.chunks) + eq(np.dtype('f4'), d2.dtype) + eq('foo', d2.path) + eq('/foo', d2.name) + assert_is(g.store, d2.store) + assert_array_equal(np.arange(1000), d2[:]) + eq(d1, d2) + + # bad shape - use TypeError for h5py compatibility + with assert_raises(TypeError): + g.require_dataset('foo', shape=2000, chunks=100, dtype='f4') + + # dtype matching + # can cast + d3 = g.require_dataset('foo', shape=1000, chunks=100, dtype='i2') + eq(np.dtype('f4'), d3.dtype) + eq(d1, d3) + with assert_raises(TypeError): + # cannot cast + g.require_dataset('foo', shape=1000, chunks=100, dtype='i4') + with assert_raises(TypeError): + # can cast but not exact match + g.require_dataset('foo', shape=1000, chunks=100, dtype='i2', + exact=True) + + def test_create_errors(self): + g = self.create_group() + + # array obstructs group, array + g.create_dataset('foo', shape=100, chunks=10) + with assert_raises(KeyError): + g.create_group('foo/bar') + with assert_raises(KeyError): + g.require_group('foo/bar') + with assert_raises(KeyError): + g.create_dataset('foo/bar', shape=100, chunks=10) + with assert_raises(KeyError): + g.require_dataset('foo/bar', shape=100, chunks=10) + + # array obstructs group, array + g.create_dataset('a/b', shape=100, chunks=10) + with assert_raises(KeyError): + g.create_group('a/b') + with assert_raises(KeyError): + g.require_group('a/b') + with assert_raises(KeyError): + g.create_dataset('a/b', shape=100, chunks=10) + + # group obstructs array + g.create_group('c/d') + with assert_raises(KeyError): + g.create_dataset('c', shape=100, chunks=10) + with assert_raises(KeyError): + g.require_dataset('c', shape=100, chunks=10) + with assert_raises(KeyError): + g.create_dataset('c/d', shape=100, chunks=10) + with assert_raises(KeyError): + g.require_dataset('c/d', shape=100, chunks=10) + + # h5py compatibility - accept but ingore some keyword args + d = g.create_dataset('x', shape=100, chunks=10, fillvalue=1) + assert_is_none(d.fill_value) + d = g.create_dataset('y', shape=100, chunks=10, shuffle=True) + assert not hasattr(d, 'shuffle') + + # read-only + g = self.create_group(read_only=True) + with assert_raises(ReadOnlyError): + g.create_group('zzz') + with assert_raises(ReadOnlyError): + g.require_group('zzz') + with assert_raises(ReadOnlyError): + g.create_dataset('zzz', shape=100, chunks=10) + with assert_raises(ReadOnlyError): + g.require_dataset('zzz', shape=100, chunks=10) + + def test_getitem_contains_iterators(self): + # setup + g1 = self.create_group() + g2 = g1.create_group('foo/bar') + d1 = g2.create_dataset('/a/b/c', shape=1000, chunks=100) + d1[:] = np.arange(1000) + d2 = g1.create_dataset('foo/baz', shape=3000, chunks=300) + d2[:] = np.arange(3000) + + # test __getitem__ + assert_is_instance(g1['foo'], Group) + assert_is_instance(g1['foo']['bar'], Group) + assert_is_instance(g1['foo/bar'], Group) + assert_is_instance(g1['/foo/bar/'], Group) + assert_is_instance(g1['foo/baz'], Array) + eq(g2, g1['foo/bar']) + eq(g1['foo']['bar'], g1['foo/bar']) + eq(d2, g1['foo/baz']) + assert_array_equal(d2[:], g1['foo/baz']) + assert_is_instance(g1['a'], Group) + assert_is_instance(g1['a']['b'], Group) + assert_is_instance(g1['a/b'], Group) + assert_is_instance(g1['a']['b']['c'], Array) + assert_is_instance(g1['a/b/c'], Array) + eq(d1, g1['a/b/c']) + eq(g1['a']['b']['c'], g1['a/b/c']) + assert_array_equal(d1[:], g1['a/b/c'][:]) + + # test __contains__ + assert 'foo' in g1 + assert 'foo/bar' in g1 + assert 'foo/baz' in g1 + assert 'bar' in g1['foo'] + assert 'a' in g1 + assert 'a/b' in g1 + assert 'a/b/c' in g1 + assert 'baz' not in g1 + assert 'a/b/c/d' not in g1 + assert 'a/z' not in g1 + assert 'quux' not in g1['foo'] + + # test key errors + with assert_raises(KeyError): + g1['baz'] + with assert_raises(KeyError): + g1['x/y/z'] + + # test __len__ + eq(2, len(g1)) + eq(2, len(g1['foo'])) + eq(0, len(g1['foo/bar'])) + eq(1, len(g1['a'])) + eq(1, len(g1['a/b'])) + + # test __iter__, keys() + # currently assumes sorted by key + + eq(['a', 'foo'], list(g1)) + eq(['a', 'foo'], list(g1.keys())) + eq(['bar', 'baz'], list(g1['foo'])) + eq(['bar', 'baz'], list(g1['foo'].keys())) + eq([], sorted(g1['foo/bar'])) + eq([], sorted(g1['foo/bar'].keys())) + + # test items(), values() + # currently assumes sorted by key + + items = list(g1.items()) + values = list(g1.values()) + eq('a', items[0][0]) + eq(g1['a'], items[0][1]) + eq(g1['a'], values[0]) + eq('foo', items[1][0]) + eq(g1['foo'], items[1][1]) + eq(g1['foo'], values[1]) + + items = list(g1['foo'].items()) + values = list(g1['foo'].values()) + eq('bar', items[0][0]) + eq(g1['foo']['bar'], items[0][1]) + eq(g1['foo']['bar'], values[0]) + eq('baz', items[1][0]) + eq(g1['foo']['baz'], items[1][1]) + eq(g1['foo']['baz'], values[1]) + + # test array_keys(), arrays(), group_keys(), groups() + # currently assumes sorted by key + + eq(['a', 'foo'], list(g1.group_keys())) + groups = list(g1.groups()) + arrays = list(g1.arrays()) + eq('a', groups[0][0]) + eq(g1['a'], groups[0][1]) + eq('foo', groups[1][0]) + eq(g1['foo'], groups[1][1]) + eq([], list(g1.array_keys())) + eq([], arrays) + + eq(['bar'], list(g1['foo'].group_keys())) + eq(['baz'], list(g1['foo'].array_keys())) + groups = list(g1['foo'].groups()) + arrays = list(g1['foo'].arrays()) + eq('bar', groups[0][0]) + eq(g1['foo']['bar'], groups[0][1]) + eq('baz', arrays[0][0]) + eq(g1['foo']['baz'], arrays[0][1]) + + def test_empty_getitem_contains_iterators(self): + # setup + g = self.create_group() + + # test + eq([], list(g)) + eq([], list(g.keys())) + eq(0, len(g)) + assert 'foo' not in g + + def test_group_repr(self): + g = self.create_group() + store_class = '%s.%s' % (dict.__module__, dict.__name__) + expect = 'zarr.hierarchy.Group(/, 0)\n store: %s' % store_class + actual = repr(g) + eq(expect, actual) + g.create_group('foo') + g.create_group('bar') + g.create_group('y'*80) + g.create_dataset('baz', shape=100, chunks=10) + g.create_dataset('quux', shape=100, chunks=10) + g.create_dataset('z'*80, shape=100, chunks=10) + expect = \ + 'zarr.hierarchy.Group(/, 6)\n' \ + ' arrays: 3; baz, quux, ' \ + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz...\n' \ + ' groups: 3; bar, foo, ' \ + 'yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy...\n' \ + ' store: %s' % store_class + actual = repr(g) + eq(expect, actual) + + def test_setitem(self): + g = self.create_group() + with assert_raises(TypeError): + g['foo'] = 'bar' + + def test_array_creation(self): + grp = self.create_group() + + a = grp.create('a', shape=100, chunks=10) + assert_is_instance(a, Array) + b = grp.empty('b', shape=100, chunks=10) + assert_is_instance(b, Array) + assert_is_none(b.fill_value) + c = grp.zeros('c', shape=100, chunks=10) + assert_is_instance(c, Array) + eq(0, c.fill_value) + d = grp.ones('d', shape=100, chunks=10) + assert_is_instance(d, Array) + eq(1, d.fill_value) + e = grp.full('e', shape=100, chunks=10, fill_value=42) + assert_is_instance(e, Array) + eq(42, e.fill_value) + + f = grp.empty_like('f', a) + assert_is_instance(f, Array) + assert_is_none(f.fill_value) + g = grp.zeros_like('g', a) + assert_is_instance(g, Array) + eq(0, g.fill_value) + h = grp.ones_like('h', a) + assert_is_instance(h, Array) + eq(1, h.fill_value) + i = grp.full_like('i', e) + assert_is_instance(i, Array) + eq(42, i.fill_value) + + j = grp.array('j', data=np.arange(100), chunks=10) + assert_is_instance(j, Array) + assert_array_equal(np.arange(100), j[:]) + + grp = self.create_group(read_only=True) + with assert_raises(ReadOnlyError): + grp.create('aa', shape=100, chunks=10) + with assert_raises(ReadOnlyError): + grp.empty('aa', shape=100, chunks=10) + with assert_raises(ReadOnlyError): + grp.zeros('aa', shape=100, chunks=10) + with assert_raises(ReadOnlyError): + grp.ones('aa', shape=100, chunks=10) + with assert_raises(ReadOnlyError): + grp.full('aa', shape=100, chunks=10, fill_value=42) + with assert_raises(ReadOnlyError): + grp.array('aa', data=np.arange(100), chunks=10) + with assert_raises(ReadOnlyError): + grp.create('aa', shape=100, chunks=10) + with assert_raises(ReadOnlyError): + grp.empty_like('aa', a) + with assert_raises(ReadOnlyError): + grp.zeros_like('aa', a) + with assert_raises(ReadOnlyError): + grp.ones_like('aa', a) + with assert_raises(ReadOnlyError): + grp.full_like('aa', a) + + def test_paths(self): + g1 = self.create_group() + g2 = g1.create_group('foo/bar') + + eq(g1, g1['/']) + eq(g1, g1['//']) + eq(g1, g1['///']) + eq(g1, g2['/']) + eq(g1, g2['//']) + eq(g1, g2['///']) + eq(g2, g1['foo/bar']) + eq(g2, g1['/foo/bar']) + eq(g2, g1['foo/bar/']) + eq(g2, g1['//foo/bar']) + eq(g2, g1['//foo//bar//']) + eq(g2, g1['///foo///bar///']) + eq(g2, g2['/foo/bar']) + + with assert_raises(ValueError): + g1['.'] + with assert_raises(ValueError): + g1['..'] + with assert_raises(ValueError): + g1['foo/.'] + with assert_raises(ValueError): + g1['foo/..'] + with assert_raises(ValueError): + g1['foo/./bar'] + with assert_raises(ValueError): + g1['foo/../bar'] + + def test_pickle(self): + # setup + g = self.create_group() + d = g.create_dataset('foo/bar', shape=100, chunks=10) + d[:] = np.arange(100) + + # pickle round trip + g2 = pickle.loads(pickle.dumps(g)) + eq(g.path, g2.path) + eq(g.name, g2.name) + eq(len(g), len(g2)) + eq(list(g), list(g2)) + eq(g['foo'], g2['foo']) + eq(g['foo/bar'], g2['foo/bar']) + + +class TestGroupWithDictStore(TestGroup): + + @staticmethod + def create_store(): + return DictStore(), None + + def test_group_repr(self): + g = self.create_group() + expect = 'zarr.hierarchy.Group(/, 0)\n store: zarr.storage.DictStore' + actual = repr(g) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +def rmtree(p, f=shutil.rmtree, g=os.path.isdir): # pragma: no cover + """Version of rmtree that will work atexit and only remove if directory.""" + if g(p): + f(p) + + +class TestGroupWithDirectoryStore(TestGroup): + + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(rmtree, path) + store = DirectoryStore(path) + return store, None + + def test_group_repr(self): + g = self.create_group() + expect = 'zarr.hierarchy.Group(/, 0)\n' \ + ' store: zarr.storage.DirectoryStore' + actual = repr(g) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestGroupWithZipStore(TestGroup): + + @staticmethod + def create_store(): + path = tempfile.mktemp(suffix='.zip') + atexit.register(os.remove, path) + store = ZipStore(path) + return store, None + + def test_group_repr(self): + g = self.create_group() + expect = 'zarr.hierarchy.Group(/, 0)\n' \ + ' store: zarr.storage.ZipStore' + actual = repr(g) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + +class TestGroupWithChunkStore(TestGroup): + + @staticmethod + def create_store(): + return dict(), dict() + + def test_group_repr(self): + if not PY2: + g = self.create_group() + expect = 'zarr.hierarchy.Group(/, 0)\n' \ + ' store: builtins.dict\n' \ + ' chunk_store: builtins.dict' + actual = repr(g) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + def test_chunk_store(self): + # setup + store, chunk_store = self.create_store() + g = self.create_group(store, chunk_store=chunk_store) + + # check attributes + assert_is(store, g.store) + assert_is(chunk_store, g.chunk_store) + + # create array + a = g.zeros('foo', shape=100, chunks=10) + assert_is(store, a.store) + assert_is(chunk_store, a.chunk_store) + a[:] = np.arange(100) + assert_array_equal(np.arange(100), a[:]) + + # check store keys + expect = sorted([attrs_key, group_meta_key, 'foo/' + attrs_key, + 'foo/' + array_meta_key]) + actual = sorted(store.keys()) + eq(expect, actual) + expect = ['foo/' + str(i) for i in range(10)] + actual = sorted(chunk_store.keys()) + eq(expect, actual) + + +class TestGroupWithThreadSynchronizer(TestGroup): + + def create_group(self, store=None, path=None, read_only=False, + chunk_store=None, synchronizer=None): + if store is None: + store, chunk_store = self.create_store() + init_group(store, path=path, chunk_store=chunk_store) + synchronizer = ThreadSynchronizer() + g = Group(store, path=path, read_only=read_only, + chunk_store=chunk_store, synchronizer=synchronizer) + return g + + def test_group_repr(self): + if not PY2: + g = self.create_group() + expect = 'zarr.hierarchy.Group(/, 0)\n' \ + ' store: builtins.dict\n' \ + ' synchronizer: zarr.sync.ThreadSynchronizer' + actual = repr(g) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + def test_synchronizer_property(self): + g = self.create_group() + assert_is_instance(g.synchronizer, ThreadSynchronizer) + + +class TestGroupWithProcessSynchronizer(TestGroup): + + def create_group(self, store=None, path=None, read_only=False, + chunk_store=None, synchronizer=None): + if store is None: + store, chunk_store = self.create_store() + init_group(store, path=path, chunk_store=chunk_store) + sync_path = tempfile.mkdtemp() + atexit.register(shutil.rmtree, sync_path) + synchronizer = ProcessSynchronizer(sync_path) + g = Group(store, path=path, read_only=read_only, + chunk_store=chunk_store, synchronizer=synchronizer) + return g + + def test_group_repr(self): + if not PY2: + g = self.create_group() + expect = 'zarr.hierarchy.Group(/, 0)\n' \ + ' store: builtins.dict\n' \ + ' synchronizer: zarr.sync.ProcessSynchronizer' + actual = repr(g) + for l1, l2 in zip(expect.split('\n'), actual.split('\n')): + eq(l1, l2) + + def test_synchronizer_property(self): + g = self.create_group() + assert_is_instance(g.synchronizer, ProcessSynchronizer) + + +def test_group(): + # test the group() convenience function + + # basic usage + g = group() + assert_is_instance(g, Group) + eq('', g.path) + eq('/', g.name) + + # usage with custom store + store = dict() + g = group(store=store) + assert_is_instance(g, Group) + assert_is(store, g.store) + + # overwrite behaviour + store = dict() + init_array(store, shape=100, chunks=10) + with assert_raises(ValueError): + group(store) + g = group(store, overwrite=True) + assert_is_instance(g, Group) + assert_is(store, g.store) + + +def test_open_group(): + # test the open_group() convenience function + + path = 'example' + + # mode == 'w' + g = open_group(path, mode='w') + assert_is_instance(g, Group) + assert_is_instance(g.store, DirectoryStore) + eq(0, len(g)) + g.create_groups('foo', 'bar') + eq(2, len(g)) + + # mode in 'r', 'r+' + open_array('example_array', shape=100, chunks=10, mode='w') + for mode in 'r', 'r+': + with assert_raises(ValueError): + open_group('doesnotexist', mode=mode) + with assert_raises(ValueError): + open_group('example_array', mode=mode) + g = open_group(path, mode='r') + assert_is_instance(g, Group) + eq(2, len(g)) + with assert_raises(ReadOnlyError): + g.create_group('baz') + g = open_group(path, mode='r+') + assert_is_instance(g, Group) + eq(2, len(g)) + g.create_groups('baz', 'quux') + eq(4, len(g)) + + # mode == 'a' + shutil.rmtree(path) + g = open_group(path, mode='a') + assert_is_instance(g, Group) + assert_is_instance(g.store, DirectoryStore) + eq(0, len(g)) + g.create_groups('foo', 'bar') + eq(2, len(g)) + with assert_raises(ValueError): + open_group('example_array', mode='a') + + # mode in 'w-', 'x' + for mode in 'w-', 'x': + shutil.rmtree(path) + g = open_group(path, mode=mode) + assert_is_instance(g, Group) + assert_is_instance(g.store, DirectoryStore) + eq(0, len(g)) + g.create_groups('foo', 'bar') + eq(2, len(g)) + with assert_raises(ValueError): + open_group(path, mode=mode) + with assert_raises(ValueError): + open_group('example_array', mode=mode) diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index ec8323172d..0152e55639 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -7,73 +7,171 @@ import numpy as np -from zarr.meta import decode_metadata, encode_dtype, decode_dtype +from zarr.compat import binary_type, text_type +from zarr.meta import decode_array_metadata, encode_dtype, decode_dtype, \ + ZARR_FORMAT, decode_group_metadata, encode_array_metadata from zarr.errors import MetadataError -def test_decode(): +def assert_json_eq(expect, actual): # pragma: no cover + if isinstance(expect, binary_type): + expect = text_type(expect, 'ascii') + if isinstance(actual, binary_type): + actual = text_type(actual, 'ascii') + ej = json.loads(expect) + aj = json.loads(actual) + eq(ej, aj) - # typical - b = b'''{ - "zarr_format": 1, - "shape": [100], + +def test_encode_decode_array_1(): + + meta = dict( + shape=(100,), + chunks=(10,), + dtype=np.dtype('f8'), + compression='zlib', + compression_opts=1, + fill_value=None, + order='C' + ) + + meta_json = '''{ "chunks": [10], - "dtype": " CHUNK_MAX: + target_size = CHUNK_MAX + elif target_size < CHUNK_MIN: + target_size = CHUNK_MIN + + idx = 0 + while True: + # Repeatedly loop over the axes, dividing them by 2. Stop when: + # 1a. We're smaller than the target chunk size, OR + # 1b. We're within 50% of the target chunk size, AND + # 2. The chunk is smaller than the maximum chunk size + + chunk_bytes = np.product(chunks)*typesize + + if (chunk_bytes < target_size or + abs(chunk_bytes-target_size)/target_size < 0.5) and \ + chunk_bytes < CHUNK_MAX: + break + + if np.product(chunks) == 1: + break # Element size larger than CHUNK_MAX + + chunks[idx % ndims] = np.ceil(chunks[idx % ndims] / 2.0) + idx += 1 + + return tuple(int(x) for x in chunks) + + +def normalize_chunks(chunks, shape, typesize): """Convenience function to normalize the `chunks` argument for an array with the given `shape`.""" # N.B., expect shape already normalized + # handle auto-chunking + if chunks is None or chunks is True: + return guess_chunks(shape, typesize) + # handle 1D convenience form if isinstance(chunks, integer_types): chunks = (int(chunks),) @@ -49,7 +104,7 @@ def normalize_chunks(chunks, shape): # noinspection PyTypeChecker def is_total_slice(item, shape): """Determine whether `item` specifies a complete slice of array with the - given `shape`. Used to optimise __setitem__ operations on the Chunk + given `shape`. Used to optimize __setitem__ operations on the Chunk class.""" # N.B., assume shape is normalized @@ -187,3 +242,39 @@ def normalize_order(order): if order not in ['C', 'F']: raise ValueError("order must be either 'C' or 'F', found: %r" % order) return order + + +def normalize_storage_path(path): + if path: + + # convert backslash to forward slash + path = path.replace('\\', '/') + + # ensure no leading slash + while len(path) > 0 and path[0] == '/': + path = path[1:] + + # ensure no trailing slash + while len(path) > 0 and path[-1] == '/': + path = path[:-1] + + # collapse any repeated slashes + previous_char = None + collapsed = '' + for char in path: + if char == '/' and previous_char == '/': + pass + else: + collapsed += char + previous_char = char + path = collapsed + + # don't allow path segments with just '.' or '..' + segments = path.split('/') + if any([s in {'.', '..'} for s in segments]): + raise ValueError("path containing '.' or '..' segment not allowed") + + else: + path = '' + + return path