Skip to content

Commit dca87fc

Browse files
d-v-bjoshmoore
andauthored
N5FSStore (#793)
* Drop skip_if_nested_chunks from test_storage.py * Add failing nested test * Make DirectoryStore dimension_separator aware * Migrate key logic to core rather than storage Previous tests (now commented out) used logic in the store classes to convert "0/0" keys into "0.0" keys, forcing the store to be aware of array details. This tries to swap the logic so that stores are responsible for passing dimension separator values down to the arrays only. Since arrays can also get the dimension_separator value from a .zarray file they are now in charge. * Fix linting in new test * Extend the test suite for dim_sep * add n5fsstore and tests * slightly smarter kwarg interception * remove outdated unittest ref and fix the name of a test func * fix massive string block and fix default key_separator kwarg for FSStore * flake8 * promote n5store to toplevel import and fix examples in docstring * Try fsspec 2021.7 (see #802) * Revert "Try fsspec 2021.7 (see #802)" This reverts commit 68adca5. * Add missing core tests for N5FSStore, and rchanges required for making them pass * tmp: debug * uncomment N5 chunk ordering test * more commented tests get uncommented * add dimension_separator to array metadata adaptor * Revert "tmp: debug" This reverts commit ee9cdbc. * Attempt failed: keeping '.' and switching * Revert "Attempt failed: keeping '.' and switching" This reverts commit 51b3109. * regex: attempt failed due to slight diff in files * Revert "regex: attempt failed due to slight diff in files" This reverts commit 3daea7c. * N5: use "." internally for dimension separation This allows N5 to detect the split between key and chunks and pre-process them (re-ordering and changing the separator). see: #773 #793 * move FSSpec import guard * remove os.path.sep concatenation in listdir that was erroring a test, and add a mea culpa docstring about the dimension_separator for n5 stores * resolve merge conflicts in favor of upstream * make listdir implementation for n5fsstore look more like fsstore's listdir, and add crucial lstrip * Update hexdigest tests for N5Stores to account for the presence of the dimension_separator keyword now present in metadata * Add tests for dimension_separator in array meta for N5Stores * N5FSStore: try to increase code coverage * Adds a test for the dimension_separator warning * uses the parent test_complex for listdir * "nocover" the import error since fsspec is ever present * flake8 * add chunk nesting test to N5FSStore test suite * make array_meta_key, group_meta_key, attrs_key private * N5FSStore: Remove ImportError test FSStore only throws ModuleNotFoundError on initialization rather than on import. Therefore N5FSStore does the same. If this *weren't* the case, then the import in zarr/init would need to test the import as well, which isn't the case. Co-authored-by: jmoore <[email protected]> Co-authored-by: Josh Moore <[email protected]>
1 parent be3d657 commit dca87fc

File tree

5 files changed

+466
-31
lines changed

5 files changed

+466
-31
lines changed

zarr/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
zeros_like)
1010
from zarr.errors import CopyError, MetadataError
1111
from zarr.hierarchy import Group, group, open_group
12-
from zarr.n5 import N5Store
12+
from zarr.n5 import N5Store, N5FSStore
1313
from zarr.storage import (ABSStore, DBMStore, DictStore, DirectoryStore,
1414
LMDBStore, LRUStoreCache, MemoryStore, MongoDBStore,
1515
NestedDirectoryStore, RedisStore, SQLiteStore,

zarr/n5.py

Lines changed: 291 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
from numcodecs.registry import get_codec, register_codec
1212

1313
from .meta import ZARR_FORMAT, json_dumps, json_loads
14-
from .storage import NestedDirectoryStore, _prog_ckey, _prog_number
14+
from .storage import FSStore
15+
from .storage import NestedDirectoryStore, _prog_ckey, _prog_number, normalize_storage_path
1516
from .storage import array_meta_key as zarr_array_meta_key
1617
from .storage import attrs_key as zarr_attrs_key
1718
from .storage import group_meta_key as zarr_group_meta_key
@@ -281,12 +282,298 @@ def _contains_attrs(self, path):
281282
return len(attrs) > 0
282283

283284

285+
class N5FSStore(FSStore):
286+
"""Implentation of the N5 format (https://github.com/saalfeldlab/n5) using `fsspec`,
287+
which allows storage on a variety of filesystems. Based on `zarr.N5Store`.
288+
Parameters
289+
----------
290+
path : string
291+
Location of directory to use as the root of the storage hierarchy.
292+
normalize_keys : bool, optional
293+
If True, all store keys will be normalized to use lower case characters
294+
(e.g. 'foo' and 'FOO' will be treated as equivalent). This can be
295+
useful to avoid potential discrepancies between case-senstive and
296+
case-insensitive file system. Default value is False.
297+
298+
Examples
299+
--------
300+
Store a single array::
301+
302+
>>> import zarr
303+
>>> store = zarr.N5FSStore('data/array.n5', auto_mkdir=True)
304+
>>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True)
305+
>>> z[...] = 42
306+
307+
Store a group::
308+
309+
>>> store = zarr.N5FSStore('data/group.n5', auto_mkdir=True)
310+
>>> root = zarr.group(store=store, overwrite=True)
311+
>>> foo = root.create_group('foo')
312+
>>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5))
313+
>>> bar[...] = 42
314+
315+
Notes
316+
-----
317+
This is an experimental feature.
318+
Safe to write in multiple threads or processes.
319+
320+
Be advised that the `_dimension_separator` property of this store
321+
(and arrays it creates) is ".", but chunks saved by this store will
322+
in fact be "/" separated, as proscribed by the N5 format.
323+
324+
This is counter-intuitive (to say the least), but not arbitrary.
325+
Chunks in N5 format are stored with reversed dimension order
326+
relative to Zarr chunks: a chunk of a 3D Zarr array would be stored
327+
on a file system as `/0/1/2`, but in N5 the same chunk would be
328+
stored as `/2/1/0`. Therefore, stores targeting N5 must intercept
329+
chunk keys and flip the order of the dimensions before writing to
330+
storage, and this procedure requires chunk keys with "." separated
331+
dimensions, hence the Zarr arrays targeting N5 have the deceptive
332+
"." dimension separator.
333+
"""
334+
_array_meta_key = 'attributes.json'
335+
_group_meta_key = 'attributes.json'
336+
_attrs_key = 'attributes.json'
337+
338+
def __init__(self, *args, **kwargs):
339+
if 'dimension_separator' in kwargs:
340+
kwargs.pop('dimension_separator')
341+
warnings.warn('Keyword argument `dimension_separator` will be ignored')
342+
dimension_separator = "."
343+
super().__init__(*args, dimension_separator=dimension_separator, **kwargs)
344+
345+
def _swap_separator(self, key):
346+
segments = list(key.split('/'))
347+
if segments:
348+
last_segment = segments[-1]
349+
if _prog_ckey.match(last_segment):
350+
coords = list(last_segment.split('.'))
351+
last_segment = '/'.join(coords[::-1])
352+
segments = segments[:-1] + [last_segment]
353+
key = '/'.join(segments)
354+
return key
355+
356+
def _normalize_key(self, key):
357+
if is_chunk_key(key):
358+
key = invert_chunk_coords(key)
359+
360+
key = normalize_storage_path(key).lstrip("/")
361+
if key:
362+
*bits, end = key.split("/")
363+
364+
if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key):
365+
end = end.replace(".", "/")
366+
key = "/".join(bits + [end])
367+
return key.lower() if self.normalize_keys else key
368+
369+
def __getitem__(self, key):
370+
if key.endswith(zarr_group_meta_key):
371+
372+
key = key.replace(zarr_group_meta_key, self._group_meta_key)
373+
value = group_metadata_to_zarr(self._load_n5_attrs(key))
374+
375+
return json_dumps(value)
376+
377+
elif key.endswith(zarr_array_meta_key):
378+
379+
key = key.replace(zarr_array_meta_key, self._array_meta_key)
380+
value = array_metadata_to_zarr(self._load_n5_attrs(key))
381+
382+
return json_dumps(value)
383+
384+
elif key.endswith(zarr_attrs_key):
385+
386+
key = key.replace(zarr_attrs_key, self._attrs_key)
387+
value = attrs_to_zarr(self._load_n5_attrs(key))
388+
389+
if len(value) == 0:
390+
raise KeyError(key)
391+
else:
392+
return json_dumps(value)
393+
394+
elif is_chunk_key(key):
395+
key = self._swap_separator(key)
396+
397+
return super().__getitem__(key)
398+
399+
def __setitem__(self, key, value):
400+
if key.endswith(zarr_group_meta_key):
401+
402+
key = key.replace(zarr_group_meta_key, self._group_meta_key)
403+
404+
n5_attrs = self._load_n5_attrs(key)
405+
n5_attrs.update(**group_metadata_to_n5(json_loads(value)))
406+
407+
value = json_dumps(n5_attrs)
408+
409+
elif key.endswith(zarr_array_meta_key):
410+
411+
key = key.replace(zarr_array_meta_key, self._array_meta_key)
412+
413+
n5_attrs = self._load_n5_attrs(key)
414+
n5_attrs.update(**array_metadata_to_n5(json_loads(value)))
415+
416+
value = json_dumps(n5_attrs)
417+
418+
elif key.endswith(zarr_attrs_key):
419+
420+
key = key.replace(zarr_attrs_key, self._attrs_key)
421+
422+
n5_attrs = self._load_n5_attrs(key)
423+
zarr_attrs = json_loads(value)
424+
425+
for k in n5_keywords:
426+
if k in zarr_attrs.keys():
427+
raise ValueError(
428+
"Can not set attribute %s, this is a reserved N5 keyword" % k
429+
)
430+
431+
# replace previous user attributes
432+
for k in list(n5_attrs.keys()):
433+
if k not in n5_keywords:
434+
del n5_attrs[k]
435+
436+
# add new user attributes
437+
n5_attrs.update(**zarr_attrs)
438+
439+
value = json_dumps(n5_attrs)
440+
441+
elif is_chunk_key(key):
442+
key = self._swap_separator(key)
443+
444+
super().__setitem__(key, value)
445+
446+
def __delitem__(self, key):
447+
448+
if key.endswith(zarr_group_meta_key): # pragma: no cover
449+
key = key.replace(zarr_group_meta_key, self._group_meta_key)
450+
elif key.endswith(zarr_array_meta_key): # pragma: no cover
451+
key = key.replace(zarr_array_meta_key, self._array_meta_key)
452+
elif key.endswith(zarr_attrs_key): # pragma: no cover
453+
key = key.replace(zarr_attrs_key, self._attrs_key)
454+
elif is_chunk_key(key):
455+
key = self._swap_separator(key)
456+
457+
super().__delitem__(key)
458+
459+
def __contains__(self, key):
460+
if key.endswith(zarr_group_meta_key):
461+
462+
key = key.replace(zarr_group_meta_key, self._group_meta_key)
463+
if key not in self:
464+
return False
465+
# group if not a dataset (attributes do not contain 'dimensions')
466+
return "dimensions" not in self._load_n5_attrs(key)
467+
468+
elif key.endswith(zarr_array_meta_key):
469+
470+
key = key.replace(zarr_array_meta_key, self._array_meta_key)
471+
# array if attributes contain 'dimensions'
472+
return "dimensions" in self._load_n5_attrs(key)
473+
474+
elif key.endswith(zarr_attrs_key):
475+
476+
key = key.replace(zarr_attrs_key, self._attrs_key)
477+
return self._contains_attrs(key)
478+
479+
elif is_chunk_key(key):
480+
key = self._swap_separator(key)
481+
482+
return super().__contains__(key)
483+
484+
def __eq__(self, other):
485+
return isinstance(other, N5FSStore) and self.path == other.path
486+
487+
def listdir(self, path=None):
488+
if path is not None:
489+
path = invert_chunk_coords(path)
490+
491+
# We can't use NestedDirectoryStore's listdir, as it requires
492+
# array_meta_key to be present in array directories, which this store
493+
# doesn't provide.
494+
children = super().listdir(path=path)
495+
if self._is_array(path):
496+
497+
# replace n5 attribute file with respective zarr attribute files
498+
children.remove(self._array_meta_key)
499+
children.append(zarr_array_meta_key)
500+
if self._contains_attrs(path):
501+
children.append(zarr_attrs_key)
502+
503+
# special handling of directories containing an array to map
504+
# inverted nested chunk keys back to standard chunk keys
505+
new_children = []
506+
root_path = self.dir_path(path)
507+
for entry in children:
508+
entry_path = os.path.join(root_path, entry)
509+
if _prog_number.match(entry) and self.fs.isdir(entry_path):
510+
for file_name in self.fs.find(entry_path):
511+
file_path = os.path.join(root_path, file_name)
512+
rel_path = file_path.split(root_path)[1]
513+
new_child = rel_path.lstrip('/').replace('/', ".")
514+
new_children.append(invert_chunk_coords(new_child))
515+
else:
516+
new_children.append(entry)
517+
return sorted(new_children)
518+
519+
elif self._is_group(path):
520+
521+
# replace n5 attribute file with respective zarr attribute files
522+
children.remove(self._group_meta_key)
523+
children.append(zarr_group_meta_key)
524+
if self._contains_attrs(path): # pragma: no cover
525+
children.append(zarr_attrs_key)
526+
return sorted(children)
527+
else:
528+
return children
529+
530+
def _load_n5_attrs(self, path):
531+
try:
532+
s = super().__getitem__(path)
533+
return json_loads(s)
534+
except KeyError:
535+
return {}
536+
537+
def _is_group(self, path):
538+
539+
if path is None:
540+
attrs_key = self._attrs_key
541+
else:
542+
attrs_key = os.path.join(path, self._attrs_key)
543+
544+
n5_attrs = self._load_n5_attrs(attrs_key)
545+
return len(n5_attrs) > 0 and "dimensions" not in n5_attrs
546+
547+
def _is_array(self, path):
548+
549+
if path is None:
550+
attrs_key = self._attrs_key
551+
else:
552+
attrs_key = os.path.join(path, self._attrs_key)
553+
554+
return "dimensions" in self._load_n5_attrs(attrs_key)
555+
556+
def _contains_attrs(self, path):
557+
558+
if path is None:
559+
attrs_key = self._attrs_key
560+
else:
561+
if not path.endswith(self._attrs_key):
562+
attrs_key = os.path.join(path, self._attrs_key)
563+
else: # pragma: no cover
564+
attrs_key = path
565+
566+
attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key))
567+
return len(attrs) > 0
568+
569+
284570
def is_chunk_key(key):
571+
rv = False
285572
segments = list(key.split('/'))
286573
if segments:
287574
last_segment = segments[-1]
288-
return _prog_ckey.match(last_segment)
289-
return False # pragma: no cover
575+
rv = _prog_ckey.match(last_segment)
576+
return rv
290577

291578

292579
def invert_chunk_coords(key):
@@ -373,6 +660,7 @@ def array_metadata_to_zarr(array_metadata):
373660
array_metadata['fill_value'] = 0 # also if None was requested
374661
array_metadata['order'] = 'C'
375662
array_metadata['filters'] = []
663+
array_metadata['dimension_separator'] = '.'
376664

377665
compressor_config = array_metadata['compressor']
378666
compressor_config = compressor_config_to_zarr(compressor_config)

zarr/storage.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,22 +1065,28 @@ class FSStore(MutableMapping):
10651065
Separator placed between the dimensions of a chunk.
10661066
storage_options : passed to the fsspec implementation
10671067
"""
1068+
_array_meta_key = array_meta_key
1069+
_group_meta_key = group_meta_key
1070+
_attrs_key = attrs_key
10681071

1069-
_META_KEYS = (attrs_key, group_meta_key, array_meta_key)
1070-
1071-
def __init__(self, url, normalize_keys=False, key_separator=None,
1072+
def __init__(self, url, normalize_keys=True, key_separator=None,
10721073
mode='w',
10731074
exceptions=(KeyError, PermissionError, IOError),
10741075
dimension_separator=None,
10751076
**storage_options):
10761077
import fsspec
10771078
self.normalize_keys = normalize_keys
1079+
1080+
protocol, _ = fsspec.core.split_protocol(url)
1081+
# set auto_mkdir to True for local file system
1082+
if protocol in (None, "file") and not storage_options.get("auto_mkdir"):
1083+
storage_options["auto_mkdir"] = True
1084+
10781085
self.map = fsspec.get_mapper(url, **storage_options)
10791086
self.fs = self.map.fs # for direct operations
10801087
self.path = self.fs._strip_protocol(url)
10811088
self.mode = mode
10821089
self.exceptions = exceptions
1083-
10841090
# For backwards compatibility. Guaranteed to be non-None
10851091
if key_separator is not None:
10861092
dimension_separator = key_separator
@@ -1091,7 +1097,6 @@ def __init__(self, url, normalize_keys=False, key_separator=None,
10911097

10921098
# Pass attributes to array creation
10931099
self._dimension_separator = dimension_separator
1094-
10951100
if self.fs.exists(self.path) and not self.fs.isdir(self.path):
10961101
raise FSPathExistNotDir(url)
10971102

@@ -1100,7 +1105,7 @@ def _normalize_key(self, key):
11001105
if key:
11011106
*bits, end = key.split('/')
11021107

1103-
if end not in FSStore._META_KEYS:
1108+
if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key):
11041109
end = end.replace('.', self.key_separator)
11051110
key = '/'.join(bits + [end])
11061111

@@ -1178,7 +1183,7 @@ def listdir(self, path=None):
11781183
if self.key_separator != "/":
11791184
return children
11801185
else:
1181-
if array_meta_key in children:
1186+
if self._array_meta_key in children:
11821187
# special handling of directories containing an array to map nested chunk
11831188
# keys back to standard chunk keys
11841189
new_children = []

0 commit comments

Comments
 (0)