From a860a320eee5f5b5c25f31fbde3e7799edd321f9 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 May 2019 02:41:37 -0400 Subject: [PATCH 1/9] Compare test data's content generally Instead of strictly expecting values returned from a store to be `bytes` instances only in the tests, relax this to anything that supports the buffer protocol and has the expected content. This makes it possible to return things like NumPy `memmap` objects and still have the tests work correctly. --- zarr/tests/test_storage.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 90461b9db4..a7bd32dd8d 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -21,6 +21,7 @@ except ImportError: # pragma: no cover asb = None +from numcodecs.compat import ensure_bytes from zarr.storage import (init_array, array_meta_key, attrs_key, DictStore, DirectoryStore, ZipStore, init_group, group_meta_key, @@ -65,7 +66,7 @@ def test_get_set_del_contains(self): store['foo'] store['foo'] = b'bar' assert 'foo' in store - assert b'bar' == store['foo'] + assert b'bar' == ensure_bytes(store['foo']) # test __delitem__ (optional) try: @@ -103,10 +104,10 @@ def test_pop(self): store['baz'] = b'qux' assert len(store) == 2 v = store.pop('foo') - assert v == b'bar' + assert ensure_bytes(v) == b'bar' assert len(store) == 1 v = store.pop('baz') - assert v == b'qux' + assert ensure_bytes(v) == b'qux' assert len(store) == 0 with pytest.raises(KeyError): store.pop('xxx') @@ -122,7 +123,7 @@ def test_popitem(self): store['foo'] = b'bar' k, v = store.popitem() assert k == 'foo' - assert v == b'bar' + assert ensure_bytes(v) == b'bar' assert len(store) == 0 with pytest.raises(KeyError): store.popitem() @@ -141,8 +142,8 @@ def test_update(self): assert 'foo' not in store assert 'baz' not in store store.update(foo=b'bar', baz=b'quux') - assert b'bar' == store['foo'] - assert b'quux' == store['baz'] + assert b'bar' == ensure_bytes(store['foo']) + assert b'quux' == ensure_bytes(store['baz']) def test_iterators(self): store = self.create_store() @@ -164,9 +165,9 @@ def test_iterators(self): assert 4 == len(store) assert {'a', 'b', 'c/d', 'c/e/f'} == set(store) assert {'a', 'b', 'c/d', 'c/e/f'} == set(store.keys()) - assert {b'aaa', b'bbb', b'ddd', b'fff'} == set(store.values()) + assert {b'aaa', b'bbb', b'ddd', b'fff'} == set(map(ensure_bytes, store.values())) assert ({('a', b'aaa'), ('b', b'bbb'), ('c/d', b'ddd'), ('c/e/f', b'fff')} == - set(store.items())) + set(map(lambda kv: (kv[0], ensure_bytes(kv[1])), store.items()))) def test_pickle(self): @@ -190,8 +191,8 @@ def test_pickle(self): # verify assert n == len(store2) assert keys == sorted(store2.keys()) - assert b'bar' == store2['foo'] - assert b'quux' == store2['baz'] + assert b'bar' == ensure_bytes(store2['foo']) + assert b'quux' == ensure_bytes(store2['baz']) def test_getsize(self): store = self.create_store() @@ -671,10 +672,10 @@ def setdel_hierarchy_checks(store): # test __setitem__ overwrite level store['x/y/z'] = b'xxx' store['x/y'] = b'yyy' - assert b'yyy' == store['x/y'] + assert b'yyy' == ensure_bytes(store['x/y']) assert 'x/y/z' not in store store['x'] = b'zzz' - assert b'zzz' == store['x'] + assert b'zzz' == ensure_bytes(store['x']) assert 'x/y' not in store # test __delitem__ overwrite level @@ -736,7 +737,7 @@ def test_pickle_ext(self): # check point to same underlying directory assert 'xxx' not in store store2['xxx'] = b'yyy' - assert b'yyy' == store['xxx'] + assert b'yyy' == ensure_bytes(store['xxx']) def test_setdel(self): store = self.create_store() From 8f7d342db1ceb3920fde730f254a6471d611379e Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 May 2019 02:44:26 -0400 Subject: [PATCH 2/9] Optionally memory-mapping data in `DirectoryStore` This provides users the option to memory-map the data from the `DirectoryStore` instead of reading it all into memory at once. This can be useful if the chunks are especially large. Also can be useful in combination with zero-copy changes made to Numcodecs to decompress data straight from disk into a decompressed buffer. In cases where the data is not compressed, this can make accessing pieces of chunks pretty efficient. To make resource management simple, we leverage NumPy's `memmap`, which cleans up after itself once no more references are held (e.g. RAII). So there is no need to explicitly close the file. This way if the memory-mapped file is only used for a single operation and tossed, it will automatically be cleaned up without needing other code outside the store to handle this. So as to ensure no inadvertent changes are made to the store, the memory-mapped file is opened in read-only mode. Thus we disallow accidental overwrites of data on disk. This constraint applies to any views taken of the `memmap` (e.g. NumPy `ndarray`s or `memoryview`s). Of course users can still use `__setitem__` to overwrite the data on disk. At present, memory-mapping only occurs when reading data from disk. It is not used to write data to disk. With reading data from disk, there are clear benefits (as noted above). However it is unclear what the benefits are for writing data to disk using memory-mapping. Particularly as we completely overwrite data on disk in an atomic fashion, which would be difficult to reproduce using a memory-mapped file. Other stores inheriting from the `DirectoryStore` can benefit from this change if they also pass along a `memmap` keyword argument to the constructor and reuse its `__getitem__` either as is or in their own subclass implementation of the method. No further changes are required. Currently memory-mapping is optional and is opt-in. If it is seen to be generally useful, we may make it opt-out instead. Though the option remains useful should users find this does or does not make sense in different contexts. Users could hack the value of `memmap` should they need to on a case-by-case basis (e.g. only used for specific values), but it is not necessarily recommended. --- zarr/storage.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index d30b0da6df..ff51d26df6 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -32,6 +32,7 @@ import glob import warnings +import numpy as np from zarr.util import (json_loads, normalize_shape, normalize_chunks, normalize_order, normalize_storage_path, buffer_size, @@ -709,7 +710,7 @@ class DirectoryStore(MutableMapping): """ - def __init__(self, path): + def __init__(self, path, memmap=False): # guard conditions path = os.path.abspath(path) @@ -717,12 +718,16 @@ def __init__(self, path): err_fspath_exists_notdir(path) self.path = path + self.memmap = memmap def __getitem__(self, key): filepath = os.path.join(self.path, key) if os.path.isfile(filepath): - with open(filepath, 'rb') as f: - return f.read() + if self.memmap: + return np.memmap(filepath, mode='r') + else: + with open(filepath, 'rb') as f: + return f.read() else: raise KeyError(key) From 909903293d0f1468dfa796f985a9bcd250232128 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 May 2019 02:44:27 -0400 Subject: [PATCH 3/9] Add `memmap` flag to `DirectoryStore` subclasses Ensure that `TempStore` and `NestedDirectoryStore` can also set the `memmap` flag and thus benefit from its behavior. --- zarr/storage.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index ff51d26df6..ce364adb3f 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -901,10 +901,10 @@ class TempStore(DirectoryStore): """ # noinspection PyShadowingBuiltins - def __init__(self, suffix='', prefix='zarr', dir=None): + def __init__(self, suffix='', prefix='zarr', dir=None, memmap=False): path = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir) atexit.register(atexit_rmtree, path) - super(TempStore, self).__init__(path) + super(TempStore, self).__init__(path, memmap=memmap) _prog_ckey = re.compile(r'^(\d+)(\.\d+)+$') @@ -987,8 +987,8 @@ class NestedDirectoryStore(DirectoryStore): """ - def __init__(self, path): - super(NestedDirectoryStore, self).__init__(path) + def __init__(self, path, memmap=False): + super(NestedDirectoryStore, self).__init__(path, memmap=memmap) def __getitem__(self, key): key = _nested_map_ckey(key) From 960453f26d982f24e5abe4ad9f2ad0ebddfa22ac Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 May 2019 02:44:28 -0400 Subject: [PATCH 4/9] Test memory-mapping support in `DirectoryStore` Simply extend all of the tests applied to `TestDirectoryStore` except set the `memmap` flag to `True` instead of its default value, `False`. --- zarr/tests/test_storage.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index a7bd32dd8d..f36895fcd2 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -744,6 +744,15 @@ def test_setdel(self): setdel_hierarchy_checks(store) +class TestDirectoryStoreMemmap(TestDirectoryStore, unittest.TestCase): + + def create_store(self): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = DirectoryStore(path, memmap=True) + return store + + class TestNestedDirectoryStore(TestDirectoryStore, unittest.TestCase): def create_store(self): From d177b4101629430555c39622326848bc305c85aa Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 May 2019 02:44:29 -0400 Subject: [PATCH 5/9] Implement `pop` and `popitem` for `DirectoryStore` As memory-mapped files will become invalid once deleted, `DirectoryStore`'s `pop` and `popitem` need to be implemented such that they don't hold invalid references to files removed from the disk. To handle this, simply call `ensure_bytes` on any value retrieved to ensure it is copied into memory. This will be a no-op if memory-mapping was disabled as the value would already be a `bytes` instance. --- zarr/storage.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/zarr/storage.py b/zarr/storage.py index ce364adb3f..c978c47b97 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -810,6 +810,30 @@ def __iter__(self): def __len__(self): return sum(1 for _ in self.keys()) + __marker = object() + + def pop(self, key, default=__marker): + try: + value = ensure_bytes(self[key]) + except KeyError: + if default is self.__marker: + raise + else: + return default + else: + del self[key] + return value + + def popitem(self): + try: + key = next(self.keys()) + except StopIteration: + raise KeyError("Store empty") + else: + value = ensure_bytes(self[key]) + del self[key] + return (key, value) + def dir_path(self, path=None): store_path = normalize_storage_path(path) dir_path = self.path From 9e4e6791cb348f582514ec9264338df05033d18b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 May 2019 02:44:30 -0400 Subject: [PATCH 6/9] Coerce NumPy `memmap` object to a `memoryview` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As stores have various properties like needing to be able to compare equal, NumPy `memmap` objects are not ideal for this purpose. Not to mention they subclass NumPy `ndarray`'s, which makes it hard to get a proper NumPy `ndarray` that views the data from things like `ensure_ndarray`. Plus being a less commonly used object in NumPy, users are less familiar with it. Finally NumPy's docs state, "...[`memmap`] doesn’t quite fit properly as a subclass." However `memoryview`s bypass these issues while also providing a convenient container for constructing a NumPy `ndarray` with `ensure_ndarray` later. Hence we go ahead and coerce the NumPy `memmap` object to a `memoryview`. ref: https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.memmap.html --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index c978c47b97..59153ebc35 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -724,7 +724,7 @@ def __getitem__(self, key): filepath = os.path.join(self.path, key) if os.path.isfile(filepath): if self.memmap: - return np.memmap(filepath, mode='r') + return memoryview(np.memmap(filepath, mode='r')) else: with open(filepath, 'rb') as f: return f.read() From ad8858703c3daf681d59589a88e9638bd6eb1bc1 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 May 2019 02:44:31 -0400 Subject: [PATCH 7/9] Simplify `DirectoryStore`'s `pop` method Instead of indirectly using things like `__getitem__` and `__delitem__` in the `DirectoryStore` implementation of `pop`, make direct usage of the file system. --- zarr/storage.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 59153ebc35..f1051d44c1 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -813,16 +813,16 @@ def __len__(self): __marker = object() def pop(self, key, default=__marker): - try: - value = ensure_bytes(self[key]) - except KeyError: - if default is self.__marker: - raise - else: - return default - else: - del self[key] + filepath = os.path.join(self.path, key) + if os.path.isfile(filepath): + with open(filepath, 'rb') as f: + value = f.read() + os.remove(filepath) return value + elif default is self.__marker: + raise KeyError(key) + else: + return default def popitem(self): try: From 60470bda736967c2e54938b37446c85801430903 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 May 2019 02:44:32 -0400 Subject: [PATCH 8/9] Use `pop` in `popitem` implementation Just simplifies things a bit. --- zarr/storage.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index f1051d44c1..bcecea3a8a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -830,8 +830,7 @@ def popitem(self): except StopIteration: raise KeyError("Store empty") else: - value = ensure_bytes(self[key]) - del self[key] + value = self.pop(key) return (key, value) def dir_path(self, path=None): From 08f98ddeb9b119023cffddcb767174a54ea9ebbc Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 May 2019 02:44:34 -0400 Subject: [PATCH 9/9] Override `pop` in `NestedDirectoryStore` Corrects how the path is determined. --- zarr/storage.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/zarr/storage.py b/zarr/storage.py index bcecea3a8a..2a4e6ed86a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1035,6 +1035,10 @@ def __eq__(self, other): self.path == other.path ) + def pop(self, key, *args, **kwargs): + key = _nested_map_ckey(key) + return super(NestedDirectoryStore, self).pop(key, *args, **kwargs) + def listdir(self, path=None): children = super(NestedDirectoryStore, self).listdir(path=path) if array_meta_key in children: