diff --git a/docs/api/core.rst b/docs/api/core.rst index ada6a653ca..cf8c4b0f8b 100644 --- a/docs/api/core.rst +++ b/docs/api/core.rst @@ -14,6 +14,7 @@ The Array class (``zarr.core``) .. automethod:: set_coordinate_selection .. automethod:: get_orthogonal_selection .. automethod:: set_orthogonal_selection + .. automethod:: hexdigest .. automethod:: resize .. automethod:: append .. automethod:: view diff --git a/docs/release.rst b/docs/release.rst index 84363e1336..a1568524e7 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -104,6 +104,9 @@ Enhancements * **ZipStore is now thread-safe**; :issue:`194`, :issue:`192`. +* **New Array.hexdigest() method** computes an ``Array``'s hash with ``hashlib``. + By :user:`John Kirkham `, :issue:`98`, :issue:`203`. + Bug fixes ~~~~~~~~~ diff --git a/zarr/attrs.py b/zarr/attrs.py index 2dc1f73298..f8756debac 100644 --- a/zarr/attrs.py +++ b/zarr/attrs.py @@ -24,7 +24,7 @@ def __getitem__(self, item): return self.asdict()[item] def _put(self, d): - s = json.dumps(d, indent=4, sort_keys=True, ensure_ascii=True) + s = json.dumps(d, indent=4, sort_keys=True, ensure_ascii=True, separators=(',', ': ')) self.store[self.key] = s.encode('ascii') def _write_op(self, f, *args, **kwargs): diff --git a/zarr/core.py b/zarr/core.py index 13c1883765..4d33049343 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division import operator import itertools +import hashlib import re @@ -87,6 +88,7 @@ class Array(object): set_mask_selection get_coordinate_selection set_coordinate_selection + hexdigest resize append view @@ -1835,6 +1837,37 @@ def bytestr(n): return items + def hexdigest(self, hashname="sha1"): + """ + Compute a checksum for the data. Default uses sha1 for speed. + + Examples + -------- + >>> import zarr + >>> z = zarr.empty(shape=(10000, 10000), chunks=(1000, 1000)) + >>> z.hexdigest() + '041f90bc7a571452af4f850a8ca2c6cddfa8a1ac' + >>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000)) + >>> z.hexdigest() + '7162d416d26a68063b66ed1f30e0a866e4abed60' + >>> z = zarr.zeros(shape=(10000, 10000), dtype="u1", chunks=(1000, 1000)) + >>> z.hexdigest() + 'cb387af37410ae5a3222e893cf3373e4e4f22816' + """ + + h = hashlib.new(hashname) + + for i in itertools.product(*[range(s) for s in self.cdata_shape]): + h.update(self.chunk_store.get(self._chunk_key(i), b"")) + + h.update(self.store.get(self._key_prefix + array_meta_key, b"")) + + h.update(self.store.get(self.attrs.key, b"")) + + checksum = h.hexdigest() + + return checksum + def __getstate__(self): return (self._store, self._path, self._read_only, self._chunk_store, self._synchronizer, self._cache_metadata) diff --git a/zarr/meta.py b/zarr/meta.py index 36360a54f1..51661eeebb 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -64,7 +64,7 @@ def encode_array_metadata(meta): order=meta['order'], filters=meta['filters'], ) - s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True) + s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True, separators=(',', ': ')) b = s.encode('ascii') return b diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 1679edd7b9..8d4cdad5e3 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -40,6 +40,7 @@ def test_array_init(self): assert_is_none(a.name) assert_is_none(a.basename) assert_is(store, a.store) + eq("8fecb7a17ea1493d9c1430d04437b4f5b0b34985", a.hexdigest()) # initialize at path store = dict() @@ -52,6 +53,7 @@ def test_array_init(self): eq('/foo/bar', a.name) eq('bar', a.basename) assert_is(store, a.store) + eq("8fecb7a17ea1493d9c1430d04437b4f5b0b34985", a.hexdigest()) # store not initialized store = dict() @@ -440,6 +442,29 @@ def test_setitem_data_not_shared(self): a[:] = 0 assert_array_equal(z[:], np.arange(20, dtype='i4')) + def test_hexdigest(self): + # Check basic 1-D array + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + eq('063b02ff8d9d3bab6da932ad5828b506ef0a6578', z.hexdigest()) + + # Check basic 1-D array with different type + z = self.create_array(shape=(1050,), chunks=100, dtype='f4') + eq('f97b84dc9ffac807415f750100108764e837bb82', z.hexdigest()) + + # Check basic 2-D array + z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4') + eq('4f797d7bdad0fa1c9fa8c80832efb891a68de104', z.hexdigest()) + + # Check basic 1-D array with some data + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z[200:400] = np.arange(200, 400, dtype='i4') + eq('14470724dca6c1837edddedc490571b6a7f270bc', z.hexdigest()) + + # Check basic 1-D array with attributes + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z.attrs['foo'] = 'bar' + eq('2a1046dd99b914459b3e86be9dde05027a07d209', z.hexdigest()) + def test_resize_1d(self): z = self.create_array(shape=105, chunks=10, dtype='i4', @@ -848,6 +873,29 @@ def create_array(read_only=False, **kwargs): init_array(store, path='foo/bar', **kwargs) return Array(store, path='foo/bar', read_only=read_only) + def test_hexdigest(self): + # Check basic 1-D array + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + eq('f710da18d45d38d4aaf2afd7fb822fdd73d02957', z.hexdigest()) + + # Check basic 1-D array with different type + z = self.create_array(shape=(1050,), chunks=100, dtype='f4') + eq('1437428e69754b1e1a38bd7fc9e43669577620db', z.hexdigest()) + + # Check basic 2-D array + z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4') + eq('dde44c72cc530bd6aae39b629eb15a2da627e5f9', z.hexdigest()) + + # Check basic 1-D array with some data + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z[200:400] = np.arange(200, 400, dtype='i4') + eq('4c0a76fb1222498e09dcd92f7f9221d6cea8b40e', z.hexdigest()) + + # Check basic 1-D array with attributes + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z.attrs['foo'] = 'bar' + eq('05b0663ffe1785f38d3a459dec17e57a18f254af', z.hexdigest()) + def test_nbytes_stored(self): # dict as store @@ -877,6 +925,29 @@ def create_array(read_only=False, **kwargs): init_array(store, chunk_store=chunk_store, **kwargs) return Array(store, read_only=read_only, chunk_store=chunk_store) + def test_hexdigest(self): + # Check basic 1-D array + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + eq('f710da18d45d38d4aaf2afd7fb822fdd73d02957', z.hexdigest()) + + # Check basic 1-D array with different type + z = self.create_array(shape=(1050,), chunks=100, dtype='f4') + eq('1437428e69754b1e1a38bd7fc9e43669577620db', z.hexdigest()) + + # Check basic 2-D array + z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4') + eq('dde44c72cc530bd6aae39b629eb15a2da627e5f9', z.hexdigest()) + + # Check basic 1-D array with some data + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z[200:400] = np.arange(200, 400, dtype='i4') + eq('4c0a76fb1222498e09dcd92f7f9221d6cea8b40e', z.hexdigest()) + + # Check basic 1-D array with attributes + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z.attrs['foo'] = 'bar' + eq('05b0663ffe1785f38d3a459dec17e57a18f254af', z.hexdigest()) + def test_nbytes_stored(self): z = self.create_array(shape=1000, chunks=100) @@ -1009,6 +1080,29 @@ def create_array(self, read_only=False, **kwargs): init_array(store, **kwargs) return Array(store, read_only=read_only) + def test_hexdigest(self): + # Check basic 1-D array + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + eq('d3da3d485de4a5fcc6d91f9dfc6a7cba9720c561', z.hexdigest()) + + # Check basic 1-D array with different type + z = self.create_array(shape=(1050,), chunks=100, dtype='f4') + eq('443b8dee512e42946cb63ff01d28e9bee8105a5f', z.hexdigest()) + + # Check basic 2-D array + z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4') + eq('de841ca276042993da53985de1e7769f5d0fc54d', z.hexdigest()) + + # Check basic 1-D array with some data + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z[200:400] = np.arange(200, 400, dtype='i4') + eq('42b6ae0d50ec361628736ab7e68fe5fefca22136', z.hexdigest()) + + # Check basic 1-D array with attributes + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z.attrs['foo'] = 'bar' + eq('a0535f31c130f5e5ac66ba0713d1c1ceaebd089b', z.hexdigest()) + class TestArrayWithBZ2Compressor(TestArray): @@ -1019,6 +1113,29 @@ def create_array(self, read_only=False, **kwargs): init_array(store, **kwargs) return Array(store, read_only=read_only) + def test_hexdigest(self): + # Check basic 1-D array + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + eq('33141032439fb1df5e24ad9891a7d845b6c668c8', z.hexdigest()) + + # Check basic 1-D array with different type + z = self.create_array(shape=(1050,), chunks=100, dtype='f4') + eq('44d719da065c88a412d609a5500ff41e07b331d6', z.hexdigest()) + + # Check basic 2-D array + z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4') + eq('f57a9a73a4004490fe1b871688651b8a298a5db7', z.hexdigest()) + + # Check basic 1-D array with some data + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z[200:400] = np.arange(200, 400, dtype='i4') + eq('1e1bcaac63e4ef3c4a68f11672537131c627f168', z.hexdigest()) + + # Check basic 1-D array with attributes + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z.attrs['foo'] = 'bar' + eq('86d7b9bf22dccbeaa22f340f38be506b55e76ff2', z.hexdigest()) + class TestArrayWithBloscCompressor(TestArray): @@ -1029,6 +1146,29 @@ def create_array(self, read_only=False, **kwargs): init_array(store, **kwargs) return Array(store, read_only=read_only) + def test_hexdigest(self): + # Check basic 1-D array + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + eq('7ff2ae8511eac915fad311647c168ccfe943e788', z.hexdigest()) + + # Check basic 1-D array with different type + z = self.create_array(shape=(1050,), chunks=100, dtype='f4') + eq('962705c861863495e9ccb7be7735907aa15e85b5', z.hexdigest()) + + # Check basic 2-D array + z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4') + eq('deb675ff91dd26dba11b65aab5f19a1f21a5645b', z.hexdigest()) + + # Check basic 1-D array with some data + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z[200:400] = np.arange(200, 400, dtype='i4') + eq('90e30bdab745a9641cd0eb605356f531bc8ec1c3', z.hexdigest()) + + # Check basic 1-D array with attributes + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z.attrs['foo'] = 'bar' + eq('95d40c391f167db8b1290e3c39d9bf741edacdf6', z.hexdigest()) + # TODO can we rely on backports and remove the PY2 exclusion? if not PY2: # pragma: py2 no cover @@ -1044,6 +1184,29 @@ def create_array(self, read_only=False, **kwargs): init_array(store, **kwargs) return Array(store, read_only=read_only) + def test_hexdigest(self): + # Check basic 1-D array + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + eq('93ecaa530a1162a9d48a3c1dcee4586ccfc59bae', z.hexdigest()) + + # Check basic 1-D array with different type + z = self.create_array(shape=(1050,), chunks=100, dtype='f4') + eq('04a9755a0cd638683531b7816c7fa4fbb6f577f2', z.hexdigest()) + + # Check basic 2-D array + z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4') + eq('b93b163a21e8500519250a6defb821d03eb5d9e0', z.hexdigest()) + + # Check basic 1-D array with some data + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z[200:400] = np.arange(200, 400, dtype='i4') + eq('cde499f3dc945b4e97197ff8e3cf8188a1262c35', z.hexdigest()) + + # Check basic 1-D array with attributes + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z.attrs['foo'] = 'bar' + eq('e2cf3afbf66ad0e28a2b6b68b1b07817c69aaee2', z.hexdigest()) + class TestArrayWithFilters(TestArray): @@ -1061,6 +1224,29 @@ def create_array(read_only=False, **kwargs): init_array(store, **kwargs) return Array(store, read_only=read_only) + def test_hexdigest(self): + # Check basic 1-D array + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + eq('b80367c5599d47110d42bd8886240c2f46620dba', z.hexdigest()) + + # Check basic 1-D array with different type + z = self.create_array(shape=(1050,), chunks=100, dtype='f4') + eq('95a7b2471225e73199c9716d21e8d3dd6e5f6f2a', z.hexdigest()) + + # Check basic 2-D array + z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4') + eq('9abf3ad54413ab11855d88a5e0087cd416657e02', z.hexdigest()) + + # Check basic 1-D array with some data + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z[200:400] = np.arange(200, 400, dtype='i4') + eq('c649ad229bc5720258b934ea958570c2f354c2eb', z.hexdigest()) + + # Check basic 1-D array with attributes + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z.attrs['foo'] = 'bar' + eq('62fc9236d78af18a5ec26c12eea1d33bce52501e', z.hexdigest()) + def test_astype_no_filters(self): shape = (100,) dtype = np.dtype(np.int8) @@ -1113,6 +1299,12 @@ def __init__(self): def keys(self): return self.inner.keys() + def get(self, item, default=None): + try: + return self.inner[item] + except KeyError: + return default + def __getitem__(self, item): return self.inner[item] diff --git a/zarr/tests/test_sync.py b/zarr/tests/test_sync.py index fadcd73e6f..b52aab0cc6 100644 --- a/zarr/tests/test_sync.py +++ b/zarr/tests/test_sync.py @@ -113,6 +113,29 @@ def create_pool(self): pool = ThreadPool(cpu_count()) return pool + def test_hexdigest(self): + # Check basic 1-D array + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + eq('f710da18d45d38d4aaf2afd7fb822fdd73d02957', z.hexdigest()) + + # Check basic 1-D array with different type + z = self.create_array(shape=(1050,), chunks=100, dtype='f4') + eq('1437428e69754b1e1a38bd7fc9e43669577620db', z.hexdigest()) + + # Check basic 2-D array + z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4') + eq('dde44c72cc530bd6aae39b629eb15a2da627e5f9', z.hexdigest()) + + # Check basic 1-D array with some data + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z[200:400] = np.arange(200, 400, dtype='i4') + eq('4c0a76fb1222498e09dcd92f7f9221d6cea8b40e', z.hexdigest()) + + # Check basic 1-D array with attributes + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z.attrs['foo'] = 'bar' + eq('05b0663ffe1785f38d3a459dec17e57a18f254af', z.hexdigest()) + class TestArrayWithProcessSynchronizer(TestArray, MixinArraySyncTests): @@ -131,6 +154,29 @@ def create_pool(self): pool = ProcessPool(processes=cpu_count()) return pool + def test_hexdigest(self): + # Check basic 1-D array + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + eq('f710da18d45d38d4aaf2afd7fb822fdd73d02957', z.hexdigest()) + + # Check basic 1-D array with different type + z = self.create_array(shape=(1050,), chunks=100, dtype='f4') + eq('1437428e69754b1e1a38bd7fc9e43669577620db', z.hexdigest()) + + # Check basic 2-D array + z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4') + eq('dde44c72cc530bd6aae39b629eb15a2da627e5f9', z.hexdigest()) + + # Check basic 1-D array with some data + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z[200:400] = np.arange(200, 400, dtype='i4') + eq('4c0a76fb1222498e09dcd92f7f9221d6cea8b40e', z.hexdigest()) + + # Check basic 1-D array with attributes + z = self.create_array(shape=(1050,), chunks=100, dtype='i4') + z.attrs['foo'] = 'bar' + eq('05b0663ffe1785f38d3a459dec17e57a18f254af', z.hexdigest()) + def _create_group(arg): g, name = arg