Skip to content

Commit 4941f97

Browse files
authored
Merge pull request #203 from jakirkham/support_hexdigest
Method to find Zarr Array checksum
2 parents 12ff586 + d36dfe1 commit 4941f97

File tree

7 files changed

+277
-2
lines changed

7 files changed

+277
-2
lines changed

docs/api/core.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ The Array class (``zarr.core``)
1414
.. automethod:: set_coordinate_selection
1515
.. automethod:: get_orthogonal_selection
1616
.. automethod:: set_orthogonal_selection
17+
.. automethod:: hexdigest
1718
.. automethod:: resize
1819
.. automethod:: append
1920
.. automethod:: view

docs/release.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ Enhancements
104104

105105
* **ZipStore is now thread-safe**; :issue:`194`, :issue:`192`.
106106

107+
* **New Array.hexdigest() method** computes an ``Array``'s hash with ``hashlib``.
108+
By :user:`John Kirkham <jakirkham>`, :issue:`98`, :issue:`203`.
109+
107110

108111
Bug fixes
109112
~~~~~~~~~

zarr/attrs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __getitem__(self, item):
2424
return self.asdict()[item]
2525

2626
def _put(self, d):
27-
s = json.dumps(d, indent=4, sort_keys=True, ensure_ascii=True)
27+
s = json.dumps(d, indent=4, sort_keys=True, ensure_ascii=True, separators=(',', ': '))
2828
self.store[self.key] = s.encode('ascii')
2929

3030
def _write_op(self, f, *args, **kwargs):

zarr/core.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from __future__ import absolute_import, print_function, division
33
import operator
44
import itertools
5+
import hashlib
56
import re
67

78

@@ -87,6 +88,7 @@ class Array(object):
8788
set_mask_selection
8889
get_coordinate_selection
8990
set_coordinate_selection
91+
hexdigest
9092
resize
9193
append
9294
view
@@ -1835,6 +1837,37 @@ def bytestr(n):
18351837

18361838
return items
18371839

1840+
def hexdigest(self, hashname="sha1"):
1841+
"""
1842+
Compute a checksum for the data. Default uses sha1 for speed.
1843+
1844+
Examples
1845+
--------
1846+
>>> import zarr
1847+
>>> z = zarr.empty(shape=(10000, 10000), chunks=(1000, 1000))
1848+
>>> z.hexdigest()
1849+
'041f90bc7a571452af4f850a8ca2c6cddfa8a1ac'
1850+
>>> z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000))
1851+
>>> z.hexdigest()
1852+
'7162d416d26a68063b66ed1f30e0a866e4abed60'
1853+
>>> z = zarr.zeros(shape=(10000, 10000), dtype="u1", chunks=(1000, 1000))
1854+
>>> z.hexdigest()
1855+
'cb387af37410ae5a3222e893cf3373e4e4f22816'
1856+
"""
1857+
1858+
h = hashlib.new(hashname)
1859+
1860+
for i in itertools.product(*[range(s) for s in self.cdata_shape]):
1861+
h.update(self.chunk_store.get(self._chunk_key(i), b""))
1862+
1863+
h.update(self.store.get(self._key_prefix + array_meta_key, b""))
1864+
1865+
h.update(self.store.get(self.attrs.key, b""))
1866+
1867+
checksum = h.hexdigest()
1868+
1869+
return checksum
1870+
18381871
def __getstate__(self):
18391872
return (self._store, self._path, self._read_only, self._chunk_store,
18401873
self._synchronizer, self._cache_metadata)

zarr/meta.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def encode_array_metadata(meta):
6464
order=meta['order'],
6565
filters=meta['filters'],
6666
)
67-
s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True)
67+
s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True, separators=(',', ': '))
6868
b = s.encode('ascii')
6969
return b
7070

zarr/tests/test_core.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def test_array_init(self):
4040
assert_is_none(a.name)
4141
assert_is_none(a.basename)
4242
assert_is(store, a.store)
43+
eq("8fecb7a17ea1493d9c1430d04437b4f5b0b34985", a.hexdigest())
4344

4445
# initialize at path
4546
store = dict()
@@ -52,6 +53,7 @@ def test_array_init(self):
5253
eq('/foo/bar', a.name)
5354
eq('bar', a.basename)
5455
assert_is(store, a.store)
56+
eq("8fecb7a17ea1493d9c1430d04437b4f5b0b34985", a.hexdigest())
5557

5658
# store not initialized
5759
store = dict()
@@ -440,6 +442,29 @@ def test_setitem_data_not_shared(self):
440442
a[:] = 0
441443
assert_array_equal(z[:], np.arange(20, dtype='i4'))
442444

445+
def test_hexdigest(self):
446+
# Check basic 1-D array
447+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
448+
eq('063b02ff8d9d3bab6da932ad5828b506ef0a6578', z.hexdigest())
449+
450+
# Check basic 1-D array with different type
451+
z = self.create_array(shape=(1050,), chunks=100, dtype='f4')
452+
eq('f97b84dc9ffac807415f750100108764e837bb82', z.hexdigest())
453+
454+
# Check basic 2-D array
455+
z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4')
456+
eq('4f797d7bdad0fa1c9fa8c80832efb891a68de104', z.hexdigest())
457+
458+
# Check basic 1-D array with some data
459+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
460+
z[200:400] = np.arange(200, 400, dtype='i4')
461+
eq('14470724dca6c1837edddedc490571b6a7f270bc', z.hexdigest())
462+
463+
# Check basic 1-D array with attributes
464+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
465+
z.attrs['foo'] = 'bar'
466+
eq('2a1046dd99b914459b3e86be9dde05027a07d209', z.hexdigest())
467+
443468
def test_resize_1d(self):
444469

445470
z = self.create_array(shape=105, chunks=10, dtype='i4',
@@ -848,6 +873,29 @@ def create_array(read_only=False, **kwargs):
848873
init_array(store, path='foo/bar', **kwargs)
849874
return Array(store, path='foo/bar', read_only=read_only)
850875

876+
def test_hexdigest(self):
877+
# Check basic 1-D array
878+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
879+
eq('f710da18d45d38d4aaf2afd7fb822fdd73d02957', z.hexdigest())
880+
881+
# Check basic 1-D array with different type
882+
z = self.create_array(shape=(1050,), chunks=100, dtype='f4')
883+
eq('1437428e69754b1e1a38bd7fc9e43669577620db', z.hexdigest())
884+
885+
# Check basic 2-D array
886+
z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4')
887+
eq('dde44c72cc530bd6aae39b629eb15a2da627e5f9', z.hexdigest())
888+
889+
# Check basic 1-D array with some data
890+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
891+
z[200:400] = np.arange(200, 400, dtype='i4')
892+
eq('4c0a76fb1222498e09dcd92f7f9221d6cea8b40e', z.hexdigest())
893+
894+
# Check basic 1-D array with attributes
895+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
896+
z.attrs['foo'] = 'bar'
897+
eq('05b0663ffe1785f38d3a459dec17e57a18f254af', z.hexdigest())
898+
851899
def test_nbytes_stored(self):
852900

853901
# dict as store
@@ -877,6 +925,29 @@ def create_array(read_only=False, **kwargs):
877925
init_array(store, chunk_store=chunk_store, **kwargs)
878926
return Array(store, read_only=read_only, chunk_store=chunk_store)
879927

928+
def test_hexdigest(self):
929+
# Check basic 1-D array
930+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
931+
eq('f710da18d45d38d4aaf2afd7fb822fdd73d02957', z.hexdigest())
932+
933+
# Check basic 1-D array with different type
934+
z = self.create_array(shape=(1050,), chunks=100, dtype='f4')
935+
eq('1437428e69754b1e1a38bd7fc9e43669577620db', z.hexdigest())
936+
937+
# Check basic 2-D array
938+
z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4')
939+
eq('dde44c72cc530bd6aae39b629eb15a2da627e5f9', z.hexdigest())
940+
941+
# Check basic 1-D array with some data
942+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
943+
z[200:400] = np.arange(200, 400, dtype='i4')
944+
eq('4c0a76fb1222498e09dcd92f7f9221d6cea8b40e', z.hexdigest())
945+
946+
# Check basic 1-D array with attributes
947+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
948+
z.attrs['foo'] = 'bar'
949+
eq('05b0663ffe1785f38d3a459dec17e57a18f254af', z.hexdigest())
950+
880951
def test_nbytes_stored(self):
881952

882953
z = self.create_array(shape=1000, chunks=100)
@@ -1009,6 +1080,29 @@ def create_array(self, read_only=False, **kwargs):
10091080
init_array(store, **kwargs)
10101081
return Array(store, read_only=read_only)
10111082

1083+
def test_hexdigest(self):
1084+
# Check basic 1-D array
1085+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1086+
eq('d3da3d485de4a5fcc6d91f9dfc6a7cba9720c561', z.hexdigest())
1087+
1088+
# Check basic 1-D array with different type
1089+
z = self.create_array(shape=(1050,), chunks=100, dtype='f4')
1090+
eq('443b8dee512e42946cb63ff01d28e9bee8105a5f', z.hexdigest())
1091+
1092+
# Check basic 2-D array
1093+
z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4')
1094+
eq('de841ca276042993da53985de1e7769f5d0fc54d', z.hexdigest())
1095+
1096+
# Check basic 1-D array with some data
1097+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1098+
z[200:400] = np.arange(200, 400, dtype='i4')
1099+
eq('42b6ae0d50ec361628736ab7e68fe5fefca22136', z.hexdigest())
1100+
1101+
# Check basic 1-D array with attributes
1102+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1103+
z.attrs['foo'] = 'bar'
1104+
eq('a0535f31c130f5e5ac66ba0713d1c1ceaebd089b', z.hexdigest())
1105+
10121106

10131107
class TestArrayWithBZ2Compressor(TestArray):
10141108

@@ -1019,6 +1113,29 @@ def create_array(self, read_only=False, **kwargs):
10191113
init_array(store, **kwargs)
10201114
return Array(store, read_only=read_only)
10211115

1116+
def test_hexdigest(self):
1117+
# Check basic 1-D array
1118+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1119+
eq('33141032439fb1df5e24ad9891a7d845b6c668c8', z.hexdigest())
1120+
1121+
# Check basic 1-D array with different type
1122+
z = self.create_array(shape=(1050,), chunks=100, dtype='f4')
1123+
eq('44d719da065c88a412d609a5500ff41e07b331d6', z.hexdigest())
1124+
1125+
# Check basic 2-D array
1126+
z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4')
1127+
eq('f57a9a73a4004490fe1b871688651b8a298a5db7', z.hexdigest())
1128+
1129+
# Check basic 1-D array with some data
1130+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1131+
z[200:400] = np.arange(200, 400, dtype='i4')
1132+
eq('1e1bcaac63e4ef3c4a68f11672537131c627f168', z.hexdigest())
1133+
1134+
# Check basic 1-D array with attributes
1135+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1136+
z.attrs['foo'] = 'bar'
1137+
eq('86d7b9bf22dccbeaa22f340f38be506b55e76ff2', z.hexdigest())
1138+
10221139

10231140
class TestArrayWithBloscCompressor(TestArray):
10241141

@@ -1029,6 +1146,29 @@ def create_array(self, read_only=False, **kwargs):
10291146
init_array(store, **kwargs)
10301147
return Array(store, read_only=read_only)
10311148

1149+
def test_hexdigest(self):
1150+
# Check basic 1-D array
1151+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1152+
eq('7ff2ae8511eac915fad311647c168ccfe943e788', z.hexdigest())
1153+
1154+
# Check basic 1-D array with different type
1155+
z = self.create_array(shape=(1050,), chunks=100, dtype='f4')
1156+
eq('962705c861863495e9ccb7be7735907aa15e85b5', z.hexdigest())
1157+
1158+
# Check basic 2-D array
1159+
z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4')
1160+
eq('deb675ff91dd26dba11b65aab5f19a1f21a5645b', z.hexdigest())
1161+
1162+
# Check basic 1-D array with some data
1163+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1164+
z[200:400] = np.arange(200, 400, dtype='i4')
1165+
eq('90e30bdab745a9641cd0eb605356f531bc8ec1c3', z.hexdigest())
1166+
1167+
# Check basic 1-D array with attributes
1168+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1169+
z.attrs['foo'] = 'bar'
1170+
eq('95d40c391f167db8b1290e3c39d9bf741edacdf6', z.hexdigest())
1171+
10321172

10331173
# TODO can we rely on backports and remove the PY2 exclusion?
10341174
if not PY2: # pragma: py2 no cover
@@ -1044,6 +1184,29 @@ def create_array(self, read_only=False, **kwargs):
10441184
init_array(store, **kwargs)
10451185
return Array(store, read_only=read_only)
10461186

1187+
def test_hexdigest(self):
1188+
# Check basic 1-D array
1189+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1190+
eq('93ecaa530a1162a9d48a3c1dcee4586ccfc59bae', z.hexdigest())
1191+
1192+
# Check basic 1-D array with different type
1193+
z = self.create_array(shape=(1050,), chunks=100, dtype='f4')
1194+
eq('04a9755a0cd638683531b7816c7fa4fbb6f577f2', z.hexdigest())
1195+
1196+
# Check basic 2-D array
1197+
z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4')
1198+
eq('b93b163a21e8500519250a6defb821d03eb5d9e0', z.hexdigest())
1199+
1200+
# Check basic 1-D array with some data
1201+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1202+
z[200:400] = np.arange(200, 400, dtype='i4')
1203+
eq('cde499f3dc945b4e97197ff8e3cf8188a1262c35', z.hexdigest())
1204+
1205+
# Check basic 1-D array with attributes
1206+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1207+
z.attrs['foo'] = 'bar'
1208+
eq('e2cf3afbf66ad0e28a2b6b68b1b07817c69aaee2', z.hexdigest())
1209+
10471210

10481211
class TestArrayWithFilters(TestArray):
10491212

@@ -1061,6 +1224,29 @@ def create_array(read_only=False, **kwargs):
10611224
init_array(store, **kwargs)
10621225
return Array(store, read_only=read_only)
10631226

1227+
def test_hexdigest(self):
1228+
# Check basic 1-D array
1229+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1230+
eq('b80367c5599d47110d42bd8886240c2f46620dba', z.hexdigest())
1231+
1232+
# Check basic 1-D array with different type
1233+
z = self.create_array(shape=(1050,), chunks=100, dtype='f4')
1234+
eq('95a7b2471225e73199c9716d21e8d3dd6e5f6f2a', z.hexdigest())
1235+
1236+
# Check basic 2-D array
1237+
z = self.create_array(shape=(20, 35,), chunks=10, dtype='i4')
1238+
eq('9abf3ad54413ab11855d88a5e0087cd416657e02', z.hexdigest())
1239+
1240+
# Check basic 1-D array with some data
1241+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1242+
z[200:400] = np.arange(200, 400, dtype='i4')
1243+
eq('c649ad229bc5720258b934ea958570c2f354c2eb', z.hexdigest())
1244+
1245+
# Check basic 1-D array with attributes
1246+
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
1247+
z.attrs['foo'] = 'bar'
1248+
eq('62fc9236d78af18a5ec26c12eea1d33bce52501e', z.hexdigest())
1249+
10641250
def test_astype_no_filters(self):
10651251
shape = (100,)
10661252
dtype = np.dtype(np.int8)
@@ -1113,6 +1299,12 @@ def __init__(self):
11131299
def keys(self):
11141300
return self.inner.keys()
11151301

1302+
def get(self, item, default=None):
1303+
try:
1304+
return self.inner[item]
1305+
except KeyError:
1306+
return default
1307+
11161308
def __getitem__(self, item):
11171309
return self.inner[item]
11181310

0 commit comments

Comments
 (0)