Skip to content

Commit 3c933cf

Browse files
authored
(feat): typesize declared with constructor for Blosc (#713)
* (feat): `typesize` declared with constructor * (chore): add docstring * (chore): relnote * (chore): format * (fix): add check for `typesize<1` * (chore): no cover for internal `ValueError` * (fix): test internal `compress` error
1 parent 8168e15 commit 3c933cf

File tree

3 files changed

+41
-4
lines changed

3 files changed

+41
-4
lines changed

docs/release.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ Improvements
2828
~~~~~~~~~~~~
2929
* Raise a custom `UnknownCodecError` when trying to retrieve an unavailable codec.
3030
By :user:`Cas Wognum <cwognum>`.
31+
* Add ``typesize`` argument to ``Blosc`` to allow for buffers that are passed to ``encode``
32+
use that information. zarr v3 currently has its Blosc codec as bytes-to-bytes but does retain
33+
the size information so using it here allows for massive compression ratio gains.
34+
By :user:`Ilan Gold <ilan-gold>`
3135

3236
Fixes
3337
~~~~~

numcodecs/blosc.pyx

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ def _err_bad_cname(cname):
235235
err_bad_cname = deprecated(_err_bad_cname)
236236

237237
def compress(source, char* cname, int clevel, int shuffle=SHUFFLE,
238-
int blocksize=AUTOBLOCKS):
238+
int blocksize=AUTOBLOCKS, typesize=None):
239239
"""Compress data.
240240
241241
Parameters
@@ -279,7 +279,12 @@ def compress(source, char* cname, int clevel, int shuffle=SHUFFLE,
279279
source_buffer = Buffer(source, PyBUF_ANY_CONTIGUOUS)
280280
source_ptr = source_buffer.ptr
281281
nbytes = source_buffer.nbytes
282-
itemsize = source_buffer.itemsize
282+
if isinstance(typesize, int):
283+
if typesize < 1:
284+
raise ValueError(f"Cannot use typesize {typesize} less than 1.")
285+
itemsize = typesize
286+
else:
287+
itemsize = source_buffer.itemsize
283288

284289
# determine shuffle
285290
if shuffle == AUTOSHUFFLE:
@@ -552,6 +557,8 @@ class Blosc(Codec):
552557
blocksize : int
553558
The requested size of the compressed blocks. If 0 (default), an automatic
554559
blocksize will be used.
560+
typesize : int, optional
561+
The size in bytes of uncompressed array elements.
555562
556563
See Also
557564
--------
@@ -566,7 +573,9 @@ class Blosc(Codec):
566573
AUTOSHUFFLE = AUTOSHUFFLE
567574
max_buffer_size = 2**31 - 1
568575

569-
def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS):
576+
def __init__(self, cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=AUTOBLOCKS, typesize=None):
577+
if isinstance(typesize, int) and typesize < 1:
578+
raise ValueError(f"Cannot use typesize {typesize} less than 1.")
570579
self.cname = cname
571580
if isinstance(cname, str):
572581
self._cname_bytes = cname.encode('ascii')
@@ -575,10 +584,11 @@ class Blosc(Codec):
575584
self.clevel = clevel
576585
self.shuffle = shuffle
577586
self.blocksize = blocksize
587+
self.typesize = typesize
578588

579589
def encode(self, buf):
580590
buf = ensure_contiguous_ndarray(buf, self.max_buffer_size)
581-
return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize)
591+
return compress(buf, self._cname_bytes, self.clevel, self.shuffle, self.blocksize, self.typesize)
582592

583593
def decode(self, buf, out=None):
584594
buf = ensure_contiguous_ndarray(buf, self.max_buffer_size)

numcodecs/tests/test_blosc.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,3 +273,26 @@ def test_max_buffer_size():
273273
_skip_null(codec)
274274
assert codec.max_buffer_size == 2**31 - 1
275275
check_max_buffer_size(codec)
276+
277+
278+
def test_typesize_explicit():
279+
arr = np.arange(100).astype("int64")
280+
itemsize = arr.itemsize
281+
codec_no_type_size = Blosc(shuffle=Blosc.SHUFFLE)
282+
codec_itemsize = Blosc(shuffle=Blosc.SHUFFLE, typesize=itemsize)
283+
encoded_without_itemsize = codec_no_type_size.encode(arr.tobytes())
284+
encoded_with_itemsize = codec_itemsize.encode(arr.tobytes())
285+
# third byte encodes the `typesize`
286+
assert encoded_without_itemsize[3] == 1 # inferred from bytes i.e., 1
287+
assert encoded_with_itemsize[3] == itemsize # given as a constructor argument
288+
289+
290+
def test_typesize_less_than_1():
291+
with pytest.raises(ValueError, match=r"Cannot use typesize"):
292+
Blosc(shuffle=Blosc.SHUFFLE, typesize=0)
293+
compressor = Blosc(shuffle=Blosc.SHUFFLE)
294+
# not really something that should be done in practice, but good for testing.
295+
compressor.typesize = 0
296+
arr = np.arange(100)
297+
with pytest.raises(ValueError, match=r"Cannot use typesize"):
298+
compressor.encode(arr.tobytes())

0 commit comments

Comments
 (0)