Skip to content

Commit c05b9d1

Browse files
committed
add legacy vlen-utf8 codec
1 parent b8baa68 commit c05b9d1

File tree

4 files changed

+101
-1
lines changed

4 files changed

+101
-1
lines changed

src/zarr/buffer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,10 @@ class NDBuffer:
283283

284284
def __init__(self, array: NDArrayLike):
285285
# assert array.ndim > 0
286-
assert array.dtype != object
286+
287+
# Commented this out because string arrays have dtype object
288+
# TODO: decide how to handle strings (e.g. numpy 2.0 StringDtype)
289+
# assert array.dtype != object
287290
self._data = array
288291

289292
@classmethod

src/zarr/codecs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from zarr.codecs.bytes import BytesCodec, Endian
55
from zarr.codecs.crc32c_ import Crc32cCodec
66
from zarr.codecs.gzip import GzipCodec
7+
from zarr.codecs.legacy_vlen import VLenUTF8Codec
78
from zarr.codecs.pipeline import BatchedCodecPipeline
89
from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
910
from zarr.codecs.transpose import TransposeCodec
@@ -21,5 +22,6 @@
2122
"ShardingCodec",
2223
"ShardingCodecIndexLocation",
2324
"TransposeCodec",
25+
"VLenUTF8Codec",
2426
"ZstdCodec",
2527
]

src/zarr/codecs/legacy_vlen.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
from typing import TYPE_CHECKING
5+
6+
from numcodecs.vlen import VLenUTF8
7+
8+
from zarr.abc.codec import ArrayBytesCodec
9+
from zarr.array_spec import ArraySpec
10+
from zarr.buffer import Buffer, NDBuffer
11+
from zarr.codecs.registry import register_codec
12+
from zarr.common import JSON, parse_named_configuration
13+
14+
if TYPE_CHECKING:
15+
from typing_extensions import Self
16+
17+
18+
# can use a global because there are no parameters
19+
vlen_utf8_codec = VLenUTF8()
20+
21+
22+
@dataclass(frozen=True)
23+
class VLenUTF8Codec(ArrayBytesCodec):
24+
def __init__(self) -> None:
25+
pass
26+
27+
@classmethod
28+
def from_dict(cls, data: dict[str, JSON]) -> Self:
29+
_, configuration_parsed = parse_named_configuration(
30+
data, "vlen-utf8", require_configuration=False
31+
)
32+
configuration_parsed = configuration_parsed or {}
33+
return cls(**configuration_parsed)
34+
35+
def to_dict(self) -> dict[str, JSON]:
36+
return {"name": "vlen-utf8"}
37+
38+
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
39+
return self
40+
41+
async def _decode_single(
42+
self,
43+
chunk_bytes: Buffer,
44+
chunk_spec: ArraySpec,
45+
) -> NDBuffer:
46+
assert isinstance(chunk_bytes, Buffer)
47+
48+
raw_bytes = chunk_bytes.as_array_like()
49+
decoded = vlen_utf8_codec.decode(raw_bytes)
50+
decoded.shape = chunk_spec.shape
51+
return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded)
52+
53+
async def _encode_single(
54+
self,
55+
chunk_array: NDBuffer,
56+
chunk_spec: ArraySpec,
57+
) -> Buffer | None:
58+
assert isinstance(chunk_array, NDBuffer)
59+
return chunk_spec.prototype.buffer.from_bytes(
60+
vlen_utf8_codec.encode(chunk_array.as_numpy_array())
61+
)
62+
63+
def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
64+
# what is input_byte_length for an object dtype?
65+
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs")
66+
67+
68+
register_codec("vlen-utf8", VLenUTF8Codec)

tests/v3/test_codecs/test_vlen.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import numpy as np
2+
import pytest
3+
4+
from zarr.abc.store import Store
5+
from zarr.array import Array
6+
from zarr.codecs import VLenUTF8Codec
7+
from zarr.store.core import StorePath
8+
9+
10+
@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
11+
def test_arrow_vlen_string(store: Store) -> None:
12+
strings = ["hello", "world", "this", "is", "a", "test"]
13+
data = np.array(strings).reshape((2, 3))
14+
15+
a = Array.create(
16+
StorePath(store, path="arrow"),
17+
shape=data.shape,
18+
chunk_shape=data.shape,
19+
dtype=data.dtype,
20+
fill_value=0,
21+
codecs=[VLenUTF8Codec()],
22+
)
23+
24+
a[:, :] = data
25+
print(a)
26+
print(a[:])
27+
assert np.array_equal(data, a[:, :])

0 commit comments

Comments
 (0)