Skip to content

Commit c90aeca

Browse files
authored
Add support for fixed-size blob attributes (#2252)
1 parent f678d8d commit c90aeca

File tree

5 files changed

+111
-14
lines changed

5 files changed

+111
-14
lines changed

tiledb/attribute.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,11 @@ def __init__(
4747
if (var is None and dtype == "ascii") or np.issubdtype(dt.np_dtype, np.str_):
4848
var = True
4949
elif np.issubdtype(dt.np_dtype, np.bytes_):
50-
if dt.np_dtype.itemsize > 0 and var:
50+
if dtype == "blob":
51+
if var is None:
52+
# The default for blob is var-length
53+
var = True
54+
elif dt.np_dtype.itemsize > 0 and var:
5155
warnings.warn(
5256
f"Attr given `var=True` but `dtype` `{dtype}` is fixed; "
5357
"setting `dtype=S0`. Hint: set `var=True` with `dtype=S0`, "

tiledb/core.cc

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1378,6 +1378,10 @@ class PyQuery {
13781378
auto dtype = buffer_dtype(name);
13791379
bool is_unicode = dtype.is(py::dtype("U"));
13801380
bool is_str = dtype.is(py::dtype("S"));
1381+
1382+
auto buffer_type_info = buffer_type(name);
1383+
bool is_blob = (buffer_type_info.first == TILEDB_BLOB);
1384+
13811385
if (is_unicode || is_str) {
13821386
dtype = py::dtype("O");
13831387
}
@@ -1438,7 +1442,11 @@ class PyQuery {
14381442
} else {
14391443
o = py::bytes(data_ptr, size);
14401444
}
1441-
else {
1445+
else if (is_blob) {
1446+
// Zero-copy: create memoryview directly for blob data
1447+
auto arr = py::array(py::dtype("int8"), size, data_ptr);
1448+
o = py::memoryview(arr);
1449+
} else {
14421450
o = py::array(py::dtype("uint8"), size, data_ptr);
14431451
o.attr("dtype") = dtype;
14441452
}

tiledb/dense_array.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,16 @@ def _read_dense_subarray(
336336
# <TODO> sanity check the TileDB buffer size against schema?
337337
# <TODO> add assert to verify np.require doesn't copy?
338338
arr = results[name][0]
339-
arr.dtype = dtype
339+
340+
# Handle the case of fixed-length blob attribute.
341+
if (
342+
self.schema.has_attr(name)
343+
and self.attr(name)._tiledb_dtype == lt.DataType.BLOB
344+
):
345+
# Note: fixed blobs are always 1 byte per cell
346+
arr = arr.view("S1")
347+
else:
348+
arr.dtype = dtype
340349
if len(arr) == 0:
341350
# special case: the C API returns 0 len for blank arrays
342351
arr = np.zeros(output_shape, dtype=dtype)

tiledb/sparse_array.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -601,20 +601,30 @@ def _read_sparse_subarray(self, subarray, attr_names: list, cond, layout):
601601
arr.dtype = self.schema.attr_or_dim_dtype(name)
602602
out[final_name] = arr
603603
else:
604-
if self.schema.domain.has_dim(name):
605-
el_dtype = self.schema.domain.dim(name).dtype
606-
else:
607-
el_dtype = self.attr(name).dtype
608604
arr = results[name][0]
609605

610-
# this is a work-around for NumPy restrictions removed in 1.16
611-
if el_dtype == np.dtype("S0"):
612-
out[final_name] = b""
613-
elif el_dtype == np.dtype("U0"):
614-
out[final_name] = ""
606+
# Handle the case of fixed-length blob attribute.
607+
if (
608+
self.schema.has_attr(name)
609+
and self.attr(name)._tiledb_dtype == lt.DataType.BLOB
610+
):
611+
# Note: fixed blobs are always 1 byte per cell
612+
out[final_name] = arr.view("S1")
613+
615614
else:
616-
arr.dtype = el_dtype
617-
out[final_name] = arr
615+
if self.schema.domain.has_dim(name):
616+
el_dtype = self.schema.domain.dim(name).dtype
617+
else:
618+
el_dtype = self.attr(name).dtype
619+
620+
# this is a work-around for NumPy restrictions removed in 1.16
621+
if el_dtype == np.dtype("S0"):
622+
out[final_name] = b""
623+
elif el_dtype == np.dtype("U0"):
624+
out[final_name] = ""
625+
else:
626+
arr.dtype = el_dtype
627+
out[final_name] = arr
618628

619629
if self.schema.has_attr(final_name) and self.attr(final_name).isnullable:
620630
out[final_name] = np.ma.array(

tiledb/tests/test_attribute.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,13 @@ def test_blob_attribute(self):
195195
attr = tiledb.Attr(name="foo", dtype="blob")
196196
self.assertEqual(attr, attr)
197197
self.assertEqual(attr.dtype, np.bytes_)
198+
self.assertTrue(attr.isvar) # for blobs var is True if not specified
199+
200+
attr1 = tiledb.Attr(name="foo", dtype="blob", var=True)
201+
self.assertTrue(attr1.isvar)
202+
203+
attr2 = tiledb.Attr(name="foo", dtype="blob", var=False)
204+
self.assertFalse(attr2.isvar)
198205

199206
def test_blob_attribute_dump(self, capfd):
200207
attr = tiledb.Attr(name="foo", dtype="blob")
@@ -248,6 +255,65 @@ def test_ascii_attribute(self, sparse, capfd):
248255
assert A.schema.attr("A").isascii
249256
assert_array_equal(A[:]["A"], np.asarray(ascii_data, dtype=np.bytes_))
250257

258+
@pytest.mark.parametrize("sparse", [True, False])
259+
def test_fixed_size_blob_attribute(self, sparse):
260+
path = self.path("test_fixed_blob")
261+
dom = tiledb.Domain(
262+
tiledb.Dim(name="d", domain=(1, 4), tile=1, dtype=np.uint32)
263+
)
264+
attrs = [tiledb.Attr(name="a", dtype="blob", var=False)]
265+
266+
schema = tiledb.ArraySchema(domain=dom, attrs=attrs, sparse=sparse)
267+
tiledb.Array.create(path, schema)
268+
269+
# Fixed-size blob attribute stores single bytes per cell
270+
blob_data = [b"a", b"b", b"c", b"d"]
271+
272+
with tiledb.open(path, "w") as A:
273+
if sparse:
274+
A[np.arange(1, 5)] = blob_data
275+
else:
276+
A[:] = np.asarray(blob_data, dtype=np.bytes_)
277+
278+
with tiledb.open(path, "r") as A:
279+
assert A.schema.nattr == 1
280+
assert A.schema.attr("a").ncells == 1
281+
assert not A.schema.attr("a").isvar
282+
assert A.schema.attr("a").dtype == np.dtype("|S0") # numpy representation
283+
assert (
284+
A.schema.attr("a")._tiledb_dtype == tiledb.libtiledb.DataType.BLOB
285+
) # TileDB type
286+
assert_array_equal(A[:]["a"], np.asarray(blob_data, dtype=np.bytes_))
287+
288+
@pytest.mark.parametrize("sparse", [True, False])
289+
def test_var_blob_attribute(self, sparse):
290+
path = self.path("test_var_blob")
291+
dom = tiledb.Domain(
292+
tiledb.Dim(name="d", domain=(1, 4), tile=1, dtype=np.uint32)
293+
)
294+
attrs = [tiledb.Attr(name="a", dtype="blob", var=True)]
295+
296+
schema = tiledb.ArraySchema(domain=dom, attrs=attrs, sparse=sparse)
297+
tiledb.Array.create(path, schema)
298+
299+
# Variable-length blob attribute can store different sized blobs per cell
300+
blob_data = [b"a", b"bb", b"ccc", b"dddd"]
301+
302+
with tiledb.open(path, "w") as A:
303+
if sparse:
304+
A[np.arange(1, 5)] = blob_data
305+
else:
306+
A[:] = blob_data
307+
308+
with tiledb.open(path, "r") as A:
309+
assert A.schema.nattr == 1
310+
assert A.schema.attr("a").isvar
311+
assert A.schema.attr("a").dtype == np.dtype("|S0") # numpy representation
312+
assert (
313+
A.schema.attr("a")._tiledb_dtype == tiledb.libtiledb.DataType.BLOB
314+
) # TileDB type
315+
assert_array_equal(A[:]["a"], np.array(blob_data, dtype=np.bytes_))
316+
251317
def test_modify_attribute_in_schema(self):
252318
path = self.path("test_modify_attribute_in_schema")
253319
tiledb.from_numpy(path, np.random.rand(10))

0 commit comments

Comments
 (0)