Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion tiledb/attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,11 @@ def __init__(
if (var is None and dtype == "ascii") or np.issubdtype(dt.np_dtype, np.str_):
var = True
elif np.issubdtype(dt.np_dtype, np.bytes_):
if dt.np_dtype.itemsize > 0 and var:
if dtype == "blob":
if var is None:
# The default for blob is var-length
var = True
elif dt.np_dtype.itemsize > 0 and var:
warnings.warn(
f"Attr given `var=True` but `dtype` `{dtype}` is fixed; "
"setting `dtype=S0`. Hint: set `var=True` with `dtype=S0`, "
Expand Down
10 changes: 9 additions & 1 deletion tiledb/core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1378,6 +1378,10 @@ class PyQuery {
auto dtype = buffer_dtype(name);
bool is_unicode = dtype.is(py::dtype("U"));
bool is_str = dtype.is(py::dtype("S"));

auto buffer_type_info = buffer_type(name);
bool is_blob = (buffer_type_info.first == TILEDB_BLOB);

if (is_unicode || is_str) {
dtype = py::dtype("O");
}
Expand Down Expand Up @@ -1438,7 +1442,11 @@ class PyQuery {
} else {
o = py::bytes(data_ptr, size);
}
else {
else if (is_blob) {
// Zero-copy: create memoryview directly for blob data
auto arr = py::array(py::dtype("int8"), size, data_ptr);
o = py::memoryview(arr);
} else {
o = py::array(py::dtype("uint8"), size, data_ptr);
o.attr("dtype") = dtype;
}
Expand Down
11 changes: 10 additions & 1 deletion tiledb/dense_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,16 @@ def _read_dense_subarray(
# <TODO> sanity check the TileDB buffer size against schema?
# <TODO> add assert to verify np.require doesn't copy?
arr = results[name][0]
arr.dtype = dtype

# Handle the case of fixed-length blob attribute.
if (
self.schema.has_attr(name)
and self.attr(name)._tiledb_dtype == lt.DataType.BLOB
):
# Note: fixed blobs are always 1 byte per cell
arr = arr.view("S1")
else:
arr.dtype = dtype
if len(arr) == 0:
# special case: the C API returns 0 len for blank arrays
arr = np.zeros(output_shape, dtype=dtype)
Expand Down
32 changes: 21 additions & 11 deletions tiledb/sparse_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,20 +601,30 @@ def _read_sparse_subarray(self, subarray, attr_names: list, cond, layout):
arr.dtype = self.schema.attr_or_dim_dtype(name)
out[final_name] = arr
else:
if self.schema.domain.has_dim(name):
el_dtype = self.schema.domain.dim(name).dtype
else:
el_dtype = self.attr(name).dtype
arr = results[name][0]

# this is a work-around for NumPy restrictions removed in 1.16
if el_dtype == np.dtype("S0"):
out[final_name] = b""
elif el_dtype == np.dtype("U0"):
out[final_name] = ""
# Handle the case of fixed-length blob attribute.
if (
self.schema.has_attr(name)
and self.attr(name)._tiledb_dtype == lt.DataType.BLOB
):
# Note: fixed blobs are always 1 byte per cell
out[final_name] = arr.view("S1")

else:
arr.dtype = el_dtype
out[final_name] = arr
if self.schema.domain.has_dim(name):
el_dtype = self.schema.domain.dim(name).dtype
else:
el_dtype = self.attr(name).dtype

# this is a work-around for NumPy restrictions removed in 1.16
if el_dtype == np.dtype("S0"):
out[final_name] = b""
elif el_dtype == np.dtype("U0"):
out[final_name] = ""
else:
arr.dtype = el_dtype
out[final_name] = arr

if self.schema.has_attr(final_name) and self.attr(final_name).isnullable:
out[final_name] = np.ma.array(
Expand Down
66 changes: 66 additions & 0 deletions tiledb/tests/test_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,13 @@ def test_blob_attribute(self):
attr = tiledb.Attr(name="foo", dtype="blob")
self.assertEqual(attr, attr)
self.assertEqual(attr.dtype, np.bytes_)
self.assertTrue(attr.isvar) # for blobs var is True if not specified

attr1 = tiledb.Attr(name="foo", dtype="blob", var=True)
self.assertTrue(attr1.isvar)

attr2 = tiledb.Attr(name="foo", dtype="blob", var=False)
self.assertFalse(attr2.isvar)

def test_blob_attribute_dump(self, capfd):
attr = tiledb.Attr(name="foo", dtype="blob")
Expand Down Expand Up @@ -248,6 +255,65 @@ def test_ascii_attribute(self, sparse, capfd):
assert A.schema.attr("A").isascii
assert_array_equal(A[:]["A"], np.asarray(ascii_data, dtype=np.bytes_))

@pytest.mark.parametrize("sparse", [True, False])
def test_fixed_size_blob_attribute(self, sparse):
path = self.path("test_fixed_blob")
dom = tiledb.Domain(
tiledb.Dim(name="d", domain=(1, 4), tile=1, dtype=np.uint32)
)
attrs = [tiledb.Attr(name="a", dtype="blob", var=False)]

schema = tiledb.ArraySchema(domain=dom, attrs=attrs, sparse=sparse)
tiledb.Array.create(path, schema)

# Fixed-size blob attribute stores single bytes per cell
blob_data = [b"a", b"b", b"c", b"d"]

with tiledb.open(path, "w") as A:
if sparse:
A[np.arange(1, 5)] = blob_data
else:
A[:] = np.asarray(blob_data, dtype=np.bytes_)

with tiledb.open(path, "r") as A:
assert A.schema.nattr == 1
assert A.schema.attr("a").ncells == 1
assert not A.schema.attr("a").isvar
assert A.schema.attr("a").dtype == np.dtype("|S0") # numpy representation
assert (
A.schema.attr("a")._tiledb_dtype == tiledb.libtiledb.DataType.BLOB
) # TileDB type
assert_array_equal(A[:]["a"], np.asarray(blob_data, dtype=np.bytes_))

@pytest.mark.parametrize("sparse", [True, False])
def test_var_blob_attribute(self, sparse):
path = self.path("test_var_blob")
dom = tiledb.Domain(
tiledb.Dim(name="d", domain=(1, 4), tile=1, dtype=np.uint32)
)
attrs = [tiledb.Attr(name="a", dtype="blob", var=True)]

schema = tiledb.ArraySchema(domain=dom, attrs=attrs, sparse=sparse)
tiledb.Array.create(path, schema)

# Variable-length blob attribute can store different sized blobs per cell
blob_data = [b"a", b"bb", b"ccc", b"dddd"]

with tiledb.open(path, "w") as A:
if sparse:
A[np.arange(1, 5)] = blob_data
else:
A[:] = blob_data

with tiledb.open(path, "r") as A:
assert A.schema.nattr == 1
assert A.schema.attr("a").isvar
assert A.schema.attr("a").dtype == np.dtype("|S0") # numpy representation
assert (
A.schema.attr("a")._tiledb_dtype == tiledb.libtiledb.DataType.BLOB
) # TileDB type
assert_array_equal(A[:]["a"], np.array(blob_data, dtype=np.bytes_))

def test_modify_attribute_in_schema(self):
path = self.path("test_modify_attribute_in_schema")
tiledb.from_numpy(path, np.random.rand(10))
Expand Down
Loading