diff --git a/tiledb/attribute.py b/tiledb/attribute.py index 2ba713abaa..f0ca8fc5b3 100644 --- a/tiledb/attribute.py +++ b/tiledb/attribute.py @@ -47,7 +47,11 @@ def __init__( if (var is None and dtype == "ascii") or np.issubdtype(dt.np_dtype, np.str_): var = True elif np.issubdtype(dt.np_dtype, np.bytes_): - if dt.np_dtype.itemsize > 0 and var: + if dtype == "blob": + if var is None: + # The default for blob is var-length + var = True + elif dt.np_dtype.itemsize > 0 and var: warnings.warn( f"Attr given `var=True` but `dtype` `{dtype}` is fixed; " "setting `dtype=S0`. Hint: set `var=True` with `dtype=S0`, " diff --git a/tiledb/core.cc b/tiledb/core.cc index 2116d93e61..98c0320cba 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -1378,6 +1378,10 @@ class PyQuery { auto dtype = buffer_dtype(name); bool is_unicode = dtype.is(py::dtype("U")); bool is_str = dtype.is(py::dtype("S")); + + auto buffer_type_info = buffer_type(name); + bool is_blob = (buffer_type_info.first == TILEDB_BLOB); + if (is_unicode || is_str) { dtype = py::dtype("O"); } @@ -1438,7 +1442,11 @@ class PyQuery { } else { o = py::bytes(data_ptr, size); } - else { + else if (is_blob) { + // Zero-copy: create memoryview directly for blob data + auto arr = py::array(py::dtype("int8"), size, data_ptr); + o = py::memoryview(arr); + } else { o = py::array(py::dtype("uint8"), size, data_ptr); o.attr("dtype") = dtype; } diff --git a/tiledb/dense_array.py b/tiledb/dense_array.py index 000256f4e3..71e086aaa4 100644 --- a/tiledb/dense_array.py +++ b/tiledb/dense_array.py @@ -336,7 +336,16 @@ def _read_dense_subarray( # sanity check the TileDB buffer size against schema? # add assert to verify np.require doesn't copy? arr = results[name][0] - arr.dtype = dtype + + # Handle the case of fixed-length blob attribute. + if ( + self.schema.has_attr(name) + and self.attr(name)._tiledb_dtype == lt.DataType.BLOB + ): + # Note: fixed blobs are always 1 byte per cell + arr = arr.view("S1") + else: + arr.dtype = dtype if len(arr) == 0: # special case: the C API returns 0 len for blank arrays arr = np.zeros(output_shape, dtype=dtype) diff --git a/tiledb/sparse_array.py b/tiledb/sparse_array.py index 122fe46501..70be2c86a8 100644 --- a/tiledb/sparse_array.py +++ b/tiledb/sparse_array.py @@ -601,20 +601,30 @@ def _read_sparse_subarray(self, subarray, attr_names: list, cond, layout): arr.dtype = self.schema.attr_or_dim_dtype(name) out[final_name] = arr else: - if self.schema.domain.has_dim(name): - el_dtype = self.schema.domain.dim(name).dtype - else: - el_dtype = self.attr(name).dtype arr = results[name][0] - # this is a work-around for NumPy restrictions removed in 1.16 - if el_dtype == np.dtype("S0"): - out[final_name] = b"" - elif el_dtype == np.dtype("U0"): - out[final_name] = "" + # Handle the case of fixed-length blob attribute. + if ( + self.schema.has_attr(name) + and self.attr(name)._tiledb_dtype == lt.DataType.BLOB + ): + # Note: fixed blobs are always 1 byte per cell + out[final_name] = arr.view("S1") + else: - arr.dtype = el_dtype - out[final_name] = arr + if self.schema.domain.has_dim(name): + el_dtype = self.schema.domain.dim(name).dtype + else: + el_dtype = self.attr(name).dtype + + # this is a work-around for NumPy restrictions removed in 1.16 + if el_dtype == np.dtype("S0"): + out[final_name] = b"" + elif el_dtype == np.dtype("U0"): + out[final_name] = "" + else: + arr.dtype = el_dtype + out[final_name] = arr if self.schema.has_attr(final_name) and self.attr(final_name).isnullable: out[final_name] = np.ma.array( diff --git a/tiledb/tests/test_attribute.py b/tiledb/tests/test_attribute.py index ff67b17c9b..40fd8e5b85 100644 --- a/tiledb/tests/test_attribute.py +++ b/tiledb/tests/test_attribute.py @@ -195,6 +195,13 @@ def test_blob_attribute(self): attr = tiledb.Attr(name="foo", dtype="blob") self.assertEqual(attr, attr) self.assertEqual(attr.dtype, np.bytes_) + self.assertTrue(attr.isvar) # for blobs var is True if not specified + + attr1 = tiledb.Attr(name="foo", dtype="blob", var=True) + self.assertTrue(attr1.isvar) + + attr2 = tiledb.Attr(name="foo", dtype="blob", var=False) + self.assertFalse(attr2.isvar) def test_blob_attribute_dump(self, capfd): attr = tiledb.Attr(name="foo", dtype="blob") @@ -248,6 +255,65 @@ def test_ascii_attribute(self, sparse, capfd): assert A.schema.attr("A").isascii assert_array_equal(A[:]["A"], np.asarray(ascii_data, dtype=np.bytes_)) + @pytest.mark.parametrize("sparse", [True, False]) + def test_fixed_size_blob_attribute(self, sparse): + path = self.path("test_fixed_blob") + dom = tiledb.Domain( + tiledb.Dim(name="d", domain=(1, 4), tile=1, dtype=np.uint32) + ) + attrs = [tiledb.Attr(name="a", dtype="blob", var=False)] + + schema = tiledb.ArraySchema(domain=dom, attrs=attrs, sparse=sparse) + tiledb.Array.create(path, schema) + + # Fixed-size blob attribute stores single bytes per cell + blob_data = [b"a", b"b", b"c", b"d"] + + with tiledb.open(path, "w") as A: + if sparse: + A[np.arange(1, 5)] = blob_data + else: + A[:] = np.asarray(blob_data, dtype=np.bytes_) + + with tiledb.open(path, "r") as A: + assert A.schema.nattr == 1 + assert A.schema.attr("a").ncells == 1 + assert not A.schema.attr("a").isvar + assert A.schema.attr("a").dtype == np.dtype("|S0") # numpy representation + assert ( + A.schema.attr("a")._tiledb_dtype == tiledb.libtiledb.DataType.BLOB + ) # TileDB type + assert_array_equal(A[:]["a"], np.asarray(blob_data, dtype=np.bytes_)) + + @pytest.mark.parametrize("sparse", [True, False]) + def test_var_blob_attribute(self, sparse): + path = self.path("test_var_blob") + dom = tiledb.Domain( + tiledb.Dim(name="d", domain=(1, 4), tile=1, dtype=np.uint32) + ) + attrs = [tiledb.Attr(name="a", dtype="blob", var=True)] + + schema = tiledb.ArraySchema(domain=dom, attrs=attrs, sparse=sparse) + tiledb.Array.create(path, schema) + + # Variable-length blob attribute can store different sized blobs per cell + blob_data = [b"a", b"bb", b"ccc", b"dddd"] + + with tiledb.open(path, "w") as A: + if sparse: + A[np.arange(1, 5)] = blob_data + else: + A[:] = blob_data + + with tiledb.open(path, "r") as A: + assert A.schema.nattr == 1 + assert A.schema.attr("a").isvar + assert A.schema.attr("a").dtype == np.dtype("|S0") # numpy representation + assert ( + A.schema.attr("a")._tiledb_dtype == tiledb.libtiledb.DataType.BLOB + ) # TileDB type + assert_array_equal(A[:]["a"], np.array(blob_data, dtype=np.bytes_)) + def test_modify_attribute_in_schema(self): path = self.path("test_modify_attribute_in_schema") tiledb.from_numpy(path, np.random.rand(10))