diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 32e342d..b927d42 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: language: system pass_filenames: false - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.2 + rev: v0.11.9 hooks: - id: ruff args: ["--fix"] diff --git a/Cargo.toml b/Cargo.toml index 8b9c944..b61cb02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.23.2", features = ["abi3-py311"] } -zarrs = { version = "0.19.0", features = ["async"] } +zarrs = { version = "0.20.0", features = ["async", "zlib", "pcodec", "bz2"] } rayon_iter_concurrent_limit = "0.2.0" rayon = "1.10.0" # fix for https://stackoverflow.com/questions/76593417/package-openssl-was-not-found-in-the-pkg-config-search-path @@ -19,10 +19,9 @@ numpy = "0.23.0" unsafe_cell_slice = "0.2.0" serde_json = "1.0.128" pyo3-stub-gen = "0.7.0" -opendal = { version = "0.51.0", features = ["services-http"] } +opendal = { version = "0.53.0", features = ["services-http"] } tokio = { version = "1.41.1", features = ["rt-multi-thread"] } -zarrs_opendal = "0.5.0" -zarrs_metadata = "0.3.7" # require recent zarr-python compatibility fixes (remove with zarrs 0.20) +zarrs_opendal = "0.7.2" itertools = "0.9.0" [profile.release] diff --git a/pyproject.toml b/pyproject.toml index e24f777..89556af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,7 +100,7 @@ select = [ "W", # Warning detected by Pycodestyle "UP", # pyupgrade "I", # isort - "TCH", # manage type checking blocks + "TC", # manage type checking blocks "TID251", # Banned imports "ICN", # Follow import conventions "PTH", # Pathlib instead of os.path diff --git a/src/chunk_item.rs b/src/chunk_item.rs index 7bdea4b..feac5cb 100644 --- a/src/chunk_item.rs +++ b/src/chunk_item.rs @@ -10,7 +10,7 @@ use pyo3_stub_gen::derive::{gen_stub_pyclass, gen_stub_pymethods}; use zarrs::{ array::{ChunkRepresentation, DataType, FillValue}, array_subset::ArraySubset, - metadata::v3::{array::data_type::DataTypeMetadataV3, MetadataV3}, + metadata::v3::MetadataV3, storage::StoreKey, }; @@ -146,9 +146,11 @@ fn get_chunk_representation( fill_value: Vec, ) -> PyResult { // Get the chunk representation - let data_type = - DataType::from_metadata(&DataTypeMetadataV3::from_metadata(&MetadataV3::new(dtype))) - .map_py_err::()?; + let data_type = DataType::from_metadata( + &MetadataV3::new(dtype), + zarrs::config::global_config().data_type_aliases_v3(), + ) + .map_py_err::()?; let chunk_shape = chunk_shape .into_iter() .map(|x| NonZeroU64::new(x).expect("chunk shapes should always be non-zero")) diff --git a/src/lib.rs b/src/lib.rs index a38be9b..c750f78 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,7 +22,8 @@ use zarrs::array::codec::{ ArrayPartialDecoderTraits, ArrayToBytesCodecTraits, CodecOptions, CodecOptionsBuilder, }; use zarrs::array::{ - copy_fill_value_into, update_array_bytes, ArrayBytes, ArraySize, CodecChain, FillValue, + copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesFixedDisjointView, ArraySize, + CodecChain, FillValue, }; use zarrs::array_subset::ArraySubset; use zarrs::metadata::v3::MetadataV3; @@ -114,7 +115,7 @@ impl CodecPipelineImpl { codec_options: &CodecOptions, ) -> PyResult<()> { let array_shape = item.representation().shape_u64(); - if !chunk_subset.inbounds(&array_shape) { + if !chunk_subset.inbounds_shape(&array_shape) { return Err(PyErr::new::(format!( "chunk subset ({chunk_subset}) is out of bounds for array shape ({array_shape:?})" ))); @@ -134,20 +135,14 @@ impl CodecPipelineImpl { let chunk_bytes_old = self.retrieve_chunk_bytes(item, codec_chain, codec_options)?; // Update the chunk - let chunk_bytes_new = unsafe { - // SAFETY: - // - chunk_bytes_old is compatible with the chunk shape and data type size (validated on decoding) - // - chunk_subset is compatible with chunk_subset_bytes and the data type size (validated above) - // - chunk_subset is within the bounds of the chunk shape (validated above) - // - output bytes and output subset bytes are compatible (same data type) - update_array_bytes( - chunk_bytes_old, - &array_shape, - chunk_subset, - &chunk_subset_bytes, - data_type_size, - ) - }; + let chunk_bytes_new = update_array_bytes( + chunk_bytes_old, + &array_shape, + chunk_subset, + &chunk_subset_bytes, + data_type_size, + ) + .map_py_err::()?; // Store the updated chunk self.store_chunk_bytes(item, codec_chain, chunk_bytes_new, codec_options) @@ -279,8 +274,8 @@ impl CodecPipelineImpl { .unique_by(|item| item.key()) .collect::>(); let mut partial_decoder_cache: HashMap> = - HashMap::new().into(); - if partial_chunk_descriptions.len() > 0 { + HashMap::new(); + if !partial_chunk_descriptions.is_empty() { let key_decoder_pairs = iter_concurrent_limit!( chunk_concurrent_limit, partial_chunk_descriptions, @@ -308,59 +303,61 @@ impl CodecPipelineImpl { // For variable length data types, need a codepath with non `_into` methods. // Collect all the subsets and copy into value on the Python side? let update_chunk_subset = |item: chunk_item::WithSubset| { + let chunk_item::WithSubset { + item, + subset, + chunk_subset, + } = item; + let mut output_view = unsafe { + // TODO: Is the following correct? + // can we guarantee that when this function is called from Python with arbitrary arguments? + // SAFETY: chunks represent disjoint array subsets + ArrayBytesFixedDisjointView::new( + output, + // TODO: why is data_type in `item`, it should be derived from `output`, no? + item.representation() + .data_type() + .fixed_size() + .ok_or("variable length data type not supported") + .map_py_err::()?, + &output_shape, + subset, + ) + .map_py_err::()? + }; + // See zarrs::array::Array::retrieve_chunk_subset_into - if is_whole_chunk(&item) { + if chunk_subset.start().iter().all(|&o| o == 0) + && chunk_subset.shape() == item.representation().shape_u64() + { // See zarrs::array::Array::retrieve_chunk_into if let Some(chunk_encoded) = self.stores.get(&item)? { // Decode the encoded data into the output buffer let chunk_encoded: Vec = chunk_encoded.into(); - unsafe { - // SAFETY: - // - output is an array with output_shape elements of the item.representation data type, - // - item.subset is within the bounds of output_shape. - self.codec_chain.decode_into( - Cow::Owned(chunk_encoded), - item.representation(), - &output, - &output_shape, - &item.subset, - &codec_options, - ) - } + self.codec_chain.decode_into( + Cow::Owned(chunk_encoded), + item.representation(), + &mut output_view, + &codec_options, + ) } else { // The chunk is missing, write the fill value - unsafe { - // SAFETY: - // - data type and fill value are confirmed to be compatible when the ChunkRepresentation is created, - // - output is an array with output_shape elements of the item.representation data type, - // - item.subset is within the bounds of output_shape. - copy_fill_value_into( - item.representation().data_type(), - item.representation().fill_value(), - &output, - &output_shape, - &item.subset, - ) - } + copy_fill_value_into( + item.representation().data_type(), + item.representation().fill_value(), + &mut output_view, + ) } } else { let key = item.key(); let partial_decoder = partial_decoder_cache.get(key).ok_or_else(|| { PyRuntimeError::new_err(format!("Partial decoder not found for key: {key}")) })?; - unsafe { - // SAFETY: - // - output is an array with output_shape elements of the item.representation data type, - // - item.subset is within the bounds of output_shape. - // - item.chunk_subset has the same number of elements as item.subset. - partial_decoder.partial_decode_into( - &item.chunk_subset, - &output, - &output_shape, - &item.subset, - &codec_options, - ) - } + partial_decoder.partial_decode_into( + &chunk_subset, + &mut output_view, + &codec_options, + ) } .map_py_err::() }; diff --git a/src/metadata_v2.rs b/src/metadata_v2.rs index de14123..7714e57 100644 --- a/src/metadata_v2.rs +++ b/src/metadata_v2.rs @@ -1,7 +1,7 @@ use pyo3::{exceptions::PyRuntimeError, pyfunction, PyErr, PyResult}; use zarrs::metadata::{ - v2::{array::ArrayMetadataV2Order, MetadataV2}, - v3::array::data_type::DataTypeMetadataV3, + v2::{ArrayMetadataV2Order, MetadataV2}, + v3::MetadataV3, }; #[pyfunction] @@ -35,13 +35,15 @@ pub fn codec_metadata_v2_to_v3( // FIXME: The array order, dimensionality, data type, and endianness are needed to exhaustively support all Zarr V2 data that zarrs can handle. // However, CodecPipeline.from_codecs does not supply this information, and CodecPipeline.evolve_from_array_spec is seemingly never called. - let metadata = zarrs::metadata::v2_to_v3::codec_metadata_v2_to_v3( + let metadata = zarrs::metadata_ext::v2_to_v3::codec_metadata_v2_to_v3( ArrayMetadataV2Order::C, - 0, // unused with C order - &DataTypeMetadataV3::Bool, // FIXME + 0, // unused with C order + &MetadataV3::new("bool"), // FIXME None, &filters, &compressor, + zarrs::config::global_config().codec_aliases_v2(), + zarrs::config::global_config().codec_aliases_v3(), ) .map_err(|err| { // TODO: More informative error messages from zarrs for ArrayMetadataV2ToV3ConversionError