Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ repos:
language: system
pass_filenames: false
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.7.2
rev: v0.11.9
hooks:
- id: ruff
args: ["--fix"]
Expand Down
7 changes: 3 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ crate-type = ["cdylib", "rlib"]

[dependencies]
pyo3 = { version = "0.23.2", features = ["abi3-py311"] }
zarrs = { version = "0.19.0", features = ["async"] }
zarrs = { version = "0.20.0", features = ["async", "zlib", "pcodec", "bz2"] }
rayon_iter_concurrent_limit = "0.2.0"
rayon = "1.10.0"
# fix for https://stackoverflow.com/questions/76593417/package-openssl-was-not-found-in-the-pkg-config-search-path
Expand All @@ -19,10 +19,9 @@ numpy = "0.23.0"
unsafe_cell_slice = "0.2.0"
serde_json = "1.0.128"
pyo3-stub-gen = "0.7.0"
opendal = { version = "0.51.0", features = ["services-http"] }
opendal = { version = "0.53.0", features = ["services-http"] }
tokio = { version = "1.41.1", features = ["rt-multi-thread"] }
zarrs_opendal = "0.5.0"
zarrs_metadata = "0.3.7" # require recent zarr-python compatibility fixes (remove with zarrs 0.20)
zarrs_opendal = "0.7.2"
itertools = "0.9.0"

[profile.release]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ select = [
"W", # Warning detected by Pycodestyle
"UP", # pyupgrade
"I", # isort
"TCH", # manage type checking blocks
"TC", # manage type checking blocks
"TID251", # Banned imports
"ICN", # Follow import conventions
"PTH", # Pathlib instead of os.path
Expand Down
10 changes: 6 additions & 4 deletions src/chunk_item.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use pyo3_stub_gen::derive::{gen_stub_pyclass, gen_stub_pymethods};
use zarrs::{
array::{ChunkRepresentation, DataType, FillValue},
array_subset::ArraySubset,
metadata::v3::{array::data_type::DataTypeMetadataV3, MetadataV3},
metadata::v3::MetadataV3,
storage::StoreKey,
};

Expand Down Expand Up @@ -146,9 +146,11 @@ fn get_chunk_representation(
fill_value: Vec<u8>,
) -> PyResult<ChunkRepresentation> {
// Get the chunk representation
let data_type =
DataType::from_metadata(&DataTypeMetadataV3::from_metadata(&MetadataV3::new(dtype)))
.map_py_err::<PyRuntimeError>()?;
let data_type = DataType::from_metadata(
&MetadataV3::new(dtype),
zarrs::config::global_config().data_type_aliases_v3(),
)
.map_py_err::<PyRuntimeError>()?;
let chunk_shape = chunk_shape
.into_iter()
.map(|x| NonZeroU64::new(x).expect("chunk shapes should always be non-zero"))
Expand Down
113 changes: 55 additions & 58 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ use zarrs::array::codec::{
ArrayPartialDecoderTraits, ArrayToBytesCodecTraits, CodecOptions, CodecOptionsBuilder,
};
use zarrs::array::{
copy_fill_value_into, update_array_bytes, ArrayBytes, ArraySize, CodecChain, FillValue,
copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesFixedDisjointView, ArraySize,
CodecChain, FillValue,
};
use zarrs::array_subset::ArraySubset;
use zarrs::metadata::v3::MetadataV3;
Expand Down Expand Up @@ -114,7 +115,7 @@ impl CodecPipelineImpl {
codec_options: &CodecOptions,
) -> PyResult<()> {
let array_shape = item.representation().shape_u64();
if !chunk_subset.inbounds(&array_shape) {
if !chunk_subset.inbounds_shape(&array_shape) {
return Err(PyErr::new::<PyValueError, _>(format!(
"chunk subset ({chunk_subset}) is out of bounds for array shape ({array_shape:?})"
)));
Expand All @@ -134,20 +135,14 @@ impl CodecPipelineImpl {
let chunk_bytes_old = self.retrieve_chunk_bytes(item, codec_chain, codec_options)?;

// Update the chunk
let chunk_bytes_new = unsafe {
// SAFETY:
// - chunk_bytes_old is compatible with the chunk shape and data type size (validated on decoding)
// - chunk_subset is compatible with chunk_subset_bytes and the data type size (validated above)
// - chunk_subset is within the bounds of the chunk shape (validated above)
// - output bytes and output subset bytes are compatible (same data type)
update_array_bytes(
chunk_bytes_old,
&array_shape,
chunk_subset,
&chunk_subset_bytes,
data_type_size,
)
};
let chunk_bytes_new = update_array_bytes(
chunk_bytes_old,
&array_shape,
chunk_subset,
&chunk_subset_bytes,
data_type_size,
)
.map_py_err::<PyRuntimeError>()?;

// Store the updated chunk
self.store_chunk_bytes(item, codec_chain, chunk_bytes_new, codec_options)
Expand Down Expand Up @@ -279,8 +274,8 @@ impl CodecPipelineImpl {
.unique_by(|item| item.key())
.collect::<Vec<_>>();
let mut partial_decoder_cache: HashMap<StoreKey, Arc<dyn ArrayPartialDecoderTraits>> =
HashMap::new().into();
if partial_chunk_descriptions.len() > 0 {
HashMap::new();
if !partial_chunk_descriptions.is_empty() {
let key_decoder_pairs = iter_concurrent_limit!(
chunk_concurrent_limit,
partial_chunk_descriptions,
Expand Down Expand Up @@ -308,59 +303,61 @@ impl CodecPipelineImpl {
// For variable length data types, need a codepath with non `_into` methods.
// Collect all the subsets and copy into value on the Python side?
let update_chunk_subset = |item: chunk_item::WithSubset| {
let chunk_item::WithSubset {
item,
subset,
chunk_subset,
} = item;
let mut output_view = unsafe {
// TODO: Is the following correct?
// can we guarantee that when this function is called from Python with arbitrary arguments?
// SAFETY: chunks represent disjoint array subsets
Comment on lines +312 to +314
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the big question: is it a good idea to blindly trust subsets coming from Python?

Copy link
Collaborator Author

@flying-sheep flying-sheep Feb 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, one option would be to add a _unsafe_skip_validation parameter (default False, obviously).

When we call the function with chunks from the zarr Python lib, we set _unsafe_skip_validation=True because we know we can trust zarr, but users that are tempted to use the CodecPipeline directly need to set it to get the speed boost of not validating the chunks.

When they do set a parameter called _unsafe_..., it’s on them to use it correctly.

But I don‘t think anyone should offer a regular Python API that can cause segfaults when used wrong.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but users that are tempted to use the CodecPipeline

If I understand you right, I think we explicitly say not to instantiate your own pipeline class or use it as an object.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the big question: is it a good idea to blindly trust subsets coming from Python?

Looks like we can't rely on the subsets being disjoint. zarr-developers/zarr-python#2851 (comment). Based on that comment, I suppose we would just have to iterate over overlapping subsets sequentially to match numpy-like behaviour.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So zarrs-python is currently unsound, lovely! Good that we caught that. I added an issue: #89

Should we first merge this PR (which would make fixing the issue easier) or will it take time until zarrs 0.20 is released?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So zarrs-python is currently unsound, lovely!

Why is this true? The previous unsafe comments seem to be different than what's being discussed here. Also as Lachlan said in the issue, it's possible that zarr-python will fix this issue so that the safety assumption here would be correct.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I could get a release out soon, but maybe hold off on merging for now. Just in case we need any more hotfixes for zarr-python changes in the meantime.

Copy link
Collaborator Author

@flying-sheep flying-sheep Feb 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this true?

unless I misunderstood @LDeakin, because

Looks like we can't rely on the subsets being disjoint

which I interpreted as “the chunks coming from zarr aren‘t necessarily nonoverlapping”.

if that’s correct, our current behavior

  1. is unsound, as our parallel writers can end up simultaneously writing the same memory regions, which is UB
  2. even if we used fine-grained locking to avoid UB, we wouldn’t guarantee that the last data written is the one zarr expects us to write, so we’d still be wrong.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, ok, understood. I remember our discussion in the kitchen a few weeks ago now. Got it.

ArrayBytesFixedDisjointView::new(
output,
// TODO: why is data_type in `item`, it should be derived from `output`, no?
item.representation()
.data_type()
.fixed_size()
.ok_or("variable length data type not supported")
.map_py_err::<PyTypeError>()?,
Comment on lines +317 to +322
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another question: Each individual item having its own data type makes no sense.

We should probably pass in the data type only once. If we can rely on the output having the correct one, that would be easy, otherwise, we could make chunk_descriptions a struct containing the dtype and the chunk items.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #90

&output_shape,
subset,
)
.map_py_err::<PyRuntimeError>()?
};

// See zarrs::array::Array::retrieve_chunk_subset_into
if is_whole_chunk(&item) {
if chunk_subset.start().iter().all(|&o| o == 0)
&& chunk_subset.shape() == item.representation().shape_u64()
{
// See zarrs::array::Array::retrieve_chunk_into
if let Some(chunk_encoded) = self.stores.get(&item)? {
// Decode the encoded data into the output buffer
let chunk_encoded: Vec<u8> = chunk_encoded.into();
unsafe {
// SAFETY:
// - output is an array with output_shape elements of the item.representation data type,
// - item.subset is within the bounds of output_shape.
self.codec_chain.decode_into(
Cow::Owned(chunk_encoded),
item.representation(),
&output,
&output_shape,
&item.subset,
&codec_options,
)
}
self.codec_chain.decode_into(
Cow::Owned(chunk_encoded),
item.representation(),
&mut output_view,
&codec_options,
)
} else {
// The chunk is missing, write the fill value
unsafe {
// SAFETY:
// - data type and fill value are confirmed to be compatible when the ChunkRepresentation is created,
// - output is an array with output_shape elements of the item.representation data type,
// - item.subset is within the bounds of output_shape.
copy_fill_value_into(
item.representation().data_type(),
item.representation().fill_value(),
&output,
&output_shape,
&item.subset,
)
}
copy_fill_value_into(
item.representation().data_type(),
item.representation().fill_value(),
&mut output_view,
)
}
} else {
let key = item.key();
let partial_decoder = partial_decoder_cache.get(key).ok_or_else(|| {
PyRuntimeError::new_err(format!("Partial decoder not found for key: {key}"))
})?;
unsafe {
// SAFETY:
// - output is an array with output_shape elements of the item.representation data type,
// - item.subset is within the bounds of output_shape.
// - item.chunk_subset has the same number of elements as item.subset.
partial_decoder.partial_decode_into(
&item.chunk_subset,
&output,
&output_shape,
&item.subset,
&codec_options,
)
}
partial_decoder.partial_decode_into(
&chunk_subset,
&mut output_view,
&codec_options,
)
}
.map_py_err::<PyValueError>()
};
Expand Down
12 changes: 7 additions & 5 deletions src/metadata_v2.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use pyo3::{exceptions::PyRuntimeError, pyfunction, PyErr, PyResult};
use zarrs::metadata::{
v2::{array::ArrayMetadataV2Order, MetadataV2},
v3::array::data_type::DataTypeMetadataV3,
v2::{ArrayMetadataV2Order, MetadataV2},
v3::MetadataV3,
};

#[pyfunction]
Expand Down Expand Up @@ -35,13 +35,15 @@ pub fn codec_metadata_v2_to_v3(

// FIXME: The array order, dimensionality, data type, and endianness are needed to exhaustively support all Zarr V2 data that zarrs can handle.
// However, CodecPipeline.from_codecs does not supply this information, and CodecPipeline.evolve_from_array_spec is seemingly never called.
let metadata = zarrs::metadata::v2_to_v3::codec_metadata_v2_to_v3(
let metadata = zarrs::metadata_ext::v2_to_v3::codec_metadata_v2_to_v3(
ArrayMetadataV2Order::C,
0, // unused with C order
&DataTypeMetadataV3::Bool, // FIXME
0, // unused with C order
&MetadataV3::new("bool"), // FIXME
None,
&filters,
&compressor,
zarrs::config::global_config().codec_aliases_v2(),
zarrs::config::global_config().codec_aliases_v3(),
)
.map_err(|err| {
// TODO: More informative error messages from zarrs for ArrayMetadataV2ToV3ConversionError
Expand Down
Loading