Skip to content

Commit de66dae

Browse files
mraspauddcherianpre-commit-ci[bot]max-sixtyIllviljan
authored
Implement preferred_chunks for netcdf 4 backends (#7948)
* Write failing test * Add preferred chunks to netcdf 4 backends * Add unit tests for preferred chunking * Fix formatting * Require dask for a couple of chunking tests * Use xarray's interface to create a test chunked nc file * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix type annotations * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Import Generator * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Use roundtrip * Add news about the new feature * Update xarray/tests/test_backends.py * Update xarray/tests/test_backends.py * Move whats new line --------- Co-authored-by: Deepak Cherian <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Maximilian Roos <[email protected]> Co-authored-by: Illviljan <[email protected]>
1 parent 3edd997 commit de66dae

File tree

4 files changed

+82
-1
lines changed

4 files changed

+82
-1
lines changed

doc/whats-new.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ New Features
2626
different collections of coordinates prior to assign them to a Dataset or
2727
DataArray (:pull:`8102`) at once.
2828
By `Benoît Bovy <https://github.com/benbovy>`_.
29+
- Provide `preferred_chunks` for data read from netcdf files (:issue:`1440`, :pull:`7948`)
2930

3031
Breaking changes
3132
~~~~~~~~~~~~~~~~

xarray/backends/h5netcdf_.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ def open_store_variable(self, name, var):
198198
"fletcher32": var.fletcher32,
199199
"shuffle": var.shuffle,
200200
}
201+
if var.chunks:
202+
encoding["preferred_chunks"] = dict(zip(var.dimensions, var.chunks))
201203
# Convert h5py-style compression options to NetCDF4-Python
202204
# style, if possible
203205
if var.compression == "gzip":

xarray/backends/netCDF4_.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,7 @@ def open_store_variable(self, name, var):
426426
else:
427427
encoding["contiguous"] = False
428428
encoding["chunksizes"] = tuple(chunking)
429+
encoding["preferred_chunks"] = dict(zip(var.dimensions, chunking))
429430
# TODO: figure out how to round-trip "endian-ness" without raising
430431
# warnings from netCDF4
431432
# encoding['endian'] = var.endian()

xarray/tests/test_backends.py

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import tempfile
1414
import uuid
1515
import warnings
16-
from collections.abc import Iterator
16+
from collections.abc import Generator, Iterator
1717
from contextlib import ExitStack
1818
from io import BytesIO
1919
from os import listdir
@@ -1536,6 +1536,83 @@ def test_keep_chunksizes_if_no_original_shape(self) -> None:
15361536
ds["x"].encoding["chunksizes"], actual["x"].encoding["chunksizes"]
15371537
)
15381538

1539+
def test_preferred_chunks_is_present(self) -> None:
1540+
ds = Dataset({"x": [1, 2, 3]})
1541+
chunksizes = (2,)
1542+
ds.variables["x"].encoding = {"chunksizes": chunksizes}
1543+
1544+
with self.roundtrip(ds) as actual:
1545+
assert actual["x"].encoding["preferred_chunks"] == {"x": 2}
1546+
1547+
@requires_dask
1548+
def test_auto_chunking_is_based_on_disk_chunk_sizes(self) -> None:
1549+
x_size = y_size = 1000
1550+
y_chunksize = y_size
1551+
x_chunksize = 10
1552+
1553+
with dask.config.set({"array.chunk-size": "100KiB"}):
1554+
with self.chunked_roundtrip(
1555+
(1, y_size, x_size),
1556+
(1, y_chunksize, x_chunksize),
1557+
open_kwargs={"chunks": "auto"},
1558+
) as ds:
1559+
t_chunks, y_chunks, x_chunks = ds["image"].data.chunks
1560+
assert all(np.asanyarray(y_chunks) == y_chunksize)
1561+
# Check that the chunk size is a multiple of the file chunk size
1562+
assert all(np.asanyarray(x_chunks) % x_chunksize == 0)
1563+
1564+
@requires_dask
1565+
def test_base_chunking_uses_disk_chunk_sizes(self) -> None:
1566+
x_size = y_size = 1000
1567+
y_chunksize = y_size
1568+
x_chunksize = 10
1569+
1570+
with self.chunked_roundtrip(
1571+
(1, y_size, x_size),
1572+
(1, y_chunksize, x_chunksize),
1573+
open_kwargs={"chunks": {}},
1574+
) as ds:
1575+
for chunksizes, expected in zip(
1576+
ds["image"].data.chunks, (1, y_chunksize, x_chunksize)
1577+
):
1578+
assert all(np.asanyarray(chunksizes) == expected)
1579+
1580+
@contextlib.contextmanager
1581+
def chunked_roundtrip(
1582+
self,
1583+
array_shape: tuple[int, int, int],
1584+
chunk_sizes: tuple[int, int, int],
1585+
open_kwargs: dict[str, Any] | None = None,
1586+
) -> Generator[Dataset, None, None]:
1587+
t_size, y_size, x_size = array_shape
1588+
t_chunksize, y_chunksize, x_chunksize = chunk_sizes
1589+
1590+
image = xr.DataArray(
1591+
np.arange(t_size * x_size * y_size, dtype=np.int16).reshape(
1592+
(t_size, y_size, x_size)
1593+
),
1594+
dims=["t", "y", "x"],
1595+
)
1596+
image.encoding = {"chunksizes": (t_chunksize, y_chunksize, x_chunksize)}
1597+
dataset = xr.Dataset(dict(image=image))
1598+
1599+
with self.roundtrip(dataset, open_kwargs=open_kwargs) as ds:
1600+
yield ds
1601+
1602+
def test_preferred_chunks_are_disk_chunk_sizes(self) -> None:
1603+
x_size = y_size = 1000
1604+
y_chunksize = y_size
1605+
x_chunksize = 10
1606+
1607+
with self.chunked_roundtrip(
1608+
(1, y_size, x_size), (1, y_chunksize, x_chunksize)
1609+
) as ds:
1610+
assert ds["image"].encoding["preferred_chunks"] == {
1611+
"t": 1,
1612+
"y": y_chunksize,
1613+
"x": x_chunksize,
1614+
}
1615+
15391616
def test_encoding_chunksizes_unlimited(self) -> None:
15401617
# regression test for GH1225
15411618
ds = Dataset({"x": [1, 2, 3], "y": ("x", [2, 3, 4])})

0 commit comments

Comments
 (0)