Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

10x speedup loading Zarrs by using chunks=auto! #459

Merged
merged 1 commit into from
Nov 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion nowcasting_dataset/data_sources/nwp/nwp_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,11 @@ def open_nwp(zarr_path: str, consolidated: bool) -> xr.DataArray:
_LOG.debug("Opening NWP data: %s", zarr_path)
utils.set_fsspec_for_multiprocess()
nwp = xr.open_dataset(
zarr_path, engine="zarr", consolidated=consolidated, mode="r", chunks=None
zarr_path,
engine="zarr",
consolidated=consolidated,
mode="r",
chunks="auto", # See issue #456 for why we use "auto".
)

# Select the "UKV" DataArray from the "nwp" Dataset.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,18 +233,12 @@ def open_sat_data(zarr_path: str, consolidated: bool) -> xr.DataArray:
"""
_LOG.debug("Opening satellite data: %s", zarr_path)

# We load using chunks=None so xarray *doesn't* use Dask to
# load the Zarr chunks from disk. Using Dask to load the data
# seems to slow things down a lot if the Zarr store has more than
# about a million chunks.
# See https://github.com/openclimatefix/nowcasting_dataset/issues/23

# If we are opening multiple Zarr stores (i.e. one for each month of the year) we load them
# together and create a single dataset from them. open_mfdataset also works if zarr_path
# points to a specific zarr directory (with no wildcards).
dataset = xr.open_mfdataset(
zarr_path,
chunks=None,
chunks="auto", # See issue #456 for why we use "auto".
mode="r",
engine="zarr",
concat_dim="time",
Expand Down