diff --git a/doc/api.rst b/doc/api.rst index d7c2370d348..67c81aaf601 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1591,6 +1591,8 @@ Tutorial tutorial.open_dataset tutorial.load_dataset + tutorial.open_datatree + tutorial.load_datatree Testing ======= diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 05e03869553..e3022fed50d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,6 +21,8 @@ v2025.02.0 (unreleased) New Features ~~~~~~~~~~~~ +- Added :py:meth:`tutorial.open_datatree` and :py:meth:`tutorial.load_datatree` + By `Eni Awowale `_. - Added :py:meth:`DataTree.filter_like` to conveniently restructure a DataTree like another DataTree (:issue:`10096`, :pull:`10097`). By `Kobe Vandelanotte `_. - Added :py:meth:`Coordinates.from_xindex` as convenience for creating a new :py:class:`Coordinates` object diff --git a/xarray/tests/test_tutorial.py b/xarray/tests/test_tutorial.py index 9d59219c204..e271da6863b 100644 --- a/xarray/tests/test_tutorial.py +++ b/xarray/tests/test_tutorial.py @@ -1,20 +1,15 @@ from __future__ import annotations -import pytest - -from xarray import DataArray, tutorial -from xarray.tests import assert_identical, network +from xarray import DataArray, DataTree, tutorial +from xarray.testing import assert_identical +from xarray.tests import network @network class TestLoadDataset: - @pytest.fixture(autouse=True) - def setUp(self): - self.testfile = "tiny" - def test_download_from_github(self, tmp_path) -> None: cache_dir = tmp_path / tutorial._default_cache_dir_name - ds = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load() + ds = tutorial.open_dataset("tiny", cache_dir=cache_dir).load() tiny = DataArray(range(5), name="tiny").to_dataset() assert_identical(ds, tiny) @@ -24,7 +19,27 @@ def test_download_from_github_load_without_cache( cache_dir = tmp_path / tutorial._default_cache_dir_name ds_nocache = tutorial.open_dataset( - self.testfile, cache=False, cache_dir=cache_dir + "tiny", cache=False, cache_dir=cache_dir + ).load() + ds_cache = tutorial.open_dataset("tiny", cache_dir=cache_dir).load() + assert_identical(ds_cache, ds_nocache) + + +@network +class TestLoadDataTree: + def test_download_from_github(self, tmp_path) -> None: + cache_dir = tmp_path / tutorial._default_cache_dir_name + ds = tutorial.open_datatree("tiny", cache_dir=cache_dir).load() + tiny = DataTree.from_dict({"/": DataArray(range(5), name="tiny").to_dataset()}) + assert_identical(ds, tiny) + + def test_download_from_github_load_without_cache( + self, tmp_path, monkeypatch + ) -> None: + cache_dir = tmp_path / tutorial._default_cache_dir_name + + ds_nocache = tutorial.open_datatree( + "tiny", cache=False, cache_dir=cache_dir ).load() - ds_cache = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load() + ds_cache = tutorial.open_datatree("tiny", cache_dir=cache_dir).load() assert_identical(ds_cache, ds_nocache) diff --git a/xarray/tutorial.py b/xarray/tutorial.py index cfc6a5147d3..ec832694a99 100644 --- a/xarray/tutorial.py +++ b/xarray/tutorial.py @@ -16,8 +16,10 @@ import numpy as np from xarray.backends.api import open_dataset as _open_dataset +from xarray.backends.api import open_datatree as _open_datatree from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset +from xarray.core.datatree import DataTree if TYPE_CHECKING: from xarray.backends.api import T_Engine @@ -248,3 +250,140 @@ def scatter_example_dataset(*, seed: None | int = None) -> Dataset: ds.B.attrs["units"] = "Bunits" return ds + + +def open_datatree( + name: str, + cache: bool = True, + cache_dir: None | str | os.PathLike = None, + *, + engine: T_Engine = None, + **kws, +) -> DataTree: + """ + Open a dataset as a `DataTree` from the online repository (requires internet). + + If a local copy is found then always use that to avoid network traffic. + + Available datasets: + + * ``"imerghh_730"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T07:30:00.000Z + * ``"imerghh_830"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T08:30:00.000Z + * ``"air_temperature"``: NCEP reanalysis subset + * ``"air_temperature_gradient"``: NCEP reanalysis subset with approximate x,y gradients + * ``"basin_mask"``: Dataset with ocean basins marked using integers + * ``"ASE_ice_velocity"``: MEaSUREs InSAR-Based Ice Velocity of the Amundsen Sea Embayment, Antarctica, Version 1 + * ``"rasm"``: Output of the Regional Arctic System Model (RASM) + * ``"ROMS_example"``: Regional Ocean Model System (ROMS) output + * ``"tiny"``: small synthetic dataset with a 1D data variable + * ``"era5-2mt-2019-03-uk.grib"``: ERA5 temperature data over the UK + * ``"eraint_uvz"``: data from ERA-Interim reanalysis, monthly averages of upper level data + * ``"ersstv5"``: NOAA's Extended Reconstructed Sea Surface Temperature monthly averages + + Parameters + ---------- + name : str + Name of the file containing the dataset. + e.g. 'air_temperature' + cache_dir : path-like, optional + The directory in which to search for and write cached data. + cache : bool, optional + If True, then cache data locally for use on subsequent calls + **kws : dict, optional + Passed to xarray.open_dataset + + See Also + -------- + tutorial.load_datatree + open_datatree + """ + try: + import pooch + except ImportError as e: + raise ImportError( + "tutorial.open_dataset depends on pooch to download and manage datasets." + " To proceed please install pooch." + ) from e + + logger = pooch.get_logger() + logger.setLevel("WARNING") + + cache_dir = _construct_cache_dir(cache_dir) + if name in external_urls: + url = external_urls[name] + else: + path = pathlib.Path(name) + if not path.suffix: + # process the name + default_extension = ".nc" + if engine is None: + _check_netcdf_engine_installed(name) + path = path.with_suffix(default_extension) + elif path.suffix == ".grib": + if engine is None: + engine = "cfgrib" + try: + import cfgrib # noqa: F401 + except ImportError as e: + raise ImportError( + "Reading this tutorial dataset requires the cfgrib package." + ) from e + + url = f"{base_url}/raw/{version}/{path.name}" + + headers = {"User-Agent": f"xarray {sys.modules['xarray'].__version__}"} + downloader = pooch.HTTPDownloader(headers=headers) + + # retrieve the file + filepath = pooch.retrieve( + url=url, known_hash=None, path=cache_dir, downloader=downloader + ) + ds = _open_datatree(filepath, engine=engine, **kws) + if not cache: + ds = ds.load() + pathlib.Path(filepath).unlink() + + return ds + + +def load_datatree(*args, **kwargs) -> DataTree: + """ + Open, load into memory (as a `DataTree`), and close a dataset from the online repository + (requires internet). + + If a local copy is found then always use that to avoid network traffic. + + Available datasets: + + * ``"imerghh_730"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T07:30:00.000Z + * ``"imerghh_830"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T08:30:00.000Z + * ``"air_temperature"``: NCEP reanalysis subset + * ``"air_temperature_gradient"``: NCEP reanalysis subset with approximate x,y gradients + * ``"basin_mask"``: Dataset with ocean basins marked using integers + * ``"ASE_ice_velocity"``: MEaSUREs InSAR-Based Ice Velocity of the Amundsen Sea Embayment, Antarctica, Version 1 + * ``"rasm"``: Output of the Regional Arctic System Model (RASM) + * ``"ROMS_example"``: Regional Ocean Model System (ROMS) output + * ``"tiny"``: small synthetic dataset with a 1D data variable + * ``"era5-2mt-2019-03-uk.grib"``: ERA5 temperature data over the UK + * ``"eraint_uvz"``: data from ERA-Interim reanalysis, monthly averages of upper level data + * ``"ersstv5"``: NOAA's Extended Reconstructed Sea Surface Temperature monthly averages + + Parameters + ---------- + name : str + Name of the file containing the dataset. + e.g. 'air_temperature' + cache_dir : path-like, optional + The directory in which to search for and write cached data. + cache : bool, optional + If True, then cache data locally for use on subsequent calls + **kws : dict, optional + Passed to xarray.open_datatree + + See Also + -------- + tutorial.open_datatree + open_datatree + """ + with open_datatree(*args, **kwargs) as ds: + return ds.load()