From 9a423796d6ccf78029a4f403c9e753d496277704 Mon Sep 17 00:00:00 2001 From: Olufunke Awowale Date: Thu, 27 Feb 2025 14:35:59 -0500 Subject: [PATCH 1/5] added tutorial.open_datatree and tutorial.load_datatree --- xarray/tests/test_tutorial.py | 41 +++++++--- xarray/tutorial.py | 138 ++++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+), 10 deletions(-) diff --git a/xarray/tests/test_tutorial.py b/xarray/tests/test_tutorial.py index 9d59219c204..f26a6ad8110 100644 --- a/xarray/tests/test_tutorial.py +++ b/xarray/tests/test_tutorial.py @@ -2,29 +2,50 @@ import pytest -from xarray import DataArray, tutorial +from xarray import DataArray, DataTree, tutorial from xarray.tests import assert_identical, network +@pytest.fixture(autouse=True) +def setUp(name="testfile"): + yield "tiny" + + @network class TestLoadDataset: - @pytest.fixture(autouse=True) - def setUp(self): - self.testfile = "tiny" - - def test_download_from_github(self, tmp_path) -> None: + def test_download_from_github(self, testfile, tmp_path) -> None: cache_dir = tmp_path / tutorial._default_cache_dir_name - ds = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load() + ds = tutorial.open_dataset(testfile, cache_dir=cache_dir).load() tiny = DataArray(range(5), name="tiny").to_dataset() assert_identical(ds, tiny) def test_download_from_github_load_without_cache( - self, tmp_path, monkeypatch + self, testfile, tmp_path, monkeypatch ) -> None: cache_dir = tmp_path / tutorial._default_cache_dir_name ds_nocache = tutorial.open_dataset( - self.testfile, cache=False, cache_dir=cache_dir + testfile, cache=False, cache_dir=cache_dir + ).load() + ds_cache = tutorial.open_dataset(testfile, cache_dir=cache_dir).load() + assert_identical(ds_cache, ds_nocache) + + +@network +class TestLoadDataTree: + def test_download_from_github(self, testfile, tmp_path) -> None: + cache_dir = tmp_path / tutorial._default_cache_dir_name + ds = tutorial.open_datatree(testfile, cache_dir=cache_dir).load() + tiny = DataTree.from_dict({"/": DataArray(range(5), name="tiny").to_dataset()}) + assert_identical(ds, tiny) + + def test_download_from_github_load_without_cache( + self, testfile, tmp_path, monkeypatch + ) -> None: + cache_dir = tmp_path / tutorial._default_cache_dir_name + + ds_nocache = tutorial.open_datatree( + testfile, cache=False, cache_dir=cache_dir ).load() - ds_cache = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load() + ds_cache = tutorial.open_datatree(testfile, cache_dir=cache_dir).load() assert_identical(ds_cache, ds_nocache) diff --git a/xarray/tutorial.py b/xarray/tutorial.py index cfc6a5147d3..88ee026eccc 100644 --- a/xarray/tutorial.py +++ b/xarray/tutorial.py @@ -16,8 +16,10 @@ import numpy as np from xarray.backends.api import open_dataset as _open_dataset +from xarray.backends.api import open_datatree as _open_datatree from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset +from xarray.core.datatree import DataTree if TYPE_CHECKING: from xarray.backends.api import T_Engine @@ -248,3 +250,139 @@ def scatter_example_dataset(*, seed: None | int = None) -> Dataset: ds.B.attrs["units"] = "Bunits" return ds + + +def open_datatree( + name: str, + cache: bool = True, + cache_dir: None | str | os.PathLike = None, + *, + engine: T_Engine = None, + **kws, +) -> DataTree: + """ + Open a dataset as a `DataTree` from the online repository (requires internet). + + If a local copy is found then always use that to avoid network traffic. + + Available datasets: + * ``imerghh_730.HDF5`` IMERGHH_07 product from 2021-08-29T07:30:00.000Z + * ``imerghh_830.HDF5`` IMERGHH_07 product from 2021-08-29T08:30:00.000Z + * ``"air_temperature"``: NCEP reanalysis subset + * ``"air_temperature_gradient"``: NCEP reanalysis subset with approximate x,y gradients + * ``"basin_mask"``: Dataset with ocean basins marked using integers + * ``"ASE_ice_velocity"``: MEaSUREs InSAR-Based Ice Velocity of the Amundsen Sea Embayment, Antarctica, Version 1 + * ``"rasm"``: Output of the Regional Arctic System Model (RASM) + * ``"ROMS_example"``: Regional Ocean Model System (ROMS) output + * ``"tiny"``: small synthetic dataset with a 1D data variable + * ``"era5-2mt-2019-03-uk.grib"``: ERA5 temperature data over the UK + * ``"eraint_uvz"``: data from ERA-Interim reanalysis, monthly averages of upper level data + * ``"ersstv5"``: NOAA's Extended Reconstructed Sea Surface Temperature monthly averages + + Parameters + ---------- + name : str + Name of the file containing the dataset. + e.g. 'air_temperature' + cache_dir : path-like, optional + The directory in which to search for and write cached data. + cache : bool, optional + If True, then cache data locally for use on subsequent calls + **kws : dict, optional + Passed to xarray.open_dataset + + See Also + -------- + tutorial.load_datatree + open_datatree + """ + try: + import pooch + except ImportError as e: + raise ImportError( + "tutorial.open_dataset depends on pooch to download and manage datasets." + " To proceed please install pooch." + ) from e + + logger = pooch.get_logger() + logger.setLevel("WARNING") + + cache_dir = _construct_cache_dir(cache_dir) + if name in external_urls: + url = external_urls[name] + else: + path = pathlib.Path(name) + if not path.suffix: + # process the name + default_extension = ".nc" + if engine is None: + _check_netcdf_engine_installed(name) + path = path.with_suffix(default_extension) + elif path.suffix == ".grib": + if engine is None: + engine = "cfgrib" + try: + import cfgrib # noqa: F401 + except ImportError as e: + raise ImportError( + "Reading this tutorial dataset requires the cfgrib package." + ) from e + + url = f"{base_url}/raw/{version}/{path.name}" + + headers = {"User-Agent": f"xarray {sys.modules['xarray'].__version__}"} + downloader = pooch.HTTPDownloader(headers=headers) + + # retrieve the file + filepath = pooch.retrieve( + url=url, known_hash=None, path=cache_dir, downloader=downloader + ) + ds = _open_datatree(filepath, engine=engine, **kws) + if not cache: + ds = ds.load() + pathlib.Path(filepath).unlink() + + return ds + + +def load_datatree(*args, **kwargs) -> DataTree: + """ + Open, load into memory (as a `DataTree`), and close a dataset from the online repository + (requires internet). + + If a local copy is found then always use that to avoid network traffic. + + Available datasets: + * ``imerghh_730.HDF5`` IMERGHH_07 product from 2021-08-29T07:30:00.000Z + * ``imerghh_830.HDF5`` IMERGHH_07 product from 2021-08-29T08:30:00.000Z + * ``"air_temperature"``: NCEP reanalysis subset + * ``"air_temperature_gradient"``: NCEP reanalysis subset with approximate x,y gradients + * ``"basin_mask"``: Dataset with ocean basins marked using integers + * ``"ASE_ice_velocity"``: MEaSUREs InSAR-Based Ice Velocity of the Amundsen Sea Embayment, Antarctica, Version 1 + * ``"rasm"``: Output of the Regional Arctic System Model (RASM) + * ``"ROMS_example"``: Regional Ocean Model System (ROMS) output + * ``"tiny"``: small synthetic dataset with a 1D data variable + * ``"era5-2mt-2019-03-uk.grib"``: ERA5 temperature data over the UK + * ``"eraint_uvz"``: data from ERA-Interim reanalysis, monthly averages of upper level data + * ``"ersstv5"``: NOAA's Extended Reconstructed Sea Surface Temperature monthly averages + + Parameters + ---------- + name : str + Name of the file containing the dataset. + e.g. 'air_temperature' + cache_dir : path-like, optional + The directory in which to search for and write cached data. + cache : bool, optional + If True, then cache data locally for use on subsequent calls + **kws : dict, optional + Passed to xarray.open_datatree + + See Also + -------- + tutorial.open_datatree + open_datatree + load_datatree + """ + with open_datatree(*args, **kwargs) as ds: + return ds.load() From 0caf54a766e221f21a24863ca8e33a3fe6805963 Mon Sep 17 00:00:00 2001 From: Olufunke Awowale Date: Thu, 27 Feb 2025 15:09:55 -0500 Subject: [PATCH 2/5] updated tests to use fixture --- xarray/tests/test_tutorial.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_tutorial.py b/xarray/tests/test_tutorial.py index f26a6ad8110..95c30199cb4 100644 --- a/xarray/tests/test_tutorial.py +++ b/xarray/tests/test_tutorial.py @@ -3,11 +3,12 @@ import pytest from xarray import DataArray, DataTree, tutorial -from xarray.tests import assert_identical, network +from xarray.testing import assert_identical +from xarray.tests import network -@pytest.fixture(autouse=True) -def setUp(name="testfile"): +@pytest.fixture(autouse=True, name="testfile") +def setUp(): yield "tiny" From e092710e03d5104e1715510ec866fa381e7c8c33 Mon Sep 17 00:00:00 2001 From: Olufunke Awowale Date: Thu, 27 Feb 2025 15:45:52 -0500 Subject: [PATCH 3/5] added whats-new.rst and api.rst --- doc/api.rst | 2 ++ doc/whats-new.rst | 2 ++ 2 files changed, 4 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 3aa07830655..08686fc4ed0 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1590,6 +1590,8 @@ Tutorial tutorial.open_dataset tutorial.load_dataset + tutorial.open_datatree + tutorial.load_datatree Testing ======= diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 94ab5832f2a..756d9c3bc6a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,6 +21,8 @@ v2025.02.0 (unreleased) New Features ~~~~~~~~~~~~ +- Added :py:meth:`tutorial.open_datatree` and :py:meth:`tutorial.load_datatree` + By `Eni Awowale `_. - Added :py:meth:`Coordinates.from_xindex` as convenience for creating a new :py:class:`Coordinates` object directly from an existing Xarray index object if the latter supports it (:pull:`10000`) By `Benoit Bovy `_. From 37d5cc0b8f9809d3cbdabf043f185ef0289f047b Mon Sep 17 00:00:00 2001 From: Olufunke Awowale Date: Thu, 27 Feb 2025 16:15:43 -0500 Subject: [PATCH 4/5] added suggestions fixed formatting for docs --- xarray/tests/test_tutorial.py | 27 ++++++++++----------------- xarray/tutorial.py | 7 ++++--- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/xarray/tests/test_tutorial.py b/xarray/tests/test_tutorial.py index 95c30199cb4..e271da6863b 100644 --- a/xarray/tests/test_tutorial.py +++ b/xarray/tests/test_tutorial.py @@ -1,52 +1,45 @@ from __future__ import annotations -import pytest - from xarray import DataArray, DataTree, tutorial from xarray.testing import assert_identical from xarray.tests import network -@pytest.fixture(autouse=True, name="testfile") -def setUp(): - yield "tiny" - - @network class TestLoadDataset: - def test_download_from_github(self, testfile, tmp_path) -> None: + def test_download_from_github(self, tmp_path) -> None: cache_dir = tmp_path / tutorial._default_cache_dir_name - ds = tutorial.open_dataset(testfile, cache_dir=cache_dir).load() + ds = tutorial.open_dataset("tiny", cache_dir=cache_dir).load() tiny = DataArray(range(5), name="tiny").to_dataset() assert_identical(ds, tiny) def test_download_from_github_load_without_cache( - self, testfile, tmp_path, monkeypatch + self, tmp_path, monkeypatch ) -> None: cache_dir = tmp_path / tutorial._default_cache_dir_name ds_nocache = tutorial.open_dataset( - testfile, cache=False, cache_dir=cache_dir + "tiny", cache=False, cache_dir=cache_dir ).load() - ds_cache = tutorial.open_dataset(testfile, cache_dir=cache_dir).load() + ds_cache = tutorial.open_dataset("tiny", cache_dir=cache_dir).load() assert_identical(ds_cache, ds_nocache) @network class TestLoadDataTree: - def test_download_from_github(self, testfile, tmp_path) -> None: + def test_download_from_github(self, tmp_path) -> None: cache_dir = tmp_path / tutorial._default_cache_dir_name - ds = tutorial.open_datatree(testfile, cache_dir=cache_dir).load() + ds = tutorial.open_datatree("tiny", cache_dir=cache_dir).load() tiny = DataTree.from_dict({"/": DataArray(range(5), name="tiny").to_dataset()}) assert_identical(ds, tiny) def test_download_from_github_load_without_cache( - self, testfile, tmp_path, monkeypatch + self, tmp_path, monkeypatch ) -> None: cache_dir = tmp_path / tutorial._default_cache_dir_name ds_nocache = tutorial.open_datatree( - testfile, cache=False, cache_dir=cache_dir + "tiny", cache=False, cache_dir=cache_dir ).load() - ds_cache = tutorial.open_datatree(testfile, cache_dir=cache_dir).load() + ds_cache = tutorial.open_datatree("tiny", cache_dir=cache_dir).load() assert_identical(ds_cache, ds_nocache) diff --git a/xarray/tutorial.py b/xarray/tutorial.py index 88ee026eccc..0b7adb8ac04 100644 --- a/xarray/tutorial.py +++ b/xarray/tutorial.py @@ -266,6 +266,7 @@ def open_datatree( If a local copy is found then always use that to avoid network traffic. Available datasets: + * ``imerghh_730.HDF5`` IMERGHH_07 product from 2021-08-29T07:30:00.000Z * ``imerghh_830.HDF5`` IMERGHH_07 product from 2021-08-29T08:30:00.000Z * ``"air_temperature"``: NCEP reanalysis subset @@ -353,8 +354,9 @@ def load_datatree(*args, **kwargs) -> DataTree: If a local copy is found then always use that to avoid network traffic. Available datasets: - * ``imerghh_730.HDF5`` IMERGHH_07 product from 2021-08-29T07:30:00.000Z - * ``imerghh_830.HDF5`` IMERGHH_07 product from 2021-08-29T08:30:00.000Z + + * ``imerghh_730.HDF5``: GPM_3IMERGHH_07 product from 2021-08-29T07:30:00.000Z + * ``imerghh_830.HDF5``: GPM_3IMERGHH_07 product from 2021-08-29T08:30:00.000Z * ``"air_temperature"``: NCEP reanalysis subset * ``"air_temperature_gradient"``: NCEP reanalysis subset with approximate x,y gradients * ``"basin_mask"``: Dataset with ocean basins marked using integers @@ -382,7 +384,6 @@ def load_datatree(*args, **kwargs) -> DataTree: -------- tutorial.open_datatree open_datatree - load_datatree """ with open_datatree(*args, **kwargs) as ds: return ds.load() From d15e1a9ddcfb59dca55b4cf0e46fdd1a07912ce0 Mon Sep 17 00:00:00 2001 From: Olufunke Awowale Date: Thu, 27 Feb 2025 16:42:42 -0500 Subject: [PATCH 5/5] Added longname of GPM_3IMERGHH_07 --- xarray/tutorial.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/tutorial.py b/xarray/tutorial.py index 0b7adb8ac04..ec832694a99 100644 --- a/xarray/tutorial.py +++ b/xarray/tutorial.py @@ -267,8 +267,8 @@ def open_datatree( Available datasets: - * ``imerghh_730.HDF5`` IMERGHH_07 product from 2021-08-29T07:30:00.000Z - * ``imerghh_830.HDF5`` IMERGHH_07 product from 2021-08-29T08:30:00.000Z + * ``"imerghh_730"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T07:30:00.000Z + * ``"imerghh_830"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T08:30:00.000Z * ``"air_temperature"``: NCEP reanalysis subset * ``"air_temperature_gradient"``: NCEP reanalysis subset with approximate x,y gradients * ``"basin_mask"``: Dataset with ocean basins marked using integers @@ -355,8 +355,8 @@ def load_datatree(*args, **kwargs) -> DataTree: Available datasets: - * ``imerghh_730.HDF5``: GPM_3IMERGHH_07 product from 2021-08-29T07:30:00.000Z - * ``imerghh_830.HDF5``: GPM_3IMERGHH_07 product from 2021-08-29T08:30:00.000Z + * ``"imerghh_730"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T07:30:00.000Z + * ``"imerghh_830"``: GPM IMERG Final Precipitation L3 Half Hourly 0.1 degree x 0.1 degree V07 from 2021-08-29T08:30:00.000Z * ``"air_temperature"``: NCEP reanalysis subset * ``"air_temperature_gradient"``: NCEP reanalysis subset with approximate x,y gradients * ``"basin_mask"``: Dataset with ocean basins marked using integers