From 8f042bdfd30bf5f4959e749004c343b08f29a386 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 8 Aug 2022 09:15:51 -0500 Subject: [PATCH] [POC]: Added "pc" fsspec filesystem This adds a "pc" fsspec filesystem implementation, which lets us insert "pc::" in an fsspec URL and automatically sign it when loading it with an fsspec client. The primary motivation is integration with fsspec's filesystem where users would need to call `planetary_computer.sign` in multiple places 1. Once for loading the index JSON files 2. Once for signing the reference filesystem templates Which lets us replace this: ```python >>> result = xr.open_dataset( ... fsspec.get_mapper( ... "reference://", ... fo=planetary_computer.sign(requests.get(planetary_computer.sign("https://deltaresreservoirssa.blob.core.windows.net/references/reservoirs/chirps.json")).json()), ... ), ... engine="zarr", ... consolidated=False, ... ) ``` With this: ```python >>> result = xr.open_dataset( ... "pc::reference::pc::https://deltaresreservoirssa.blob.core.windows.net/references/reservoirs/CHIRPS.json", ... engine="zarr", ... consolidated=False, ... ) ``` --- planetary_computer/_pc_fs.py | 90 ++++++++++++++++++++++++++++++++++++ setup.cfg | 7 +++ 2 files changed, 97 insertions(+) create mode 100644 planetary_computer/_pc_fs.py diff --git a/planetary_computer/_pc_fs.py b/planetary_computer/_pc_fs.py new file mode 100644 index 0000000..ec71b6d --- /dev/null +++ b/planetary_computer/_pc_fs.py @@ -0,0 +1,90 @@ +import fsspec +import planetary_computer + + +class PCFileSystem(fsspec.AbstractFileSystem): + """ + Planetary Computer filesystem for fsspec. + + This file system is solely a convenience for automatically + signing assets in fsspec URLs. It uses fsspec's + `URL chaining `_ + and :meth:`planetary_computer.sign` to transform URLs like + ``pc://https://.blob.core.windows.net/container/asset`` to the signed version. + + Parameters + ---------- + target_protocol : str + The protocol used to load the actual asset (e.g. 'https') + target_options : dict + Additional keywords to use for the target protocol's fsspec filesystem + fo: str, optional + The target path. + + Examples + -------- + This example loads a Kerchunk index file from Azure Blob Storage. The index file is in a private blob + storage container and so needs to be signed. The ``pc`` in the URL will automatically sign the + asset before attempting to load it. + + >>> import xarray as xr + >>> url = "reference::pc::https://deltaresreservoirssa.blob.core.windows.net/references/reservoirs/CHIRPS.json" + >>> result = xr.open_dataset(url, engine="zarr", consolidated=False) + + Dimensions: (time: 13515, GrandID: 2951, ksathorfrac: 5) + Coordinates: + * GrandID (GrandID) float64 nan nan nan nan nan ... nan nan nan nan nan + * ksathorfrac (ksathorfrac) float64 5.0 20.0 50.0 100.0 250.0 + * time (time) datetime64[ns] NaT NaT NaT NaT NaT ... NaT NaT NaT NaT + Data variables: (12/14) + ETa (time, GrandID, ksathorfrac) float32 dask.array + Ea_res (time, GrandID, ksathorfrac) float32 dask.array + FracFull (time, GrandID, ksathorfrac) float32 dask.array + Melt (time, GrandID, ksathorfrac) float32 dask.array + P (time, GrandID, ksathorfrac) float32 dask.array + PET (time, GrandID, ksathorfrac) float32 dask.array + ... ... + Qout_res (time, GrandID, ksathorfrac) float32 dask.array + S_res (time, GrandID, ksathorfrac) float32 dask.array + Snow (time, GrandID, ksathorfrac) float32 dask.array + Temp (time, GrandID, ksathorfrac) float32 dask.array + latitude (GrandID) float32 dask.array + longitude (GrandID) float32 dask.array + """ + def __init__( + self, + target_protocol=None, + target_options=None, + fo=None, + **kwargs, + ): + self.target_protocol = target_protocol + self.target_options = target_options + if fo: + fo = planetary_computer.sign(fo) + self.fo = fo + self.target_fs = fsspec.filesystem(self.target_protocol, **self.target_options) + if isinstance(self.target_fs, fsspec.implementations.reference.ReferenceFileSystem): + # this is a hack, but we need to sign the references after they've been loaded. + # for k, v in self.target_fs.templates.items(): + # print(k, v) + # # print(k) + # self.target_fs.templates[k] = planetary_computer.sign(v) + + # ReferenceFileSystem.__init__ does some processing, which means this is too late. + for k, v in self.target_fs.references.items(): + if isinstance(v, list) and len(v) == 3: + # print("sign", k) + self.target_fs.references[k] = [planetary_computer.sign(v[0]),] + v[1:] + + super().__init__(**kwargs) + + def open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs): + # print("open", path) + if self.fo: + path = self.fo + return self.target_fs.open(path, mode=mode, block_size=block_size, cache_options=cache_options, **kwargs) + + def ls(self, path, detail=True, **kwargs): + # print("ls", path) + return self.target_fs.ls(path, detail=detail, **kwargs) diff --git a/setup.cfg b/setup.cfg index d934a93..442740f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,6 +19,13 @@ install_requires = pytz>=2020.5 requests>=2.25.1 +[options.extras_requires] +fsspec = + fsspec + [options.entry_points] console_scripts = planetarycomputer = planetary_computer.scripts.cli:app + +fsspec.specs = + pc = planetary_computer._pc_fs.PCFileSystem \ No newline at end of file