openclimatefix · JackKelly · Sep 28, 2021 · Sep 22, 2021 · Sep 22, 2021 · Sep 22, 2021
diff --git a/conftest.py b/conftest.py
@@ -64,7 +64,7 @@ def gsp_data_source():
 
 @pytest.fixture
 def configuration():
-    filename = os.path.join(os.path.dirname(nowcasting_dataset.__file__), "config", "example.yaml")
+    filename = os.path.join(os.path.dirname(nowcasting_dataset.__file__), "config", "gcp.yaml")
     config = load_yaml_configuration(filename)
 
     return config

diff --git a/environment.yml b/environment.yml
@@ -4,7 +4,7 @@ channels:
   - pvlib
   - conda-forge
 dependencies:
-  - python>=3.8
+  - python>=3.9
   - pip
 
   # Scientific Python
@@ -14,17 +14,20 @@ dependencies:
   - zarr
   - xarray
   - ipykernel
-  - h5netcdf  # For opening NetCDF files from cloud buckets.
-  
+  - h5netcdf # For opening NetCDF files from cloud buckets.
+
   # Cloud & distributed compute
   - gcsfs
+  - s3fs
+  - fsspec
+  - pathy
 
   # Images & optical flow
-  - conda-forge::opencv  # also run `apt install libgl1-mesa-glx`
+  - conda-forge::opencv # also run `apt install libgl1-mesa-glx`
   - scikit-image
-  
+
   # Machine learning
-  - pytorch::pytorch  # explicitly specify pytorch channel to prevent conda from using conda-forge for pytorch, and hence installing the CPU-only version.
+  - pytorch::pytorch # explicitly specify pytorch channel to prevent conda from using conda-forge for pytorch, and hence installing the CPU-only version.
   - pytorch-lightning
 
   # PV & Geospatial
@@ -40,5 +43,5 @@ dependencies:
   - pre-commit
 
   - pip:
-    - neptune-client[pytorch-lightning]
-    - tilemapbase
+      - neptune-client[pytorch-lightning]
+      - tilemapbase
diff --git a/nowcasting_dataset/cloud/gcp.py b/nowcasting_dataset/cloud/gcp.py
@@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
-from typing import List
+from typing import List, Union
 
 import gcsfs
 
@@ -9,17 +9,19 @@
 _LOG = logging.getLogger(__name__)
 
 
-def check_path_exists(path: Path):
+def check_path_exists(path: Union[str, Path]):
     """
-    Check that the path exists in GCS
-    @param path: the path in GCS that is checked
+    Check that the path exists in GCS.
+
+    Args:
+        path: the path in GCS that is checked
     """
     gcs = gcsfs.GCSFileSystem()
     if not gcs.exists(path):
         raise RuntimeError(f"{path} does not exist!")
 
 
-def gcp_upload_and_delete_local_files(dst_path: str, local_path: Path):
+def gcp_upload_and_delete_local_files(dst_path: str, local_path: Union[str, Path]):
     """
     Upload the files in a local path, to a path in gcs
     """
@@ -32,11 +34,12 @@ def gcp_upload_and_delete_local_files(dst_path: str, local_path: Path):
 def gcp_download_to_local(
     remote_filename: str, local_filename: str, gcs: gcsfs.GCSFileSystem = None
 ):
-    """
-    Download file from gcs
-    @param remote_filename: the gcs file name, should start with gs://
-    @param local_filename:
-    @param gcs: gcsfs.GCSFileSystem connection, means a new one doesnt have to be made everytime.
+    """Download file from gcs.
+
+    Args:
+        remote_filename: the gcs file name, should start with gs://
+        local_filename:
+        gcs: gcsfs.GCSFileSystem connection, means a new one doesnt have to be made everytime.
     """
 
     _LOG.debug(f"Downloading from GCP {remote_filename} to {local_filename}")
@@ -46,14 +49,16 @@ def gcp_download_to_local(
     gcs.get(remote_filename, local_filename)
 
 
-def get_all_filenames_in_path(remote_path) -> List[str]:
+def get_all_filenames_in_path(remote_path: str) -> List[str]:
     """
     Get all the files names from one folder in gcp
-    @param remote_path: the path that we should look in
-    @return: a list of strings, of files names
+
+    Args:
+        remote_path: the path that we should look in
+
+    Returns: a list of files names represented as strings.
     """
     gcs = gcsfs.GCSFileSystem()
-
     return gcs.ls(remote_path)
 
 
@@ -67,5 +72,4 @@ def rename_file(remote_file: str, new_filename: str):
 
     """
     gcs = gcsfs.GCSFileSystem()
-
     gcs.mv(remote_file, new_filename)
diff --git a/nowcasting_dataset/cloud/local.py b/nowcasting_dataset/cloud/local.py
@@ -1,12 +1,12 @@
 import glob
 import os
 import shutil
-
+from typing import Union
 
 import logging
 from pathlib import Path
 
-_LOG = logging.getLogger("nowcasting_dataset")
+_LOG = logging.getLogger(__name__)
 
 
 def delete_all_files_and_folder_in_temp_path(path: str):
@@ -32,3 +32,10 @@ def delete_all_files_in_temp_path(path: Path):
     _LOG.info(f"Deleting {len(files)} files from {path}.")
     for f in files:
         os.remove(f)
+
+
+def check_path_exists(path: Union[str, Path]):
+    """Raises a RuntimeError if `path` does not exist in the local filesystem."""
+    path = Path(path)
+    if not path.exists():
+        raise RuntimeError(f"{path} does not exist!")
diff --git a/nowcasting_dataset/cloud/utils.py b/nowcasting_dataset/cloud/utils.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 import gcsfs
 import tempfile
+import fsspec
 
 from nowcasting_dataset.cloud.aws import aws_upload_and_delete_local_files, upload_one_file
 from nowcasting_dataset.cloud.gcp import gcp_upload_and_delete_local_files, gcp_download_to_local
@@ -43,3 +44,38 @@ def gcp_to_aws(gcp_filename: str, gcs: gcsfs.GCSFileSystem, aws_filename: str, a
         upload_one_file(
             remote_filename=aws_filename, bucket=aws_bucket, local_filename=local_filename
         )
+
+
+def get_maximum_batch_id(path: str):
+    """
+    Get the last batch ID. Works with GCS, AWS, and local.
+
+    Args:
+        path: the path folder to look in.  Begin with 'gs://' for GCS.
+
+    Returns: the maximum batch id of data in `path`.
+    """
+    _LOG.debug(f"Looking for maximum batch id in {path}")
+
+    filesystem = fsspec.open(path).fs
+    filenames = filesystem.ls(path)
+
+    # just take filename
+    filenames = [filename.split("/")[-1] for filename in filenames]
+
+    # remove suffix
+    filenames = [filename.split(".")[0] for filename in filenames]
+
+    # change to integer
+    batch_indexes = [int(filename) for filename in filenames if len(filename) > 0]
+
+    # if there is no files, return None
+    if len(batch_indexes) == 0:
+        _LOG.debug(f"Did not find any files in {path}")
+        return None
+
+    # get the maximum batch id
+    maximum_batch_id = max(batch_indexes)
+    _LOG.debug(f"Found maximum of batch it of {maximum_batch_id} in {path}")
+
+    return maximum_batch_id
diff --git a/nowcasting_dataset/config/README.md b/nowcasting_dataset/config/README.md
@@ -2,19 +2,22 @@
 
 Configuration for the data set.
 
-
 Decided to go for a 'Pydantic' data class. It's slightly more complicated that just having yaml files, but the
 'Pydantic' feature I think outweigh this. There is a load from yaml file also.
 
+See `model.py` for documentation of the expected configuration fields.
+
+See either `gcp.yaml` or `on_premises.yaml` for example config files.
+
 # Example
 
-```
+```python
 # import the load function
 from nowcasting_dataset.config.load import load_yaml_configuration
 
 # load the configuration
 confgiruation = load_yaml_configuration(filename)
 
 # get the batch size
-batch_size = confgiruation.process.batch_size
+batch_size = configuration.process.batch_size
 ```
diff --git a/nowcasting_dataset/config/example.yaml b/nowcasting_dataset/config/example.yaml
diff --git a/nowcasting_dataset/config/gcp.yaml b/nowcasting_dataset/config/gcp.yaml
@@ -1,18 +1,19 @@
 general:
-  description: example configuration
-  name: example
+  name: gcp
+  description: Configuration for Google Cloud
 input_data:
-  bucket: solar-pv-nowcasting-data
-  npw_base_path: NWP/UK_Met_Office/UKV__2018-01_to_2019-12__chunks__variable10__init_time1__step1__x548__y704__.zarr
-  satelite_filename: satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep.zarr
-  solar_pv_data_filename: UK_PV_timeseries_batch.nc
-  solar_pv_metadata_filename: UK_PV_metadata.csv
-  solar_pv_path: PV/PVOutput.org
+  nwp_zarr_path: gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV__2018-01_to_2019-12__chunks__variable10__init_time1__step1__x548__y704__.zarr
+  satellite_zarr_path: gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep.zarr
+  solar_pv_data_filename: gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_timeseries_batch.nc
+  solar_pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_metadata.csv
+  gsp_zarr_path: gs://solar-pv-nowcasting-data/PV/PVOutput.org/PV/GSP/v0/pv_gsp.zarr
 output_data:
-  filepath: solar-pv-nowcasting-data/prepared_ML_training_data/v6/
+  filepath: gs://solar-pv-nowcasting-data/prepared_ML_training_data/v6/
 process:
+  local_temp_path: ~/temp/
   seed: 1234
   batch_size: 32
+  upload_every_n_batches: 16
   forecast_minutes: 60
   history_minutes: 30
   satellite_image_size_pixels: 64
@@ -28,7 +29,6 @@ process:
     - lcc
     - mcc
     - hcc
-  prcesion: 16
   sat_channels:
     - HRV
     - IR_016
@@ -42,4 +42,3 @@ process:
     - VIS008
     - WV_062
     - WV_073
-  val_check_interval: 1000
diff --git a/nowcasting_dataset/config/load.py b/nowcasting_dataset/config/load.py
@@ -4,19 +4,23 @@
 import io
 import yaml
 from nowcasting_dataset.config.model import Configuration
+from pathy import Pathy
+from typing import Union
+import fsspec
 
 logger = logging.getLogger(__name__)
 
 
-def load_yaml_configuration(filename: str) -> Configuration:
+def load_yaml_configuration(filename: Union[str, Pathy]) -> Configuration:
     """
     Load a yaml file which has a configuration in it
-    filename: the file name that you want to load
+    filename: the file name that you want to load.  Will load from local, AWS, or GCP
+      depending on the protocol suffix (e.g. 's3://bucket/config.yaml').
     Returns: pydantic class
     """
 
     # load the file to a dictionary
-    with open(filename, "r") as stream:
+    with fsspec.open(filename, mode="r") as stream:
         configuration = yaml.safe_load(stream)
 
     # turn into pydantic class