Finally, a full complete draft of #213. Not yet tested

JackKelly · JackKelly · commit 72e39c80c747 · 2021-10-29T15:40:44.000+01:00
diff --git a/nowcasting_dataset/data_sources/data_source.py b/nowcasting_dataset/data_sources/data_source.py
@@ -3,12 +3,17 @@
 import logging
 from dataclasses import InitVar, dataclass
 from numbers import Number
+from pathlib import Path
 from typing import Iterable, List, Tuple
 
 import pandas as pd
 import xarray as xr
 
+import nowcasting_dataset.filesystem.utils as nd_fs_utils
+
+# nowcasting_dataset imports
 import nowcasting_dataset.time as nd_time
+import nowcasting_dataset.utils as nd_utils
 from nowcasting_dataset import square
 from nowcasting_dataset.data_sources.datasource_output import DataSourceOutput
 from nowcasting_dataset.dataset.xr_utils import join_dataset_to_batch_dataset
@@ -99,8 +104,7 @@ def sample_period_minutes(self) -> int:
         """
         This is the default sample period in minutes.
 
-        This functions may be overwritten if
-        the sample period of the data source is not 5 minutes.
+        This functions may be overwritten if the sample period of the data source is not 5 minutes.
         """
         logging.debug(
             "Getting sample_period_minutes default of 5 minutes. "
@@ -112,13 +116,79 @@ def open(self):
         """Open the data source, if necessary.
 
         Called from each worker process.  Useful for data sources where the
-        underlying data source cannot be forked (like Zarr on GCP!).
+        underlying data source cannot be forked (like Zarr).
 
-        Data sources which can be forked safely should call open()
-        from __init__().
+        Data sources which can be forked safely should call open() from __init__().
         """
         pass
 
+    def create_batches(
+        self,
+        spatial_and_temporal_locations_of_each_example: pd.DataFrame,
+        idx_of_first_batch: int,
+        batch_size: int,
+        dst_path: Path,
+        temp_path: Path,
+        upload_every_n_batches: int,
+    ) -> None:
+        """Create multiple batches and save them to disk.
+
+        Args:
+          spatial_and_temporal_locations_of_each_example: A DataFrame where each row specifies
+            the spatial and temporal location of an example.  The number of rows must be
+            an exact multiple of `batch_size`.
+            Columns are: t0_datetime_UTC, x_center_OSGB, y_center_OSGB.
+          idx_of_first_batch: The batch number of the first batch to create.
+          batch_size: The number of examples per batch.
+          dst_path: The final destination path for the batches.  Must exist.
+          temp_path: The local temporary path.  This is only required when dst_path is a
+            cloud storage bucket, so files must first be created on the VM's local disk in temp_path
+            and then uploaded to dst_path every upload_every_n_batches. Must exist. Will be emptied.
+          upload_every_n_batches: Upload the contents of temp_path to dst_path after this number
+            of batches have been created.  If 0 then will write directly to dst_path.
+        """
+        # Sanity checks:
+        assert idx_of_first_batch >= 0
+        assert batch_size > 0
+        assert len(spatial_and_temporal_locations_of_each_example) % batch_size == 0
+        assert upload_every_n_batches >= 0
+
+        # Figure out where to write batches to:
+        save_batches_locally_and_upload = upload_every_n_batches > 0
+        if save_batches_locally_and_upload:
+            nd_fs_utils.delete_all_files_in_temp_path(temp_path)
+        path_to_write_to = temp_path if save_batches_locally_and_upload else dst_path
+
+        # Loop round each batch:
+        examples_for_batch = spatial_and_temporal_locations_of_each_example.iloc[:batch_size]
+        n_batches_processed = 0
+        while not examples_for_batch.empty:
+            # Generate batch.
+            batch = self.get_batch(
+                t0_datetimes=examples_for_batch.t0_datetime_UTC,
+                x_locations=examples_for_batch.x_center_OSGB,
+                y_locations=examples_for_batch.y_center_OSGB,
+            )
+
+            # Save batch to disk.
+            batch_idx = idx_of_first_batch + n_batches_processed
+            netcdf_filename = path_to_write_to / nd_utils.get_netcdf_filename(batch_idx)
+            batch.to_netcdf(netcdf_filename)
+
+            # Upload if necessary.
+            if (
+                save_batches_locally_and_upload
+                and n_batches_processed > 0
+                and n_batches_processed % upload_every_n_batches == 0
+            ):
+                nd_fs_utils.upload_and_delete_local_files(dst_path, path_to_write_to)
+
+            n_batches_processed += 1
+
+        # Upload last few batches, if necessary:
+        if save_batches_locally_and_upload:
+            nd_fs_utils.upload_and_delete_local_files(dst_path, path_to_write_to)
+
     def get_batch(
         self,
         t0_datetimes: pd.DatetimeIndex,
@@ -141,14 +211,9 @@ def get_batch(
         zipped = zip(t0_datetimes, x_locations, y_locations)
         for t0_datetime, x_location, y_location in zipped:
             output: xr.Dataset = self.get_example(t0_datetime, x_location, y_location)
-
             examples.append(output)
 
-        # could add option here, to save each data source using
-        # 1. # DataSourceOutput.to_xr_dataset() to make it a dataset
-        # 2. DataSourceOutput.save_netcdf(), save to netcdf
-
-        # get the name of the cls, this could be one of the data sources like Sun
+        # Get the DataSource class, this could be one of the data sources like Sun
         cls = examples[0].__class__
 
         # join the examples together, and cast them to the cls, so that validation can occur
diff --git a/nowcasting_dataset/manager.py b/nowcasting_dataset/manager.py
@@ -343,7 +343,7 @@ def create_batches(self, overwrite_batches: bool) -> None:
                 for worker_id, (data_source_name, data_source) in enumerate(
                     self.data_sources.items()
                 ):
-                    # Get indexes of first batch and example; and subset locations_for_split.
+                    # Get indexes of first batch and example. And subset locations_for_split.
                     idx_of_first_batch = first_batches_to_create[split_name][data_source_name]
                     idx_of_first_example = idx_of_first_batch * self.config.process.batch_size
                     locations = locations_for_split.loc[idx_of_first_example:]
diff --git a/nowcasting_dataset/utils.py b/nowcasting_dataset/utils.py
@@ -1,11 +1,9 @@
 """ utils functions """
-import hashlib
 import logging
 import os
 import re
 import tempfile
 from functools import wraps
-from pathlib import Path
 from typing import Optional
 
 import fsspec.asyn
@@ -35,6 +33,7 @@ def set_fsspec_for_multiprocess() -> None:
     fsspec.asyn.loop[0] = None
 
 
+# TODO: Issue #170. Is this this function still used?
 def is_monotonically_increasing(a: Array) -> bool:
     """ Check the array is monotonically increasing """
     # TODO: Can probably replace with pd.Index.is_monotonic_increasing()
@@ -46,12 +45,14 @@ def is_monotonically_increasing(a: Array) -> bool:
     return np.all(np.diff(a) > 0)
 
 
+# TODO: Issue #170. Is this this function still used?
 def is_unique(a: Array) -> bool:
     """ Check array has unique values """
     # TODO: Can probably replace with pd.Index.is_unique()
     return len(a) == len(np.unique(a))
 
 
+# TODO: Issue #170. Is this this function still used?
 def scale_to_0_to_1(a: Array) -> Array:
     """Scale to the range [0, 1]."""
     a = a - a.min()
@@ -61,6 +62,7 @@ def scale_to_0_to_1(a: Array) -> Array:
     return a
 
 
+# TODO: Issue #170. Is this this function still used?
 def sin_and_cos(df: pd.DataFrame) -> pd.DataFrame:
     """
     For every column in df, creates cols for sin and cos of that col.
@@ -94,26 +96,13 @@ def sin_and_cos(df: pd.DataFrame) -> pd.DataFrame:
     return output_df
 
 
-def get_netcdf_filename(batch_idx: int, add_hash: bool = False) -> Path:
-    """Generate full filename, excluding path.
-
-    Filename includes the first 6 digits of the MD5 hash of the filename,
-    as recommended by Google Cloud in order to distribute data across
-    multiple back-end servers.
-
-    Add option to turn on and off hashing
-
-    """
-    filename = f"{batch_idx}.nc"
-    # In the future we could hash the configuration file, and use this to
-    # make sure we are saving and loading the same thing.
-    if add_hash:
-        hash_of_filename = hashlib.md5(filename.encode()).hexdigest()
-        filename = f"{hash_of_filename[0:6]}_{filename}"
-
-    return filename
+def get_netcdf_filename(batch_idx: int) -> str:
+    """Generate full filename, excluding path."""
+    assert 0 <= batch_idx < 1e6
+    return f"{batch_idx:06d}.nc"
 
 
+# TODO: Issue #170. Is this this function still used?
 def to_numpy(value):
     """ Change generic data to numpy"""
     if isinstance(value, xr.DataArray):