normalise NWP params. Closes #3

JackKelly · JackKelly · commit 0fbe8343ac0e · 2021-07-05T20:00:43.000Z
diff --git a/notebooks/benchmark_loading_speed.ipynb b/notebooks/benchmark_loading_speed.ipynb
diff --git a/notebooks/testing_NWPDataSource.ipynb b/notebooks/testing_NWPDataSource.ipynb
diff --git a/nowcasting_dataset/data_sources/nwp_data_source.py b/nowcasting_dataset/data_sources/nwp_data_source.py
@@ -13,6 +13,31 @@
 _LOG = logging.getLogger('nowcasting_dataset')
 
 
+nwp_ds.data - xr.DataArray(data=std.values, dims=('variable', ), coords=dict(variable=std['variable'].values))
+
+NWP_VARIABLE_NAMES = (
+    't', 'dswrf', 'prate', 'r', 'sde', 'si10', 'vis', 'lcc', 'mcc', 'hcc')
+
+# Means computed with
+# nwp_ds = NWPDataSource(...)
+# nwp_ds.open()
+# mean = nwp_ds.data.isel(init_time=slice(0, 10)).mean(dim=['step', 'x', 'init_time', 'y']).compute()
+NWP_MEAN = xr.DataArray(
+    data=(
+        2.8041010e+02, 1.6854691e+01, 6.7529683e-05, 8.1832832e+01,
+        7.1233767e-03, 8.8566933e+00, 4.3474598e+04, 4.9820110e+01,
+        4.8095409e+01, 4.2833260e+01),
+    dims=('variable', ),
+    coords={'variable': NWP_VARIABLE_NAMES})
+
+NWP_STD = xr.DataArray(
+    data=(
+        2.5812180e+00, 4.1278820e+01, 2.7507244e-04, 9.0967312e+00,
+        1.4110464e-01, 4.3616886e+00, 2.3853148e+04, 3.8900299e+01,
+        4.2830105e+01, 4.2778091e+01),
+    dims=('variable', ),
+    coords={'variable': NWP_VARIABLE_NAMES})
+
 @dataclass
 class NWPDataSource(ZarrDataSource):
     """
@@ -38,9 +63,7 @@ class NWPDataSource(ZarrDataSource):
             mcc   : Medium-level cloud cover in %.
             hcc   : High-level cloud cover in %.
     """
-    channels: Optional[Iterable[str]] = (
-        't', 'dswrf', 'prate', 'r', 'sde', 'si10', 'vis', 'lcc', 'mcc', 'hcc')
-    max_step: int = 3  #: Max forecast timesteps to load from NWPs.
+    channels: Optional[Iterable[str]] = NWP_VARIABLE_NAMES
     image_size_pixels: InitVar[int] = 2
     meters_per_pixel: InitVar[int] = 2_000
 
@@ -58,8 +81,8 @@ def open(self) -> None:
         # call open() _after_ creating separate processes.
         data = self._open_data()
         data = data[list(self.channels)].to_array()
-        #self._data = data.sel(
-        #    step=slice(pd.Timedelta(0), pd.Timedelta(hours=self.max_step + 1)))
+        data -= NWP_MEAN
+        data /= NWP_STD
         self._data = data
 
     def _open_data(self) -> xr.DataArray:
@@ -100,8 +123,16 @@ def _get_time_slice(self, t0_dt: pd.Timestamp) -> xr.DataArray:
 
         # Get the most recent NWP initialisation time for each
         # target_time_hourly.
-        init_times = self.data.sel(
-            init_time=target_times_hourly, method='ffill').init_time.values
+        try:
+            init_times = self.data.sel(
+                init_time=target_times_hourly, method='ffill').init_time.values
+        except Exception as e:
+            is_increasing = utils.is_monotonically_increasing(self.data.init_time.astype(int))
+            is_unique = utils.is_unique(self.data.init_time)
+            _LOG.exception(
+                f'Exception! start_hourly={start_hourly}, t0_hourly={t0_hourly}, end_hourly={end_hourly}, '
+                f'target_times_hourly={target_times_hourly}, {e}, is_increasing={is_increasing}, is_unique={is_unique}')
+            raise
 
         # Find the NWP init time for just the 'future' portion of the example.
         init_time_future = init_times[target_times_hourly == t0_hourly]
@@ -150,7 +181,7 @@ def datetime_index(self) -> pd.DatetimeIndex:
             nwp = self._open_data()
         else:
             nwp = self._data
-        target_times = nwp['init_time'] + nwp['step'][:self.max_step]
+        target_times = nwp['init_time'] + nwp['step'][:3]
         target_times = target_times.values.flatten()
         target_times = np.unique(target_times)
         target_times = np.sort(target_times)
diff --git a/nowcasting_dataset/data_sources/pv_data_source.py b/nowcasting_dataset/data_sources/pv_data_source.py
@@ -107,7 +107,7 @@ def get_example(
                 " (but not at the identical location to) x_meters_center and"
                 " y_meters_center.")
 
-        selected_pv_power = self._get_timestep_with_cache(t0_dt)
+        selected_pv_power = self._get_cached_time_slice(t0_dt)
         pv_system_ids = selected_pv_power.columns.intersection(pv_system_ids)
         assert len(pv_system_ids) > 0
 
diff --git a/nowcasting_dataset/datamodule.py b/nowcasting_dataset/datamodule.py
@@ -47,6 +47,8 @@ def __post_init__(self):
         # Plus 1 because neither history_len nor forecast_len include t0.
         self._total_seq_len = self.history_len + self.forecast_len + 1
         self.contiguous_dataset = None
+        if self.num_workers == 0:
+            self.prefetch_factor = 2  # Set to default when not using multiprocessing.
 
     def prepare_data(self) -> None:
         # Satellite data
@@ -141,14 +143,24 @@ def setup(self, stage='fit'):
         self.train_dataset = dataset.NowcastingDataset(
             t0_datetimes=self.train_t0_datetimes,
             data_sources=self.data_sources,
-            n_batches_per_epoch_per_worker=1024 // self.num_workers,
+            n_batches_per_epoch_per_worker=self._n_batches_per_epoch_per_worker(1024),
             **self._common_dataset_params())
         self.val_dataset = dataset.NowcastingDataset(
             t0_datetimes=self.val_t0_datetimes,
             data_sources=self.data_sources,
-            n_batches_per_epoch_per_worker=32 // self.num_workers,
+            n_batches_per_epoch_per_worker=self._n_batches_per_epoch_per_worker(32),
             **self._common_dataset_params())
-
+        
+        if self.num_workers == 0:
+            self.train_dataset.per_worker_init(worker_id=0)
+            self.val_dataset.per_worker_init(worker_id=0)
+
+    def _n_batches_per_epoch_per_worker(self, n_batches_per_epoch: int) -> int:
+        if self.num_workers > 0:
+            return n_batches_per_epoch // self.num_workers
+        else:
+            return n_batches_per_epoch
+            
     def _split_data(self):
         """Sets self.train_t0_datetimes and self.val_t0_datetimes."""
         self._check_has_prepared_data()
@@ -184,8 +196,10 @@ def contiguous_dataloader(self) -> torch.utils.data.DataLoader:
             self.contiguous_dataset = dataset.ContiguousNowcastingDataset(
                 t0_datetimes=self.val_t0_datetimes,
                 data_sources=data_sources,
-                n_batches_per_epoch_per_worker=32 // self.num_workers,
+                n_batches_per_epoch_per_worker=self._n_batches_per_epoch_per_worker(32),
                 **self._common_dataset_params())
+            if self.num_workers == 0:
+                self.contiguous_dataset.per_worker_init(worker_id=0)
         return torch.utils.data.DataLoader(
             self.contiguous_dataset, **self._common_dataloader_params())
 
diff --git a/nowcasting_dataset/dataset.py b/nowcasting_dataset/dataset.py
@@ -7,6 +7,9 @@
 from dataclasses import dataclass
 import torch
 from concurrent import futures
+import logging
+
+_LOG = logging.getLogger('nowcasting_dataset')
 
 
 @dataclass
@@ -115,10 +118,16 @@ def _get_example(
 
         example = nowcasting_dataset.example.Example(t0_dt=t0_dt)
         for data_source in self.data_sources:
-            example_from_source = data_source.get_example(
-                t0_dt=t0_dt,
-                x_meters_center=x_meters_center,
-                y_meters_center=y_meters_center)
+            try:
+                example_from_source = data_source.get_example(
+                    t0_dt=t0_dt,
+                    x_meters_center=x_meters_center,
+                    y_meters_center=y_meters_center)
+            except Exception as e:
+                _LOG.exception(
+                    f'Exception!  t0_dt={t0_dt}, x_meters_center={x_meters_center}, y_meters_center={y_meters_center}, {e}')
+                raise
+                
             example.update(example_from_source)
         example = nowcasting_dataset.example.to_numpy(example)
         return example