Speed up loading by only using HRV channel in dataset for now. #23

JackKelly · JackKelly · commit ef65c33bb26b · 2021-06-21T12:47:46.000Z
diff --git a/notebooks/benchmark_loading_speed.ipynb b/notebooks/benchmark_loading_speed.ipynb
@@ -25,6 +25,16 @@
     "import pytorch_lightning as pl"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0c3c31a-86ed-493b-ab00-625fa4edb302",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FILENAME = 'gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep_just_hrv.zarr'"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -45,7 +55,8 @@
    "outputs": [],
    "source": [
     "sat_data_source = data_sources.SatelliteDataSource(\n",
-    "    #filename='gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/OSGB36/all_zarr_int16_single_timestep_quarter_geospatial.zarr',\n",
+    "    filename=FILENAME,\n",
+    "    consolidated=False,\n",
     "    image_size_pixels=128,\n",
     "    history_len=HISTORY_LEN,\n",
     "    forecast_len=FORECAST_LEN\n",
@@ -76,16 +87,6 @@
     "len(t0_datetimes)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "da2a60b4-f28f-4db2-99d4-879a321b905c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "t0_datetimes[:5_000]"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -97,7 +98,7 @@
     "    batch_size=32,\n",
     "    n_samples_per_timestep=4,\n",
     "    data_sources=[sat_data_source],\n",
-    "    t0_datetimes=t0_datetimes[:5_000])"
+    "    t0_datetimes=t0_datetimes)"
    ]
   },
   {
@@ -178,7 +179,7 @@
     "\n",
     "        \n",
     "    def forward(self, x):\n",
-    "        images = x['sat_data'][:, self.history_len:, :, :, 0]\n",
+    "        images = x['sat_data'][:, self.history_len:, :, :] # , 0]\n",
     "        images = normalise_images_in_model(images, self.device)\n",
     "        \n",
     "        # Pass data through the network :)\n",
@@ -251,7 +252,15 @@
    "execution_count": null,
    "id": "3eb05006-b2df-426c-a775-e41402abf7b0",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0: : 609it [00:39, 15.23it/s, loss=0.259, v_num=47]"
+     ]
+    }
+   ],
    "source": [
     "trainer.fit(model, train_dataloader=dataloader)"
    ]
diff --git a/nowcasting_dataset/data_sources/satellite_data_source.py b/nowcasting_dataset/data_sources/satellite_data_source.py
@@ -22,12 +22,13 @@ class SatelliteDataSource(DataSource):
         y is top-to-bottom.
         Access using public sat_data property.
       filename: Filename of the satellite data Zarr.
-      channels: List of satellite channels to load.
-      image_size: Instance of Square, which defines the size of each sample.
-        (Inherited from DataSource super-class).
+      consolidated: Whether or not the Zarr store is consolidated.
+      channels: List of satellite channels to load. If None then don't filter by channels.
+      image_size_pixels: Size of the width and height of the image crop returned by get_sample().
     """
     filename: Union[str, Path] = consts.SAT_FILENAME
-    channels: Iterable[str] = ('HRV', )
+    consolidated: bool = True
+    channels: Optional[Iterable[str]] = None
     image_size_pixels: InitVar[int] = 128
     meters_per_pixel: InitVar[int] = 2_000
 
@@ -50,8 +51,9 @@ def open(self) -> None:
         # If we did that, then we couldn't copy SatelliteDataSource
         # instances into separate processes.  Instead,
         # call open() _after_ creating separate processes.
-        sat_data = self._open_sat_data()
-        self._sat_data = sat_data.sel(variable=list(self.channels))
+        self._sat_data = self._open_sat_data()
+        if self.channels is not None:
+            self._sat_data = self._sat_data.sel(variable=list(self.channels))
 
     def get_sample(
             self,
@@ -98,7 +100,7 @@ def geospatial_border(self) -> List[Tuple[Number, Number]]:
                 [GEO_BORDER, -GEO_BORDER])]
 
     def _open_sat_data(self):
-        return open_sat_data(filename=self.filename)
+        return open_sat_data(filename=self.filename, consolidated=self.consolidated)
 
 
 def open_sat_data(
diff --git a/nowcasting_dataset/utils.py b/nowcasting_dataset/utils.py
@@ -1,19 +1,16 @@
 import numpy as np
 import pandas as pd
 from nowcasting_dataset.consts import Array
+import fsspec.asyn
 
 
 def set_fsspec_for_multiprocess() -> None:
     """Clear reference to the loop and thread.  This is necessary otherwise
     gcsfs hangs in the ML training loop.  Only required for fsspec >= 0.9.0
     See https://github.com/dask/gcsfs/issues/379#issuecomment-839929801
     TODO: Try deleting this two lines to make sure this is still relevant."""
-    import fsspec
-    try:
-        fsspec.asyn.iothread[0] = None
-        fsspec.asyn.loop[0] = None
-    except AttributeError:
-        pass
+    fsspec.asyn.iothread[0] = None
+    fsspec.asyn.loop[0] = None
 
 
 def is_monotonically_increasing(a: Array) -> bool:
diff --git a/scripts/rechunk_sat_data.py b/scripts/rechunk_sat_data.py
@@ -5,34 +5,38 @@
 import numcodecs
 import gcsfs
 import rechunker
-from dask.diagnostics import ProgressBar
+import zarr
 
 
 BUCKET = Path('solar-pv-nowcasting-data')
 SAT_PATH = BUCKET / 'satellite/EUMETSAT/SEVIRI_RSS/OSGB36/'
 SOURCE_SAT_FILENAME = 'gs://' + str(SAT_PATH / 'all_zarr_int16')
-TARGET_SAT_FILENAME = SAT_PATH / 'all_zarr_int16_single_timestep_quarter_geospatial.zarr'
+TARGET_SAT_FILENAME = SAT_PATH / 'all_zarr_int16_single_timestep_just_hrv.zarr'
 TEMP_STORE_FILENAME = SAT_PATH / 'temp.zarr'
 
 
 def main():
     source_sat_dataset = xr.open_zarr(SOURCE_SAT_FILENAME, consolidated=True)
-
+    #source_sat_dataset = source_sat_dataset.isel(time=slice(0, 3600))
+    source_sat_dataset = source_sat_dataset.sel(variable='HRV')
+    
     gcs = gcsfs.GCSFileSystem()
     target_store = gcs.get_mapper(TARGET_SAT_FILENAME)
     temp_store = gcs.get_mapper(TEMP_STORE_FILENAME)
 
     target_chunks = {
         'stacked_eumetsat_data': {
             "time": 1,
-            "y": 704 // 2,
-            "x": 548 // 2,
-            "variable": 1}}
+            "y": 704,
+            "x": 548,
+            #"variable": 1
+        }}
 
     encoding = {
         'stacked_eumetsat_data': {
             'compressor': numcodecs.Blosc(cname="zstd", clevel=5)}}
 
+    print('Rechunking...')
     rechunk_plan = rechunker.rechunk(
         source=source_sat_dataset,
         target_chunks=target_chunks,
@@ -42,7 +46,11 @@ def main():
         temp_store=temp_store)
 
     rechunk_plan.execute()
+    
+    print('Consolidating...')
+    zarr.convenience.consolidate_metadata(target_store)
 
+    print('Done!')
 
 if __name__ == '__main__':
     main()