Closed
Description
Currently working on PyTorch example implementation here, but come across an error when generating samples from the dataloader (via DaskMultiWorkerLoader.generate_sample
).
I've created a IceNet dataset which inherits from the torch.Dataset
class here. And when iterating through the dataset, I come across the following error:
Traceback (most recent call last):
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/xarray/backends/api.py", line 1026, in open_mfdataset
combined = combine_by_coords(
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/xarray/core/combine.py", line 982, in combine_by_coords
concatenated = _combine_single_variable_hypercube(
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/xarray/core/combine.py", line 629, in _combine_single_variable_hypercube
combined_ids, concat_dims = _infer_concat_order_from_coords(list(datasets))
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/xarray/core/combine.py", line 149, in _infer_concat_order_from_coords
raise ValueError(
ValueError: Could not find any dimension coordinates to use to order the datasets for concatenation
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pytorch_example.py", line 60, in <module>
lit_unet_module, unet_model = train_icenet_unet(
File "/data/hpcdata/users/rychan/notebooks/icenet-notebooks/pytorch_example/train_icenet_unet.py", line 84, in train_icenet_unet
trainer.fit(lit_module, train_dataloader, val_dataloader)
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 529, in fit
call._call_and_handle_interrupt(
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/trainer/call.py", line 42, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 568, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 973, in _run
results = self._run_stage()
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/trainer/trainer.py", line 1016, in _run_stage
self.fit_loop.run()
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/loops/fit_loop.py", line 201, in run
self.advance()
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/loops/fit_loop.py", line 354, in advance
self.epoch_loop.run(self._data_fetcher)
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 133, in run
self.advance(data_fetcher)
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 189, in advance
batch = next(data_fetcher)
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/loops/fetchers.py", line 136, in __next__
self._fetch_next_batch(self.dataloader_iter)
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/loops/fetchers.py", line 150, in _fetch_next_batch
batch = next(iterator)
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/utilities/combined_loader.py", line 284, in __next__
out = next(self._iterator)
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/lightning/pytorch/utilities/combined_loader.py", line 65, in __next__
out[i] = next(self.iterators[i])
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 633, in __next__
data = self._next_data()
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 677, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/data/hpcdata/users/rychan/notebooks/icenet-notebooks/pytorch_example/icenet_pytorch_dataset.py", line 26, in __getitem__
return self._dl.generate_sample(date=pd.Timestamp(self._dates[idx]))
File "/data/hpcdata/users/rychan/icenet/icenet_fork/icenet/data/loaders/dask.py", line 258, in generate_sample
var_ds = xr.open_mfdataset(
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/xarray/backends/api.py", line 1041, in open_mfdataset
ds.close()
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/xarray/core/common.py", line 1155, in close
self._close()
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/xarray/backends/netCDF4_.py", line 513, in close
self._manager.close(**kwargs)
File "/data/hpcdata/users/rychan/miniconda3/envs/icenet_pytorch/lib/python3.8/site-packages/xarray/backends/file_manager.py", line 232, in close
file.close()
File "src/netCDF4/_netCDF4.pyx", line 2622, in netCDF4._netCDF4.Dataset.close
File "src/netCDF4/_netCDF4.pyx", line 2585, in netCDF4._netCDF4.Dataset._close
File "src/netCDF4/_netCDF4.pyx", line 2029, in netCDF4._netCDF4._ensure_nc_success
RuntimeError: NetCDF: Not a valid ID
At what point this error occurs is quite volatile. This is an issue during training when we're obtaining samples at each epoch.
The training will fail at different points at each run so it's been difficult to really nail down the issue, but we suspect its from the multiprocessing.