Merge mock data preparation and dataset logic in prototype tests (#6010)

pmeier · web-flow · commit 08c8f0e0b681 · 2022-05-17T13:21:27.000+02:00
* merge mock data preparation and loading

* address comments

* fix extra file creation

* remove tmp folder

* inline images meta creation in coco mock data
diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
@@ -10,19 +10,18 @@
 import pathlib
 import pickle
 import random
+import shutil
 import unittest.mock
 import warnings
 import xml.etree.ElementTree as ET
 from collections import defaultdict, Counter
 
 import numpy as np
-import PIL.Image
 import pytest
 import torch
 from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file, combinations_grid
 from torch.nn.functional import one_hot
 from torch.testing import make_tensor as _make_tensor
-from torchvision._utils import sequence_to_str
 from torchvision.prototype import datasets
 
 make_tensor = functools.partial(_make_tensor, device="cpu")
@@ -62,27 +61,51 @@ def _parse_mock_info(self, mock_info):
 
         return mock_info
 
-    def prepare(self, config):
+    def load(self, config):
         # `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in
         # test/test_prototype_builtin_datasets.py
         root = pathlib.Path(datasets.home()) / self.name
-        root.mkdir(exist_ok=True)
+        # We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn,
+        # this will only download **and** preprocess if the file is not present. In other words, if we already place
+        # the file in `root` before the resource is loaded, we are effectively skipping the preprocessing.
+        # To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to
+        # `root` only when it is requested.
+        tmp_mock_data_folder = root / "__mock__"
+        tmp_mock_data_folder.mkdir(parents=True)
+
+        mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config))
+
+        def patched_download(resource, root, **kwargs):
+            src = tmp_mock_data_folder / resource.file_name
+            if not src.exists():
+                raise pytest.UsageError(
+                    f"Dataset '{self.name}' requires the file {resource.file_name} for {config}"
+                    f"but it was not created by the mock data function."
+                )
 
-        mock_info = self._parse_mock_info(self.mock_data_fn(root, config))
+            dst = root / resource.file_name
+            shutil.move(str(src), str(root))
 
-        with unittest.mock.patch.object(datasets.utils.Dataset, "__init__"):
-            required_file_names = {
-                resource.file_name for resource in datasets.load(self.name, root=root, **config)._resources()
-            }
-        available_file_names = {path.name for path in root.glob("*")}
-        missing_file_names = required_file_names - available_file_names
-        if missing_file_names:
+            return dst
+
+        with unittest.mock.patch(
+            "torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download
+        ):
+            dataset = datasets.load(self.name, **config)
+
+        extra_files = list(tmp_mock_data_folder.glob("**/*"))
+        if extra_files:
             raise pytest.UsageError(
-                f"Dataset '{self.name}' requires the files {sequence_to_str(sorted(missing_file_names))} "
-                f"for {config}, but they were not created by the mock data function."
+                (
+                    f"Dataset '{self.name}' created the following files for {config} in the mock data function, "
+                    f"but they were not loaded:\n\n"
+                )
+                + "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files)
             )
 
-        return mock_info
+        tmp_mock_data_folder.rmdir()
+
+        return dataset, mock_info
 
 
 def config_id(name, config):
@@ -513,22 +536,6 @@ def imagenet(root, config):
 
 
 class CocoMockData:
-    @classmethod
-    def _make_images_archive(cls, root, name, *, num_samples):
-        image_paths = create_image_folder(
-            root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples
-        )
-
-        images_meta = []
-        for path in image_paths:
-            with PIL.Image.open(path) as image:
-                width, height = image.size
-            images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height))
-
-        make_zip(root, f"{name}.zip")
-
-        return images_meta
-
     @classmethod
     def _make_annotations_json(
         cls,
@@ -596,16 +603,38 @@ def generate(
         cls,
         root,
         *,
+        split,
         year,
         num_samples,
     ):
         annotations_dir = root / "annotations"
         annotations_dir.mkdir()
 
-        for split in ("train", "val"):
-            config_name = f"{split}{year}"
+        for split_ in ("train", "val"):
+            config_name = f"{split_}{year}"
+
+            images_meta = [
+                dict(
+                    file_name=f"{idx:012d}.jpg",
+                    id=idx,
+                    width=width,
+                    height=height,
+                )
+                for idx, (height, width) in enumerate(
+                    torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()
+                )
+            ]
+
+            if split_ == split:
+                create_image_folder(
+                    root,
+                    config_name,
+                    file_name_fn=lambda idx: images_meta[idx]["file_name"],
+                    num_examples=num_samples,
+                    size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]),
+                )
+                make_zip(root, f"{config_name}.zip")
 
-            images_meta = cls._make_images_archive(root, config_name, num_samples=num_samples)
             cls._make_annotations(
                 annotations_dir,
                 config_name,
@@ -625,7 +654,7 @@ def generate(
     )
 )
 def coco(root, config):
-    return CocoMockData.generate(root, year=config["year"], num_samples=5)
+    return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5)
 
 
 class SBDMockData:
@@ -799,8 +828,11 @@ def add_bndbox(obj):
     def generate(cls, root, *, year, trainval):
         archive_folder = root
         if year == "2011":
-            archive_folder /= "TrainVal"
-        data_folder = archive_folder / "VOCdevkit" / f"VOC{year}"
+            archive_folder = root / "TrainVal"
+            data_folder = archive_folder / "VOCdevkit"
+        else:
+            archive_folder = data_folder = root / "VOCdevkit"
+        data_folder = data_folder / f"VOC{year}"
         data_folder.mkdir(parents=True, exist_ok=True)
 
         ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval)
@@ -810,7 +842,7 @@ def generate(cls, root, *, year, trainval):
             (cls._make_detection_anns_folder, "Annotations", ".xml"),
         ]:
             make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids))
-        make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], data_folder)
+        make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder)
 
         return num_samples_map
 
@@ -1091,8 +1123,10 @@ def _make_ann_file(path, num_examples, class_idx):
                     }
                 )
 
+    archive_folder = root / "GTSRB"
+
     if config["split"] == "train":
-        train_folder = root / "GTSRB" / "Training"
+        train_folder = archive_folder / "Training"
         train_folder.mkdir(parents=True)
 
         for class_idx in classes:
@@ -1107,9 +1141,9 @@ def _make_ann_file(path, num_examples, class_idx):
                 num_examples=num_examples_per_class,
                 class_idx=int(class_idx),
             )
-        make_zip(root, "GTSRB-Training_fixed.zip", train_folder)
+        make_zip(root, "GTSRB-Training_fixed.zip", archive_folder)
     else:
-        test_folder = root / "GTSRB" / "Final_Test"
+        test_folder = archive_folder / "Final_Test"
         test_folder.mkdir(parents=True)
 
         create_image_folder(
@@ -1119,7 +1153,7 @@ def _make_ann_file(path, num_examples, class_idx):
             num_examples=num_examples,
         )
 
-        make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder)
+        make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder)
 
         _make_ann_file(
             path=root / "GT-final_test.csv",
@@ -1484,11 +1518,10 @@ def stanford_cars(root, config):
     num_samples = {"train": 5, "test": 7}[split]
     num_categories = 3
 
-    devkit = root / "devkit"
-    devkit.mkdir(parents=True)
-
     if split == "train":
         images_folder_name = "cars_train"
+        devkit = root / "devkit"
+        devkit.mkdir()
         annotations_mat_path = devkit / "cars_train_annos.mat"
     else:
         images_folder_name = "cars_test"
diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_builtin_datasets.py
@@ -56,18 +56,14 @@ def test_info(self, name):
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_smoke(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         if not isinstance(dataset, datasets.utils.Dataset):
             raise AssertionError(f"Loading the dataset should return an Dataset, but got {type(dataset)} instead.")
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_sample(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         try:
             sample = next(iter(dataset))
@@ -84,17 +80,13 @@ def test_sample(self, dataset_mock, config):
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_num_samples(self, dataset_mock, config):
-        mock_info = dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, mock_info = dataset_mock.load(config)
 
         assert len(list(dataset)) == mock_info["num_samples"]
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_no_vanilla_tensors(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor}
         if vanilla_tensors:
@@ -105,24 +97,20 @@ def test_no_vanilla_tensors(self, dataset_mock, config):
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_transformable(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         next(iter(dataset.map(transforms.Identity())))
 
     @pytest.mark.parametrize("only_datapipe", [False, True])
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_traversable(self, dataset_mock, config, only_datapipe):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         traverse(dataset, only_datapipe=only_datapipe)
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_serializable(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         pickle.dumps(dataset)
 
@@ -135,8 +123,7 @@ def _collate_fn(self, batch):
     @pytest.mark.parametrize("num_workers", [0, 1])
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_data_loader(self, dataset_mock, config, num_workers):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         dl = DataLoader(
             dataset,
@@ -153,17 +140,15 @@ def test_data_loader(self, dataset_mock, config, num_workers):
     @parametrize_dataset_mocks(DATASET_MOCKS)
     @pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter))
     def test_has_annotations(self, dataset_mock, config, annotation_dp_type):
-
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         if not any(isinstance(dp, annotation_dp_type) for dp in extract_datapipes(dataset)):
             raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.")
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_save_load(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
+
         sample = next(iter(dataset))
 
         with io.BytesIO() as buffer:
@@ -173,8 +158,7 @@ def test_save_load(self, dataset_mock, config):
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_infinite_buffer_size(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         for dp in extract_datapipes(dataset):
             if hasattr(dp, "buffer_size"):
@@ -184,18 +168,15 @@ def test_infinite_buffer_size(self, dataset_mock, config):
 
     @parametrize_dataset_mocks(DATASET_MOCKS)
     def test_has_length(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         assert len(dataset) > 0
 
 
 @parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
 class TestQMNIST:
     def test_extra_label(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         sample = next(iter(dataset))
         for key, type in (
@@ -218,9 +199,7 @@ def test_label_matches_path(self, dataset_mock, config):
         if config["split"] != "train":
             return
 
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         for sample in dataset:
             label_from_path = int(Path(sample["path"]).parent.name)
@@ -230,9 +209,7 @@ def test_label_matches_path(self, dataset_mock, config):
 @parametrize_dataset_mocks(DATASET_MOCKS["usps"])
 class TestUSPS:
     def test_sample_content(self, dataset_mock, config):
-        dataset_mock.prepare(config)
-
-        dataset = datasets.load(dataset_mock.name, **config)
+        dataset, _ = dataset_mock.load(config)
 
         for sample in dataset:
             assert "image" in sample