10
10
import pathlib
11
11
import pickle
12
12
import random
13
+ import shutil
13
14
import unittest .mock
14
15
import warnings
15
16
import xml .etree .ElementTree as ET
16
17
from collections import defaultdict , Counter
17
18
18
19
import numpy as np
19
- import PIL .Image
20
20
import pytest
21
21
import torch
22
22
from datasets_utils import make_zip , make_tar , create_image_folder , create_image_file , combinations_grid
23
23
from torch .nn .functional import one_hot
24
24
from torch .testing import make_tensor as _make_tensor
25
- from torchvision ._utils import sequence_to_str
26
25
from torchvision .prototype import datasets
27
26
28
27
make_tensor = functools .partial (_make_tensor , device = "cpu" )
@@ -62,27 +61,51 @@ def _parse_mock_info(self, mock_info):
62
61
63
62
return mock_info
64
63
65
- def prepare (self , config ):
64
+ def load (self , config ):
66
65
# `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in
67
66
# test/test_prototype_builtin_datasets.py
68
67
root = pathlib .Path (datasets .home ()) / self .name
69
- root .mkdir (exist_ok = True )
68
+ # We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn,
69
+ # this will only download **and** preprocess if the file is not present. In other words, if we already place
70
+ # the file in `root` before the resource is loaded, we are effectively skipping the preprocessing.
71
+ # To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to
72
+ # `root` only when it is requested.
73
+ tmp_mock_data_folder = root / "__mock__"
74
+ tmp_mock_data_folder .mkdir (parents = True )
75
+
76
+ mock_info = self ._parse_mock_info (self .mock_data_fn (tmp_mock_data_folder , config ))
77
+
78
+ def patched_download (resource , root , ** kwargs ):
79
+ src = tmp_mock_data_folder / resource .file_name
80
+ if not src .exists ():
81
+ raise pytest .UsageError (
82
+ f"Dataset '{ self .name } ' requires the file { resource .file_name } for { config } "
83
+ f"but it was not created by the mock data function."
84
+ )
70
85
71
- mock_info = self ._parse_mock_info (self .mock_data_fn (root , config ))
86
+ dst = root / resource .file_name
87
+ shutil .move (str (src ), str (root ))
72
88
73
- with unittest .mock .patch .object (datasets .utils .Dataset , "__init__" ):
74
- required_file_names = {
75
- resource .file_name for resource in datasets .load (self .name , root = root , ** config )._resources ()
76
- }
77
- available_file_names = {path .name for path in root .glob ("*" )}
78
- missing_file_names = required_file_names - available_file_names
79
- if missing_file_names :
89
+ return dst
90
+
91
+ with unittest .mock .patch (
92
+ "torchvision.prototype.datasets.utils._resource.OnlineResource.download" , new = patched_download
93
+ ):
94
+ dataset = datasets .load (self .name , ** config )
95
+
96
+ extra_files = list (tmp_mock_data_folder .glob ("**/*" ))
97
+ if extra_files :
80
98
raise pytest .UsageError (
81
- f"Dataset '{ self .name } ' requires the files { sequence_to_str (sorted (missing_file_names ))} "
82
- f"for { config } , but they were not created by the mock data function."
99
+ (
100
+ f"Dataset '{ self .name } ' created the following files for { config } in the mock data function, "
101
+ f"but they were not loaded:\n \n "
102
+ )
103
+ + "\n " .join (str (file .relative_to (tmp_mock_data_folder )) for file in extra_files )
83
104
)
84
105
85
- return mock_info
106
+ tmp_mock_data_folder .rmdir ()
107
+
108
+ return dataset , mock_info
86
109
87
110
88
111
def config_id (name , config ):
@@ -513,22 +536,6 @@ def imagenet(root, config):
513
536
514
537
515
538
class CocoMockData :
516
- @classmethod
517
- def _make_images_archive (cls , root , name , * , num_samples ):
518
- image_paths = create_image_folder (
519
- root , name , file_name_fn = lambda idx : f"{ idx :012d} .jpg" , num_examples = num_samples
520
- )
521
-
522
- images_meta = []
523
- for path in image_paths :
524
- with PIL .Image .open (path ) as image :
525
- width , height = image .size
526
- images_meta .append (dict (file_name = path .name , id = int (path .stem ), width = width , height = height ))
527
-
528
- make_zip (root , f"{ name } .zip" )
529
-
530
- return images_meta
531
-
532
539
@classmethod
533
540
def _make_annotations_json (
534
541
cls ,
@@ -596,16 +603,38 @@ def generate(
596
603
cls ,
597
604
root ,
598
605
* ,
606
+ split ,
599
607
year ,
600
608
num_samples ,
601
609
):
602
610
annotations_dir = root / "annotations"
603
611
annotations_dir .mkdir ()
604
612
605
- for split in ("train" , "val" ):
606
- config_name = f"{ split } { year } "
613
+ for split_ in ("train" , "val" ):
614
+ config_name = f"{ split_ } { year } "
615
+
616
+ images_meta = [
617
+ dict (
618
+ file_name = f"{ idx :012d} .jpg" ,
619
+ id = idx ,
620
+ width = width ,
621
+ height = height ,
622
+ )
623
+ for idx , (height , width ) in enumerate (
624
+ torch .randint (3 , 11 , size = (num_samples , 2 ), dtype = torch .int ).tolist ()
625
+ )
626
+ ]
627
+
628
+ if split_ == split :
629
+ create_image_folder (
630
+ root ,
631
+ config_name ,
632
+ file_name_fn = lambda idx : images_meta [idx ]["file_name" ],
633
+ num_examples = num_samples ,
634
+ size = lambda idx : (3 , images_meta [idx ]["height" ], images_meta [idx ]["width" ]),
635
+ )
636
+ make_zip (root , f"{ config_name } .zip" )
607
637
608
- images_meta = cls ._make_images_archive (root , config_name , num_samples = num_samples )
609
638
cls ._make_annotations (
610
639
annotations_dir ,
611
640
config_name ,
@@ -625,7 +654,7 @@ def generate(
625
654
)
626
655
)
627
656
def coco (root , config ):
628
- return CocoMockData .generate (root , year = config ["year" ], num_samples = 5 )
657
+ return CocoMockData .generate (root , split = config [ "split" ], year = config ["year" ], num_samples = 5 )
629
658
630
659
631
660
class SBDMockData :
@@ -799,8 +828,11 @@ def add_bndbox(obj):
799
828
def generate (cls , root , * , year , trainval ):
800
829
archive_folder = root
801
830
if year == "2011" :
802
- archive_folder /= "TrainVal"
803
- data_folder = archive_folder / "VOCdevkit" / f"VOC{ year } "
831
+ archive_folder = root / "TrainVal"
832
+ data_folder = archive_folder / "VOCdevkit"
833
+ else :
834
+ archive_folder = data_folder = root / "VOCdevkit"
835
+ data_folder = data_folder / f"VOC{ year } "
804
836
data_folder .mkdir (parents = True , exist_ok = True )
805
837
806
838
ids , num_samples_map = cls ._make_split_files (data_folder , year = year , trainval = trainval )
@@ -810,7 +842,7 @@ def generate(cls, root, *, year, trainval):
810
842
(cls ._make_detection_anns_folder , "Annotations" , ".xml" ),
811
843
]:
812
844
make_folder_fn (data_folder , name , file_name_fn = lambda idx : ids [idx ] + suffix , num_examples = len (ids ))
813
- make_tar (root , (cls ._TRAIN_VAL_FILE_NAMES if trainval else cls ._TEST_FILE_NAMES )[year ], data_folder )
845
+ make_tar (root , (cls ._TRAIN_VAL_FILE_NAMES if trainval else cls ._TEST_FILE_NAMES )[year ], archive_folder )
814
846
815
847
return num_samples_map
816
848
@@ -1091,8 +1123,10 @@ def _make_ann_file(path, num_examples, class_idx):
1091
1123
}
1092
1124
)
1093
1125
1126
+ archive_folder = root / "GTSRB"
1127
+
1094
1128
if config ["split" ] == "train" :
1095
- train_folder = root / "GTSRB" / "Training"
1129
+ train_folder = archive_folder / "Training"
1096
1130
train_folder .mkdir (parents = True )
1097
1131
1098
1132
for class_idx in classes :
@@ -1107,9 +1141,9 @@ def _make_ann_file(path, num_examples, class_idx):
1107
1141
num_examples = num_examples_per_class ,
1108
1142
class_idx = int (class_idx ),
1109
1143
)
1110
- make_zip (root , "GTSRB-Training_fixed.zip" , train_folder )
1144
+ make_zip (root , "GTSRB-Training_fixed.zip" , archive_folder )
1111
1145
else :
1112
- test_folder = root / "GTSRB" / "Final_Test"
1146
+ test_folder = archive_folder / "Final_Test"
1113
1147
test_folder .mkdir (parents = True )
1114
1148
1115
1149
create_image_folder (
@@ -1119,7 +1153,7 @@ def _make_ann_file(path, num_examples, class_idx):
1119
1153
num_examples = num_examples ,
1120
1154
)
1121
1155
1122
- make_zip (root , "GTSRB_Final_Test_Images.zip" , test_folder )
1156
+ make_zip (root , "GTSRB_Final_Test_Images.zip" , archive_folder )
1123
1157
1124
1158
_make_ann_file (
1125
1159
path = root / "GT-final_test.csv" ,
@@ -1484,11 +1518,10 @@ def stanford_cars(root, config):
1484
1518
num_samples = {"train" : 5 , "test" : 7 }[split ]
1485
1519
num_categories = 3
1486
1520
1487
- devkit = root / "devkit"
1488
- devkit .mkdir (parents = True )
1489
-
1490
1521
if split == "train" :
1491
1522
images_folder_name = "cars_train"
1523
+ devkit = root / "devkit"
1524
+ devkit .mkdir ()
1492
1525
annotations_mat_path = devkit / "cars_train_annos.mat"
1493
1526
else :
1494
1527
images_folder_name = "cars_test"
0 commit comments