Skip to content

Add GTSRB dataset to prototypes #5214

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 44 commits into from
Jan 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
237a707
Change default of download for Food101 and DTD
NicolasHug Jan 5, 2022
bc3be4e
WIP
NicolasHug Jan 7, 2022
85ca229
Merge branch 'main' of github.com:pytorch/vision into defaultdownload
NicolasHug Jan 18, 2022
87695d4
Set download default to False and put it at the end
NicolasHug Jan 18, 2022
1e6e37d
Keep stuff private
NicolasHug Jan 18, 2022
474546f
GTSRB: train -> split. Also use pathlib
NicolasHug Jan 18, 2022
a38a18b
mypy
NicolasHug Jan 18, 2022
d58ef16
Remove split and partition for SUN397
NicolasHug Jan 18, 2022
5061141
mypy
NicolasHug Jan 18, 2022
6c02cff
mypy
NicolasHug Jan 18, 2022
d3cb34f
Merge branch 'main' of github.com:pytorch/vision into gtsrb_prototype
NicolasHug Jan 18, 2022
d288c6c
Merge branch 'defaultdownload' into gtsrb_prototype
NicolasHug Jan 18, 2022
521b75c
WIP
NicolasHug Jan 18, 2022
1c1ceb0
WIP
NicolasHug Jan 19, 2022
1b2ee27
Merge branch 'main' of github.com:pytorch/vision into gtsrb_prototype
NicolasHug Jan 19, 2022
4fdb976
WIP
NicolasHug Jan 19, 2022
a6ae4c4
Add tests
NicolasHug Jan 19, 2022
761e5d7
Add some types
NicolasHug Jan 19, 2022
1dd6efe
lmao mypy you funny lad
NicolasHug Jan 19, 2022
a32ab88
fix unpacking
NicolasHug Jan 19, 2022
862187a
Merge branch 'main' of github.com:pytorch/vision into gtsrb_prototype
NicolasHug Jan 19, 2022
e487828
Use DictWriter
NicolasHug Jan 19, 2022
8f15cc3
Hardcode categories since they are just ints in [0, 42]
NicolasHug Jan 19, 2022
9ac22d3
Split URL root
NicolasHug Jan 19, 2022
1f1fa35
Use name instead of stem
NicolasHug Jan 19, 2022
f25a83a
Add category to labels, and fix dict reading
NicolasHug Jan 19, 2022
52ec648
Use path_comparator
NicolasHug Jan 19, 2022
379876f
Use buffer_size=1
NicolasHug Jan 19, 2022
632c212
Merge branch 'main' of github.com:pytorch/vision into gtsrb_prototype
NicolasHug Jan 20, 2022
0d6b58d
Merge branch 'main' of github.com:pytorch/vision into gtsrb_prototype
NicolasHug Jan 20, 2022
e26b456
Use Zipper instead of IterKeyZipper
NicolasHug Jan 20, 2022
b958b6b
mypy
NicolasHug Jan 20, 2022
06c0904
Some more instructions
NicolasHug Jan 20, 2022
18b87e2
forgot backquotes
NicolasHug Jan 20, 2022
44bb8f1
Apply suggestions from code review
NicolasHug Jan 21, 2022
c1ec16d
gt -> ground_truth
NicolasHug Jan 21, 2022
ff78c70
e -> sample
NicolasHug Jan 21, 2022
cd38e25
Add support for bboxes
NicolasHug Jan 21, 2022
1e8aea6
Update torchvision/prototype/datasets/_builtin/gtsrb.py
NicolasHug Jan 21, 2022
8e9a617
format
NicolasHug Jan 21, 2022
6703710
Remove unused method
NicolasHug Jan 21, 2022
6b67ce7
Add test for label matching
NicolasHug Jan 21, 2022
1ef84e0
Update test/test_prototype_builtin_datasets.py
NicolasHug Jan 24, 2022
8283332
Merge branch 'main' into gtsrb_prototype
NicolasHug Jan 24, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions test/builtin_dataset_mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1017,6 +1017,76 @@ def fer2013(info, root, config):
return num_samples


@DATASET_MOCKS.set_from_named_callable
def gtsrb(info, root, config):
num_examples_per_class = 5 if config.split == "train" else 3
classes = ("00000", "00042", "00012")
num_examples = num_examples_per_class * len(classes)

csv_columns = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]

def _make_ann_file(path, num_examples, class_idx):
if class_idx == "random":
class_idx = torch.randint(1, len(classes) + 1, size=(1,)).item()

with open(path, "w") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=";")
writer.writeheader()
for image_idx in range(num_examples):
writer.writerow(
{
"Filename": f"{image_idx:05d}.ppm",
"Width": torch.randint(1, 100, size=()).item(),
"Height": torch.randint(1, 100, size=()).item(),
"Roi.X1": torch.randint(1, 100, size=()).item(),
"Roi.Y1": torch.randint(1, 100, size=()).item(),
"Roi.X2": torch.randint(1, 100, size=()).item(),
"Roi.Y2": torch.randint(1, 100, size=()).item(),
"ClassId": class_idx,
}
)

if config["split"] == "train":
train_folder = root / "GTSRB" / "Training"
train_folder.mkdir(parents=True)

for class_idx in classes:
create_image_folder(
train_folder,
name=class_idx,
file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
num_examples=num_examples_per_class,
)
_make_ann_file(
path=train_folder / class_idx / f"GT-{class_idx}.csv",
num_examples=num_examples_per_class,
class_idx=int(class_idx),
)
make_zip(root, "GTSRB-Training_fixed.zip", train_folder)
else:
test_folder = root / "GTSRB" / "Final_Test"
test_folder.mkdir(parents=True)

create_image_folder(
test_folder,
name="Images",
file_name_fn=lambda image_idx: f"{image_idx:05d}.ppm",
num_examples=num_examples,
)

make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder)

_make_ann_file(
path=root / "GT-final_test.csv",
num_examples=num_examples,
class_idx="random",
)

make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv")

return num_examples


@DATASET_MOCKS.set_from_named_callable
def clevr(info, root, config):
data_folder = root / "CLEVR_v1.0"
Expand Down
2 changes: 1 addition & 1 deletion test/datasets_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ def _make_archive(root, name, *files_or_dirs, opener, adder, remove=True):
files, dirs = _split_files_or_dirs(root, *files_or_dirs)

with opener(archive) as fh:
for file in files:
for file in sorted(files):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pmeier LMK what you think of this.

Down below I set buffer_size=1 in the Zipper, because in the original .zip files, both the image_dp and the gt_dp are fully aligned: they both contain images 00001, 00002, etc. in this order. So I'm assuming that buffer_size=1 is better than buffer_size=UNLIMITED?

Without this call to sorted(), the tests would fail: the .zip archive created by make_zip would contain the files in a shuffled order (because files is a set), and so image_dp and the gt_dp would not be aligned anymore, leading to a failure to match keys in the Zipper. (Note: this is only a problem in the tests; the code works fine otherwise on my custom script iterating over the dataset).

I hope this won't make other tests fails. This might not be a problem that we have right now, but perhaps something to keep in mind for the future: we might need the test archives to exactly match the order of the "original" archives.

Copy link
Collaborator

@pmeier pmeier Jan 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I didn't think of that. Do you know if the order of the returned paths of pathlib.Path.glob() is stable? If yes, we could simply replace the sets in _split_files_or_dirs with lists instead of sorting here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure about Path.glob(). I know that glob.glob has no guaranteed order, but I don't think Path.glob() relies on it. Maybe the safest is to not assume a specific order.

BTW, slightly related, what was the reason to use sets instead of lists?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what was the reason to use sets instead of lists?

It think the reason was to avoid duplicates, but I don't remember if there was a case where I hit something like that.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you tried using lists rather than sorting afterwards? If CI is not complaining for other datasets, I feel like that would be the better approach.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just saw this call to remove() which might the reason for using sets:

if root in dirs:
dirs.remove(root)

I can still switch to lists if you'd like, I guess I would have to write something like

dirs = [dir in dirs if dir != root]

LMK which one you prefer

adder(fh, file, file.relative_to(root))

if remove:
Expand Down
19 changes: 18 additions & 1 deletion test/test_prototype_builtin_datasets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import io
from pathlib import Path

import pytest
import torch
Expand Down Expand Up @@ -123,7 +124,7 @@ def scan(graph):
if type(dp) is annotation_dp_type:
break
else:
raise AssertionError(f"The dataset doesn't comprise a {annotation_dp_type.__name__}() datapipe.")
raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.")


@parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
Expand All @@ -143,3 +144,19 @@ def test_extra_label(self, dataset_mock, config):
("unused", bool),
):
assert key in sample and isinstance(sample[key], type)


@parametrize_dataset_mocks(DATASET_MOCKS["gtsrb"])
class TestGTSRB:
def test_label_matches_path(self, dataset_mock, config):
# We read the labels from the csv files instead. But for the trainset, the labels are also part of the path.
# This test makes sure that they're both the same
if config.split != "train":
return

with dataset_mock.prepare(config):
dataset = datasets.load(dataset_mock.name, **config)

for sample in dataset:
label_from_path = int(Path(sample["image_path"]).parent.name)
assert sample["label"] == label_from_path
Loading