fix prototype resource loading (#5447)

pmeier · web-flow · commit c530b623a912 · 2022-02-21T11:14:56.000+01:00
* fix prototype resource loading

* revert unrelated change
diff --git a/torchvision/prototype/datasets/utils/_resource.py b/torchvision/prototype/datasets/utils/_resource.py
@@ -88,20 +88,30 @@ def load(
         root = pathlib.Path(root)
         path = root / self.file_name
         # Instead of the raw file, there might also be files with fewer suffixes after decompression or directories
-        # with no suffixes at all. Thus, we look for all paths that share the same name without suffixes as the raw
-        # file.
-        path_candidates = {file for file in path.parent.glob(path.name.replace("".join(path.suffixes), "") + "*")}
-        # If we don't find anything, we try to download the raw file.
-        if not path_candidates:
-            path_candidates = {self.download(root, skip_integrity_check=skip_integrity_check)}
+        # with no suffixes at all.
+        stem = path.name.replace("".join(path.suffixes), "")
+
+        # In a first step, we check for a folder with the same stem as the raw file. If it exists, we use it since
+        # extracted files give the best I/O performance. Note that OnlineResource._extract() makes sure that an archive
+        # is always extracted in a folder with the corresponding file name.
+        folder_candidate = path.parent / stem
+        if folder_candidate.exists() and folder_candidate.is_dir():
+            return self._loader(folder_candidate)
+
+        # If there is no folder, we look for all files that share the same stem as the raw file, but might have a
+        # different suffix.
+        file_candidates = {file for file in path.parent.glob(stem + ".*")}
+        # If we don't find anything, we download the raw file.
+        if not file_candidates:
+            file_candidates = {self.download(root, skip_integrity_check=skip_integrity_check)}
         # If the only thing we find is the raw file, we use it and optionally perform some preprocessing steps.
-        if path_candidates == {path}:
+        if file_candidates == {path}:
             if self._preprocess is not None:
                 path = self._preprocess(path)
-        # Otherwise we use the path with the fewest suffixes. This gives us the extracted > decompressed > raw priority
-        # that we want.
+        # Otherwise, we use the path with the fewest suffixes. This gives us the decompressed > raw priority that we
+        # want for the best I/O performance.
         else:
-            path = min(path_candidates, key=lambda path: len(path.suffixes))
+            path = min(file_candidates, key=lambda path: len(path.suffixes))
         return self._loader(path)
 
     @abc.abstractmethod