Merge branch 'main' into rendered-sst2-dataset

NicolasHug · web-flow · commit 39b24412ad3c · 2022-01-20T10:20:12.000Z
diff --git a/docs/source/models.rst b/docs/source/models.rst
@@ -88,6 +88,7 @@ You can construct a model with random weights by calling its constructor:
     vit_b_32 = models.vit_b_32()
     vit_l_16 = models.vit_l_16()
     vit_l_32 = models.vit_l_32()
+    vit_h_14 = models.vit_h_14() 
 
 We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
 These can be constructed by passing ``pretrained=True``:
@@ -460,6 +461,7 @@ VisionTransformer
     vit_b_32
     vit_l_16
     vit_l_32
+    vit_h_14
 
 Quantized Models
 ----------------
diff --git a/hubconf.py b/hubconf.py
@@ -63,4 +63,5 @@
     vit_b_32,
     vit_l_16,
     vit_l_32,
+    vit_h_14,
 )
diff --git a/test/common_utils.py b/test/common_utils.py
@@ -4,29 +4,18 @@
 import random
 import shutil
 import tempfile
-from distutils.util import strtobool
 
 import numpy as np
-import pytest
 import torch
 from PIL import Image
 from torchvision import io
 
 import __main__  # noqa: 401
 
 
-def get_bool_env_var(name, *, exist_ok=False, default=False):
-    value = os.getenv(name)
-    if value is None:
-        return default
-    if exist_ok:
-        return True
-    return bool(strtobool(value))
-
-
-IN_CIRCLE_CI = get_bool_env_var("CIRCLECI")
-IN_RE_WORKER = get_bool_env_var("INSIDE_RE_WORKER", exist_ok=True)
-IN_FBCODE = get_bool_env_var("IN_FBCODE_TORCHVISION")
+IN_CIRCLE_CI = os.getenv("CIRCLECI", False) == "true"
+IN_RE_WORKER = os.environ.get("INSIDE_RE_WORKER") is not None
+IN_FBCODE = os.environ.get("IN_FBCODE_TORCHVISION") == "1"
 CUDA_NOT_AVAILABLE_MSG = "CUDA device not available"
 CIRCLECI_GPU_NO_CUDA_MSG = "We're in a CircleCI GPU machine, and this test doesn't need cuda."
 
@@ -213,7 +202,3 @@ def _test_fn_on_batch(batch_tensors, fn, scripted_fn_atol=1e-8, **fn_kwargs):
         # scriptable function test
         s_transformed_batch = scripted_fn(batch_tensors, **fn_kwargs)
         torch.testing.assert_close(transformed_batch, s_transformed_batch, rtol=1e-5, atol=scripted_fn_atol)
-
-
-def run_on_env_var(name, *, skip_reason=None, exist_ok=False, default=False):
-    return pytest.mark.skipif(not get_bool_env_var(name, exist_ok=exist_ok, default=default), reason=skip_reason)
diff --git a/test/expect/ModelTester.test_vit_h_14_expect.pkl b/test/expect/ModelTester.test_vit_h_14_expect.pkl
diff --git a/test/test_prototype_models.py b/test/test_prototype_models.py
@@ -1,16 +1,17 @@
 import importlib
+import os
 
 import pytest
 import test_models as TM
 import torch
-from common_utils import cpu_and_gpu, run_on_env_var, needs_cuda
+from common_utils import cpu_and_gpu, needs_cuda
 from torchvision.prototype import models
 from torchvision.prototype.models._api import WeightsEnum, Weights
 from torchvision.prototype.models._utils import handle_legacy_interface
 
-run_if_test_with_prototype = run_on_env_var(
-    "PYTORCH_TEST_WITH_PROTOTYPE",
-    skip_reason="Prototype tests are disabled by default. Set PYTORCH_TEST_WITH_PROTOTYPE=1 to run them.",
+run_if_test_with_prototype = pytest.mark.skipif(
+    os.getenv("PYTORCH_TEST_WITH_PROTOTYPE") != "1",
+    reason="Prototype tests are disabled by default. Set PYTORCH_TEST_WITH_PROTOTYPE=1 to run them.",
 )
 
 
diff --git a/test/test_prototype_utils.py b/test/test_prototype_utils.py
@@ -0,0 +1,17 @@
+import pytest
+from torchvision.prototype.utils._internal import sequence_to_str
+
+
+@pytest.mark.parametrize(
+    ("seq", "separate_last", "expected"),
+    [
+        ([], "", ""),
+        (["foo"], "", "'foo'"),
+        (["foo", "bar"], "", "'foo', 'bar'"),
+        (["foo", "bar"], "and ", "'foo' and 'bar'"),
+        (["foo", "bar", "baz"], "", "'foo', 'bar', 'baz'"),
+        (["foo", "bar", "baz"], "and ", "'foo', 'bar', and 'baz'"),
+    ],
+)
+def test_sequence_to_str(seq, separate_last, expected):
+    assert sequence_to_str(seq, separate_last=separate_last) == expected
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
@@ -15,6 +15,7 @@
     "vit_b_32",
     "vit_l_16",
     "vit_l_32",
+    "vit_h_14",
 ]
 
 model_urls = {
@@ -260,6 +261,8 @@ def _vision_transformer(
     )
 
     if pretrained:
+        if arch not in model_urls:
+            raise ValueError(f"No checkpoint is available for model type '{arch}'!")
         state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
         model.load_state_dict(state_dict)
 
@@ -354,6 +357,26 @@ def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) ->
     )
 
 
+def vit_h_14(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    """
+    Constructs a vit_h_14 architecture from
+    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/abs/2010.11929>`_.
+
+    NOTE: Pretrained weights are not available for this model.
+    """
+    return _vision_transformer(
+        arch="vit_h_14",
+        patch_size=14,
+        num_layers=32,
+        num_heads=16,
+        hidden_dim=1280,
+        mlp_dim=5120,
+        pretrained=pretrained,
+        progress=progress,
+        **kwargs,
+    )
+
+
 def interpolate_embeddings(
     image_size: int,
     patch_size: int,
diff --git a/torchvision/prototype/models/_api.py b/torchvision/prototype/models/_api.py
@@ -60,9 +60,8 @@ def verify(cls, obj: Any) -> Any:
 
     @classmethod
     def from_str(cls, value: str) -> "WeightsEnum":
-        for k, v in cls.__members__.items():
-            if k == value:
-                return v
+        if value in cls.__members__:
+            return cls.__members__[value]
         raise ValueError(f"Invalid value {value} for enum {cls.__name__}.")
 
     def get_state_dict(self, progress: bool) -> OrderedDict:
diff --git a/torchvision/prototype/models/vision_transformer.py b/torchvision/prototype/models/vision_transformer.py
@@ -19,10 +19,12 @@
     "ViT_B_32_Weights",
     "ViT_L_16_Weights",
     "ViT_L_32_Weights",
+    "ViT_H_14_Weights",
     "vit_b_16",
     "vit_b_32",
     "vit_l_16",
     "vit_l_32",
+    "vit_h_14",
 ]
 
 
@@ -99,6 +101,11 @@ class ViT_L_32_Weights(WeightsEnum):
     default = ImageNet1K_V1
 
 
+class ViT_H_14_Weights(WeightsEnum):
+    # Weights are not available yet.
+    pass
+
+
 def _vision_transformer(
     patch_size: int,
     num_layers: int,
@@ -192,3 +199,19 @@ def vit_l_32(*, weights: Optional[ViT_L_32_Weights] = None, progress: bool = Tru
         progress=progress,
         **kwargs,
     )
+
+
+@handle_legacy_interface(weights=("pretrained", None))
+def vit_h_14(*, weights: Optional[ViT_H_14_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
+    weights = ViT_H_14_Weights.verify(weights)
+
+    return _vision_transformer(
+        patch_size=14,
+        num_layers=32,
+        num_heads=16,
+        hidden_dim=1280,
+        mlp_dim=5120,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
diff --git a/torchvision/prototype/utils/_internal.py b/torchvision/prototype/utils/_internal.py
@@ -30,10 +30,15 @@ class StrEnum(enum.Enum, metaclass=StrEnumMeta):
 
 
 def sequence_to_str(seq: Sequence, separate_last: str = "") -> str:
+    if not seq:
+        return ""
     if len(seq) == 1:
         return f"'{seq[0]}'"
 
-    return f"""'{"', '".join([str(item) for item in seq[:-1]])}', {separate_last}'{seq[-1]}'"""
+    head = "'" + "', '".join([str(item) for item in seq[:-1]]) + "'"
+    tail = f"{'' if separate_last and len(seq) == 2 else ','} {separate_last}'{seq[-1]}'"
+
+    return head + tail
 
 
 def add_suggestion(

Original file line number	Diff line number	Diff line change
`@@ -63,4 +63,5 @@`
`63`	`63`	`vit_b_32,`
`64`	`64`	`vit_l_16,`
`65`	`65`	`vit_l_32,`
	`66`	`+ vit_h_14,`
`66`	`67`	`)`