From 50bfbe640ec1834e013295df2527c5c66d99b12f Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Mon, 1 Apr 2019 23:12:25 -0700
Subject: [PATCH 01/36] Add initial mnasnet impl

---
 torchvision/models/mnasnet.py | 167 ++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 torchvision/models/mnasnet.py

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
new file mode 100644
index 00000000000..9d6b7f714a7
--- /dev/null
+++ b/torchvision/models/mnasnet.py
@@ -0,0 +1,167 @@
+import math
+
+import torch
+import torch.nn as nn
+
+# Paper suggests 0.9997 momentum, for TensFlow. Equivalent PyTorch
+# momentum is 1.0 - tensorflow.
+_BN_MOMENTUM = 1 - 0.9997
+
+class _InvertedResidual(nn.Module):
+
+    def __init__(self, in_ch: int, out_ch: int, kernel_size: int, stride: int,
+                 expansion_factor: int, bn_momentum: float = 0.1) -> None:
+        super().__init__()
+        assert stride in [1, 2]
+        assert kernel_size in [3, 5]
+        mid_ch = in_ch * expansion_factor
+        self.apply_residual = (in_ch == out_ch and stride == 1)
+        self.layers = nn.Sequential(
+            # Pointwise
+            nn.Conv2d(in_ch, mid_ch, 1, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
+            nn.ReLU(inplace=True),
+            # Depthwise
+            nn.Conv2d(mid_ch, mid_ch, kernel_size, padding=kernel_size // 2,
+                      stride=stride, groups=mid_ch, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
+            nn.ReLU(inplace=True),
+            # Linear pointwise. Note that there's no activation.
+            nn.Conv2d(mid_ch, out_ch, 1, bias=False),
+            nn.BatchNorm2d(out_ch, momentum=bn_momentum))
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.apply_residual:
+            return self.layers.forward(input) + input
+        else:
+            return self.layers.forward(input)
+
+
+def _stack(in_ch: int, out_ch: int, kernel_size: int, stride: int,
+          exp_factor: int, repeats: int, bn_momentum: float) -> nn.Sequential:
+    """ Creates a stack of inverted residuals as seen in e.g. MobileNetV2 or
+    MNasNet. """
+    assert repeats >= 1
+    # First one has no skip, because feature map size changes.
+    first = InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor,
+                             bn_momentum=bn_momentum)
+    remaining = []
+    for _ in range(1, repeats):
+        remaining.append(
+            InvertedResidual(out_ch, out_ch, kernel_size, 1, exp_factor,
+                             bn_momentum=bn_momentum))
+    return nn.Sequential(first, *remaining)
+
+
+def _round_to_multiple_of(val: float, divisor: int,
+                          round_up_bias: float = 0.9) -> int:
+    """ Asymmetric rounding to make `val` divisible by `divisor`. With default
+    bias, will round up, unless the number is no more than 10% greater than the
+    smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88. """
+    assert 0.0 < round_up_bias < 1.0
+    new_val = max(divisor, int(val + divisor / 2) // divisor * divisor)
+    return new_val if new_val >= round_up_bias * val else new_val + divisor
+
+
+def _scale_depths(depths: List[int], alpha: float) -> List[int]:
+    """ Scales tensor depths as in reference MobileNet code, prefers rouding up
+    rather than down. """
+    return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
+
+
+class MNasNet(torch.nn.Module):
+    """ MNasNet, as described in https://arxiv.org/pdf/1807.11626.pdf.
+    >>> model = MNasNet(1000, 1.0)
+    >>> x = torch.rand(1, 3, 224, 224)
+    >>> y = model.forward(x)
+    >>> y.dim()
+    1
+    >>> y.nelement()
+    1000
+    """
+
+    def __init__(self, num_classes: int, alpha: float, dropout:float=0.2) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.num_classes = num_classes
+        depths = _scale_depths([24, 40, 80, 96, 192, 320], alpha)
+        layers = [
+            # First layer: regular conv.
+            nn.Conv2d(3, 32, 3, padding=1, stride=2, bias=False),
+            nn.BatchNorm2d(32, momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+            # Depthwise separable, no skip.
+            nn.Conv2d(32, 32, 3, padding=1, stride=1, groups=32, bias=False),
+            nn.BatchNorm2d(32, momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(32, 16, 1, padding=0, stride=1, bias=False),
+            nn.BatchNorm2d(16, momentum=_BN_MOMENTUM),
+            # MNasNet blocks: stacks of inverted residuals.
+            _stack(16, depths[0], 3, 2, 3, 3, _BN_MOMENTUM),
+            _stack(depths[0], depths[1], 5, 2, 3, 3, _BN_MOMENTUM),
+            _stack(depths[1], depths[2], 5, 2, 6, 3, _BN_MOMENTUM),
+            _stack(depths[2], depths[3], 3, 1, 6, 2, _BN_MOMENTUM),
+            _stack(depths[3], depths[4], 5, 2, 6, 4, _BN_MOMENTUM),
+            _stack(depths[4], depths[5], 3, 1, 6, 1, _BN_MOMENTUM),
+            # Final mapping to classifier input.
+            nn.Conv2d(depths[5], 1280, 1, padding=0, stride=1, bias=False),
+            nn.BatchNorm2d(1280, momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+            nn.AdaptiveAvgPool2d(1)
+        ]
+        self.layers = nn.Sequential(*layers)
+        if dropout > 0.0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(inplace=True, p=0.2), nn.Linear(1280, self.num_classes))
+        else:
+            self.classifier = nn.Linear(1280, self.num_classes)
+
+        self._initialize_weights()
+
+    def features(self, x):
+        return self.layers.forward(x).squeeze()
+
+    def forward(self, x):
+        return self.classifier(self.features(x))
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2.0 / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1.0)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
+
+class MNasNet0_5(MNasNet):
+    """ MNasNet with depth multiplier of 0.5. """
+
+    def __init__(self, num_classes: int) -> None:
+        super().__init__(num_classes, 0.5)
+
+class MNasNet0_75(MNasNet):
+    """ MNasNet with depth multiplier of 0.75. """
+
+    def __init__(self, num_classes: int) -> None:
+        super().__init__(num_classes, 0.75)
+
+class MNasNet1_0(MNasNet):
+    """ MNasNet with depth multiplier of 1.0. """
+
+    def __init__(self, num_classes: int) -> None:
+        super().__init__(num_classes, 1.0)
+
+
+class MNasNet1_3(MNasNet):
+    """ MNasNet with depth multiplier of 1.3. """
+
+    def __init__(self, num_classes: int) -> None:
+        super().__init__(num_classes, 1.3)
+
+

From e1c55063be9ba6dff47f28588cde9739915d4613 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Mon, 1 Apr 2019 23:25:47 -0700
Subject: [PATCH 02/36] Remove all type hints, comply with PyTorch overall
 style

---
 torchvision/models/mnasnet.py | 77 +++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 35 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 9d6b7f714a7..421016b82b8 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -3,14 +3,20 @@
 import torch
 import torch.nn as nn
 
-# Paper suggests 0.9997 momentum, for TensFlow. Equivalent PyTorch
-# momentum is 1.0 - tensorflow.
+
+__all__ = ['MNASNet', 'MNASNet0_5', 'MNASNet0_75', 'MNASNet1_0', 'MNASNet1_3']
+
+# Paper suggests 0.9997 momentum, for TensFlow. Equivalent PyTorch momentum is
+# 1.0 - tensorflow.
 _BN_MOMENTUM = 1 - 0.9997
 
+
 class _InvertedResidual(nn.Module):
+    """ Inverted residual block from MobileNetV2 and MNASNet papers. This can
+    be used to implement MobileNet V2, if ReLU is replaced with ReLU6. """
 
-    def __init__(self, in_ch: int, out_ch: int, kernel_size: int, stride: int,
-                 expansion_factor: int, bn_momentum: float = 0.1) -> None:
+    def __init__(self, in_ch, out_ch, kernel_size, stride, expansion_factor,
+                 bn_momentum=0.1):
         super().__init__()
         assert stride in [1, 2]
         assert kernel_size in [3, 5]
@@ -30,31 +36,30 @@ def __init__(self, in_ch: int, out_ch: int, kernel_size: int, stride: int,
             nn.Conv2d(mid_ch, out_ch, 1, bias=False),
             nn.BatchNorm2d(out_ch, momentum=bn_momentum))
 
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
+    def forward(self, input):
         if self.apply_residual:
             return self.layers.forward(input) + input
         else:
             return self.layers.forward(input)
 
 
-def _stack(in_ch: int, out_ch: int, kernel_size: int, stride: int,
-          exp_factor: int, repeats: int, bn_momentum: float) -> nn.Sequential:
+def _stack(in_ch, out_ch, kernel_size, stride, exp_factor, repeats,
+           bn_momentum):
     """ Creates a stack of inverted residuals as seen in e.g. MobileNetV2 or
-    MNasNet. """
+    MNASNet. """
     assert repeats >= 1
     # First one has no skip, because feature map size changes.
-    first = InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor,
-                             bn_momentum=bn_momentum)
+    first = _InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor,
+                              bn_momentum=bn_momentum)
     remaining = []
     for _ in range(1, repeats):
         remaining.append(
-            InvertedResidual(out_ch, out_ch, kernel_size, 1, exp_factor,
-                             bn_momentum=bn_momentum))
+            _InvertedResidual(out_ch, out_ch, kernel_size, 1, exp_factor,
+                              bn_momentum=bn_momentum))
     return nn.Sequential(first, *remaining)
 
 
-def _round_to_multiple_of(val: float, divisor: int,
-                          round_up_bias: float = 0.9) -> int:
+def _round_to_multiple_of(val, divisor, round_up_bias=0.9):
     """ Asymmetric rounding to make `val` divisible by `divisor`. With default
     bias, will round up, unless the number is no more than 10% greater than the
     smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88. """
@@ -63,15 +68,15 @@ def _round_to_multiple_of(val: float, divisor: int,
     return new_val if new_val >= round_up_bias * val else new_val + divisor
 
 
-def _scale_depths(depths: List[int], alpha: float) -> List[int]:
+def _scale_depths(depths, alpha):
     """ Scales tensor depths as in reference MobileNet code, prefers rouding up
     rather than down. """
     return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
 
 
-class MNasNet(torch.nn.Module):
-    """ MNasNet, as described in https://arxiv.org/pdf/1807.11626.pdf.
-    >>> model = MNasNet(1000, 1.0)
+class MNASNet(torch.nn.Module):
+    """ MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf.
+    >>> model = MNASNet(1000, 1.0)
     >>> x = torch.rand(1, 3, 224, 224)
     >>> y = model.forward(x)
     >>> y.dim()
@@ -80,7 +85,7 @@ class MNasNet(torch.nn.Module):
     1000
     """
 
-    def __init__(self, num_classes: int, alpha: float, dropout:float=0.2) -> None:
+    def __init__(self, num_classes, alpha, dropout=0.2):
         super().__init__()
         self.alpha = alpha
         self.num_classes = num_classes
@@ -96,7 +101,7 @@ def __init__(self, num_classes: int, alpha: float, dropout:float=0.2) -> None:
             nn.ReLU(inplace=True),
             nn.Conv2d(32, 16, 1, padding=0, stride=1, bias=False),
             nn.BatchNorm2d(16, momentum=_BN_MOMENTUM),
-            # MNasNet blocks: stacks of inverted residuals.
+            # MNASNet blocks: stacks of inverted residuals.
             _stack(16, depths[0], 3, 2, 3, 3, _BN_MOMENTUM),
             _stack(depths[0], depths[1], 5, 2, 3, 3, _BN_MOMENTUM),
             _stack(depths[1], depths[2], 5, 2, 6, 3, _BN_MOMENTUM),
@@ -112,7 +117,8 @@ def __init__(self, num_classes: int, alpha: float, dropout:float=0.2) -> None:
         self.layers = nn.Sequential(*layers)
         if dropout > 0.0:
             self.classifier = nn.Sequential(
-                nn.Dropout(inplace=True, p=0.2), nn.Linear(1280, self.num_classes))
+                nn.Dropout(inplace=True, p=0.2),
+                nn.Linear(1280, self.num_classes))
         else:
             self.classifier = nn.Linear(1280, self.num_classes)
 
@@ -139,29 +145,30 @@ def _initialize_weights(self):
                 m.weight.data.normal_(0, 0.01)
                 m.bias.data.zero_()
 
-class MNasNet0_5(MNasNet):
-    """ MNasNet with depth multiplier of 0.5. """
 
-    def __init__(self, num_classes: int) -> None:
+class MNASNet0_5(MNASNet):
+    """ MNASNet with depth multiplier of 0.5. """
+
+    def __init__(self, num_classes):
         super().__init__(num_classes, 0.5)
 
-class MNasNet0_75(MNasNet):
-    """ MNasNet with depth multiplier of 0.75. """
 
-    def __init__(self, num_classes: int) -> None:
+class MNASNet0_75(MNASNet):
+    """ MNASNet with depth multiplier of 0.75. """
+
+    def __init__(self, num_classes):
         super().__init__(num_classes, 0.75)
 
-class MNasNet1_0(MNasNet):
-    """ MNasNet with depth multiplier of 1.0. """
 
-    def __init__(self, num_classes: int) -> None:
+class MNASNet1_0(MNASNet):
+    """ MNASNet with depth multiplier of 1.0. """
+
+    def __init__(self, num_classes):
         super().__init__(num_classes, 1.0)
 
 
-class MNasNet1_3(MNasNet):
-    """ MNasNet with depth multiplier of 1.3. """
+class MNASNet1_3(MNASNet):
+    """ MNASNet with depth multiplier of 1.3. """
 
-    def __init__(self, num_classes: int) -> None:
+    def __init__(self, num_classes):
         super().__init__(num_classes, 1.3)
-
-

From 0d77accb5fcd88be9823ad2ce37ffe02ca71191f Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Tue, 2 Apr 2019 01:03:23 -0700
Subject: [PATCH 03/36] Expose models

---
 torchvision/models/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py
index 7437c51597f..839f45301ee 100644
--- a/torchvision/models/__init__.py
+++ b/torchvision/models/__init__.py
@@ -5,3 +5,4 @@
 from .inception import *
 from .densenet import *
 from .googlenet import *
+from .mnasnet import *

From c41aaab80fec8f2ddc67e3372db8e362e0aa270d Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Tue, 2 Apr 2019 01:36:45 -0700
Subject: [PATCH 04/36] Remove avgpool from features() and add separately

---
 torchvision/models/mnasnet.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 421016b82b8..413b71c26c3 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -112,9 +112,9 @@ def __init__(self, num_classes, alpha, dropout=0.2):
             nn.Conv2d(depths[5], 1280, 1, padding=0, stride=1, bias=False),
             nn.BatchNorm2d(1280, momentum=_BN_MOMENTUM),
             nn.ReLU(inplace=True),
-            nn.AdaptiveAvgPool2d(1)
         ]
         self.layers = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
         if dropout > 0.0:
             self.classifier = nn.Sequential(
                 nn.Dropout(inplace=True, p=0.2),
@@ -125,10 +125,12 @@ def __init__(self, num_classes, alpha, dropout=0.2):
         self._initialize_weights()
 
     def features(self, x):
-        return self.layers.forward(x).squeeze()
+        return self.layers.forward(x)
 
     def forward(self, x):
-        return self.classifier(self.features(x))
+        x = self.features(x)
+        x = self.avgpool(x).squeeze()
+        return self.classifier(x)
 
     def _initialize_weights(self):
         for m in self.modules():

From 568bd5083c7552884ef608ed08ace16a4d21ef74 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Fri, 12 Apr 2019 18:31:33 -0700
Subject: [PATCH 05/36] Fix python3-only stuff, replace subclasses with
 functions

---
 torchvision/models/mnasnet.py | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 413b71c26c3..364c8d363af 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -17,7 +17,7 @@ class _InvertedResidual(nn.Module):
 
     def __init__(self, in_ch, out_ch, kernel_size, stride, expansion_factor,
                  bn_momentum=0.1):
-        super().__init__()
+        super(_InvertedResidual, self).__init__()
         assert stride in [1, 2]
         assert kernel_size in [3, 5]
         mid_ch = in_ch * expansion_factor
@@ -86,7 +86,7 @@ class MNASNet(torch.nn.Module):
     """
 
     def __init__(self, num_classes, alpha, dropout=0.2):
-        super().__init__()
+        super(MNASNet, self).__init__()
         self.alpha = alpha
         self.num_classes = num_classes
         depths = _scale_depths([24, 40, 80, 96, 192, 320], alpha)
@@ -148,29 +148,21 @@ def _initialize_weights(self):
                 m.bias.data.zero_()
 
 
-class MNASNet0_5(MNASNet):
+def mnasnet0_5(num_classes):
     """ MNASNet with depth multiplier of 0.5. """
+    return MNASNet(num_classes, alpha=0.5)
 
-    def __init__(self, num_classes):
-        super().__init__(num_classes, 0.5)
 
-
-class MNASNet0_75(MNASNet):
+def mnasnet0_75(num_classes):
     """ MNASNet with depth multiplier of 0.75. """
-
-    def __init__(self, num_classes):
-        super().__init__(num_classes, 0.75)
+    return MNASNet(num_classes, alpha=0.75)
 
 
-class MNASNet1_0(MNASNet):
+def mnasnet1_0(num_classes):
     """ MNASNet with depth multiplier of 1.0. """
+    return MNASNet(num_classes, alpha=1.0)
 
-    def __init__(self, num_classes):
-        super().__init__(num_classes, 1.0)
 
-
-class MNASNet1_3(MNASNet):
+def mnasnet1_3(num_classes):
     """ MNASNet with depth multiplier of 1.3. """
-
-    def __init__(self, num_classes):
-        super().__init__(num_classes, 1.3)
+    return MNASNet(num_classes, alpha=1.3)

From 5617b8e38fae97166d001af4d19f4b20b5e954a3 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Fri, 12 Apr 2019 18:34:22 -0700
Subject: [PATCH 06/36] fix __all__

---
 torchvision/models/mnasnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 364c8d363af..695d1830f76 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 
 
-__all__ = ['MNASNet', 'MNASNet0_5', 'MNASNet0_75', 'MNASNet1_0', 'MNASNet1_3']
+__all__ = ['MNASNet', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3']
 
 # Paper suggests 0.9997 momentum, for TensFlow. Equivalent PyTorch momentum is
 # 1.0 - tensorflow.

From ba0ad4d80512bd357fccc4a7fe3aa8fd87ac4aa2 Mon Sep 17 00:00:00 2001
From: Dmitry Belenko <38598618+1e100@users.noreply.github.com>
Date: Sat, 13 Apr 2019 01:47:06 -0700
Subject: [PATCH 07/36] Fix typo

---
 torchvision/models/mnasnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 695d1830f76..f235d12989b 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -6,7 +6,7 @@
 
 __all__ = ['MNASNet', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3']
 
-# Paper suggests 0.9997 momentum, for TensFlow. Equivalent PyTorch momentum is
+# Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is
 # 1.0 - tensorflow.
 _BN_MOMENTUM = 1 - 0.9997
 

From bd4836b6746331115388bcb07415e7d3d9dfc684 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 13 Apr 2019 19:03:10 -0700
Subject: [PATCH 08/36] Remove conditional dropout

---
 torchvision/models/mnasnet.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 695d1830f76..cd3538032dd 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -115,12 +115,9 @@ def __init__(self, num_classes, alpha, dropout=0.2):
         ]
         self.layers = nn.Sequential(*layers)
         self.avgpool = nn.AdaptiveAvgPool2d(1)
-        if dropout > 0.0:
-            self.classifier = nn.Sequential(
-                nn.Dropout(inplace=True, p=0.2),
-                nn.Linear(1280, self.num_classes))
-        else:
-            self.classifier = nn.Linear(1280, self.num_classes)
+        self.classifier = nn.Sequential(
+            nn.Dropout(inplace=True, p=dropout),
+            nn.Linear(1280, self.num_classes))
 
         self._initialize_weights()
 

From 102ba553b7b075675bdbc983b0e5ba28e73186fe Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sun, 14 Apr 2019 20:48:10 -0700
Subject: [PATCH 09/36] Make dropout functional

---
 torchvision/models/mnasnet.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 85c2808d768..41855582311 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -3,7 +3,6 @@
 import torch
 import torch.nn as nn
 
-
 __all__ = ['MNASNet', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3']
 
 # Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is
@@ -89,6 +88,7 @@ def __init__(self, num_classes, alpha, dropout=0.2):
         super(MNASNet, self).__init__()
         self.alpha = alpha
         self.num_classes = num_classes
+        self.dropout = dropout
         depths = _scale_depths([24, 40, 80, 96, 192, 320], alpha)
         layers = [
             # First layer: regular conv.
@@ -115,9 +115,7 @@ def __init__(self, num_classes, alpha, dropout=0.2):
         ]
         self.layers = nn.Sequential(*layers)
         self.avgpool = nn.AdaptiveAvgPool2d(1)
-        self.classifier = nn.Sequential(
-            nn.Dropout(inplace=True, p=dropout),
-            nn.Linear(1280, self.num_classes))
+        self.classifier = nn.Linear(1280, self.num_classes)
 
         self._initialize_weights()
 
@@ -127,6 +125,9 @@ def features(self, x):
     def forward(self, x):
         x = self.features(x)
         x = self.avgpool(x).squeeze()
+        if self.dropout > 0.0:
+            x = nn.functional.dropout(x, p=self.dropout, training=self.training,
+                                      inplace=True)
         return self.classifier(x)
 
     def _initialize_weights(self):

From 9c8b827b37e7c18f4a44ba9740fd771bd6a6d122 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Mon, 15 Apr 2019 23:00:19 -0700
Subject: [PATCH 10/36] Addressing @fmassa's feedback, round 1

---
 torchvision/models/mnasnet.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 41855582311..c37c73a67e6 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -11,8 +11,6 @@
 
 
 class _InvertedResidual(nn.Module):
-    """ Inverted residual block from MobileNetV2 and MNASNet papers. This can
-    be used to implement MobileNet V2, if ReLU is replaced with ReLU6. """
 
     def __init__(self, in_ch, out_ch, kernel_size, stride, expansion_factor,
                  bn_momentum=0.1):
@@ -37,15 +35,14 @@ def __init__(self, in_ch, out_ch, kernel_size, stride, expansion_factor,
 
     def forward(self, input):
         if self.apply_residual:
-            return self.layers.forward(input) + input
+            return self.layers(input) + input
         else:
-            return self.layers.forward(input)
+            return self.layers(input)
 
 
 def _stack(in_ch, out_ch, kernel_size, stride, exp_factor, repeats,
            bn_momentum):
-    """ Creates a stack of inverted residuals as seen in e.g. MobileNetV2 or
-    MNASNet. """
+    """ Creates a stack of inverted residuals. """
     assert repeats >= 1
     # First one has no skip, because feature map size changes.
     first = _InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor,

From 2872b1fa9ee6c846a21c4b6ec0c71649036f51e6 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Mon, 15 Apr 2019 23:07:49 -0700
Subject: [PATCH 11/36] Replaced adaptive avgpool with mean on H and W to
 prevent collapsing the batch dimension

---
 torchvision/models/mnasnet.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index c37c73a67e6..42591e77868 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -111,17 +111,17 @@ def __init__(self, num_classes, alpha, dropout=0.2):
             nn.ReLU(inplace=True),
         ]
         self.layers = nn.Sequential(*layers)
-        self.avgpool = nn.AdaptiveAvgPool2d(1)
         self.classifier = nn.Linear(1280, self.num_classes)
 
         self._initialize_weights()
 
     def features(self, x):
-        return self.layers.forward(x)
+        return self.layers(x)
 
     def forward(self, x):
         x = self.features(x)
-        x = self.avgpool(x).squeeze()
+        # Equivalent to global avgpool and removing H and W dimensions.
+        x = x.mean([2, 3])
         if self.dropout > 0.0:
             x = nn.functional.dropout(x, p=self.dropout, training=self.training,
                                       inplace=True)

From 05b387b8bda3e686485890a8a55f8bf5e3484439 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Fri, 3 May 2019 03:40:52 -0700
Subject: [PATCH 12/36] Partially address feedback

---
 torchvision/models/mnasnet.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 42591e77868..7e41a505750 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -74,7 +74,7 @@ class MNASNet(torch.nn.Module):
     """ MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf.
     >>> model = MNASNet(1000, 1.0)
     >>> x = torch.rand(1, 3, 224, 224)
-    >>> y = model.forward(x)
+    >>> y = model(x)
     >>> y.dim()
     1
     >>> y.nelement()
@@ -111,20 +111,14 @@ def __init__(self, num_classes, alpha, dropout=0.2):
             nn.ReLU(inplace=True),
         ]
         self.layers = nn.Sequential(*layers)
-        self.classifier = nn.Linear(1280, self.num_classes)
-
+        self.classifier = nn.Sequential(nn.Dropout(p=self.dropout, inplace=True),
+                                        nn.Linear(1280, self.num_classes))
         self._initialize_weights()
 
-    def features(self, x):
-        return self.layers(x)
-
     def forward(self, x):
-        x = self.features(x)
+        x = self.layers(x)
         # Equivalent to global avgpool and removing H and W dimensions.
         x = x.mean([2, 3])
-        if self.dropout > 0.0:
-            x = nn.functional.dropout(x, p=self.dropout, training=self.training,
-                                      inplace=True)
         return self.classifier(x)
 
     def _initialize_weights(self):

From 2d397976c000548dca28d8295d63b4ee12ded32c Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Fri, 3 May 2019 03:41:54 -0700
Subject: [PATCH 13/36] YAPF

---
 torchvision/models/mnasnet.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 7e41a505750..9405dd844ca 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -111,8 +111,9 @@ def __init__(self, num_classes, alpha, dropout=0.2):
             nn.ReLU(inplace=True),
         ]
         self.layers = nn.Sequential(*layers)
-        self.classifier = nn.Sequential(nn.Dropout(p=self.dropout, inplace=True),
-                                        nn.Linear(1280, self.num_classes))
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=self.dropout, inplace=True),
+            nn.Linear(1280, self.num_classes))
         self._initialize_weights()
 
     def forward(self, x):

From 8b5f7b91d31ff9e5ab3200d5064387cb5a2094c4 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Fri, 3 May 2019 03:46:33 -0700
Subject: [PATCH 14/36] Removed redundant class vars

---
 torchvision/models/mnasnet.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 9405dd844ca..bdb7f000317 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -83,9 +83,6 @@ class MNASNet(torch.nn.Module):
 
     def __init__(self, num_classes, alpha, dropout=0.2):
         super(MNASNet, self).__init__()
-        self.alpha = alpha
-        self.num_classes = num_classes
-        self.dropout = dropout
         depths = _scale_depths([24, 40, 80, 96, 192, 320], alpha)
         layers = [
             # First layer: regular conv.
@@ -112,8 +109,8 @@ def __init__(self, num_classes, alpha, dropout=0.2):
         ]
         self.layers = nn.Sequential(*layers)
         self.classifier = nn.Sequential(
-            nn.Dropout(p=self.dropout, inplace=True),
-            nn.Linear(1280, self.num_classes))
+            nn.Dropout(p=dropout, inplace=True),
+            nn.Linear(1280, num_classes))
         self._initialize_weights()
 
     def forward(self, x):

From 40471ac6f090a929eb83136a95a62108bf40bad7 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Mon, 6 May 2019 02:40:09 -0700
Subject: [PATCH 15/36] Update urls to releases

---
 torchvision/models/mnasnet.py | 53 +++++++++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 12 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index bdb7f000317..ee107d2518e 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -2,9 +2,19 @@
 
 import torch
 import torch.nn as nn
+from .utils import load_state_dict_from_url
 
 __all__ = ['MNASNet', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3']
 
+_MODEL_URLS = {
+    "mnasnet0_5":
+    "https://github.com/1e100/mnasnet_trainer/releases/download/v0.1/mnasnet0.5_top1_67.592-7c6cb539b9.pth",
+    "mnasnet0_75": None,
+    "mnasnet1_0":
+    "https://github.com/1e100/mnasnet_trainer/releases/download/v0.1/mnasnet1.0_top1_73.512-f206786ef8.pth",
+    "mnasnet1_3": None
+}
+
 # Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is
 # 1.0 - tensorflow.
 _BN_MOMENTUM = 1 - 0.9997
@@ -81,7 +91,7 @@ class MNASNet(torch.nn.Module):
     1000
     """
 
-    def __init__(self, num_classes, alpha, dropout=0.2):
+    def __init__(self, alpha, num_classes=1000, dropout=0.2):
         super(MNASNet, self).__init__()
         depths = _scale_depths([24, 40, 80, 96, 192, 320], alpha)
         layers = [
@@ -108,9 +118,8 @@ def __init__(self, num_classes, alpha, dropout=0.2):
             nn.ReLU(inplace=True),
         ]
         self.layers = nn.Sequential(*layers)
-        self.classifier = nn.Sequential(
-            nn.Dropout(p=dropout, inplace=True),
-            nn.Linear(1280, num_classes))
+        self.classifier = nn.Sequential(nn.Dropout(p=dropout, inplace=True),
+                                        nn.Linear(1280, num_classes))
         self._initialize_weights()
 
     def forward(self, x):
@@ -135,21 +144,41 @@ def _initialize_weights(self):
                 m.bias.data.zero_()
 
 
-def mnasnet0_5(num_classes):
+def _load_pretrained(model_name, model):
+    if model_name not in _MODEL_URLS or _MODEL_URLS[model_name] is None:
+        raise ValueError(
+            "No checkpoint is available for model type {}".format(model_name))
+    checkpoint_url = _MODEL_URLS[model_name]
+    model.load_state_dict(torch.utils.model_zoo.load_url(checkpoint_url))
+
+
+def mnasnet0_5(pretrained=False, **kwargs):
     """ MNASNet with depth multiplier of 0.5. """
-    return MNASNet(num_classes, alpha=0.5)
+    model = MNASNet(0.5, **kwargs)
+    if pretrained:
+        _load_pretrained("mnasnet0_5", model)
+    return model
 
 
-def mnasnet0_75(num_classes):
+def mnasnet0_75(pretrained=False, **kwargs):
     """ MNASNet with depth multiplier of 0.75. """
-    return MNASNet(num_classes, alpha=0.75)
+    model = MNASNet(0.75, **kwargs)
+    if pretrained:
+        _load_pretrained("mnasnet0_75", model)
+    return model
 
 
-def mnasnet1_0(num_classes):
+def mnasnet1_0(pretrained=False, **kwargs):
     """ MNASNet with depth multiplier of 1.0. """
-    return MNASNet(num_classes, alpha=1.0)
+    model = MNASNet(1.0, **kwargs)
+    if pretrained:
+        _load_pretrained("mnasnet1_0", model)
+    return model
 
 
-def mnasnet1_3(num_classes):
+def mnasnet1_3(pretrained=False, **kwargs):
     """ MNASNet with depth multiplier of 1.3. """
-    return MNASNet(num_classes, alpha=1.3)
+    model = MNASNet(1.3, **kwargs)
+    if pretrained:
+        _load_pretrained("mnasnet1_3", model)
+    return model

From b1d54ec62b7f5c355f8316e0b2f5a0c63de5812d Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Mon, 6 May 2019 02:51:54 -0700
Subject: [PATCH 16/36] Add information to models.rst

---
 docs/source/models.rst | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/source/models.rst b/docs/source/models.rst
index 66bb60e2004..216c0f9b79b 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -12,6 +12,7 @@ architectures:
 -  `Inception`_ v3
 -  `GoogLeNet`_
 -  `ShuffleNet`_ v2
+-  `MNASNet`_
 
 You can construct a model with random weights by calling its constructor:
 
@@ -26,6 +27,7 @@ You can construct a model with random weights by calling its constructor:
     inception = models.inception_v3()
     googlenet = models.googlenet()
     shufflenet = models.shufflenetv2()
+    mnasnet = models.mnasnet1_0()
 
 We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
 These can be constructed by passing ``pretrained=True``:
@@ -41,6 +43,7 @@ These can be constructed by passing ``pretrained=True``:
     inception = models.inception_v3(pretrained=True)
     googlenet = models.googlenet(pretrained=True)
     shufflenet = models.shufflenetv2(pretrained=True)
+    mnasnet = models.mnasnet1_0(pretrained=True)
 
 Instancing a pre-trained model will download its weights to a cache directory.
 This directory can be set using the `TORCH_MODEL_ZOO` environment variable. See
@@ -92,6 +95,7 @@ Densenet-161                      22.35           6.20
 Inception v3                      22.55           6.44
 GoogleNet                         30.22           10.47
 ShuffleNet V2                     30.64           11.68
+MNASNet 1.0                       26.49           8.456
 ================================  =============   =============
 
 
@@ -103,6 +107,7 @@ ShuffleNet V2                     30.64           11.68
 .. _Inception: https://arxiv.org/abs/1512.00567
 .. _GoogLeNet: https://arxiv.org/abs/1409.4842
 .. _ShuffleNet: https://arxiv.org/abs/1807.11164
+.. _MNASNet: https://arxiv.org/abs/1807.11626
 
 .. currentmodule:: torchvision.models
 
@@ -162,3 +167,10 @@ ShuffleNet v2
 
 .. autofunction:: shufflenet
 
+MNASNet
+-------------
+
+.. autofunction:: mnasnet0_5
+.. autofunction:: mnasnet0_75
+.. autofunction:: mnasnet1_0
+.. autofunction:: mnasnet1_3

From ec717d03042fbf6e84273dfc70f1855efb19a0be Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 11 May 2019 16:57:50 -0700
Subject: [PATCH 17/36] Replace init with kaiming_normal_ in fan-out mode

---
 torchvision/models/mnasnet.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index ee107d2518e..26fc68879c8 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -131,17 +131,16 @@ def forward(self, x):
     def _initialize_weights(self):
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(0, math.sqrt(2.0 / n))
+                nn.init.kaiming_normal_(m.weight, mode="fan_out",
+                                        nonlinearity="relu")
                 if m.bias is not None:
-                    m.bias.data.zero_()
+                    nn.init.zeros_(m.bias)
             elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1.0)
-                m.bias.data.zero_()
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
             elif isinstance(m, nn.Linear):
-                n = m.weight.size(1)
-                m.weight.data.normal_(0, 0.01)
-                m.bias.data.zero_()
+                nn.init.normal_(m.weight, 0.01)
+                nn.init.zeros_(m.bias)
 
 
 def _load_pretrained(model_name, model):

From 8b2dba9d62e25fe0d8c5db9558b348472dc0432c Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 11 May 2019 17:13:11 -0700
Subject: [PATCH 18/36] Use load_state_dict_from_url

---
 torchvision/models/mnasnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 26fc68879c8..5deb87c2ad1 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -148,7 +148,7 @@ def _load_pretrained(model_name, model):
         raise ValueError(
             "No checkpoint is available for model type {}".format(model_name))
     checkpoint_url = _MODEL_URLS[model_name]
-    model.load_state_dict(torch.utils.model_zoo.load_url(checkpoint_url))
+    model.load_state_dict(load_state_dict_from_url(checkpoint_url))
 
 
 def mnasnet0_5(pretrained=False, **kwargs):

From 7be747842e8807badef063d7a82edb9caec90fdf Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Wed, 26 Jun 2019 23:03:08 -0700
Subject: [PATCH 19/36] Fix depth scaling on first 2 layers

---
 torchvision/models/mnasnet.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index ba815206be2..a52a06587aa 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -93,27 +93,28 @@ class MNASNet(torch.nn.Module):
 
     def __init__(self, alpha, num_classes=1000, dropout=0.2):
         super(MNASNet, self).__init__()
-        depths = _scale_depths([24, 40, 80, 96, 192, 320], alpha)
+        depths = _scale_depths([32, 16, 24, 40, 80, 96, 192, 320], alpha)
         layers = [
             # First layer: regular conv.
-            nn.Conv2d(3, 32, 3, padding=1, stride=2, bias=False),
-            nn.BatchNorm2d(32, momentum=_BN_MOMENTUM),
+            nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
+            nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
             nn.ReLU(inplace=True),
             # Depthwise separable, no skip.
-            nn.Conv2d(32, 32, 3, padding=1, stride=1, groups=32, bias=False),
-            nn.BatchNorm2d(32, momentum=_BN_MOMENTUM),
+            nn.Conv2d(depths[0], depths[0], 3, padding=1, stride=1,
+                      groups=depths[0], bias=False),
+            nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
             nn.ReLU(inplace=True),
-            nn.Conv2d(32, 16, 1, padding=0, stride=1, bias=False),
-            nn.BatchNorm2d(16, momentum=_BN_MOMENTUM),
+            nn.Conv2d(depths[0], depths[1], 1, padding=0, stride=1, bias=False),
+            nn.BatchNorm2d(depths[1], momentum=_BN_MOMENTUM),
             # MNASNet blocks: stacks of inverted residuals.
-            _stack(16, depths[0], 3, 2, 3, 3, _BN_MOMENTUM),
-            _stack(depths[0], depths[1], 5, 2, 3, 3, _BN_MOMENTUM),
-            _stack(depths[1], depths[2], 5, 2, 6, 3, _BN_MOMENTUM),
-            _stack(depths[2], depths[3], 3, 1, 6, 2, _BN_MOMENTUM),
-            _stack(depths[3], depths[4], 5, 2, 6, 4, _BN_MOMENTUM),
-            _stack(depths[4], depths[5], 3, 1, 6, 1, _BN_MOMENTUM),
+            _stack(depths[1], depths[2], 3, 2, 3, 3, _BN_MOMENTUM),
+            _stack(depths[2], depths[3], 5, 2, 3, 3, _BN_MOMENTUM),
+            _stack(depths[3], depths[4], 5, 2, 6, 3, _BN_MOMENTUM),
+            _stack(depths[4], depths[5], 3, 1, 6, 2, _BN_MOMENTUM),
+            _stack(depths[5], depths[6], 5, 2, 6, 4, _BN_MOMENTUM),
+            _stack(depths[6], depths[7], 3, 1, 6, 1, _BN_MOMENTUM),
             # Final mapping to classifier input.
-            nn.Conv2d(depths[5], 1280, 1, padding=0, stride=1, bias=False),
+            nn.Conv2d(depths[7], 1280, 1, padding=0, stride=1, bias=False),
             nn.BatchNorm2d(1280, momentum=_BN_MOMENTUM),
             nn.ReLU(inplace=True),
         ]
@@ -148,7 +149,8 @@ def _load_pretrained(model_name, model, progress):
         raise ValueError(
             "No checkpoint is available for model type {}".format(model_name))
     checkpoint_url = _MODEL_URLS[model_name]
-    model.load_state_dict(load_state_dict_from_url(checkpoint_url, progress=progress))
+    model.load_state_dict(
+        load_state_dict_from_url(checkpoint_url, progress=progress))
 
 
 def mnasnet0_5(pretrained=False, progress=True, **kwargs):

From e996c3670f6300b2a1064e791f8c311a05bbea20 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 29 Jun 2019 22:53:00 -0700
Subject: [PATCH 20/36] Restore initialization

---
 torchvision/models/mnasnet.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index a52a06587aa..13fcfb9883b 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -129,6 +129,21 @@ def forward(self, x):
         x = x.mean([2, 3])
         return self.classifier(x)
 
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
+
+"""
     def _initialize_weights(self):
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
@@ -142,6 +157,7 @@ def _initialize_weights(self):
             elif isinstance(m, nn.Linear):
                 nn.init.normal_(m.weight, 0.01)
                 nn.init.zeros_(m.bias)
+"""
 
 
 def _load_pretrained(model_name, model, progress):

From 1fc9c76b137bc43086a3589a3f5f6d0ea45fb092 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Thu, 4 Jul 2019 15:45:58 -0700
Subject: [PATCH 21/36] Match reference implementation initialization for dense
 layer

---
 torchvision/models/mnasnet.py | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 13fcfb9883b..47a138fa21c 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -129,21 +129,6 @@ def forward(self, x):
         x = x.mean([2, 3])
         return self.classifier(x)
 
-    def _initialize_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(0, math.sqrt(2. / n))
-                if m.bias is not None:
-                    m.bias.data.zero_()
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-            elif isinstance(m, nn.Linear):
-                m.weight.data.normal_(0, 0.01)
-                m.bias.data.zero_()
-
-"""
     def _initialize_weights(self):
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
@@ -155,9 +140,9 @@ def _initialize_weights(self):
                 nn.init.ones_(m.weight)
                 nn.init.zeros_(m.bias)
             elif isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, 0.01)
+                nn.init.xavier_uniform_(m.weight, mode="fan_out",
+                                        nonlinearity="sigmoid")
                 nn.init.zeros_(m.bias)
-"""
 
 
 def _load_pretrained(model_name, model, progress):

From e5164e3eb51769ea8a530d241ebbce5df1fbfa76 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Thu, 4 Jul 2019 16:10:51 -0700
Subject: [PATCH 22/36] Meant to use Kaiming

---
 torchvision/models/mnasnet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 47a138fa21c..d2e81ac8eef 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -119,8 +119,8 @@ def __init__(self, alpha, num_classes=1000, dropout=0.2):
             nn.ReLU(inplace=True),
         ]
         self.layers = nn.Sequential(*layers)
-        self.classifier = nn.Sequential(nn.Dropout(p=dropout, inplace=True),
-                                        nn.Linear(1280, num_classes))
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout, inplace=True), nn.Linear(1280, num_classes))
         self._initialize_weights()
 
     def forward(self, x):
@@ -140,8 +140,8 @@ def _initialize_weights(self):
                 nn.init.ones_(m.weight)
                 nn.init.zeros_(m.bias)
             elif isinstance(m, nn.Linear):
-                nn.init.xavier_uniform_(m.weight, mode="fan_out",
-                                        nonlinearity="sigmoid")
+                nn.init.kaiming_uniform_(m.weight, mode="fan_out",
+                                         nonlinearity="sigmoid")
                 nn.init.zeros_(m.bias)
 
 

From f5c9a17194c575f96274331af5fa4762a10ae21d Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Fri, 12 Jul 2019 16:28:00 -0700
Subject: [PATCH 23/36] Remove spurious relu

---
 torchvision/models/mnasnet.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index d2e81ac8eef..1b102230f55 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -98,7 +98,6 @@ def __init__(self, alpha, num_classes=1000, dropout=0.2):
             # First layer: regular conv.
             nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
             nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
-            nn.ReLU(inplace=True),
             # Depthwise separable, no skip.
             nn.Conv2d(depths[0], depths[0], 3, padding=1, stride=1,
                       groups=depths[0], bias=False),

From 1b7808e7308f28762c3faeec9ce213a6a4fc25f9 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 10 Aug 2019 03:03:21 -0700
Subject: [PATCH 24/36] Point to the newest 0.5 checkpoint

---
 torchvision/models/mnasnet.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 1b102230f55..0209cdac6cc 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -7,11 +7,9 @@
 __all__ = ['MNASNet', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3']
 
 _MODEL_URLS = {
-    "mnasnet0_5":
-    "https://download.pytorch.org/models/mnasnet0.5_top1_67.592-7c6cb539b9.pth",
+    "mnasnet0_5": "https://github.com/1e100/mnasnet_trainer/releases/download/v0.2/mnasnet0.5_top1_67.823-653d4e038a.pth",
     "mnasnet0_75": None,
-    "mnasnet1_0":
-    "https://download.pytorch.org/models/mnasnet1.0_top1_73.512-f206786ef8.pth",
+    "mnasnet1_0": None,
     "mnasnet1_3": None
 }
 
@@ -98,6 +96,7 @@ def __init__(self, alpha, num_classes=1000, dropout=0.2):
             # First layer: regular conv.
             nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
             nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
             # Depthwise separable, no skip.
             nn.Conv2d(depths[0], depths[0], 3, padding=1, stride=1,
                       groups=depths[0], bias=False),
@@ -118,8 +117,8 @@ def __init__(self, alpha, num_classes=1000, dropout=0.2):
             nn.ReLU(inplace=True),
         ]
         self.layers = nn.Sequential(*layers)
-        self.classifier = nn.Sequential(
-            nn.Dropout(p=dropout, inplace=True), nn.Linear(1280, num_classes))
+        self.classifier = nn.Sequential(nn.Dropout(p=dropout, inplace=True),
+                                        nn.Linear(1280, num_classes))
         self._initialize_weights()
 
     def forward(self, x):

From 96eb194dd188adc2a270a91d26784911d3bdc4c1 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 10 Aug 2019 03:23:00 -0700
Subject: [PATCH 25/36] Latest pretrained checkpoint

---
 torchvision/models/mnasnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 0209cdac6cc..0bc1c78dfd4 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -7,7 +7,7 @@
 __all__ = ['MNASNet', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3']
 
 _MODEL_URLS = {
-    "mnasnet0_5": "https://github.com/1e100/mnasnet_trainer/releases/download/v0.2/mnasnet0.5_top1_67.823-653d4e038a.pth",
+    "mnasnet0_5": "https://github.com/1e100/mnasnet_trainer/releases/download/0.2/mnasnet0.5_top1_67.823-b7834e59f1.pth",
     "mnasnet0_75": None,
     "mnasnet1_0": None,
     "mnasnet1_3": None

From 0626a21d23267f4ecbe97966c027280a012d37c5 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 10 Aug 2019 03:47:24 -0700
Subject: [PATCH 26/36] Restore 1.0 checkpoint

---
 torchvision/models/mnasnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 2534abe10a3..e17a0053fc2 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -9,7 +9,7 @@
 _MODEL_URLS = {
     "mnasnet0_5": "https://github.com/1e100/mnasnet_trainer/releases/download/0.2/mnasnet0.5_top1_67.823-b7834e59f1.pth",
     "mnasnet0_75": None,
-    "mnasnet1_0": None,
+    "mnasnet1_0": "https://download.pytorch.org/models/mnasnet1.0_top1_73.512-f206786ef8.pth",
     "mnasnet1_3": None
 }
 

From af9679d4f066fe1a99074b429ba3705884a60f54 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 10 Aug 2019 04:05:06 -0700
Subject: [PATCH 27/36] YAPF

---
 torchvision/models/mnasnet.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index e17a0053fc2..541b838ff79 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -7,9 +7,11 @@
 __all__ = ['MNASNet', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3']
 
 _MODEL_URLS = {
-    "mnasnet0_5": "https://github.com/1e100/mnasnet_trainer/releases/download/0.2/mnasnet0.5_top1_67.823-b7834e59f1.pth",
+    "mnasnet0_5":
+    "https://github.com/1e100/mnasnet_trainer/releases/download/0.2/mnasnet0.5_top1_67.823-b7834e59f1.pth",
     "mnasnet0_75": None,
-    "mnasnet1_0": "https://download.pytorch.org/models/mnasnet1.0_top1_73.512-f206786ef8.pth",
+    "mnasnet1_0":
+    "https://download.pytorch.org/models/mnasnet1.0_top1_73.512-f206786ef8.pth",
     "mnasnet1_3": None
 }
 

From c611d0d27ec9591fe09387059ef9b2034d803d57 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 7 Sep 2019 02:02:44 -0700
Subject: [PATCH 28/36] Implement backwards compat as suggested by Soumith

---
 torchvision/models/mnasnet.py | 53 +++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 541b838ff79..b0b0b52ed69 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -2,6 +2,7 @@
 
 import torch
 import torch.nn as nn
+import warnings
 from .utils import load_state_dict_from_url
 
 __all__ = ['MNASNet', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3']
@@ -74,14 +75,16 @@ def _round_to_multiple_of(val, divisor, round_up_bias=0.9):
     return new_val if new_val >= round_up_bias * val else new_val + divisor
 
 
-def _scale_depths(depths, alpha):
+def _get_depths(alpha):
     """ Scales tensor depths as in reference MobileNet code, prefers rouding up
     rather than down. """
+    depths = [32, 16, 24, 40, 80, 96, 192, 320]
     return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
 
 
 class MNASNet(torch.nn.Module):
-    """ MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf.
+    """ MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf. This
+    implements the B1 variant of the model.
     >>> model = MNASNet(1000, 1.0)
     >>> x = torch.rand(1, 3, 224, 224)
     >>> y = model(x)
@@ -90,10 +93,14 @@ class MNASNet(torch.nn.Module):
     >>> y.nelement()
     1000
     """
+    # Version 2 adds depth scaling in the initial stages of the network.
+    _version = 2
 
     def __init__(self, alpha, num_classes=1000, dropout=0.2):
         super(MNASNet, self).__init__()
-        depths = _scale_depths([32, 16, 24, 40, 80, 96, 192, 320], alpha)
+        assert alpha > 0.0
+        self.alpha = alpha
+        depths = _get_depths(alpha)
         layers = [
             # First layer: regular conv.
             nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
@@ -144,6 +151,46 @@ def _initialize_weights(self):
                                          nonlinearity="sigmoid")
                 nn.init.zeros_(m.bias)
 
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get("version", None)
+
+        assert version in [1, 2]
+        if version != MNASNet._version:
+            warnings.warn(
+                "A new version of MNASNet model has been implemented. "
+                "Your checkpoint was saved using the previous version. "
+                "This checkpoint will load and work as before, but "
+                "you may want to upgrade by training a newer model or "
+                "transfer learning from an updated ImageNet checkpoint.",
+                UserWarning)
+
+        if version == 1 and not self.alpha == 1.0:
+            # In the initial version of the model (v1), stem was fixed-size.
+            # All other layer configurations were the same. After these
+            # changes, the model is identical to v1. Model with alpha 1.0 is
+            # unaffected.
+            depths = _get_depths(self.alpha)
+            v1_stem = [
+                nn.Conv2d(3, 32, 3, padding=1, stride=2, bias=False),
+                nn.BatchNorm2d(32, momentum=_BN_MOMENTUM),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32, 32, 3, padding=1, stride=1, groups=32,
+                          bias=False),
+                nn.BatchNorm2d(32, momentum=_BN_MOMENTUM),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(32, 16, 1, padding=0, stride=1, bias=False),
+                nn.BatchNorm2d(16, momentum=_BN_MOMENTUM),
+                _stack(16, depths[2], 3, 2, 3, 3, _BN_MOMENTUM),
+            ]
+            for idx, layer in enumerate(v1_stem):
+                self.layers[idx] = layer
+            del MNASNet._version
+
+        super(MNASNet, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys,
+            unexpected_keys, error_msgs)
+
 
 def _load_pretrained(model_name, model, progress):
     if model_name not in _MODEL_URLS or _MODEL_URLS[model_name] is None:

From ed89aac5156213ed34dc096805b4beec32946834 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 7 Sep 2019 02:34:27 -0700
Subject: [PATCH 29/36] Update checkpoint URL

---
 torchvision/models/mnasnet.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index b0b0b52ed69..ebc771ebd04 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -9,7 +9,7 @@
 
 _MODEL_URLS = {
     "mnasnet0_5":
-    "https://github.com/1e100/mnasnet_trainer/releases/download/0.2/mnasnet0.5_top1_67.823-b7834e59f1.pth",
+    "https://github.com/1e100/mnasnet_trainer/releases/download/0.3/mnasnet0.5_top1_67.823-3ffadce67e.pth",
     "mnasnet0_75": None,
     "mnasnet1_0":
     "https://download.pytorch.org/models/mnasnet1.0_top1_73.512-f206786ef8.pth",
@@ -154,16 +154,7 @@ def _initialize_weights(self):
     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
         version = local_metadata.get("version", None)
-
         assert version in [1, 2]
-        if version != MNASNet._version:
-            warnings.warn(
-                "A new version of MNASNet model has been implemented. "
-                "Your checkpoint was saved using the previous version. "
-                "This checkpoint will load and work as before, but "
-                "you may want to upgrade by training a newer model or "
-                "transfer learning from an updated ImageNet checkpoint.",
-                UserWarning)
 
         if version == 1 and not self.alpha == 1.0:
             # In the initial version of the model (v1), stem was fixed-size.
@@ -185,7 +176,14 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
             ]
             for idx, layer in enumerate(v1_stem):
                 self.layers[idx] = layer
-            del MNASNet._version
+
+            warnings.warn(
+                "A new version of MNASNet model has been implemented. "
+                "Your checkpoint was saved using the previous version. "
+                "This checkpoint will load and work as before, but "
+                "you may want to upgrade by training a newer model or "
+                "transfer learning from an updated ImageNet checkpoint.",
+                UserWarning)
 
         super(MNASNet, self)._load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys,

From 36fa9fa722ec47f8cd2bb52a62e72d9eb314d79c Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 7 Sep 2019 02:35:06 -0700
Subject: [PATCH 30/36] Move warnings up

---
 torchvision/models/mnasnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index ebc771ebd04..d5360b1611e 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -1,8 +1,8 @@
 import math
+import warnings
 
 import torch
 import torch.nn as nn
-import warnings
 from .utils import load_state_dict_from_url
 
 __all__ = ['MNASNet', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3']

From 3ceed682808273f3c9d3656c534d636af290218e Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 7 Sep 2019 02:37:47 -0700
Subject: [PATCH 31/36] Record a couple more function parameters

---
 torchvision/models/mnasnet.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index d5360b1611e..f8ff453c2f0 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -100,6 +100,8 @@ def __init__(self, alpha, num_classes=1000, dropout=0.2):
         super(MNASNet, self).__init__()
         assert alpha > 0.0
         self.alpha = alpha
+        self.variant = "b1"
+        self.num_classes = num_classes
         depths = _get_depths(alpha)
         layers = [
             # First layer: regular conv.

From b9e60c2b890ffd0d86cb1b6c18cf6cec662da91d Mon Sep 17 00:00:00 2001
From: Dmitry Belenko <38598618+1e100@users.noreply.github.com>
Date: Sat, 7 Sep 2019 02:39:21 -0700
Subject: [PATCH 32/36] Update comment

---
 torchvision/models/mnasnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index f8ff453c2f0..bc611ff2fab 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -160,8 +160,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 
         if version == 1 and not self.alpha == 1.0:
             # In the initial version of the model (v1), stem was fixed-size.
-            # All other layer configurations were the same. After these
-            # changes, the model is identical to v1. Model with alpha 1.0 is
+            # All other layer configurations were the same. This will patch
+            # the model so that it's identical to v1. Model with alpha 1.0 is
             # unaffected.
             depths = _get_depths(self.alpha)
             v1_stem = [

From 2c8ccbcae36555c39554231f1aaf1348afe21855 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Sat, 7 Sep 2019 02:52:18 -0700
Subject: [PATCH 33/36] Set the correct version such that if the BC-patched
 model is saved, it could be reloaded with BC patching again

---
 torchvision/models/mnasnet.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index f8ff453c2f0..e62b9666197 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -179,6 +179,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
             for idx, layer in enumerate(v1_stem):
                 self.layers[idx] = layer
 
+            # The model is now identical to v1, and must be saved as such.
+            MNASNet._version = 1
             warnings.warn(
                 "A new version of MNASNet model has been implemented. "
                 "Your checkpoint was saved using the previous version. "

From 061dadeecbfb66a317d78805108e471e02575a81 Mon Sep 17 00:00:00 2001
From: 1e100 <38598618+1e100@users.noreply.github.com>
Date: Fri, 13 Sep 2019 21:03:20 -0700
Subject: [PATCH 34/36] Set a member var, not class var

---
 torchvision/models/mnasnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index ed3aa939e08..d4f36b86803 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -180,7 +180,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                 self.layers[idx] = layer
 
             # The model is now identical to v1, and must be saved as such.
-            MNASNet._version = 1
+            self._version = 1
             warnings.warn(
                 "A new version of MNASNet model has been implemented. "
                 "Your checkpoint was saved using the previous version. "

From d0a43c4a1d418297755b920c0b5a12119bc9b9aa Mon Sep 17 00:00:00 2001
From: Dmitry Belenko <38598618+1e100@users.noreply.github.com>
Date: Thu, 19 Sep 2019 17:48:36 -0700
Subject: [PATCH 35/36] Update mnasnet.py

Remove unused member var as per review.
---
 torchvision/models/mnasnet.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index d4f36b86803..47f45c358dc 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -100,7 +100,6 @@ def __init__(self, alpha, num_classes=1000, dropout=0.2):
         super(MNASNet, self).__init__()
         assert alpha > 0.0
         self.alpha = alpha
-        self.variant = "b1"
         self.num_classes = num_classes
         depths = _get_depths(alpha)
         layers = [

From 00ddb9d112133edb2f8caa71a2602d4723f07ca7 Mon Sep 17 00:00:00 2001
From: Dmitry Belenko <38598618+1e100@users.noreply.github.com>
Date: Fri, 20 Sep 2019 14:54:13 -0700
Subject: [PATCH 36/36] Update the path to weights

---
 torchvision/models/mnasnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 47f45c358dc..59677427f1e 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -9,7 +9,7 @@
 
 _MODEL_URLS = {
     "mnasnet0_5":
-    "https://github.com/1e100/mnasnet_trainer/releases/download/0.3/mnasnet0.5_top1_67.823-3ffadce67e.pth",
+    "https://download.pytorch.org/models/mnasnet0.5_top1_67.823-3ffadce67e.pth",
     "mnasnet0_75": None,
     "mnasnet1_0":
     "https://download.pytorch.org/models/mnasnet1.0_top1_73.512-f206786ef8.pth",