Fix default momentum value of BatchNorm2d in MaxViT from 0.99 to 0.01 (#8312)

NicolasHug · web-flow · commit 6d64cb3a9adc · 2024-03-13T13:39:35.000Z
diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py
@@ -577,7 +577,7 @@ class MaxVit(nn.Module):
         stochastic_depth_prob (float): Probability of stochastic depth. Expands to a list of probabilities for each layer that scales linearly to the specified value.
         squeeze_ratio (float): Squeeze ratio in the SE Layer. Default: 0.25.
         expansion_ratio (float): Expansion ratio in the MBConv bottleneck. Default: 4.
-        norm_layer (Callable[..., nn.Module]): Normalization function. Default: None (setting to None will produce a `BatchNorm2d(eps=1e-3, momentum=0.99)`).
+        norm_layer (Callable[..., nn.Module]): Normalization function. Default: None (setting to None will produce a `BatchNorm2d(eps=1e-3, momentum=0.01)`).
         activation_layer (Callable[..., nn.Module]): Activation function Default: nn.GELU.
         head_dim (int): Dimension of the attention heads.
         mlp_ratio (int): Expansion ratio of the MLP layer. Default: 4.
@@ -623,7 +623,7 @@ def __init__(
         # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L1029-L1030
         # for the exact parameters used in batchnorm
         if norm_layer is None:
-            norm_layer = partial(nn.BatchNorm2d, eps=1e-3, momentum=0.99)
+            norm_layer = partial(nn.BatchNorm2d, eps=1e-3, momentum=0.01)
 
         # Make sure input size will be divisible by the partition size in all blocks
         # Undefined behavior if H or W are not divisible by p
@@ -788,7 +788,8 @@ class MaxVit_T_Weights(WeightsEnum):
             },
             "_ops": 5.558,
             "_file_size": 118.769,
-            "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
+            "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.
+            They were trained with a BatchNorm2D momentum of 0.99 instead of the more correct 0.01.""",
         },
     )
     DEFAULT = IMAGENET1K_V1