diff --git a/torchvision/models/efficientnetv2.py b/torchvision/models/efficientnetv2.py new file mode 100644 index 00000000000..eef8763cf0a --- /dev/null +++ b/torchvision/models/efficientnetv2.py @@ -0,0 +1,324 @@ +import copy +import math +from functools import partial +from typing import Any, Callable, Optional, List, Sequence + +import torch +from torch import nn, Tensor + +from ._utils import _make_divisible +from .._internally_replaced_utils import load_state_dict_from_url +from ..ops import StochasticDepth +from .efficientnet import MBConv, ConvNormActivation + + +__all__ = [ + "EfficientNetV2", + "efficientnet_v2_s", # 384 + "efficientnet_v2_m", # 480 + "efficientnet_v2_l", # 480 +] + + +model_urls = { + # Weights ported from https://github.com/rwightman/pytorch-image-models/ + "efficientnet_v2_s": "", + 'efficientnet_v2_m': "", + 'efficientnet_v2_l': "" +} + + +class MBConvConfig: + # Stores information listed at Table 1 of the EfficientNet paper + def __init__( + self, + block_type: str, + expand_ratio: float, + kernel: int, + stride: int, + input_channels: int, + out_channels: int, + num_layers: int, + ) -> None: + self.block_type = block_type + self.expand_ratio = expand_ratio + self.kernel = kernel + self.stride = stride + self.input_channels = input_channels + self.out_channels = out_channels + self.num_layers = num_layers + + def __repr__(self) -> str: + s = self.__class__.__name__ + "(" + s += "block_type={block_type}" + s += "expand_ratio={expand_ratio}" + s += ", kernel={kernel}" + s += ", stride={stride}" + s += ", input_channels={input_channels}" + s += ", out_channels={out_channels}" + s += ", num_layers={num_layers}" + s += ")" + return s.format(**self.__dict__) + + @staticmethod + def adjust_channels(channels: int, width_mult: float, min_value: Optional[int] = None) -> int: + return _make_divisible(channels * width_mult, 8, min_value) + + @staticmethod + def adjust_depth(num_layers: int, depth_mult: float): + return int(math.ceil(num_layers * depth_mult)) + + +class FusedMBConv(nn.Module): + def __init__( + self, + cnf: MBConvConfig, + stochastic_depth_prob: float, + norm_layer: Callable[..., nn.Module], + se_layer: Callable[..., nn.Module] = None, + ) -> None: + super().__init__() + + if not (1 <= cnf.stride <= 2): + raise ValueError("illegal stride value") + + self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels + + layers: List[nn.Module] = [] + activation_layer = nn.SiLU + + # expand + expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio) + if expanded_channels != cnf.input_channels: + layers.append( + ConvNormActivation( + cnf.input_channels, + expanded_channels, + kernel_size=cnf.kernel, + stride=cnf.stride, + norm_layer=norm_layer, + activation_layer=activation_layer, + ) + ) + + if se_layer: + # squeeze and excitation + squeeze_channels = max(1, cnf.input_channels // 4) + layers.append(se_layer(expanded_channels, squeeze_channels, activation=partial(nn.SiLU, inplace=True))) + + # project + layers.append( + ConvNormActivation( + expanded_channels, + cnf.out_channels, + kernel_size=1 if expanded_channels != cnf.input_channels else cnf.kernel, + stride=1 if expanded_channels != cnf.input_channels else cnf.stride, + norm_layer=norm_layer, + activation_layer=None if expanded_channels != cnf.input_channels else activation_layer, + ) + ) + + self.block = nn.Sequential(*layers) + self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row") + self.out_channels = cnf.out_channels + + def forward(self, input: Tensor) -> Tensor: + result = self.block(input) + if self.use_res_connect: + result = self.stochastic_depth(result) + result += input + return result + + +class EfficientNetV2(nn.Module): + def __init__( + self, + block_setting: List[MBConvConfig], + dropout: float, + lastconv_output_channels: int = 1280, + stochastic_depth_prob: float = 0.2, + num_classes: int = 1000, + norm_layer: Optional[Callable[..., nn.Module]] = None, + **kwargs: Any, + ) -> None: + """ + EfficientNetV2 main class + Args: + block_setting (List): Network structure + dropout (float): The droupout probability + lastconv_output_channels (int): the output channels of last conv layer + stochastic_depth_prob (float): The stochastic depth probability + num_classes (int): Number of classes + block (Optional[Callable[..., nn.Module]]): Module specifying inverted residual building block for mobilenet + norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use + """ + super().__init__() + + if not block_setting: + raise ValueError("The block_setting should not be empty") + elif not ( + isinstance(block_setting, Sequence) + and all([isinstance(s, MBConvConfig) for s in block_setting]) + ): + raise TypeError("The block_setting should be List[MBConvConfig]") + + if norm_layer is None: + norm_layer = nn.BatchNorm2d + + layers: List[nn.Module] = [] + + # building first layer + firstconv_output_channels = block_setting[0].input_channels + layers.append( + ConvNormActivation( + 3, firstconv_output_channels, kernel_size=3, stride=2, norm_layer=norm_layer, activation_layer=nn.SiLU + ) + ) + + # building blocks + total_stage_blocks = sum(cnf.num_layers for cnf in block_setting) + stage_block_id = 0 + for cnf in block_setting: + block = MBConv if cnf.block_type == 'MB' else FusedMBConv + stage: List[nn.Module] = [] + for _ in range(cnf.num_layers): + # copy to avoid modifications. shallow copy is enough + block_cnf = copy.copy(cnf) + + # overwrite info if not the first conv in the stage + if stage: + block_cnf.input_channels = block_cnf.out_channels + block_cnf.stride = 1 + + # adjust stochastic depth probability based on the depth of the stage block + sd_prob = stochastic_depth_prob * float(stage_block_id) / total_stage_blocks + + stage.append(block(block_cnf, sd_prob, norm_layer)) + stage_block_id += 1 + + layers.append(nn.Sequential(*stage)) + + # building last several layers + lastconv_input_channels = block_setting[-1].out_channels + if lastconv_output_channels is None: + lastconv_output_channels = 4 * lastconv_input_channels + layers.append( + ConvNormActivation( + lastconv_input_channels, + lastconv_output_channels, + kernel_size=1, + norm_layer=norm_layer, + activation_layer=nn.SiLU, + ) + ) + + self.features = nn.Sequential(*layers) + self.avgpool = nn.AdaptiveAvgPool2d(1) + self.classifier = nn.Sequential( + nn.Dropout(p=dropout, inplace=True), + nn.Linear(lastconv_output_channels, num_classes), + ) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out") + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + init_range = 1.0 / math.sqrt(m.out_features) + nn.init.uniform_(m.weight, -init_range, init_range) + nn.init.zeros_(m.bias) + + def _forward_impl(self, x: Tensor) -> Tensor: + x = self.features(x) + + x = self.avgpool(x) + x = torch.flatten(x, 1) + + x = self.classifier(x) + + return x + + def forward(self, x: Tensor) -> Tensor: + return self._forward_impl(x) + + +def _efficientnet_v2( + arch: str, + block_setting, + dropout: float, + lastconv_output_channels: int, + pretrained: bool, + progress: bool, + **kwargs: Any, +) -> EfficientNetV2: + + model = EfficientNetV2(block_setting, dropout, lastconv_output_channels=lastconv_output_channels, **kwargs) + if pretrained: + if model_urls.get(arch, None) is None: + raise ValueError(f"No checkpoint is available for model type {arch}") + state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) + model.load_state_dict(state_dict) + return model + + +def efficientnet_v2_s(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNetV2: + """ + Constructs a EfficientNetV2-S architecture from + `"EfficientNetV2: Smaller Models and Faster Training" `_. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + block_setting = [ + MBConvConfig('FusedMB', 1, 3, 1, 24, 24, 2), + MBConvConfig('FusedMB', 4, 3, 2, 24, 48, 4), + MBConvConfig('FusedMB', 4, 3, 2, 48, 64, 4), + MBConvConfig('MB', 4, 3, 2, 64, 128, 6), + MBConvConfig('MB', 6, 3, 1, 128, 160, 9), + MBConvConfig('MB', 6, 3, 2, 160, 256, 15) + ] + return _efficientnet_v2("efficientnet_v2_s", block_setting, 0., 1280, pretrained, progress, **kwargs) + + +def efficientnet_v2_m(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNetV2: + """ + Constructs a EfficientNetV2-M architecture from + `"EfficientNetV2: Smaller Models and Faster Training" `_. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + block_setting = [ + MBConvConfig('FusedMB', 1, 3, 1, 24, 24, 3), + MBConvConfig('FusedMB', 4, 3, 2, 24, 48, 5), + MBConvConfig('FusedMB', 4, 3, 2, 48, 80, 5), + MBConvConfig('MB', 4, 3, 2, 80, 160, 7), + MBConvConfig('MB', 6, 3, 1, 160, 176, 14), + MBConvConfig('MB', 6, 3, 2, 176, 304, 18), + MBConvConfig('MB', 6, 3, 1, 304, 512, 5) + ] + return _efficientnet_v2("efficientnet_v2_m", block_setting, 0.2, 1280, pretrained, progress, **kwargs) + + +def efficientnet_v2_l(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> EfficientNetV2: + """ + Constructs a EfficientNetV2-L architecture from + `"EfficientNetV2: Smaller Models and Faster Training" `_. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + block_setting = [ + MBConvConfig('FusedMB', 1, 3, 1, 32, 32, 4), + MBConvConfig('FusedMB', 4, 3, 2, 32, 64, 7), + MBConvConfig('FusedMB', 4, 3, 2, 64, 96, 7), + MBConvConfig('MB', 4, 3, 2, 96, 192, 10), + MBConvConfig('MB', 6, 3, 1, 192, 224, 19), + MBConvConfig('MB', 6, 3, 2, 224, 384, 25), + MBConvConfig('MB', 6, 3, 1, 384, 640, 7), + ] + return _efficientnet_v2("efficientnet_v2_l", block_setting, 0.5, 1280, pretrained, progress, **kwargs)