123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423 |
- from functools import partial
- from typing import Any, Callable, List, Optional, Sequence
- import torch
- from torch import nn, Tensor
- from ..ops.misc import Conv2dNormActivation, SqueezeExcitation as SElayer
- from ..transforms._presets import ImageClassification
- from ..utils import _log_api_usage_once
- from ._api import register_model, Weights, WeightsEnum
- from ._meta import _IMAGENET_CATEGORIES
- from ._utils import _make_divisible, _ovewrite_named_param, handle_legacy_interface
- __all__ = [
- "MobileNetV3",
- "MobileNet_V3_Large_Weights",
- "MobileNet_V3_Small_Weights",
- "mobilenet_v3_large",
- "mobilenet_v3_small",
- ]
- class InvertedResidualConfig:
- # Stores information listed at Tables 1 and 2 of the MobileNetV3 paper
- def __init__(
- self,
- input_channels: int,
- kernel: int,
- expanded_channels: int,
- out_channels: int,
- use_se: bool,
- activation: str,
- stride: int,
- dilation: int,
- width_mult: float,
- ):
- self.input_channels = self.adjust_channels(input_channels, width_mult)
- self.kernel = kernel
- self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
- self.out_channels = self.adjust_channels(out_channels, width_mult)
- self.use_se = use_se
- self.use_hs = activation == "HS"
- self.stride = stride
- self.dilation = dilation
- @staticmethod
- def adjust_channels(channels: int, width_mult: float):
- return _make_divisible(channels * width_mult, 8)
- class InvertedResidual(nn.Module):
- # Implemented as described at section 5 of MobileNetV3 paper
- def __init__(
- self,
- cnf: InvertedResidualConfig,
- norm_layer: Callable[..., nn.Module],
- se_layer: Callable[..., nn.Module] = partial(SElayer, scale_activation=nn.Hardsigmoid),
- ):
- super().__init__()
- if not (1 <= cnf.stride <= 2):
- raise ValueError("illegal stride value")
- self.use_res_connect = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
- layers: List[nn.Module] = []
- activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU
- # expand
- if cnf.expanded_channels != cnf.input_channels:
- layers.append(
- Conv2dNormActivation(
- cnf.input_channels,
- cnf.expanded_channels,
- kernel_size=1,
- norm_layer=norm_layer,
- activation_layer=activation_layer,
- )
- )
- # depthwise
- stride = 1 if cnf.dilation > 1 else cnf.stride
- layers.append(
- Conv2dNormActivation(
- cnf.expanded_channels,
- cnf.expanded_channels,
- kernel_size=cnf.kernel,
- stride=stride,
- dilation=cnf.dilation,
- groups=cnf.expanded_channels,
- norm_layer=norm_layer,
- activation_layer=activation_layer,
- )
- )
- if cnf.use_se:
- squeeze_channels = _make_divisible(cnf.expanded_channels // 4, 8)
- layers.append(se_layer(cnf.expanded_channels, squeeze_channels))
- # project
- layers.append(
- Conv2dNormActivation(
- cnf.expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
- )
- )
- self.block = nn.Sequential(*layers)
- self.out_channels = cnf.out_channels
- self._is_cn = cnf.stride > 1
- def forward(self, input: Tensor) -> Tensor:
- result = self.block(input)
- if self.use_res_connect:
- result += input
- return result
- class MobileNetV3(nn.Module):
- def __init__(
- self,
- inverted_residual_setting: List[InvertedResidualConfig],
- last_channel: int,
- num_classes: int = 1000,
- block: Optional[Callable[..., nn.Module]] = None,
- norm_layer: Optional[Callable[..., nn.Module]] = None,
- dropout: float = 0.2,
- **kwargs: Any,
- ) -> None:
- """
- MobileNet V3 main class
- Args:
- inverted_residual_setting (List[InvertedResidualConfig]): Network structure
- last_channel (int): The number of channels on the penultimate layer
- num_classes (int): Number of classes
- block (Optional[Callable[..., nn.Module]]): Module specifying inverted residual building block for mobilenet
- norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
- dropout (float): The droupout probability
- """
- super().__init__()
- _log_api_usage_once(self)
- if not inverted_residual_setting:
- raise ValueError("The inverted_residual_setting should not be empty")
- elif not (
- isinstance(inverted_residual_setting, Sequence)
- and all([isinstance(s, InvertedResidualConfig) for s in inverted_residual_setting])
- ):
- raise TypeError("The inverted_residual_setting should be List[InvertedResidualConfig]")
- if block is None:
- block = InvertedResidual
- if norm_layer is None:
- norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.01)
- layers: List[nn.Module] = []
- # building first layer
- firstconv_output_channels = inverted_residual_setting[0].input_channels
- layers.append(
- Conv2dNormActivation(
- 3,
- firstconv_output_channels,
- kernel_size=3,
- stride=2,
- norm_layer=norm_layer,
- activation_layer=nn.Hardswish,
- )
- )
- # building inverted residual blocks
- for cnf in inverted_residual_setting:
- layers.append(block(cnf, norm_layer))
- # building last several layers
- lastconv_input_channels = inverted_residual_setting[-1].out_channels
- lastconv_output_channels = 6 * lastconv_input_channels
- layers.append(
- Conv2dNormActivation(
- lastconv_input_channels,
- lastconv_output_channels,
- kernel_size=1,
- norm_layer=norm_layer,
- activation_layer=nn.Hardswish,
- )
- )
- self.features = nn.Sequential(*layers)
- self.avgpool = nn.AdaptiveAvgPool2d(1)
- self.classifier = nn.Sequential(
- nn.Linear(lastconv_output_channels, last_channel),
- nn.Hardswish(inplace=True),
- nn.Dropout(p=dropout, inplace=True),
- nn.Linear(last_channel, num_classes),
- )
- for m in self.modules():
- if isinstance(m, nn.Conv2d):
- nn.init.kaiming_normal_(m.weight, mode="fan_out")
- if m.bias is not None:
- nn.init.zeros_(m.bias)
- elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
- nn.init.ones_(m.weight)
- nn.init.zeros_(m.bias)
- elif isinstance(m, nn.Linear):
- nn.init.normal_(m.weight, 0, 0.01)
- nn.init.zeros_(m.bias)
- def _forward_impl(self, x: Tensor) -> Tensor:
- x = self.features(x)
- x = self.avgpool(x)
- x = torch.flatten(x, 1)
- x = self.classifier(x)
- return x
- def forward(self, x: Tensor) -> Tensor:
- return self._forward_impl(x)
- def _mobilenet_v3_conf(
- arch: str, width_mult: float = 1.0, reduced_tail: bool = False, dilated: bool = False, **kwargs: Any
- ):
- reduce_divider = 2 if reduced_tail else 1
- dilation = 2 if dilated else 1
- bneck_conf = partial(InvertedResidualConfig, width_mult=width_mult)
- adjust_channels = partial(InvertedResidualConfig.adjust_channels, width_mult=width_mult)
- if arch == "mobilenet_v3_large":
- inverted_residual_setting = [
- bneck_conf(16, 3, 16, 16, False, "RE", 1, 1),
- bneck_conf(16, 3, 64, 24, False, "RE", 2, 1), # C1
- bneck_conf(24, 3, 72, 24, False, "RE", 1, 1),
- bneck_conf(24, 5, 72, 40, True, "RE", 2, 1), # C2
- bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
- bneck_conf(40, 5, 120, 40, True, "RE", 1, 1),
- bneck_conf(40, 3, 240, 80, False, "HS", 2, 1), # C3
- bneck_conf(80, 3, 200, 80, False, "HS", 1, 1),
- bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
- bneck_conf(80, 3, 184, 80, False, "HS", 1, 1),
- bneck_conf(80, 3, 480, 112, True, "HS", 1, 1),
- bneck_conf(112, 3, 672, 112, True, "HS", 1, 1),
- bneck_conf(112, 5, 672, 160 // reduce_divider, True, "HS", 2, dilation), # C4
- bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation),
- bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160 // reduce_divider, True, "HS", 1, dilation),
- ]
- last_channel = adjust_channels(1280 // reduce_divider) # C5
- elif arch == "mobilenet_v3_small":
- inverted_residual_setting = [
- bneck_conf(16, 3, 16, 16, True, "RE", 2, 1), # C1
- bneck_conf(16, 3, 72, 24, False, "RE", 2, 1), # C2
- bneck_conf(24, 3, 88, 24, False, "RE", 1, 1),
- bneck_conf(24, 5, 96, 40, True, "HS", 2, 1), # C3
- bneck_conf(40, 5, 240, 40, True, "HS", 1, 1),
- bneck_conf(40, 5, 240, 40, True, "HS", 1, 1),
- bneck_conf(40, 5, 120, 48, True, "HS", 1, 1),
- bneck_conf(48, 5, 144, 48, True, "HS", 1, 1),
- bneck_conf(48, 5, 288, 96 // reduce_divider, True, "HS", 2, dilation), # C4
- bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation),
- bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96 // reduce_divider, True, "HS", 1, dilation),
- ]
- last_channel = adjust_channels(1024 // reduce_divider) # C5
- else:
- raise ValueError(f"Unsupported model type {arch}")
- return inverted_residual_setting, last_channel
- def _mobilenet_v3(
- inverted_residual_setting: List[InvertedResidualConfig],
- last_channel: int,
- weights: Optional[WeightsEnum],
- progress: bool,
- **kwargs: Any,
- ) -> MobileNetV3:
- if weights is not None:
- _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
- model = MobileNetV3(inverted_residual_setting, last_channel, **kwargs)
- if weights is not None:
- model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
- return model
- _COMMON_META = {
- "min_size": (1, 1),
- "categories": _IMAGENET_CATEGORIES,
- }
- class MobileNet_V3_Large_Weights(WeightsEnum):
- IMAGENET1K_V1 = Weights(
- url="https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth",
- transforms=partial(ImageClassification, crop_size=224),
- meta={
- **_COMMON_META,
- "num_params": 5483032,
- "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#mobilenetv3-large--small",
- "_metrics": {
- "ImageNet-1K": {
- "acc@1": 74.042,
- "acc@5": 91.340,
- }
- },
- "_ops": 0.217,
- "_file_size": 21.114,
- "_docs": """These weights were trained from scratch by using a simple training recipe.""",
- },
- )
- IMAGENET1K_V2 = Weights(
- url="https://download.pytorch.org/models/mobilenet_v3_large-5c1a4163.pth",
- transforms=partial(ImageClassification, crop_size=224, resize_size=232),
- meta={
- **_COMMON_META,
- "num_params": 5483032,
- "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-reg-tuning",
- "_metrics": {
- "ImageNet-1K": {
- "acc@1": 75.274,
- "acc@5": 92.566,
- }
- },
- "_ops": 0.217,
- "_file_size": 21.107,
- "_docs": """
- These weights improve marginally upon the results of the original paper by using a modified version of
- TorchVision's `new training recipe
- <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
- """,
- },
- )
- DEFAULT = IMAGENET1K_V2
- class MobileNet_V3_Small_Weights(WeightsEnum):
- IMAGENET1K_V1 = Weights(
- url="https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth",
- transforms=partial(ImageClassification, crop_size=224),
- meta={
- **_COMMON_META,
- "num_params": 2542856,
- "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#mobilenetv3-large--small",
- "_metrics": {
- "ImageNet-1K": {
- "acc@1": 67.668,
- "acc@5": 87.402,
- }
- },
- "_ops": 0.057,
- "_file_size": 9.829,
- "_docs": """
- These weights improve upon the results of the original paper by using a simple training recipe.
- """,
- },
- )
- DEFAULT = IMAGENET1K_V1
- @register_model()
- @handle_legacy_interface(weights=("pretrained", MobileNet_V3_Large_Weights.IMAGENET1K_V1))
- def mobilenet_v3_large(
- *, weights: Optional[MobileNet_V3_Large_Weights] = None, progress: bool = True, **kwargs: Any
- ) -> MobileNetV3:
- """
- Constructs a large MobileNetV3 architecture from
- `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`__.
- Args:
- weights (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The
- pretrained weights to use. See
- :class:`~torchvision.models.MobileNet_V3_Large_Weights` below for
- more details, and possible values. By default, no pre-trained
- weights are used.
- progress (bool, optional): If True, displays a progress bar of the
- download to stderr. Default is True.
- **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
- base class. Please refer to the `source code
- <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
- for more details about this class.
- .. autoclass:: torchvision.models.MobileNet_V3_Large_Weights
- :members:
- """
- weights = MobileNet_V3_Large_Weights.verify(weights)
- inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_large", **kwargs)
- return _mobilenet_v3(inverted_residual_setting, last_channel, weights, progress, **kwargs)
- @register_model()
- @handle_legacy_interface(weights=("pretrained", MobileNet_V3_Small_Weights.IMAGENET1K_V1))
- def mobilenet_v3_small(
- *, weights: Optional[MobileNet_V3_Small_Weights] = None, progress: bool = True, **kwargs: Any
- ) -> MobileNetV3:
- """
- Constructs a small MobileNetV3 architecture from
- `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`__.
- Args:
- weights (:class:`~torchvision.models.MobileNet_V3_Small_Weights`, optional): The
- pretrained weights to use. See
- :class:`~torchvision.models.MobileNet_V3_Small_Weights` below for
- more details, and possible values. By default, no pre-trained
- weights are used.
- progress (bool, optional): If True, displays a progress bar of the
- download to stderr. Default is True.
- **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
- base class. Please refer to the `source code
- <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
- for more details about this class.
- .. autoclass:: torchvision.models.MobileNet_V3_Small_Weights
- :members:
- """
- weights = MobileNet_V3_Small_Weights.verify(weights)
- inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_small", **kwargs)
- return _mobilenet_v3(inverted_residual_setting, last_channel, weights, progress, **kwargs)
|