From f6ba25f6e1414af38665f8a9c486ae092ccffce9 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Thu, 19 May 2022 15:42:52 +0100
Subject: [PATCH 1/7] Adding an MLP block.

---
 docs/source/ops.rst                      |  1 +
 torchvision/models/vision_transformer.py | 52 ++++++++++++++++++------
 torchvision/ops/__init__.py              |  3 +-
 torchvision/ops/misc.py                  | 34 +++++++++++++++-
 4 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/docs/source/ops.rst b/docs/source/ops.rst
index d045334ce3c..472c45fbab4 100644
--- a/docs/source/ops.rst
+++ b/docs/source/ops.rst
@@ -87,6 +87,7 @@ TorchVision provides commonly used building blocks as layers:
     DeformConv2d
     DropBlock2d
     DropBlock3d
+    MLP
     FrozenBatchNorm2d
     SqueezeExcitation
     StochasticDepth
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
index dad2804e626..0c3ab682dd0 100644
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -6,7 +6,7 @@
 import torch
 import torch.nn as nn
 
-from ..ops.misc import Conv2dNormActivation
+from ..ops.misc import Conv2dNormActivation, MLP
 from ..transforms._presets import ImageClassification, InterpolationMode
 from ..utils import _log_api_usage_once
 from ._api import WeightsEnum, Weights
@@ -37,21 +37,47 @@ class ConvStemConfig(NamedTuple):
     activation_layer: Callable[..., nn.Module] = nn.ReLU
 
 
-class MLPBlock(nn.Sequential):
+class MLPBlock(MLP):
     """Transformer MLP block."""
 
     def __init__(self, in_dim: int, mlp_dim: int, dropout: float):
-        super().__init__()
-        self.linear_1 = nn.Linear(in_dim, mlp_dim)
-        self.act = nn.GELU()
-        self.dropout_1 = nn.Dropout(dropout)
-        self.linear_2 = nn.Linear(mlp_dim, in_dim)
-        self.dropout_2 = nn.Dropout(dropout)
-
-        nn.init.xavier_uniform_(self.linear_1.weight)
-        nn.init.xavier_uniform_(self.linear_2.weight)
-        nn.init.normal_(self.linear_1.bias, std=1e-6)
-        nn.init.normal_(self.linear_2.bias, std=1e-6)
+        super().__init__(in_dim, [mlp_dim], in_dim, activation_layer=nn.GELU, inplace=None, dropout=dropout)
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.normal_(m.bias, std=1e-6)
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            for i in range(2):
+                for type in ["weight", "bias"]:
+                    old_key = f"{prefix}linear_{i+1}.{type}"
+                    new_key = f"{prefix}{3*i}.{type}"
+                    if old_key in state_dict:
+                        state_dict[new_key] = state_dict.pop(old_key)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
 
 
 class EncoderBlock(nn.Module):
diff --git a/torchvision/ops/__init__.py b/torchvision/ops/__init__.py
index d3f27ef1657..333e9246401 100644
--- a/torchvision/ops/__init__.py
+++ b/torchvision/ops/__init__.py
@@ -19,7 +19,7 @@
 from .feature_pyramid_network import FeaturePyramidNetwork
 from .focal_loss import sigmoid_focal_loss
 from .giou_loss import generalized_box_iou_loss
-from .misc import FrozenBatchNorm2d, Conv2dNormActivation, Conv3dNormActivation, SqueezeExcitation
+from .misc import FrozenBatchNorm2d, Conv2dNormActivation, Conv3dNormActivation, SqueezeExcitation, MLP
 from .poolers import MultiScaleRoIAlign
 from .ps_roi_align import ps_roi_align, PSRoIAlign
 from .ps_roi_pool import ps_roi_pool, PSRoIPool
@@ -61,6 +61,7 @@
     "Conv2dNormActivation",
     "Conv3dNormActivation",
     "SqueezeExcitation",
+    "MLP",
     "generalized_box_iou_loss",
     "distance_box_iou_loss",
     "complete_box_iou_loss",
diff --git a/torchvision/ops/misc.py b/torchvision/ops/misc.py
index a4635099215..f4fb65c8fcc 100644
--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -129,7 +129,7 @@ class Conv2dNormActivation(ConvNormActivation):
         padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
         norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
-        activation_layer (Callable[..., torch.nn.Module], optinal): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
         dilation (int): Spacing between kernel elements. Default: 1
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
         bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
@@ -253,3 +253,35 @@ def _scale(self, input: Tensor) -> Tensor:
     def forward(self, input: Tensor) -> Tensor:
         scale = self._scale(input)
         return scale * input
+
+
+class MLP(torch.nn.Sequential):
+    # TODO: Add docs
+
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_channels: List[int],
+        out_channels: int,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        inplace: Optional[bool] = True,
+        bias: bool = True,
+        dropout: float = 0.0,
+    ):
+        layers = []
+        in_dim = in_channels
+        for hidden_dim in hidden_channels:
+            layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
+            if norm_layer is not None:
+                layers.append(norm_layer(hidden_dim))
+            params = {} if inplace is None else {"inplace": inplace}
+            layers.append(activation_layer(**params))
+            layers.append(torch.nn.Dropout(dropout, inplace=inplace))
+            in_dim = hidden_dim
+
+        layers.append(torch.nn.Linear(in_dim, out_channels, bias=bias))
+        layers.append(torch.nn.Dropout(dropout, inplace=inplace))
+
+        super().__init__(*layers)
+        _log_api_usage_once(self)

From eb30cb8ec76eb5c15ab7b44514cdfcce9f1de8e9 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Thu, 19 May 2022 15:54:17 +0100
Subject: [PATCH 2/7] Adding documentation

---
 torchvision/models/swin_transformer.py |  2 +-
 torchvision/ops/misc.py                | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/swin_transformer.py b/torchvision/models/swin_transformer.py
index 6e001c1d2dd..d4642e8b406 100644
--- a/torchvision/models/swin_transformer.py
+++ b/torchvision/models/swin_transformer.py
@@ -12,7 +12,7 @@
 from ._meta import _IMAGENET_CATEGORIES
 from ._utils import _ovewrite_named_param
 from .convnext import Permute
-from .vision_transformer import MLPBlock
+from .vision_transformer import MLPBlock  # TODO: remove this, patch the weights and use MLP instead
 
 
 __all__ = [
diff --git a/torchvision/ops/misc.py b/torchvision/ops/misc.py
index f4fb65c8fcc..aede55f1863 100644
--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -256,7 +256,19 @@ def forward(self, input: Tensor) -> Tensor:
 
 
 class MLP(torch.nn.Sequential):
-    # TODO: Add docs
+    """
+    This block implements the multi-layer perceptron (MLP) module.
+
+    Args:
+        in_channels (int): Number of channels of the input
+        hidden_channels (List[int]): List of the hidden channel dimensions
+        out_channels (int): Number of channels of the output
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``None``
+        activation_layer (Callable[..., torch.nn.Module], optinal): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        bias (bool): Whether to use bias in the linear layer. Default ``True``
+        dropout (float): The probability for the dropout layer. Default: 0.0
+    """
 
     def __init__(
         self,

From 1dfc3122437219e2ceddbded448ce05f6af711e9 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Thu, 19 May 2022 16:05:16 +0100
Subject: [PATCH 3/7] Update typos.

---
 torchvision/ops/misc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/ops/misc.py b/torchvision/ops/misc.py
index aede55f1863..c0e3a96cb91 100644
--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -179,7 +179,7 @@ class Conv3dNormActivation(ConvNormActivation):
         padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
         norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm3d``
-        activation_layer (Callable[..., torch.nn.Module], optinal): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
         dilation (int): Spacing between kernel elements. Default: 1
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
         bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
@@ -264,7 +264,7 @@ class MLP(torch.nn.Sequential):
         hidden_channels (List[int]): List of the hidden channel dimensions
         out_channels (int): Number of channels of the output
         norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``None``
-        activation_layer (Callable[..., torch.nn.Module], optinal): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
         bias (bool): Whether to use bias in the linear layer. Default ``True``
         dropout (float): The probability for the dropout layer. Default: 0.0

From 39921e816a859c8e1d6bc98ac91aa3bf1bc506d7 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Thu, 19 May 2022 16:34:23 +0100
Subject: [PATCH 4/7] Fix inplace for Dropout.

---
 torchvision/ops/misc.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/torchvision/ops/misc.py b/torchvision/ops/misc.py
index c0e3a96cb91..be4b5e05711 100644
--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -281,19 +281,20 @@ def __init__(
         bias: bool = True,
         dropout: float = 0.0,
     ):
+        params = {} if inplace is None else {"inplace": inplace}
+
         layers = []
         in_dim = in_channels
         for hidden_dim in hidden_channels:
             layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
             if norm_layer is not None:
                 layers.append(norm_layer(hidden_dim))
-            params = {} if inplace is None else {"inplace": inplace}
             layers.append(activation_layer(**params))
-            layers.append(torch.nn.Dropout(dropout, inplace=inplace))
+            layers.append(torch.nn.Dropout(dropout, **params))
             in_dim = hidden_dim
 
         layers.append(torch.nn.Linear(in_dim, out_channels, bias=bias))
-        layers.append(torch.nn.Dropout(dropout, inplace=inplace))
+        layers.append(torch.nn.Dropout(dropout, **params))
 
         super().__init__(*layers)
         _log_api_usage_once(self)

From 743457da438cca673d9e43e8b8e1f9970ca1f508 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Thu, 19 May 2022 17:06:50 +0100
Subject: [PATCH 5/7] Apply recommendations from code review.

---
 torchvision/models/vision_transformer.py | 3 ++-
 torchvision/ops/misc.py                  | 9 +++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
index 0c3ab682dd0..063d51749b4 100644
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -41,7 +41,7 @@ class MLPBlock(MLP):
     """Transformer MLP block."""
 
     def __init__(self, in_dim: int, mlp_dim: int, dropout: float):
-        super().__init__(in_dim, [mlp_dim], in_dim, activation_layer=nn.GELU, inplace=None, dropout=dropout)
+        super().__init__(in_dim, [mlp_dim, in_dim], activation_layer=nn.GELU, inplace=None, dropout=dropout)
 
         for m in self.modules():
             if isinstance(m, nn.Linear):
@@ -62,6 +62,7 @@ def _load_from_state_dict(
         version = local_metadata.get("version", None)
 
         if version is None or version < 2:
+            # Replacing legacy MLPBlock with MLP. See https://github.com/pytorch/vision/pull/6053
             for i in range(2):
                 for type in ["weight", "bias"]:
                     old_key = f"{prefix}linear_{i+1}.{type}"
diff --git a/torchvision/ops/misc.py b/torchvision/ops/misc.py
index be4b5e05711..916e9cd38a5 100644
--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -256,13 +256,11 @@ def forward(self, input: Tensor) -> Tensor:
 
 
 class MLP(torch.nn.Sequential):
-    """
-    This block implements the multi-layer perceptron (MLP) module.
+    """This block implements the multi-layer perceptron (MLP) module.
 
     Args:
         in_channels (int): Number of channels of the input
         hidden_channels (List[int]): List of the hidden channel dimensions
-        out_channels (int): Number of channels of the output
         norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``None``
         activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
@@ -274,7 +272,6 @@ def __init__(
         self,
         in_channels: int,
         hidden_channels: List[int],
-        out_channels: int,
         norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
         activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
         inplace: Optional[bool] = True,
@@ -285,7 +282,7 @@ def __init__(
 
         layers = []
         in_dim = in_channels
-        for hidden_dim in hidden_channels:
+        for hidden_dim in hidden_channels[:-1]:
             layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
             if norm_layer is not None:
                 layers.append(norm_layer(hidden_dim))
@@ -293,7 +290,7 @@ def __init__(
             layers.append(torch.nn.Dropout(dropout, **params))
             in_dim = hidden_dim
 
-        layers.append(torch.nn.Linear(in_dim, out_channels, bias=bias))
+        layers.append(torch.nn.Linear(in_dim, hidden_channels[-1], bias=bias))
         layers.append(torch.nn.Dropout(dropout, **params))
 
         super().__init__(*layers)

From 9356107b3574571d45fa65faad04dd8fd914143e Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Thu, 19 May 2022 17:35:05 +0100
Subject: [PATCH 6/7] Making changes on pre-trained models.

---
 torchvision/models/swin_transformer.py | 18 ++++++++++++------
 torchvision/ops/misc.py                |  2 ++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/torchvision/models/swin_transformer.py b/torchvision/models/swin_transformer.py
index d4642e8b406..dacec59c91a 100644
--- a/torchvision/models/swin_transformer.py
+++ b/torchvision/models/swin_transformer.py
@@ -6,13 +6,13 @@
 from torch import nn, Tensor
 
 from ..ops.stochastic_depth import StochasticDepth
+from ..ops.misc import MLP
 from ..transforms._presets import ImageClassification, InterpolationMode
 from ..utils import _log_api_usage_once
 from ._api import WeightsEnum, Weights
 from ._meta import _IMAGENET_CATEGORIES
 from ._utils import _ovewrite_named_param
-from .convnext import Permute
-from .vision_transformer import MLPBlock  # TODO: remove this, patch the weights and use MLP instead
+from .convnext import Permute  # TODO: move Permute on ops
 
 
 __all__ = [
@@ -263,7 +263,13 @@ def __init__(
         )
         self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
         self.norm2 = norm_layer(dim)
-        self.mlp = MLPBlock(dim, int(dim * mlp_ratio), dropout)
+        self.mlp = MLP(dim, [int(dim * mlp_ratio), dim], activation_layer=nn.GELU, inplace=None, dropout=dropout)
+
+        for m in self.mlp.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.normal_(m.bias, std=1e-6)
 
     def forward(self, x: Tensor):
         x = x + self.stochastic_depth(self.attn(self.norm1(x)))
@@ -412,7 +418,7 @@ def _swin_transformer(
 
 class Swin_T_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
-        url="https://download.pytorch.org/models/swin_t-4c37bd06.pth",
+        url="https://download.pytorch.org/models/swin_t-704ceda3.pth",
         transforms=partial(
             ImageClassification, crop_size=224, resize_size=232, interpolation=InterpolationMode.BICUBIC
         ),
@@ -435,7 +441,7 @@ class Swin_T_Weights(WeightsEnum):
 
 class Swin_S_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
-        url="https://download.pytorch.org/models/swin_s-30134662.pth",
+        url="https://download.pytorch.org/models/swin_s-5e29d889.pth",
         transforms=partial(
             ImageClassification, crop_size=224, resize_size=246, interpolation=InterpolationMode.BICUBIC
         ),
@@ -458,7 +464,7 @@ class Swin_S_Weights(WeightsEnum):
 
 class Swin_B_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
-        url="https://download.pytorch.org/models/swin_b-1f1feb5c.pth",
+        url="https://download.pytorch.org/models/swin_b-68c6b09e.pth",
         transforms=partial(
             ImageClassification, crop_size=224, resize_size=238, interpolation=InterpolationMode.BICUBIC
         ),
diff --git a/torchvision/ops/misc.py b/torchvision/ops/misc.py
index 916e9cd38a5..2e4816c9f22 100644
--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -278,6 +278,8 @@ def __init__(
         bias: bool = True,
         dropout: float = 0.0,
     ):
+        # The addition of `norm_layer` is inspired from the implementation of TorchMultimodal:
+        # https://github.com/facebookresearch/multimodal/blob/5dec8a/torchmultimodal/modules/layers/mlp.py
         params = {} if inplace is None else {"inplace": inplace}
 
         layers = []

From 48d178d7e7cb7065267a22a1f282d0482ecb98cc Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Thu, 19 May 2022 18:10:23 +0100
Subject: [PATCH 7/7] Fix linter

---
 torchvision/models/swin_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/models/swin_transformer.py b/torchvision/models/swin_transformer.py
index dacec59c91a..148bfa1c4a2 100644
--- a/torchvision/models/swin_transformer.py
+++ b/torchvision/models/swin_transformer.py
@@ -5,8 +5,8 @@
 import torch.nn.functional as F
 from torch import nn, Tensor
 
-from ..ops.stochastic_depth import StochasticDepth
 from ..ops.misc import MLP
+from ..ops.stochastic_depth import StochasticDepth
 from ..transforms._presets import ImageClassification, InterpolationMode
 from ..utils import _log_api_usage_once
 from ._api import WeightsEnum, Weights