Refactoring base module (ConvModule) (#3783)

* Split `ConvModule` to per dimension Module * Remove `conv_cfg` * Remove `build_conv_layer` * Move `DepthwiseSeparableConvModule` into `conv_module` * precommit * Remove `build_conv_layer` vestige * Remove assertion errors * Remove unused `efficient_conv_bn_eval` * Fix unit test * Remove `order`
openvinotoolkit · Aug 7, 2024 · 7f7f299 · 7f7f299
1 parent f618a37
commit 7f7f299
Show file tree

Hide file tree

Showing 44 changed files with 461 additions and 998 deletions.
diff --git a/src/otx/algo/action_classification/backbones/x3d.py b/src/otx/algo/action_classification/backbones/x3d.py
@@ -3,6 +3,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 """X3D backbone implementation."""
+
 from __future__ import annotations
 
 import math
@@ -12,7 +13,7 @@
 from torch.nn.modules.batchnorm import _BatchNorm
 
 from otx.algo.modules.activation import Swish, build_activation_layer
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv3dModule
 from otx.algo.utils.mmengine_utils import load_checkpoint
 from otx.algo.utils.weight_init import constant_init, kaiming_init
 
@@ -70,8 +71,6 @@ class BlockX3D(nn.Module):
             unit. If set as None, it means not using SE unit. Default: None.
         use_swish (bool): Whether to use swish as the activation function
             before and after the 3x3x3 conv. Default: True.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: ``dict(type='Conv3d')``.
         norm_cfg (dict): Config for norm layers. required keys are ``type``,
             Default: ``dict(type='BN3d')``.
         act_cfg (dict): Config dict for activation layer.
@@ -89,7 +88,6 @@ def __init__(
         downsample: nn.Module | None = None,
         se_ratio: float | None = None,
         use_swish: bool = True,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         with_cp: bool = False,
@@ -103,47 +101,43 @@ def __init__(
         self.downsample = downsample
         self.se_ratio = se_ratio
         self.use_swish = use_swish
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
         self.act_cfg_swish = Swish()
         self.with_cp = with_cp
 
-        self.conv1 = ConvModule(
+        self.conv1 = Conv3dModule(
             in_channels=inplanes,
             out_channels=planes,
             kernel_size=1,
             stride=1,
             padding=0,
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=self.act_cfg,
         )
         # Here we use the channel-wise conv
-        self.conv2 = ConvModule(
+        self.conv2 = Conv3dModule(
             in_channels=planes,
             out_channels=planes,
             kernel_size=3,
             stride=(1, self.spatial_stride, self.spatial_stride),
             padding=1,
             groups=planes,
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=None,
         )
 
         self.swish = Swish()
 
-        self.conv3 = ConvModule(
+        self.conv3 = Conv3dModule(
             in_channels=planes,
             out_channels=outplanes,
             kernel_size=1,
             stride=1,
             padding=0,
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=None,
         )
@@ -201,8 +195,6 @@ class X3DBackbone(nn.Module):
             unit. If set as None, it means not using SE unit. Default: 1 / 16.
         use_swish (bool): Whether to use swish as the activation function
             before and after the 3x3x3 conv. Default: True.
-        conv_cfg (dict): Config for conv layers. required keys are ``type``
-            Default: ``dict(type='Conv3d')``.
         norm_cfg (dict): Config for norm layers. required keys are ``type`` and
             ``requires_grad``.
             Default: ``dict(type='BN3d', requires_grad=True)``.
@@ -231,7 +223,6 @@ def __init__(
         se_style: str = "half",
         se_ratio: float = 1 / 16,
         use_swish: bool = True,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         norm_eval: bool = False,
@@ -275,7 +266,6 @@ def __init__(
             raise ValueError(msg)
         self.use_swish = use_swish
 
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
         self.norm_eval = norm_eval
@@ -304,7 +294,6 @@ def __init__(
                 se_ratio=self.se_ratio,
                 use_swish=self.use_swish,
                 norm_cfg=self.norm_cfg,
-                conv_cfg=self.conv_cfg,
                 act_cfg=self.act_cfg,
                 with_cp=with_cp,
                 **kwargs,
@@ -315,14 +304,13 @@ def __init__(
             self.res_layers.append(layer_name)
 
         self.feat_dim = self.base_channels * 2 ** (len(self.stage_blocks) - 1)
-        self.conv5 = ConvModule(
+        self.conv5 = Conv3dModule(
             self.feat_dim,
             int(self.feat_dim * self.gamma_b),
             kernel_size=1,
             stride=1,
             padding=0,
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=self.act_cfg,
         )
@@ -363,7 +351,6 @@ def make_res_layer(
         use_swish: bool = True,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
-        conv_cfg: dict | None = None,
         with_cp: bool = False,
         **kwargs,
     ) -> nn.Module:
@@ -388,7 +375,6 @@ def make_res_layer(
                 Default: None.
             use_swish (bool): Whether to use swish as the activation function
                 before and after the 3x3x3 conv. Default: True.
-            conv_cfg (dict | None): Config for norm layers. Default: None.
             norm_cfg (dict | None): Config for norm layers. Default: None.
             act_cfg (dict | None): Config for activate layers. Default: None.
             with_cp (bool | None): Use checkpoint or not. Using checkpoint
@@ -400,14 +386,13 @@ def make_res_layer(
         """
         downsample = None
         if spatial_stride != 1 or layer_inplanes != inplanes:
-            downsample = ConvModule(
+            downsample = Conv3dModule(
                 layer_inplanes,
                 inplanes,
                 kernel_size=1,
                 stride=(1, spatial_stride, spatial_stride),
                 padding=0,
                 bias=False,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=None,
             )
@@ -431,7 +416,6 @@ def make_res_layer(
                 se_ratio=se_ratio if use_se[0] else None,
                 use_swish=use_swish,
                 norm_cfg=norm_cfg,
-                conv_cfg=conv_cfg,
                 act_cfg=act_cfg,
                 with_cp=with_cp,
                 **kwargs,
@@ -448,7 +432,6 @@ def make_res_layer(
                     se_ratio=se_ratio if use_se[i] else None,
                     use_swish=use_swish,
                     norm_cfg=norm_cfg,
-                    conv_cfg=conv_cfg,
                     act_cfg=act_cfg,
                     with_cp=with_cp,
                     **kwargs,
@@ -459,26 +442,24 @@ def make_res_layer(
 
     def _make_stem_layer(self) -> None:
         """Construct the stem layers consists of a conv+norm+act module and a pooling layer."""
-        self.conv1_s = ConvModule(
+        self.conv1_s = Conv3dModule(
             self.in_channels,
             self.base_channels,
             kernel_size=(1, 3, 3),
             stride=(1, 2, 2),
             padding=(0, 1, 1),
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=None,
             act_cfg=None,
         )
-        self.conv1_t = ConvModule(
+        self.conv1_t = Conv3dModule(
             self.base_channels,
             self.base_channels,
             kernel_size=(5, 1, 1),
             stride=(1, 1, 1),
             padding=(2, 0, 0),
             groups=self.base_channels,
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=self.act_cfg,
         )

diff --git a/src/otx/algo/action_classification/x3d.py b/src/otx/algo/action_classification/x3d.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 """X3D model implementation."""
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -63,7 +64,6 @@ def _build_model(self, num_classes: int) -> nn.Module:
                 gamma_b=2.25,
                 gamma_d=2.2,
                 gamma_w=1,
-                conv_cfg={"type": "Conv3d"},
                 norm_cfg={"type": "BN3d", "requires_grad": True},
                 act_cfg={"type": "ReLU", "inplace": True},
             ),

diff --git a/src/otx/algo/classification/backbones/efficientnet.py b/src/otx/algo/classification/backbones/efficientnet.py
@@ -1,7 +1,8 @@
-# Copyright (C) 2023 Intel Corporation
+# Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 """EfficientNet Module."""
+
 from __future__ import annotations
 
 import math
@@ -14,7 +15,7 @@
 from torch.nn import functional, init
 
 from otx.algo.modules.activation import build_activation_layer
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv2dModule
 from otx.algo.utils.mmengine_utils import load_checkpoint_to_model
 
 PRETRAINED_ROOT = "https://github.com/osmr/imgclsmob/releases/download/v0.0.364/"
@@ -33,9 +34,9 @@ def conv1x1_block(
     use_bn: bool = True,
     bn_eps: float = 1e-5,
     activation: str | None = "ReLU",
-) -> ConvModule:
+) -> Conv2dModule:
     """Conv block."""
-    return ConvModule(
+    return Conv2dModule(
         in_channels=in_channels,
         out_channels=out_channels,
         kernel_size=1,
@@ -59,9 +60,9 @@ def conv3x3_block(
     use_bn: bool = True,
     bn_eps: float = 1e-5,
     activation: str | None = "ReLU",
-) -> ConvModule:
+) -> Conv2dModule:
     """Conv block."""
-    return ConvModule(
+    return Conv2dModule(
         in_channels=in_channels,
         out_channels=out_channels,
         kernel_size=3,
@@ -85,9 +86,9 @@ def dwconv3x3_block(
     use_bn: bool = True,
     bn_eps: float = 1e-5,
     activation: str | None = "ReLU",
-) -> ConvModule:
+) -> Conv2dModule:
     """Conv block."""
-    return ConvModule(
+    return Conv2dModule(
         in_channels=in_channels,
         out_channels=out_channels,
         kernel_size=3,
@@ -111,9 +112,9 @@ def dwconv5x5_block(
     use_bn: bool = True,
     bn_eps: float = 1e-5,
     activation: str | None = "ReLU",
-) -> ConvModule:
+) -> Conv2dModule:
     """Conv block."""
-    return ConvModule(
+    return Conv2dModule(
         in_channels=in_channels,
         out_channels=out_channels,
         kernel_size=5,

diff --git a/src/otx/algo/common/backbones/cspnext.py b/src/otx/algo/common/backbones/cspnext.py
@@ -14,8 +14,7 @@
 from otx.algo.common.layers import SPPBottleneck
 from otx.algo.detection.layers import CSPLayer
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv_module import ConvModule
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 from torch import Tensor, nn
 from torch.nn.modules.batchnorm import _BatchNorm
 
@@ -44,8 +43,6 @@ class CSPNeXt(BaseModule):
             layers. Defaults to (5, 9, 13).
         channel_attention (bool): Whether to add channel attention in each
             stage. Defaults to True.
-        conv_cfg (dict, optional): Config dict for
-            convolution layer. Defaults to None.
         norm_cfg (dict): Dictionary to construct and
             config norm layer. Defaults to dict(type='BN', requires_grad=True).
         act_cfg (dict): Config dict for activation layer.
@@ -86,7 +83,6 @@ def __init__(
         arch_ovewrite: dict | None = None,
         spp_kernel_sizes: tuple[int, int, int] = (5, 9, 13),
         channel_attention: bool = True,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         norm_eval: bool = False,
@@ -121,9 +117,9 @@ def __init__(
         self.frozen_stages = frozen_stages
         self.use_depthwise = use_depthwise
         self.norm_eval = norm_eval
-        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        conv = DepthwiseSeparableConvModule if use_depthwise else Conv2dModule
         self.stem = nn.Sequential(
-            ConvModule(
+            Conv2dModule(
                 3,
                 int(arch_setting[0][0] * widen_factor // 2),
                 3,
@@ -132,7 +128,7 @@ def __init__(
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             ),
-            ConvModule(
+            Conv2dModule(
                 int(arch_setting[0][0] * widen_factor // 2),
                 int(arch_setting[0][0] * widen_factor // 2),
                 3,
@@ -141,7 +137,7 @@ def __init__(
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             ),
-            ConvModule(
+            Conv2dModule(
                 int(arch_setting[0][0] * widen_factor // 2),
                 int(arch_setting[0][0] * widen_factor),
                 3,
@@ -164,7 +160,6 @@ def __init__(
                 3,
                 stride=2,
                 padding=1,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             )
@@ -174,7 +169,6 @@ def __init__(
                     out_channels,
                     out_channels,
                     kernel_sizes=spp_kernel_sizes,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 )
@@ -188,7 +182,6 @@ def __init__(
                 use_cspnext_block=True,
                 expand_ratio=expand_ratio,
                 channel_attention=channel_attention,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             )