From 7f7f299198329afede0924583b9b4c84221f138f Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Wed, 7 Aug 2024 09:20:27 +0900
Subject: [PATCH] Refactoring base module (`ConvModule`) (#3783)

* Split `ConvModule` to per dimension Module

* Remove `conv_cfg`

* Remove `build_conv_layer`

* Move `DepthwiseSeparableConvModule` into `conv_module`

* precommit

* Remove `build_conv_layer` vestige

* Remove assertion errors

* Remove unused `efficient_conv_bn_eval`

* Fix unit test

* Remove `order`
---
 .../action_classification/backbones/x3d.py    |  37 +--
 src/otx/algo/action_classification/x3d.py     |   2 +-
 .../classification/backbones/efficientnet.py  |  21 +-
 src/otx/algo/common/backbones/cspnext.py      |  17 +-
 src/otx/algo/common/backbones/resnet.py       |  16 +-
 src/otx/algo/common/backbones/resnext.py      |  10 +-
 src/otx/algo/common/layers/res_layer.py       |  10 +-
 src/otx/algo/common/layers/spp_layer.py       |  16 +-
 .../algo/detection/backbones/csp_darknet.py   |  17 +-
 src/otx/algo/detection/backbones/presnet.py   |  26 +-
 src/otx/algo/detection/heads/atss_head.py     |  12 +-
 src/otx/algo/detection/heads/rtmdet_head.py   |  13 +-
 src/otx/algo/detection/heads/yolox_head.py    |  22 +-
 src/otx/algo/detection/layers/csp_layer.py    |  64 ++--
 src/otx/algo/detection/necks/cspnext_pafpn.py |  15 +-
 src/otx/algo/detection/necks/fpn.py           |  14 +-
 .../algo/detection/necks/hybrid_encoder.py    |   6 +-
 src/otx/algo/detection/necks/yolox_pafpn.py   |  16 +-
 .../instance_segmentation/backbones/swin.py   |   1 -
 .../heads/convfc_bbox_head.py                 |   2 -
 .../heads/fcn_mask_head.py                    |  11 +-
 .../instance_segmentation/heads/rpn_head.py   |   4 +-
 .../heads/rtmdet_ins_head.py                  |  16 +-
 .../layers/transformer.py                     |   9 +-
 .../algo/instance_segmentation/necks/fpn.py   |  11 +-
 src/otx/algo/modules/__init__.py              |   8 +-
 src/otx/algo/modules/conv.py                  |  54 ----
 src/otx/algo/modules/conv_module.py           | 288 ++++++++----------
 .../depthwise_separable_conv_module.py        | 110 -------
 src/otx/algo/modules/transformer.py           |  10 +-
 .../algo/segmentation/backbones/litehrnet.py  | 182 +++--------
 .../algo/segmentation/heads/base_segm_head.py |   4 -
 src/otx/algo/segmentation/heads/fcn_head.py   |  12 +-
 src/otx/algo/segmentation/heads/ham_head.py   |  12 +-
 .../algo/segmentation/modules/aggregators.py  |  17 +-
 src/otx/algo/segmentation/modules/blocks.py   |  25 +-
 .../algo/detection/heads/test_yolox_head.py   |   5 +-
 .../algo/detection/layers/test_csp_layer.py   |   5 +-
 .../algo/detection/necks/test_yolox_pafpn.py  |   2 +-
 tests/unit/algo/modules/test_conv.py          |  33 --
 tests/unit/algo/modules/test_conv_module.py   | 194 ++++++------
 .../test_depthwise_separable_conv_module.py   | 104 -------
 tests/unit/algo/modules/test_norm.py          |   2 +-
 .../algo/segmentation/modules/test_blokcs.py  |   4 -
 44 files changed, 461 insertions(+), 998 deletions(-)
 delete mode 100644 src/otx/algo/modules/conv.py
 delete mode 100644 src/otx/algo/modules/depthwise_separable_conv_module.py
 delete mode 100644 tests/unit/algo/modules/test_conv.py
 delete mode 100644 tests/unit/algo/modules/test_depthwise_separable_conv_module.py

diff --git a/src/otx/algo/action_classification/backbones/x3d.py b/src/otx/algo/action_classification/backbones/x3d.py
index 11805b52679..7deef62a9f6 100644
--- a/src/otx/algo/action_classification/backbones/x3d.py
+++ b/src/otx/algo/action_classification/backbones/x3d.py
@@ -3,6 +3,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 """X3D backbone implementation."""
+
 from __future__ import annotations
 
 import math
@@ -12,7 +13,7 @@
 from torch.nn.modules.batchnorm import _BatchNorm
 
 from otx.algo.modules.activation import Swish, build_activation_layer
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv3dModule
 from otx.algo.utils.mmengine_utils import load_checkpoint
 from otx.algo.utils.weight_init import constant_init, kaiming_init
 
@@ -70,8 +71,6 @@ class BlockX3D(nn.Module):
             unit. If set as None, it means not using SE unit. Default: None.
         use_swish (bool): Whether to use swish as the activation function
             before and after the 3x3x3 conv. Default: True.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: ``dict(type='Conv3d')``.
         norm_cfg (dict): Config for norm layers. required keys are ``type``,
             Default: ``dict(type='BN3d')``.
         act_cfg (dict): Config dict for activation layer.
@@ -89,7 +88,6 @@ def __init__(
         downsample: nn.Module | None = None,
         se_ratio: float | None = None,
         use_swish: bool = True,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         with_cp: bool = False,
@@ -103,25 +101,23 @@ def __init__(
         self.downsample = downsample
         self.se_ratio = se_ratio
         self.use_swish = use_swish
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
         self.act_cfg_swish = Swish()
         self.with_cp = with_cp
 
-        self.conv1 = ConvModule(
+        self.conv1 = Conv3dModule(
             in_channels=inplanes,
             out_channels=planes,
             kernel_size=1,
             stride=1,
             padding=0,
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=self.act_cfg,
         )
         # Here we use the channel-wise conv
-        self.conv2 = ConvModule(
+        self.conv2 = Conv3dModule(
             in_channels=planes,
             out_channels=planes,
             kernel_size=3,
@@ -129,21 +125,19 @@ def __init__(
             padding=1,
             groups=planes,
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=None,
         )
 
         self.swish = Swish()
 
-        self.conv3 = ConvModule(
+        self.conv3 = Conv3dModule(
             in_channels=planes,
             out_channels=outplanes,
             kernel_size=1,
             stride=1,
             padding=0,
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=None,
         )
@@ -201,8 +195,6 @@ class X3DBackbone(nn.Module):
             unit. If set as None, it means not using SE unit. Default: 1 / 16.
         use_swish (bool): Whether to use swish as the activation function
             before and after the 3x3x3 conv. Default: True.
-        conv_cfg (dict): Config for conv layers. required keys are ``type``
-            Default: ``dict(type='Conv3d')``.
         norm_cfg (dict): Config for norm layers. required keys are ``type`` and
             ``requires_grad``.
             Default: ``dict(type='BN3d', requires_grad=True)``.
@@ -231,7 +223,6 @@ def __init__(
         se_style: str = "half",
         se_ratio: float = 1 / 16,
         use_swish: bool = True,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         norm_eval: bool = False,
@@ -275,7 +266,6 @@ def __init__(
             raise ValueError(msg)
         self.use_swish = use_swish
 
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
         self.norm_eval = norm_eval
@@ -304,7 +294,6 @@ def __init__(
                 se_ratio=self.se_ratio,
                 use_swish=self.use_swish,
                 norm_cfg=self.norm_cfg,
-                conv_cfg=self.conv_cfg,
                 act_cfg=self.act_cfg,
                 with_cp=with_cp,
                 **kwargs,
@@ -315,14 +304,13 @@ def __init__(
             self.res_layers.append(layer_name)
 
         self.feat_dim = self.base_channels * 2 ** (len(self.stage_blocks) - 1)
-        self.conv5 = ConvModule(
+        self.conv5 = Conv3dModule(
             self.feat_dim,
             int(self.feat_dim * self.gamma_b),
             kernel_size=1,
             stride=1,
             padding=0,
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=self.act_cfg,
         )
@@ -363,7 +351,6 @@ def make_res_layer(
         use_swish: bool = True,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
-        conv_cfg: dict | None = None,
         with_cp: bool = False,
         **kwargs,
     ) -> nn.Module:
@@ -388,7 +375,6 @@ def make_res_layer(
                 Default: None.
             use_swish (bool): Whether to use swish as the activation function
                 before and after the 3x3x3 conv. Default: True.
-            conv_cfg (dict | None): Config for norm layers. Default: None.
             norm_cfg (dict | None): Config for norm layers. Default: None.
             act_cfg (dict | None): Config for activate layers. Default: None.
             with_cp (bool | None): Use checkpoint or not. Using checkpoint
@@ -400,14 +386,13 @@ def make_res_layer(
         """
         downsample = None
         if spatial_stride != 1 or layer_inplanes != inplanes:
-            downsample = ConvModule(
+            downsample = Conv3dModule(
                 layer_inplanes,
                 inplanes,
                 kernel_size=1,
                 stride=(1, spatial_stride, spatial_stride),
                 padding=0,
                 bias=False,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=None,
             )
@@ -431,7 +416,6 @@ def make_res_layer(
                 se_ratio=se_ratio if use_se[0] else None,
                 use_swish=use_swish,
                 norm_cfg=norm_cfg,
-                conv_cfg=conv_cfg,
                 act_cfg=act_cfg,
                 with_cp=with_cp,
                 **kwargs,
@@ -448,7 +432,6 @@ def make_res_layer(
                     se_ratio=se_ratio if use_se[i] else None,
                     use_swish=use_swish,
                     norm_cfg=norm_cfg,
-                    conv_cfg=conv_cfg,
                     act_cfg=act_cfg,
                     with_cp=with_cp,
                     **kwargs,
@@ -459,18 +442,17 @@ def make_res_layer(
 
     def _make_stem_layer(self) -> None:
         """Construct the stem layers consists of a conv+norm+act module and a pooling layer."""
-        self.conv1_s = ConvModule(
+        self.conv1_s = Conv3dModule(
             self.in_channels,
             self.base_channels,
             kernel_size=(1, 3, 3),
             stride=(1, 2, 2),
             padding=(0, 1, 1),
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=None,
             act_cfg=None,
         )
-        self.conv1_t = ConvModule(
+        self.conv1_t = Conv3dModule(
             self.base_channels,
             self.base_channels,
             kernel_size=(5, 1, 1),
@@ -478,7 +460,6 @@ def _make_stem_layer(self) -> None:
             padding=(2, 0, 0),
             groups=self.base_channels,
             bias=False,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=self.act_cfg,
         )
diff --git a/src/otx/algo/action_classification/x3d.py b/src/otx/algo/action_classification/x3d.py
index dbb6cb0f490..7f503dadfd4 100644
--- a/src/otx/algo/action_classification/x3d.py
+++ b/src/otx/algo/action_classification/x3d.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 """X3D model implementation."""
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -63,7 +64,6 @@ def _build_model(self, num_classes: int) -> nn.Module:
                 gamma_b=2.25,
                 gamma_d=2.2,
                 gamma_w=1,
-                conv_cfg={"type": "Conv3d"},
                 norm_cfg={"type": "BN3d", "requires_grad": True},
                 act_cfg={"type": "ReLU", "inplace": True},
             ),
diff --git a/src/otx/algo/classification/backbones/efficientnet.py b/src/otx/algo/classification/backbones/efficientnet.py
index 9682dda3ce4..55646d434bd 100644
--- a/src/otx/algo/classification/backbones/efficientnet.py
+++ b/src/otx/algo/classification/backbones/efficientnet.py
@@ -1,7 +1,8 @@
-# Copyright (C) 2023 Intel Corporation
+# Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 """EfficientNet Module."""
+
 from __future__ import annotations
 
 import math
@@ -14,7 +15,7 @@
 from torch.nn import functional, init
 
 from otx.algo.modules.activation import build_activation_layer
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv2dModule
 from otx.algo.utils.mmengine_utils import load_checkpoint_to_model
 
 PRETRAINED_ROOT = "https://github.com/osmr/imgclsmob/releases/download/v0.0.364/"
@@ -33,9 +34,9 @@ def conv1x1_block(
     use_bn: bool = True,
     bn_eps: float = 1e-5,
     activation: str | None = "ReLU",
-) -> ConvModule:
+) -> Conv2dModule:
     """Conv block."""
-    return ConvModule(
+    return Conv2dModule(
         in_channels=in_channels,
         out_channels=out_channels,
         kernel_size=1,
@@ -59,9 +60,9 @@ def conv3x3_block(
     use_bn: bool = True,
     bn_eps: float = 1e-5,
     activation: str | None = "ReLU",
-) -> ConvModule:
+) -> Conv2dModule:
     """Conv block."""
-    return ConvModule(
+    return Conv2dModule(
         in_channels=in_channels,
         out_channels=out_channels,
         kernel_size=3,
@@ -85,9 +86,9 @@ def dwconv3x3_block(
     use_bn: bool = True,
     bn_eps: float = 1e-5,
     activation: str | None = "ReLU",
-) -> ConvModule:
+) -> Conv2dModule:
     """Conv block."""
-    return ConvModule(
+    return Conv2dModule(
         in_channels=in_channels,
         out_channels=out_channels,
         kernel_size=3,
@@ -111,9 +112,9 @@ def dwconv5x5_block(
     use_bn: bool = True,
     bn_eps: float = 1e-5,
     activation: str | None = "ReLU",
-) -> ConvModule:
+) -> Conv2dModule:
     """Conv block."""
-    return ConvModule(
+    return Conv2dModule(
         in_channels=in_channels,
         out_channels=out_channels,
         kernel_size=5,
diff --git a/src/otx/algo/common/backbones/cspnext.py b/src/otx/algo/common/backbones/cspnext.py
index c6347e61554..2b98783c6f7 100644
--- a/src/otx/algo/common/backbones/cspnext.py
+++ b/src/otx/algo/common/backbones/cspnext.py
@@ -14,8 +14,7 @@
 from otx.algo.common.layers import SPPBottleneck
 from otx.algo.detection.layers import CSPLayer
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv_module import ConvModule
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 from torch import Tensor, nn
 from torch.nn.modules.batchnorm import _BatchNorm
 
@@ -44,8 +43,6 @@ class CSPNeXt(BaseModule):
             layers. Defaults to (5, 9, 13).
         channel_attention (bool): Whether to add channel attention in each
             stage. Defaults to True.
-        conv_cfg (dict, optional): Config dict for
-            convolution layer. Defaults to None.
         norm_cfg (dict): Dictionary to construct and
             config norm layer. Defaults to dict(type='BN', requires_grad=True).
         act_cfg (dict): Config dict for activation layer.
@@ -86,7 +83,6 @@ def __init__(
         arch_ovewrite: dict | None = None,
         spp_kernel_sizes: tuple[int, int, int] = (5, 9, 13),
         channel_attention: bool = True,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         norm_eval: bool = False,
@@ -121,9 +117,9 @@ def __init__(
         self.frozen_stages = frozen_stages
         self.use_depthwise = use_depthwise
         self.norm_eval = norm_eval
-        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        conv = DepthwiseSeparableConvModule if use_depthwise else Conv2dModule
         self.stem = nn.Sequential(
-            ConvModule(
+            Conv2dModule(
                 3,
                 int(arch_setting[0][0] * widen_factor // 2),
                 3,
@@ -132,7 +128,7 @@ def __init__(
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             ),
-            ConvModule(
+            Conv2dModule(
                 int(arch_setting[0][0] * widen_factor // 2),
                 int(arch_setting[0][0] * widen_factor // 2),
                 3,
@@ -141,7 +137,7 @@ def __init__(
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             ),
-            ConvModule(
+            Conv2dModule(
                 int(arch_setting[0][0] * widen_factor // 2),
                 int(arch_setting[0][0] * widen_factor),
                 3,
@@ -164,7 +160,6 @@ def __init__(
                 3,
                 stride=2,
                 padding=1,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             )
@@ -174,7 +169,6 @@ def __init__(
                     out_channels,
                     out_channels,
                     kernel_sizes=spp_kernel_sizes,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 )
@@ -188,7 +182,6 @@ def __init__(
                 use_cspnext_block=True,
                 expand_ratio=expand_ratio,
                 channel_attention=channel_attention,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             )
diff --git a/src/otx/algo/common/backbones/resnet.py b/src/otx/algo/common/backbones/resnet.py
index eb6d20c18f0..6e993a71b0c 100644
--- a/src/otx/algo/common/backbones/resnet.py
+++ b/src/otx/algo/common/backbones/resnet.py
@@ -15,7 +15,6 @@
 import torch.utils.checkpoint as cp
 from otx.algo.common.layers import ResLayer
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv import build_conv_layer
 from otx.algo.modules.norm import build_norm_layer
 from torch import nn
 from torch.nn.modules.batchnorm import _BatchNorm
@@ -35,7 +34,6 @@ def __init__(
         dilation: int = 1,
         downsample: nn.Module | None = None,
         with_cp: bool = False,
-        conv_cfg: dict | None = None,
         init_cfg: dict | None = None,
     ):
         """Bottleneck block for ResNet.
@@ -50,7 +48,6 @@ def __init__(
         self.stride = stride
         self.dilation = dilation
         self.with_cp = with_cp
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
 
         self.conv1_stride = 1
@@ -60,11 +57,10 @@ def __init__(
         self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
         self.norm3_name, norm3 = build_norm_layer(norm_cfg, planes * self.expansion, postfix=3)
 
-        self.conv1 = build_conv_layer(conv_cfg, inplanes, planes, kernel_size=1, stride=self.conv1_stride, bias=False)
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=self.conv1_stride, bias=False)
         self.add_module(self.norm1_name, norm1)
 
-        self.conv2 = build_conv_layer(
-            conv_cfg,
+        self.conv2 = nn.Conv2d(
             planes,
             planes,
             kernel_size=3,
@@ -75,7 +71,7 @@ def __init__(
         )
 
         self.add_module(self.norm2_name, norm2)
-        self.conv3 = build_conv_layer(conv_cfg, planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
         self.add_module(self.norm3_name, norm3)
 
         self.relu = nn.ReLU(inplace=True)
@@ -183,7 +179,6 @@ def __init__(
         out_indices: tuple[int, int, int, int] = (0, 1, 2, 3),
         avg_down: bool = False,
         frozen_stages: int = -1,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         norm_eval: bool = True,
         with_cp: bool = False,
@@ -240,7 +235,6 @@ def __init__(
             raise ValueError(msg)
         self.avg_down = avg_down
         self.frozen_stages = frozen_stages
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.with_cp = with_cp
         self.norm_eval = norm_eval
@@ -264,7 +258,6 @@ def __init__(
                 dilation=dilation,
                 avg_down=self.avg_down,
                 with_cp=with_cp,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 init_cfg=block_init_cfg,
             )
@@ -287,8 +280,7 @@ def norm1(self) -> nn.Module:
         return getattr(self, self.norm1_name)
 
     def _make_stem_layer(self, in_channels: int, stem_channels: int) -> None:
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
+        self.conv1 = nn.Conv2d(
             in_channels,
             stem_channels,
             kernel_size=7,
diff --git a/src/otx/algo/common/backbones/resnext.py b/src/otx/algo/common/backbones/resnext.py
index 339e00bed78..2ec6dfc20c5 100644
--- a/src/otx/algo/common/backbones/resnext.py
+++ b/src/otx/algo/common/backbones/resnext.py
@@ -12,8 +12,8 @@
 from typing import ClassVar
 
 from otx.algo.common.layers import ResLayer
-from otx.algo.modules.conv import build_conv_layer
 from otx.algo.modules.norm import build_norm_layer
+from torch import nn
 
 from .resnet import Bottleneck as _Bottleneck
 from .resnet import ResNet
@@ -46,8 +46,7 @@ def __init__(
         self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, width, postfix=2)
         self.norm3_name, norm3 = build_norm_layer(self.norm_cfg, self.planes * self.expansion, postfix=3)
 
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
+        self.conv1 = nn.Conv2d(
             self.inplanes,
             width,
             kernel_size=1,
@@ -55,8 +54,7 @@ def __init__(
             bias=False,
         )
         self.add_module(self.norm1_name, norm1)
-        self.conv2 = build_conv_layer(
-            self.conv_cfg,
+        self.conv2 = nn.Conv2d(
             width,
             width,
             kernel_size=3,
@@ -67,7 +65,7 @@ def __init__(
             bias=False,
         )
         self.add_module(self.norm2_name, norm2)
-        self.conv3 = build_conv_layer(self.conv_cfg, width, self.planes * self.expansion, kernel_size=1, bias=False)
+        self.conv3 = nn.Conv2d(width, self.planes * self.expansion, kernel_size=1, bias=False)
         self.add_module(self.norm3_name, norm3)
 
     def _del_block_plugins(self, plugin_names: list[str]) -> None:
diff --git a/src/otx/algo/common/layers/res_layer.py b/src/otx/algo/common/layers/res_layer.py
index f37ed1ee439..aeae090b304 100644
--- a/src/otx/algo/common/layers/res_layer.py
+++ b/src/otx/algo/common/layers/res_layer.py
@@ -9,7 +9,6 @@
 from __future__ import annotations
 
 from otx.algo.modules.base_module import BaseModule, Sequential
-from otx.algo.modules.conv import build_conv_layer
 from otx.algo.modules.norm import build_norm_layer
 from torch import nn
 
@@ -25,8 +24,6 @@ class ResLayer(Sequential):
         stride (int): stride of the first block. Defaults to 1
         avg_down (bool): Use AvgPool instead of stride conv when
             downsampling in the bottleneck. Defaults to False
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Defaults to None
         norm_cfg (dict): dictionary to construct and config norm layer.
             Defaults to dict(type='BN')
         downsample_first (bool): Downsample at the first block or last block.
@@ -42,7 +39,6 @@ def __init__(
         norm_cfg: dict,
         stride: int = 1,
         avg_down: bool = False,
-        conv_cfg: dict | None = None,
         downsample_first: bool = True,
         **kwargs,
     ) -> None:
@@ -64,8 +60,7 @@ def __init__(
                 )
             downsample.extend(
                 [
-                    build_conv_layer(
-                        conv_cfg,
+                    nn.Conv2d(
                         inplanes,
                         planes * block.expansion,
                         kernel_size=1,
@@ -85,7 +80,6 @@ def __init__(
                     planes=planes,
                     stride=stride,
                     downsample=downsample,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     **kwargs,
                 ),
@@ -93,7 +87,7 @@ def __init__(
             inplanes = planes * block.expansion
             layers.extend(
                 [
-                    block(inplanes=inplanes, planes=planes, stride=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, **kwargs)
+                    block(inplanes=inplanes, planes=planes, stride=1, norm_cfg=norm_cfg, **kwargs)
                     for _ in range(1, num_blocks)
                 ],
             )
diff --git a/src/otx/algo/common/layers/spp_layer.py b/src/otx/algo/common/layers/spp_layer.py
index 0ceb253ec9d..d314bacea9d 100644
--- a/src/otx/algo/common/layers/spp_layer.py
+++ b/src/otx/algo/common/layers/spp_layer.py
@@ -10,7 +10,7 @@
 
 import torch
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv2dModule
 from torch import Tensor, nn
 
 
@@ -22,8 +22,6 @@ class SPPBottleneck(BaseModule):
         out_channels (int): The output channels of this Module.
         kernel_sizes (tuple[int]): Sequential of kernel sizes of pooling
             layers. Default: (5, 9, 13).
-        conv_cfg (dict): Config dict for convolution layer. Default: None,
-            which means using conv2d.
         norm_cfg (dict): Config dict for normalization layer.
             Default: dict(type='BN').
         act_cfg (dict): Config dict for activation layer.
@@ -37,7 +35,6 @@ def __init__(
         in_channels: int,
         out_channels: int,
         kernel_sizes: tuple[int, ...] = (5, 9, 13),
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         init_cfg: dict | list[dict] | None = None,
@@ -47,18 +44,23 @@ def __init__(
         act_cfg = act_cfg or {"type": "Swish"}
 
         mid_channels = in_channels // 2
-        self.conv1 = ConvModule(
+        self.conv1 = Conv2dModule(
             in_channels,
             mid_channels,
             1,
             stride=1,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg=act_cfg,
         )
         self.poolings = nn.ModuleList([nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) for ks in kernel_sizes])
         conv2_channels = mid_channels * (len(kernel_sizes) + 1)
-        self.conv2 = ConvModule(conv2_channels, out_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv2 = Conv2dModule(
+            conv2_channels,
+            out_channels,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+        )
 
     def forward(self, x: Tensor) -> Tensor:
         """Forward."""
diff --git a/src/otx/algo/detection/backbones/csp_darknet.py b/src/otx/algo/detection/backbones/csp_darknet.py
index 1ca142aec08..6e92b995b06 100644
--- a/src/otx/algo/detection/backbones/csp_darknet.py
+++ b/src/otx/algo/detection/backbones/csp_darknet.py
@@ -18,8 +18,7 @@
 from otx.algo.common.layers import SPPBottleneck
 from otx.algo.detection.layers import CSPLayer
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv_module import ConvModule
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 
 
 class Focus(nn.Module):
@@ -30,8 +29,6 @@ class Focus(nn.Module):
         out_channels (int): The output channels of this Module.
         kernel_size (int): The kernel size of the convolution. Default: 1
         stride (int): The stride of the convolution. Default: 1
-        conv_cfg (dict): Config dict for convolution layer. Default: None,
-            which means using conv2d.
         norm_cfg (dict): Config dict for normalization layer.
             Default: dict(type='BN', momentum=0.03, eps=0.001).
         act_cfg (dict): Config dict for activation layer.
@@ -44,20 +41,18 @@ def __init__(
         out_channels: int,
         kernel_size: int = 1,
         stride: int = 1,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
     ):
         super().__init__()
         norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001}
         act_cfg = act_cfg or {"type": "Swish"}
-        self.conv = ConvModule(
+        self.conv = Conv2dModule(
             in_channels * 4,
             out_channels,
             kernel_size,
             stride,
             padding=(kernel_size - 1) // 2,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg=act_cfg,
         )
@@ -113,7 +108,6 @@ class CSPDarknet(BaseModule):
         arch_ovewrite(list): Overwrite default arch settings. Default: None.
         spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP
             layers. Default: (5, 9, 13).
-        conv_cfg (dict): Config dict for convolution layer. Default: None.
         norm_cfg (dict): Dictionary to construct and config norm layer.
             Default: dict(type='BN', requires_grad=True).
         act_cfg (dict): Config dict for activation layer.
@@ -153,7 +147,6 @@ def __init__(
         use_depthwise: bool = False,
         arch_ovewrite: list | None = None,
         spp_kernal_sizes: tuple[int, ...] = (5, 9, 13),
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         norm_eval: bool = False,
@@ -183,13 +176,12 @@ def __init__(
         self.frozen_stages = frozen_stages
         self.use_depthwise = use_depthwise
         self.norm_eval = norm_eval
-        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        conv = DepthwiseSeparableConvModule if use_depthwise else Conv2dModule
 
         self.stem = Focus(
             3,
             int(arch_setting[0][0] * widen_factor),
             kernel_size=3,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg=act_cfg,
         )
@@ -206,7 +198,6 @@ def __init__(
                 3,
                 stride=2,
                 padding=1,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             )
@@ -216,7 +207,6 @@ def __init__(
                     out_channels,
                     out_channels,
                     kernel_sizes=spp_kernal_sizes,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 )
@@ -227,7 +217,6 @@ def __init__(
                 num_blocks=num_blocks,
                 add_identity=add_identity,
                 use_depthwise=use_depthwise,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             )
diff --git a/src/otx/algo/detection/backbones/presnet.py b/src/otx/algo/detection/backbones/presnet.py
index e3c96d43d63..b31f7f95c3a 100644
--- a/src/otx/algo/detection/backbones/presnet.py
+++ b/src/otx/algo/detection/backbones/presnet.py
@@ -11,8 +11,9 @@
 import torch
 from torch import nn
 
-from otx.algo.modules import ConvModule, build_activation_layer
+from otx.algo.modules import build_activation_layer
 from otx.algo.modules.base_module import BaseModule
+from otx.algo.modules.conv_module import Conv2dModule
 
 __all__ = ["PResNet"]
 
@@ -42,15 +43,15 @@ def __init__(
                     OrderedDict(
                         [
                             ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
-                            ("conv", ConvModule(ch_in, ch_out, 1, 1, act_cfg=None, norm_cfg=norm_cfg)),
+                            ("conv", Conv2dModule(ch_in, ch_out, 1, 1, act_cfg=None, norm_cfg=norm_cfg)),
                         ],
                     ),
                 )
             else:
-                self.short = ConvModule(ch_in, ch_out, 1, stride, act_cfg=None, norm_cfg=norm_cfg)
+                self.short = Conv2dModule(ch_in, ch_out, 1, stride, act_cfg=None, norm_cfg=norm_cfg)
 
-        self.branch2a = ConvModule(ch_in, ch_out, 3, stride, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg)
-        self.branch2b = ConvModule(ch_out, ch_out, 3, 1, padding=1, act_cfg=None, norm_cfg=norm_cfg)
+        self.branch2a = Conv2dModule(ch_in, ch_out, 3, stride, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg)
+        self.branch2b = Conv2dModule(ch_out, ch_out, 3, 1, padding=1, act_cfg=None, norm_cfg=norm_cfg)
         self.act = nn.Identity() if act_cfg is None else build_activation_layer(act_cfg)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -88,9 +89,9 @@ def __init__(
 
         width = ch_out
 
-        self.branch2a = ConvModule(ch_in, width, 1, stride1, act_cfg=act_cfg, norm_cfg=norm_cfg)
-        self.branch2b = ConvModule(width, width, 3, stride2, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg)
-        self.branch2c = ConvModule(width, ch_out * self.expansion, 1, 1, act_cfg=None, norm_cfg=norm_cfg)
+        self.branch2a = Conv2dModule(ch_in, width, 1, stride1, act_cfg=act_cfg, norm_cfg=norm_cfg)
+        self.branch2b = Conv2dModule(width, width, 3, stride2, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg)
+        self.branch2c = Conv2dModule(width, ch_out * self.expansion, 1, 1, act_cfg=None, norm_cfg=norm_cfg)
 
         self.shortcut = shortcut
         if not shortcut:
@@ -99,12 +100,15 @@ def __init__(
                     OrderedDict(
                         [
                             ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
-                            ("conv", ConvModule(ch_in, ch_out * self.expansion, 1, 1, act_cfg=None, norm_cfg=norm_cfg)),
+                            (
+                                "conv",
+                                Conv2dModule(ch_in, ch_out * self.expansion, 1, 1, act_cfg=None, norm_cfg=norm_cfg),
+                            ),
                         ],
                     ),
                 )
             else:
-                self.short = ConvModule(ch_in, ch_out * self.expansion, 1, stride, act_cfg=None, norm_cfg=norm_cfg)
+                self.short = Conv2dModule(ch_in, ch_out * self.expansion, 1, stride, act_cfg=None, norm_cfg=norm_cfg)
 
         self.act = nn.Identity() if act_cfg is None else build_activation_layer(act_cfg)
 
@@ -216,7 +220,7 @@ def __init__(
         self.conv1 = nn.Sequential(
             OrderedDict(
                 [
-                    (_name, ConvModule(c_in, c_out, k, s, padding=(k - 1) // 2, act_cfg=act_cfg, norm_cfg=norm_cfg))
+                    (_name, Conv2dModule(c_in, c_out, k, s, padding=(k - 1) // 2, act_cfg=act_cfg, norm_cfg=norm_cfg))
                     for c_in, c_out, k, s, _name in conv_def
                 ],
             ),
diff --git a/src/otx/algo/detection/heads/atss_head.py b/src/otx/algo/detection/heads/atss_head.py
index 20d8ebd7de0..9d85dbf0b77 100644
--- a/src/otx/algo/detection/heads/atss_head.py
+++ b/src/otx/algo/detection/heads/atss_head.py
@@ -20,7 +20,7 @@
 )
 from otx.algo.detection.utils.prior_generators.utils import anchor_inside_flags
 from otx.algo.detection.utils.utils import unmap
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv2dModule
 from otx.algo.modules.scale import Scale
 from otx.algo.utils.mmengine_utils import InstanceData
 
@@ -39,8 +39,6 @@ class ATSSHead(ClassIncrementalMixin, AnchorHead):
         in_channels (int): Number of channels in the input feature map.
         pred_kernel_size (int): Kernel size of ``nn.Conv2d``. Defaults to 3.
         stacked_convs (int): Number of stacking convs of the head. Defaults to 4.
-        conv_cfg (dict, optional): Config dict for convolution layer.
-            Defaults to None.
         norm_cfg (dict): Config dict for normalization layer.
             Defaults to ``dict(type='GN', num_groups=32, requires_grad=True)``.
         reg_decoded_bbox (bool): If true, the regression loss would be
@@ -58,7 +56,6 @@ def __init__(
         in_channels: int,
         pred_kernel_size: int = 3,
         stacked_convs: int = 4,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         reg_decoded_bbox: bool = True,
         loss_centerness: nn.Module | None = None,
@@ -70,7 +67,6 @@ def __init__(
     ) -> None:
         self.pred_kernel_size = pred_kernel_size
         self.stacked_convs = stacked_convs
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg or {"type": "GN", "num_groups": 32, "requires_grad": True}
         init_cfg = init_cfg or {
             "type": "Normal",
@@ -111,24 +107,22 @@ def _init_layers(self) -> None:
         for i in range(self.stacked_convs):
             chn = self.in_channels if i == 0 else self.feat_channels
             self.cls_convs.append(
-                ConvModule(
+                Conv2dModule(
                     chn,
                     self.feat_channels,
                     3,
                     stride=1,
                     padding=1,
-                    conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg,
                 ),
             )
             self.reg_convs.append(
-                ConvModule(
+                Conv2dModule(
                     chn,
                     self.feat_channels,
                     3,
                     stride=1,
                     padding=1,
-                    conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg,
                 ),
             )
diff --git a/src/otx/algo/detection/heads/rtmdet_head.py b/src/otx/algo/detection/heads/rtmdet_head.py
index 12573eebd96..429c03cbe05 100644
--- a/src/otx/algo/detection/heads/rtmdet_head.py
+++ b/src/otx/algo/detection/heads/rtmdet_head.py
@@ -20,8 +20,7 @@
     sigmoid_geometric_mean,
     unmap,
 )
-from otx.algo.modules.conv_module import ConvModule
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 from otx.algo.modules.norm import is_norm
 from otx.algo.modules.scale import Scale
 from otx.algo.utils.mmengine_utils import InstanceData
@@ -61,25 +60,23 @@ def _init_layers(self) -> None:
         for i in range(self.stacked_convs):
             chn = self.in_channels if i == 0 else self.feat_channels
             self.cls_convs.append(
-                ConvModule(
+                Conv2dModule(
                     chn,
                     self.feat_channels,
                     3,
                     stride=1,
                     padding=1,
-                    conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg,
                     act_cfg=self.act_cfg,
                 ),
             )
             self.reg_convs.append(
-                ConvModule(
+                Conv2dModule(
                     chn,
                     self.feat_channels,
                     3,
                     stride=1,
                     padding=1,
-                    conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg,
                     act_cfg=self.act_cfg,
                 ),
@@ -680,7 +677,7 @@ def __init__(
 
     def _init_layers(self) -> None:
         """Initialize layers of the head."""
-        conv = DepthwiseSeparableConvModule if self.use_depthwise else ConvModule
+        conv = DepthwiseSeparableConvModule if self.use_depthwise else Conv2dModule
         self.cls_convs = nn.ModuleList()
         self.reg_convs = nn.ModuleList()
 
@@ -700,7 +697,6 @@ def _init_layers(self) -> None:
                         3,
                         stride=1,
                         padding=1,
-                        conv_cfg=self.conv_cfg,
                         norm_cfg=self.norm_cfg,
                         act_cfg=self.act_cfg,
                     ),
@@ -712,7 +708,6 @@ def _init_layers(self) -> None:
                         3,
                         stride=1,
                         padding=1,
-                        conv_cfg=self.conv_cfg,
                         norm_cfg=self.norm_cfg,
                         act_cfg=self.act_cfg,
                     ),
diff --git a/src/otx/algo/detection/heads/yolox_head.py b/src/otx/algo/detection/heads/yolox_head.py
index 1e4c6ab0015..7f8a12fbef2 100644
--- a/src/otx/algo/detection/heads/yolox_head.py
+++ b/src/otx/algo/detection/heads/yolox_head.py
@@ -8,6 +8,7 @@
 
 from __future__ import annotations
 
+import logging
 import math
 from typing import Sequence
 
@@ -23,10 +24,11 @@
 from otx.algo.common.utils.utils import multi_apply, reduce_mean
 from otx.algo.detection.heads.base_head import BaseDenseHead
 from otx.algo.detection.losses import IoULoss
-from otx.algo.modules.conv_module import ConvModule
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 from otx.algo.utils.mmengine_utils import InstanceData
 
+logger = logging.getLogger()
+
 
 class YOLOXHead(BaseDenseHead):
     """YOLOXHead head used in `YOLOX <https://arxiv.org/abs/2107.08430>`_.
@@ -47,8 +49,6 @@ class YOLOXHead(BaseDenseHead):
         conv_bias (bool or str): If specified as `auto`, it will be decided by
             the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
             None, otherwise False. Defaults to "auto".
-        conv_cfg (dict, optional): Config dict for convolution layer.
-            Defaults to None.
         norm_cfg (dict): Config dict for normalization layer.
             Defaults to dict(type='BN', momentum=0.03, eps=0.001).
         act_cfg (dict): Config dict for activation layer.
@@ -75,7 +75,6 @@ def __init__(
         use_depthwise: bool = False,
         dcn_on_last_conv: bool = False,
         conv_bias: bool | str = "auto",
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         loss_cls: nn.Module | None = None,
@@ -118,7 +117,6 @@ def __init__(
         self.conv_bias = conv_bias
         self.use_sigmoid_cls = True
 
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
 
@@ -158,11 +156,18 @@ def _init_layers(self) -> None:
 
     def _build_stacked_convs(self) -> nn.Sequential:
         """Initialize conv layers of a single level head."""
-        conv = DepthwiseSeparableConvModule if self.use_depthwise else ConvModule
+        conv = DepthwiseSeparableConvModule if self.use_depthwise else Conv2dModule
         stacked_convs = []
         for i in range(self.stacked_convs):
             chn = self.in_channels if i == 0 else self.feat_channels
-            conv_cfg = {"type": "DCNv2"} if self.dcn_on_last_conv and i == self.stacked_convs - 1 else self.conv_cfg
+            # TODO (sungchul): enable deformable convolution implemented in mmcv
+            # conv_cfg = {"type": "DCNv2"} if self.dcn_on_last_conv and i == self.stacked_convs - 1 else self.conv_cfg
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                logger.warning(
+                    f"stacked convs[{i}] : Deformable convolution is not supported in YOLOXHead, "
+                    "use normal convolution instead.",
+                )
+
             stacked_convs.append(
                 conv(
                     chn,
@@ -170,7 +175,6 @@ def _build_stacked_convs(self) -> nn.Sequential:
                     3,
                     stride=1,
                     padding=1,
-                    conv_cfg=conv_cfg,
                     norm_cfg=self.norm_cfg,
                     act_cfg=self.act_cfg,
                     bias=self.conv_bias,
diff --git a/src/otx/algo/detection/layers/csp_layer.py b/src/otx/algo/detection/layers/csp_layer.py
index 175584d6074..4cb0d10b57a 100644
--- a/src/otx/algo/detection/layers/csp_layer.py
+++ b/src/otx/algo/detection/layers/csp_layer.py
@@ -11,8 +11,7 @@
 from otx.algo.detection.layers import ChannelAttention
 from otx.algo.modules import build_activation_layer
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv_module import ConvModule
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 
 
 class DarknetBottleneck(BaseModule):
@@ -32,8 +31,6 @@ class DarknetBottleneck(BaseModule):
             Defaults to True.
         use_depthwise (bool): Whether to use depthwise separable convolution.
             Defaults to False.
-        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
-            which means using conv2d.
         norm_cfg (dict): Config dict for normalization layer.
             Defaults to dict(type='BN').
         act_cfg (dict): Config dict for activation layer.
@@ -47,7 +44,6 @@ def __init__(
         expansion: float = 0.5,
         add_identity: bool = True,
         use_depthwise: bool = False,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         init_cfg: dict | list[dict] | None = None,
@@ -61,15 +57,20 @@ def __init__(
         super().__init__(init_cfg=init_cfg)
 
         hidden_channels = int(out_channels * expansion)
-        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
-        self.conv1 = ConvModule(in_channels, hidden_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        conv = DepthwiseSeparableConvModule if use_depthwise else Conv2dModule
+        self.conv1 = Conv2dModule(
+            in_channels,
+            hidden_channels,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+        )
         self.conv2 = conv(
             hidden_channels,
             out_channels,
             3,
             stride=1,
             padding=1,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg=act_cfg,
         )
@@ -99,8 +100,6 @@ class CSPNeXtBlock(BaseModule):
             Defaults to False.
         kernel_size (int): The kernel size of the second convolution layer.
             Defaults to 5.
-        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
-            which means using conv2d.
         norm_cfg (dict): Config dict for normalization layer.
             Defaults to dict(type='BN', momentum=0.03, eps=0.001).
         act_cfg (dict): Config dict for activation layer.
@@ -117,7 +116,6 @@ def __init__(
         add_identity: bool = True,
         use_depthwise: bool = False,
         kernel_size: int = 5,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         init_cfg: dict | list[dict] | None = None,
@@ -131,7 +129,7 @@ def __init__(
         super().__init__(init_cfg=init_cfg)
 
         hidden_channels = int(out_channels * expansion)
-        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        conv = DepthwiseSeparableConvModule if use_depthwise else Conv2dModule
         self.conv1 = conv(in_channels, hidden_channels, 3, stride=1, padding=1, norm_cfg=norm_cfg, act_cfg=act_cfg)
         self.conv2 = DepthwiseSeparableConvModule(
             hidden_channels,
@@ -139,7 +137,6 @@ def __init__(
             kernel_size,
             stride=1,
             padding=kernel_size // 2,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg=act_cfg,
         )
@@ -177,8 +174,8 @@ def __init__(
         super().__init__()
         self.ch_in = ch_in
         self.ch_out = ch_out
-        self.conv1 = ConvModule(ch_in, ch_out, 3, 1, padding=1, act_cfg=None, norm_cfg=norm_cfg)
-        self.conv2 = ConvModule(ch_in, ch_out, 1, 1, act_cfg=None, norm_cfg=norm_cfg)
+        self.conv1 = Conv2dModule(ch_in, ch_out, 3, 1, padding=1, act_cfg=None, norm_cfg=norm_cfg)
+        self.conv2 = Conv2dModule(ch_in, ch_out, 1, 1, act_cfg=None, norm_cfg=norm_cfg)
         self.act = nn.Identity() if act_cfg is None else build_activation_layer(act_cfg)
 
     def forward(self, x: Tensor) -> Tensor:
@@ -199,7 +196,7 @@ def _pad_1x1_to_3x3_tensor(self, kernel1x1: Tensor | None) -> Tensor:
             return 0
         return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
 
-    def _fuse_bn_tensor(self, branch: ConvModule) -> tuple[float, float]:
+    def _fuse_bn_tensor(self, branch: Conv2dModule) -> tuple[float, float]:
         """Fuse the BN layer to the convolution layer."""
         if branch is None or branch.norm_layer is None:
             return 0, 0
@@ -231,8 +228,6 @@ class CSPLayer(BaseModule):
             blocks. Defaults to False.
         channel_attention (bool): Whether to add channel attention in each
             stage. Defaults to True.
-        conv_cfg (dict, optional): Config dict for convolution layer.
-            Defaults to None, which means using conv2d.
         norm_cfg (dict): Config dict for normalization layer.
             Defaults to dict(type='BN')
         act_cfg (dict): Config dict for activation layer.
@@ -251,7 +246,6 @@ def __init__(
         use_depthwise: bool = False,
         use_cspnext_block: bool = False,
         channel_attention: bool = False,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         init_cfg: dict | list[dict] | None = None,
@@ -267,20 +261,24 @@ def __init__(
         block = CSPNeXtBlock if use_cspnext_block else DarknetBottleneck
         mid_channels = int(out_channels * expand_ratio)
         self.channel_attention = channel_attention
-        self.main_conv = ConvModule(in_channels, mid_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
-        self.short_conv = ConvModule(
+        self.main_conv = Conv2dModule(
             in_channels,
             mid_channels,
             1,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg=act_cfg,
         )
-        self.final_conv = ConvModule(
+        self.short_conv = Conv2dModule(
+            in_channels,
+            mid_channels,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+        )
+        self.final_conv = Conv2dModule(
             2 * mid_channels,
             out_channels,
             1,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg=act_cfg,
         )
@@ -293,7 +291,6 @@ def __init__(
                     1.0,
                     add_identity,
                     use_depthwise,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 )
@@ -329,8 +326,7 @@ class CSPRepLayer(nn.Module):
         bias (bool): Whether to use bias in the convolution layer.
             Defaults to False.
         act_cfg (dict[str, str] | None): Config dict for activation layer.
-            Defaults to None, which means using the activation config in
-            conv_cfg.
+            Defaults to None.
         norm_cfg (dict[str, str] | None): Config dict for normalization
             layer. Defaults to None.
     """
@@ -348,8 +344,8 @@ def __init__(
         """Initialize CSPRepLayer."""
         super().__init__()
         hidden_channels = int(out_channels * expansion)
-        self.conv1 = ConvModule(in_channels, hidden_channels, 1, 1, bias=bias, act_cfg=act_cfg, norm_cfg=norm_cfg)
-        self.conv2 = ConvModule(in_channels, hidden_channels, 1, 1, bias=bias, act_cfg=act_cfg, norm_cfg=norm_cfg)
+        self.conv1 = Conv2dModule(in_channels, hidden_channels, 1, 1, bias=bias, act_cfg=act_cfg, norm_cfg=norm_cfg)
+        self.conv2 = Conv2dModule(in_channels, hidden_channels, 1, 1, bias=bias, act_cfg=act_cfg, norm_cfg=norm_cfg)
         self.bottlenecks = nn.Sequential(
             *[
                 RepVggBlock(hidden_channels, hidden_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
@@ -357,7 +353,15 @@ def __init__(
             ],
         )
         if hidden_channels != out_channels:
-            self.conv3 = ConvModule(hidden_channels, out_channels, 1, 1, bias=bias, act_cfg=act_cfg, norm_cfg=norm_cfg)
+            self.conv3 = Conv2dModule(
+                hidden_channels,
+                out_channels,
+                1,
+                1,
+                bias=bias,
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+            )
         else:
             self.conv3 = nn.Identity()
 
diff --git a/src/otx/algo/detection/necks/cspnext_pafpn.py b/src/otx/algo/detection/necks/cspnext_pafpn.py
index 43d9dce233c..4b10101557d 100644
--- a/src/otx/algo/detection/necks/cspnext_pafpn.py
+++ b/src/otx/algo/detection/necks/cspnext_pafpn.py
@@ -17,8 +17,7 @@
 
 from otx.algo.detection.layers import CSPLayer
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv_module import ConvModule
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 
 
 class CSPNeXtPAFPN(BaseModule):
@@ -31,7 +30,6 @@ class CSPNeXtPAFPN(BaseModule):
         use_depthwise (bool): Whether to use depthwise separable convolution in blocks. Defaults to False.
         expand_ratio (float): Ratio to adjust the number of channels of the hidden layer. Default: 0.5
         upsample_cfg (dict): Config dict for interpolate layer. Default: `dict(scale_factor=2, mode='nearest')`
-        conv_cfg (dict, optional): Config dict for convolution layer. Default: None, which means using conv2d.
         norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN')
         act_cfg (dict): Config dict for activation layer. Default: dict(type='Swish')
         init_cfg (dict or list[dict], optional): Initialization config dict. Default: None.
@@ -45,7 +43,6 @@ def __init__(
         use_depthwise: bool = False,
         expand_ratio: float = 0.5,
         upsample_cfg: dict | None = None,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         init_cfg: dict | None = None,
@@ -66,7 +63,7 @@ def __init__(
         self.in_channels = in_channels
         self.out_channels = out_channels
 
-        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        conv = DepthwiseSeparableConvModule if use_depthwise else Conv2dModule
 
         # build top-down blocks
         self.upsample = nn.Upsample(**upsample_cfg)
@@ -74,11 +71,10 @@ def __init__(
         self.top_down_blocks = nn.ModuleList()
         for idx in range(len(in_channels) - 1, 0, -1):
             self.reduce_layers.append(
-                ConvModule(
+                Conv2dModule(
                     in_channels[idx],
                     in_channels[idx - 1],
                     1,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 ),
@@ -92,7 +88,6 @@ def __init__(
                     use_depthwise=use_depthwise,
                     use_cspnext_block=True,
                     expand_ratio=expand_ratio,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 ),
@@ -109,7 +104,6 @@ def __init__(
                     3,
                     stride=2,
                     padding=1,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 ),
@@ -123,7 +117,6 @@ def __init__(
                     use_depthwise=use_depthwise,
                     use_cspnext_block=True,
                     expand_ratio=expand_ratio,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 ),
@@ -132,7 +125,7 @@ def __init__(
         self.out_convs = nn.ModuleList()
         for i in range(len(in_channels)):
             self.out_convs.append(
-                conv(in_channels[i], out_channels, 3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg),
+                conv(in_channels[i], out_channels, 3, padding=1, norm_cfg=norm_cfg, act_cfg=act_cfg),
             )
 
     def forward(self, inputs: tuple[Tensor, ...]) -> tuple[Tensor, ...]:
diff --git a/src/otx/algo/detection/necks/fpn.py b/src/otx/algo/detection/necks/fpn.py
index 8ff11c939e0..1d6c32355e0 100644
--- a/src/otx/algo/detection/necks/fpn.py
+++ b/src/otx/algo/detection/necks/fpn.py
@@ -13,7 +13,7 @@
 from torch import Tensor, nn
 
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv2dModule
 
 
 class FPN(BaseModule):
@@ -43,8 +43,6 @@ class FPN(BaseModule):
             conv. Defaults to False.
         no_norm_on_lateral (bool): Whether to apply norm on lateral.
             Defaults to False.
-        conv_cfg (dict, optional): Config dict for
-            convolution layer. Defaults to None.
         norm_cfg (dict, optional): Config dict for
             normalization layer. Defaults to None.
         act_cfg (dict, optional): Config dict for
@@ -64,7 +62,6 @@ def __init__(
         add_extra_convs: bool | str = False,
         relu_before_extra_convs: bool = False,
         no_norm_on_lateral: bool = False,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         upsample_cfg: dict | None = None,
@@ -101,21 +98,19 @@ def __init__(
         self.fpn_convs = nn.ModuleList()
 
         for i in range(self.start_level, self.backbone_end_level):
-            l_conv = ConvModule(
+            l_conv = Conv2dModule(
                 in_channels[i],
                 out_channels,
                 1,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
                 act_cfg=act_cfg,
                 inplace=False,
             )
-            fpn_conv = ConvModule(
+            fpn_conv = Conv2dModule(
                 out_channels,
                 out_channels,
                 3,
                 padding=1,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
                 inplace=False,
@@ -132,13 +127,12 @@ def __init__(
                     conv_in_channels = self.in_channels[self.backbone_end_level - 1]
                 else:
                     conv_in_channels = out_channels
-                extra_fpn_conv = ConvModule(
+                extra_fpn_conv = Conv2dModule(
                     conv_in_channels,
                     out_channels,
                     3,
                     stride=2,
                     padding=1,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                     inplace=False,
diff --git a/src/otx/algo/detection/necks/hybrid_encoder.py b/src/otx/algo/detection/necks/hybrid_encoder.py
index e219ab3898f..dc8879ada33 100644
--- a/src/otx/algo/detection/necks/hybrid_encoder.py
+++ b/src/otx/algo/detection/necks/hybrid_encoder.py
@@ -11,7 +11,7 @@
 from torch import nn
 
 from otx.algo.detection.layers import CSPRepLayer
-from otx.algo.modules import ConvModule, build_activation_layer
+from otx.algo.modules import Conv2dModule, build_activation_layer
 from otx.algo.modules.base_module import BaseModule
 
 __all__ = ["HybridEncoder"]
@@ -191,7 +191,7 @@ def __init__(
         self.lateral_convs = nn.ModuleList()
         self.fpn_blocks = nn.ModuleList()
         for _ in range(len(in_channels) - 1, 0, -1):
-            self.lateral_convs.append(ConvModule(hidden_dim, hidden_dim, 1, 1, act_cfg=act_cfg, norm_cfg=norm_cfg))
+            self.lateral_convs.append(Conv2dModule(hidden_dim, hidden_dim, 1, 1, act_cfg=act_cfg, norm_cfg=norm_cfg))
             self.fpn_blocks.append(
                 CSPRepLayer(
                     hidden_dim * 2,
@@ -208,7 +208,7 @@ def __init__(
         self.pan_blocks = nn.ModuleList()
         for _ in range(len(in_channels) - 1):
             self.downsample_convs.append(
-                ConvModule(hidden_dim, hidden_dim, 3, 2, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg),
+                Conv2dModule(hidden_dim, hidden_dim, 3, 2, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg),
             )
             self.pan_blocks.append(
                 CSPRepLayer(
diff --git a/src/otx/algo/detection/necks/yolox_pafpn.py b/src/otx/algo/detection/necks/yolox_pafpn.py
index d8789201299..762d6c36852 100644
--- a/src/otx/algo/detection/necks/yolox_pafpn.py
+++ b/src/otx/algo/detection/necks/yolox_pafpn.py
@@ -16,8 +16,7 @@
 
 from otx.algo.detection.layers import CSPLayer
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv_module import ConvModule
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 
 
 class YOLOXPAFPN(BaseModule):
@@ -31,8 +30,6 @@ class YOLOXPAFPN(BaseModule):
             blocks. Default: False
         upsample_cfg (dict): Config dict for interpolate layer.
             Default: `dict(scale_factor=2, mode='nearest')`
-        conv_cfg (dict, optional): Config dict for convolution layer.
-            Default: None, which means using conv2d.
         norm_cfg (dict): Config dict for normalization layer.
             Default: dict(type='BN')
         act_cfg (dict): Config dict for activation layer.
@@ -48,7 +45,6 @@ def __init__(
         num_csp_blocks: int = 3,
         use_depthwise: bool = False,
         upsample_cfg: dict | None = None,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         init_cfg: dict | list[dict] | None = None,
@@ -70,7 +66,7 @@ def __init__(
         self.in_channels = in_channels
         self.out_channels = out_channels
 
-        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        conv = DepthwiseSeparableConvModule if use_depthwise else Conv2dModule
 
         # build top-down blocks
         self.upsample = nn.Upsample(**upsample_cfg)
@@ -78,11 +74,10 @@ def __init__(
         self.top_down_blocks = nn.ModuleList()
         for idx in range(len(in_channels) - 1, 0, -1):
             self.reduce_layers.append(
-                ConvModule(
+                Conv2dModule(
                     in_channels[idx],
                     in_channels[idx - 1],
                     1,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 ),
@@ -94,7 +89,6 @@ def __init__(
                     num_blocks=num_csp_blocks,
                     add_identity=False,
                     use_depthwise=use_depthwise,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 ),
@@ -111,7 +105,6 @@ def __init__(
                     3,
                     stride=2,
                     padding=1,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 ),
@@ -123,7 +116,6 @@ def __init__(
                     num_blocks=num_csp_blocks,
                     add_identity=False,
                     use_depthwise=use_depthwise,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 ),
@@ -132,7 +124,7 @@ def __init__(
         self.out_convs = nn.ModuleList()
         for i in range(len(in_channels)):
             self.out_convs.append(
-                ConvModule(in_channels[i], out_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg),
+                Conv2dModule(in_channels[i], out_channels, 1, norm_cfg=norm_cfg, act_cfg=act_cfg),
             )
 
     def forward(self, inputs: tuple[Tensor]) -> tuple[Any, ...]:
diff --git a/src/otx/algo/instance_segmentation/backbones/swin.py b/src/otx/algo/instance_segmentation/backbones/swin.py
index 06ea9067424..4eb85af6362 100644
--- a/src/otx/algo/instance_segmentation/backbones/swin.py
+++ b/src/otx/algo/instance_segmentation/backbones/swin.py
@@ -612,7 +612,6 @@ def __init__(
         self.patch_embed = PatchEmbed(
             in_channels=in_channels,
             embed_dims=embed_dims,
-            conv_type="Conv2d",
             kernel_size=patch_size,
             stride=strides[0],
             norm_cfg=norm_cfg if patch_norm else None,
diff --git a/src/otx/algo/instance_segmentation/heads/convfc_bbox_head.py b/src/otx/algo/instance_segmentation/heads/convfc_bbox_head.py
index 8331f08c31d..112b49847e8 100644
--- a/src/otx/algo/instance_segmentation/heads/convfc_bbox_head.py
+++ b/src/otx/algo/instance_segmentation/heads/convfc_bbox_head.py
@@ -33,7 +33,6 @@ def __init__(
         num_reg_fcs: int = 0,
         conv_out_channels: int = 256,
         fc_out_channels: int = 1024,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         init_cfg: dict | None = None,
         *args,
@@ -63,7 +62,6 @@ def __init__(
         self.num_reg_fcs = num_reg_fcs
         self.conv_out_channels = conv_out_channels
         self.fc_out_channels = fc_out_channels
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
 
         # add shared convs and fcs
diff --git a/src/otx/algo/instance_segmentation/heads/fcn_mask_head.py b/src/otx/algo/instance_segmentation/heads/fcn_mask_head.py
index cda59b4c349..06ad192cd4a 100644
--- a/src/otx/algo/instance_segmentation/heads/fcn_mask_head.py
+++ b/src/otx/algo/instance_segmentation/heads/fcn_mask_head.py
@@ -21,8 +21,7 @@
 from otx.algo.instance_segmentation.utils.structures.mask import mask_target
 from otx.algo.instance_segmentation.utils.utils import empty_instances
 from otx.algo.modules.base_module import BaseModule, ModuleList
-from otx.algo.modules.conv import build_conv_layer
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv2dModule
 
 BYTES_PER_FLOAT = 4
 #  determine it based on available resources.
@@ -46,7 +45,6 @@ def __init__(
         conv_out_channels: int = 256,
         num_classes: int = 80,
         class_agnostic: int = False,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         init_cfg: dict | list[dict] | None = None,
     ) -> None:
@@ -63,9 +61,7 @@ def __init__(
         self.conv_out_channels = conv_out_channels
         self.num_classes = num_classes
         self.class_agnostic = class_agnostic
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
-        self.predictor_cfg = {"type": "Conv"}
 
         self.loss_mask = loss_mask
 
@@ -74,12 +70,11 @@ def __init__(
             in_channels = self.in_channels if i == 0 else self.conv_out_channels
             padding = (self.conv_kernel_size - 1) // 2
             self.convs.append(
-                ConvModule(
+                Conv2dModule(
                     in_channels,
                     self.conv_out_channels,
                     self.conv_kernel_size,
                     padding=padding,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                 ),
             )
@@ -95,7 +90,7 @@ def __init__(
         self.upsample = nn.ConvTranspose2d(**upsample_cfg)
         out_channels = 1 if self.class_agnostic else self.num_classes
         logits_in_channel = self.conv_out_channels
-        self.conv_logits = build_conv_layer(self.predictor_cfg, logits_in_channel, out_channels, 1)
+        self.conv_logits = nn.Conv2d(logits_in_channel, out_channels, 1)
         self.relu = nn.ReLU(inplace=True)
         self.debug_imgs = None
 
diff --git a/src/otx/algo/instance_segmentation/heads/rpn_head.py b/src/otx/algo/instance_segmentation/heads/rpn_head.py
index ab21d6e02c8..a785371fa88 100644
--- a/src/otx/algo/instance_segmentation/heads/rpn_head.py
+++ b/src/otx/algo/instance_segmentation/heads/rpn_head.py
@@ -20,7 +20,7 @@
 from otx.algo.detection.heads.anchor_head import AnchorHead
 from otx.algo.instance_segmentation.utils.structures.bbox import empty_box_as, get_box_wh
 from otx.algo.instance_segmentation.utils.utils import unpack_inst_seg_entity
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv2dModule
 from otx.algo.utils.mmengine_utils import InstanceData
 from otx.core.data.entity.base import OTXBatchDataEntity
 from otx.core.data.entity.instance_segmentation import InstanceSegBatchDataEntity
@@ -71,7 +71,7 @@ def _init_layers(self) -> None:
                 # use ``inplace=False`` to avoid error: one of the variables
                 # needed for gradient computation has been modified by an
                 # inplace operation.
-                rpn_convs.append(ConvModule(in_channels, self.feat_channels, 3, padding=1, inplace=False))
+                rpn_convs.append(Conv2dModule(in_channels, self.feat_channels, 3, padding=1, inplace=False))
             self.rpn_conv = nn.Sequential(*rpn_convs)
         else:
             self.rpn_conv = nn.Conv2d(self.in_channels, self.feat_channels, 3, padding=1)
diff --git a/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py b/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py
index 84f65adee35..9d46627a43b 100644
--- a/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py
+++ b/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py
@@ -31,7 +31,7 @@
 from otx.algo.instance_segmentation.utils.structures.bbox.transforms import get_box_wh, scale_boxes
 from otx.algo.instance_segmentation.utils.utils import unpack_inst_seg_entity
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv2dModule
 from otx.algo.modules.norm import is_norm
 from otx.algo.utils.mmengine_utils import InstanceData
 from otx.algo.utils.weight_init import bias_init_with_prob, constant_init, normal_init
@@ -102,13 +102,12 @@ def _init_layers(self) -> None:
         for i in range(self.stacked_convs):
             chn = self.in_channels if i == 0 else self.feat_channels
             self.kernel_convs.append(
-                ConvModule(
+                Conv2dModule(
                     chn,
                     self.feat_channels,
                     3,
                     stride=1,
                     padding=1,
-                    conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg,
                     act_cfg=self.act_cfg,
                 ),
@@ -739,7 +738,7 @@ def __init__(
         convs = []
         for i in range(stacked_convs):
             in_c = in_channels if i == 0 else feat_channels
-            convs.append(ConvModule(in_c, feat_channels, 3, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg))
+            convs.append(Conv2dModule(in_c, feat_channels, 3, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg))
         self.stacked_convs = nn.Sequential(*convs)
         self.projection = nn.Conv2d(feat_channels, num_prototypes, kernel_size=1)
 
@@ -843,37 +842,34 @@ def _init_layers(self) -> None:
             for i in range(self.stacked_convs):
                 chn = self.in_channels if i == 0 else self.feat_channels
                 cls_convs.append(
-                    ConvModule(
+                    Conv2dModule(
                         chn,
                         self.feat_channels,
                         3,
                         stride=1,
                         padding=1,
-                        conv_cfg=self.conv_cfg,
                         norm_cfg=self.norm_cfg,
                         act_cfg=self.act_cfg,
                     ),
                 )
                 reg_convs.append(
-                    ConvModule(
+                    Conv2dModule(
                         chn,
                         self.feat_channels,
                         3,
                         stride=1,
                         padding=1,
-                        conv_cfg=self.conv_cfg,
                         norm_cfg=self.norm_cfg,
                         act_cfg=self.act_cfg,
                     ),
                 )
                 kernel_convs.append(
-                    ConvModule(
+                    Conv2dModule(
                         chn,
                         self.feat_channels,
                         3,
                         stride=1,
                         padding=1,
-                        conv_cfg=self.conv_cfg,
                         norm_cfg=self.norm_cfg,
                         act_cfg=self.act_cfg,
                     ),
diff --git a/src/otx/algo/instance_segmentation/layers/transformer.py b/src/otx/algo/instance_segmentation/layers/transformer.py
index b619d1055de..51e62bba734 100644
--- a/src/otx/algo/instance_segmentation/layers/transformer.py
+++ b/src/otx/algo/instance_segmentation/layers/transformer.py
@@ -17,7 +17,6 @@
 from torch import nn
 
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv import build_conv_layer
 from otx.algo.modules.norm import build_norm_layer
 
 
@@ -91,11 +90,11 @@ class PatchEmbed(BaseModule):
 
     We use a conv layer to implement PatchEmbed.
 
+    TODO (sungchul): it is duplicated with otx.algo.modules.transformer.PatchEmbed
+
     Args:
         in_channels (int): The num of input channels. Default: 3
         embed_dims (int): The dimensions of embedding. Default: 768
-        conv_type (str): The config dict for embedding
-            conv layer type selection. Default: "Conv2d.
         kernel_size (int): The kernel_size of embedding conv. Default: 16.
         stride (int): The slide stride of embedding conv.
             Default: None (Would be set as `kernel_size`).
@@ -115,7 +114,6 @@ def __init__(
         self,
         in_channels: int = 3,
         embed_dims: int = 768,
-        conv_type: str = "Conv2d",
         kernel_size: int = 16,
         stride: int = 16,
         padding: int | tuple | str = "corner",
@@ -148,8 +146,7 @@ def __init__(
             self.adap_padding = None
         padding = to_2tuple(padding)
 
-        self.projection = build_conv_layer(
-            {"type": conv_type},
+        self.projection = nn.Conv2d(
             in_channels=in_channels,
             out_channels=embed_dims,
             kernel_size=kernel_size,
diff --git a/src/otx/algo/instance_segmentation/necks/fpn.py b/src/otx/algo/instance_segmentation/necks/fpn.py
index 0c0d2b21dcb..67286814f89 100644
--- a/src/otx/algo/instance_segmentation/necks/fpn.py
+++ b/src/otx/algo/instance_segmentation/necks/fpn.py
@@ -12,7 +12,7 @@
 from torch import Tensor, nn
 
 from otx.algo.modules.base_module import BaseModule
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv2dModule
 
 
 class FPN(BaseModule):
@@ -34,8 +34,6 @@ class FPN(BaseModule):
             conv. Defaults to False.
         no_norm_on_lateral (bool): Whether to apply norm on lateral.
             Defaults to False.
-        conv_cfg (dict, optional): Config dict for
-            convolution layer. Defaults to None.
         norm_cfg (dict, optional): Config dict for
             normalization layer. Defaults to None.
         act_cfg (dict, optional): Config dict for
@@ -54,7 +52,6 @@ def __init__(
         end_level: int = -1,
         relu_before_extra_convs: bool = False,
         no_norm_on_lateral: bool = False,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         upsample_cfg: dict | None = None,
@@ -95,21 +92,19 @@ def __init__(
         self.fpn_convs = nn.ModuleList()
 
         for i in range(self.start_level, self.backbone_end_level):
-            l_conv = ConvModule(
+            l_conv = Conv2dModule(
                 in_channels[i],
                 out_channels,
                 1,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
                 act_cfg=act_cfg,
                 inplace=False,
             )
-            fpn_conv = ConvModule(
+            fpn_conv = Conv2dModule(
                 out_channels,
                 out_channels,
                 3,
                 padding=1,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
                 inplace=False,
diff --git a/src/otx/algo/modules/__init__.py b/src/otx/algo/modules/__init__.py
index c7de18df440..605f47c67e0 100644
--- a/src/otx/algo/modules/__init__.py
+++ b/src/otx/algo/modules/__init__.py
@@ -5,18 +5,16 @@
 """This module implementation is a code implementation copied or replaced from mmcv.cnn.bricks."""
 
 from .activation import build_activation_layer
-from .conv import build_conv_layer
-from .conv_module import ConvModule
-from .depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from .conv_module import Conv2dModule, Conv3dModule, DepthwiseSeparableConvModule
 from .norm import FrozenBatchNorm2d, build_norm_layer
 from .padding import build_padding_layer
 
 __all__ = [
     "build_activation_layer",
-    "build_conv_layer",
     "build_padding_layer",
     "build_norm_layer",
-    "ConvModule",
+    "Conv2dModule",
+    "Conv3dModule",
     "DepthwiseSeparableConvModule",
     "FrozenBatchNorm2d",
 ]
diff --git a/src/otx/algo/modules/conv.py b/src/otx/algo/modules/conv.py
deleted file mode 100644
index a696686bb64..00000000000
--- a/src/otx/algo/modules/conv.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (c) OpenMMLab. All rights reserved.
-
-"""This implementation replaces the functionality of mmcv.cnn.bricks.conv.build_conv_layer."""
-
-from __future__ import annotations
-
-import inspect
-
-from torch import nn
-
-CONV_DICT = {
-    "Conv1d": nn.Conv1d,
-    "Conv2d": nn.Conv2d,
-    "Conv3d": nn.Conv3d,
-    "Conv": nn.Conv2d,
-}
-
-
-def build_conv_layer(cfg: dict | None, *args, **kwargs) -> nn.Module:
-    """Build convolution layer.
-
-    Args:
-        cfg (None or dict): The conv layer config, which should contain:
-            - type (str): Layer type.
-            - layer args: Args needed to instantiate an conv layer.
-        args (argument list): Arguments passed to the `__init__`
-            method of the corresponding conv layer.
-        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
-            method of the corresponding conv layer.
-
-    Returns:
-        nn.Module: Created conv layer.
-    """
-    if cfg is None:
-        cfg_ = {"type": "Conv2d"}
-    else:
-        if not isinstance(cfg, dict):
-            msg = "cfg must be a dict"
-            raise TypeError(msg)
-        if "type" not in cfg:
-            msg = 'the cfg dict must contain the key "type"'
-            raise KeyError(msg)
-        cfg_ = cfg.copy()
-
-    layer_type = cfg_.pop("type")
-    if inspect.isclass(layer_type):
-        return layer_type(*args, **kwargs, **cfg_)
-    conv_layer = CONV_DICT.get(layer_type)
-    if conv_layer is None:
-        msg = f"Cannot find {conv_layer} in {CONV_DICT.keys()}"
-        raise KeyError(msg)
-    return conv_layer(*args, **kwargs, **cfg_)
diff --git a/src/otx/algo/modules/conv_module.py b/src/otx/algo/modules/conv_module.py
index ae8de22a545..8fa9d6764d7 100644
--- a/src/otx/algo/modules/conv_module.py
+++ b/src/otx/algo/modules/conv_module.py
@@ -3,23 +3,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 """This implementation copied ConvModule of mmcv.cnn.bricks.ConvModule."""
+
 # TODO(someone): Revisit mypy errors after deprecation of mmlab
-# mypy: ignore-errors
+
 from __future__ import annotations
 
 import warnings
-from functools import partial
 from typing import TYPE_CHECKING
 
-import torch
-from torch import nn
+from torch import Tensor, nn
 from torch.nn.modules.batchnorm import _BatchNorm as BatchNorm
 from torch.nn.modules.instancenorm import _InstanceNorm as InstanceNorm
 
 from otx.algo.utils.weight_init import constant_init, kaiming_init
 
 from .activation import build_activation_layer
-from .conv import build_conv_layer
 from .norm import build_norm_layer
 from .padding import build_padding_layer
 
@@ -27,49 +25,12 @@
     from torch.nn.modules.conv import _ConvNd as ConvNd
 
 
-def efficient_conv_bn_eval_forward(bn: BatchNorm, conv: ConvNd, x: torch.Tensor) -> torch.Tensor:
-    """Implementation based on https://arxiv.org/abs/2305.11624.
-
-    "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
-    It leverages the associative law between convolution and affine transform,
-    i.e., normalize (weight conv feature) = (normalize weight) conv feature.
-    It works for Eval mode of ConvBN blocks during validation, and can be used
-    for training as well. It reduces memory and computation cost.
-
-    Args:
-        bn (_BatchNorm): a BatchNorm module.
-        conv (nn._ConvNd): a conv module
-        x (torch.Tensor): Input feature map.
-    """
-    # These lines of code are designed to deal with various cases
-    # like bn without affine transform, and conv without bias
-    weight_on_the_fly = conv.weight
-    bias_on_the_fly = conv.bias if conv.bias is not None else torch.zeros_like(bn.running_var)
-
-    bn_weight = bn.weight if bn.weight is not None else torch.ones_like(bn.running_var)
-
-    bn_bias = bn.bias if bn.bias is not None else torch.zeros_like(bn.running_var)
-
-    # shape of [C_out, 1, 1, 1] in Conv2d
-    weight_coeff = torch.rsqrt(bn.running_var + bn.eps).reshape([-1] + [1] * (len(conv.weight.shape) - 1))
-    # shape of [C_out, 1, 1, 1] in Conv2d
-    coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
-
-    # shape of [C_out, C_in, k, k] in Conv2d
-    weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
-    # shape of [C_out] in Conv2d
-    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() * (bias_on_the_fly - bn.running_mean)
-
-    return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)  # noqa: SLF001
-
-
 class ConvModule(nn.Module):
     """A conv block that bundles conv/norm/activation layers.
 
     This block simplifies the usage of convolution layers, which are commonly
     used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
-    It is based upon three build methods: `build_conv_layer()`,
-    `build_norm_layer()` and `build_activation_layer()`.
+    It is based upon two build methods: `build_norm_layer()` and `build_activation_layer()`.
 
     Besides, we add some additional features in this module.
     1. Automatically set `bias` of the conv layer.
@@ -95,8 +56,6 @@ class ConvModule(nn.Module):
         bias (bool | str): If specified as `auto`, it will be decided by the
             norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
             False. Default: "auto".
-        conv_cfg (dict): Config dict for convolution layer. Default: None,
-            which means using conv2d.
         norm_cfg (dict): Config dict for normalization layer. Default: None.
         act_cfg (dict): Config dict for activation layer.
             Default: dict(type='ReLU').
@@ -109,16 +68,10 @@ class ConvModule(nn.Module):
             instead. Currently, we support ['zeros', 'circular'] with official
             implementation and ['reflect'] with our own implementation.
             Default: 'zeros'.
-        order (tuple[str]): The order of conv/norm/activation layers. It is a
-            sequence of "conv", "norm" and "act". Common examples are
-            ("conv", "norm", "act") and ("act", "conv", "norm").
-            Default: ('conv', 'norm', 'act').
-        efficient_conv_bn_eval (bool): Whether use efficient conv when the
-            consecutive bn is in eval mode (either training or testing), as
-            proposed in https://arxiv.org/abs/2305.11624 . Default: `False`.
     """
 
     _abbr_ = "conv_block"
+    _conv_nd: ConvNd
 
     def __init__(
         self,
@@ -130,29 +83,20 @@ def __init__(
         dilation: int | tuple[int, int] = 1,
         groups: int = 1,
         bias: bool | str = "auto",
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = {"type": "ReLU"},  # noqa: B006
         inplace: bool = True,
         with_spectral_norm: bool = False,
         padding_mode: str = "zeros",
-        order: tuple = ("conv", "norm", "act"),
-        efficient_conv_bn_eval: bool = False,
     ):
         super().__init__()
-        assert conv_cfg is None or isinstance(conv_cfg, dict)  # noqa: S101
         assert norm_cfg is None or isinstance(norm_cfg, dict)  # noqa: S101
         official_padding_mode = ["zeros", "circular"]
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
         self.inplace = inplace
         self.with_spectral_norm = with_spectral_norm
         self.with_explicit_padding = padding_mode not in official_padding_mode
-        self.order = order
-        assert isinstance(self.order, tuple)  # noqa: S101
-        assert len(self.order) == 3  # noqa: S101
-        assert set(order) == {"conv", "norm", "act"}  # noqa: S101
 
         self.with_norm = norm_cfg is not None
         self.with_activation = act_cfg is not None
@@ -168,8 +112,7 @@ def __init__(
         # reset padding to 0 for conv module
         conv_padding = 0 if self.with_explicit_padding else padding
         # build convolution layer
-        self.conv = build_conv_layer(
-            conv_cfg,
+        self.conv = self._conv_nd(
             in_channels,
             out_channels,
             kernel_size,
@@ -196,19 +139,17 @@ def __init__(
         # build normalization layers
         if self.with_norm:
             # norm layer is after conv layer
-            norm_channels = out_channels if order.index("norm") > order.index("conv") else in_channels
-            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
+            norm_channels = out_channels
+            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)  # type: ignore[arg-type]
             self.add_module(self.norm_name, norm)
             if self.with_bias and isinstance(norm, (BatchNorm, InstanceNorm)):
                 warnings.warn("Unnecessary conv bias before batch/instance norm", stacklevel=1)
         else:
-            self.norm_name = None
-
-        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
+            self.norm_name = None  # type: ignore[assignment]
 
         # build activation layer
         if self.with_activation:
-            act_cfg_ = act_cfg.copy()
+            act_cfg_ = act_cfg.copy()  # type: ignore[union-attr]
             # nn.Tanh has no 'inplace' argument
             if act_cfg_["type"] not in [
                 "Tanh",
@@ -248,9 +189,9 @@ def init_weights(self) -> None:
         # Note: For PyTorch's conv layers, they will be overwritten by our
         #    initialization implementation using default ``kaiming_init``.
         if not hasattr(self.conv, "init_weights"):
-            if self.with_activation and self.act_cfg["type"] == "LeakyReLU":
+            if self.with_activation and self.act_cfg["type"] == "LeakyReLU":  # type: ignore[index]
                 nonlinearity = "leaky_relu"
-                a = self.act_cfg.get("negative_slope", 0.01)
+                a = self.act_cfg.get("negative_slope", 0.01)  # type: ignore[union-attr]
             else:
                 nonlinearity = "relu"
                 a = 0
@@ -258,106 +199,135 @@ def init_weights(self) -> None:
         if self.with_norm:
             constant_init(self.norm_layer, 1, bias=0)
 
-    def forward(self, x: torch.Tensor, activate: bool = True, norm: bool = True) -> torch.Tensor:
+    def forward(self, x: Tensor, activate: bool = True, norm: bool = True) -> Tensor:
         """Forward pass of the ConvModule.
 
         Args:
-            x (torch.Tensor): Input tensor.
+            x (Tensor): Input tensor.
             activate (bool, optional): Whether to apply activation. Defaults to True.
             norm (bool, optional): Whether to apply normalization. Defaults to True.
 
         Returns:
-            torch.Tensor: Output tensor.
+            Tensor: Output tensor.
         """
-        layer_index = 0
-        while layer_index < len(self.order):
-            layer = self.order[layer_index]
-            if layer == "conv":
-                if self.with_explicit_padding:
-                    x = self.padding_layer(x)
-                # if the next operation is norm and we have a norm layer in
-                # eval mode and we have enabled `efficient_conv_bn_eval` for
-                # the conv operator, then activate the optimized forward and
-                # skip the next norm operator since it has been fused
-                if (
-                    layer_index + 1 < len(self.order)
-                    and self.order[layer_index + 1] == "norm"
-                    and norm
-                    and self.with_norm
-                    and not self.norm_layer.training
-                    and self.efficient_conv_bn_eval_forward is not None
-                ):
-                    self.conv.forward = partial(self.efficient_conv_bn_eval_forward, self.norm_layer, self.conv)
-                    layer_index += 1
-                    x = self.conv(x)
-                    del self.conv.forward
-                else:
-                    x = self.conv(x)
-            elif layer == "norm" and norm and self.with_norm:
-                x = self.norm_layer(x)
-            elif layer == "act" and activate and self.with_activation:
-                x = self.activate(x)
-            layer_index += 1
+        if self.with_explicit_padding:
+            x = self.padding_layer(x)
+        x = self.conv(x)
+        if norm and self.with_norm:
+            x = self.norm_layer(x)  # type: ignore[misc]
+        if activate and self.with_activation:
+            x = self.activate(x)
         return x
 
-    def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval: bool = True) -> None:
-        """Turn on the efficient convolution batch normalization evaluation.
 
-        Args:
-            efficient_conv_bn_eval (bool, optional): Whether to enable efficient convolution
-                batch normalization evaluation. Defaults to True.
-        """
-        # efficient_conv_bn_eval works for conv + bn
-        # with `track_running_stats` option
-        if (
-            efficient_conv_bn_eval
-            and self.norm_layer
-            and isinstance(self.norm_layer, BatchNorm)
-            and self.norm_layer.track_running_stats
-        ):
-            self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward
-        else:
-            self.efficient_conv_bn_eval_forward = None
-
-    @staticmethod
-    def create_from_conv_bn(
-        conv: ConvNd,
-        bn: BatchNorm,
-        efficient_conv_bn_eval: bool = True,
-    ) -> ConvModule:
-        """Create a ConvModule from a conv and a bn module."""
-        self = ConvModule.__new__(ConvModule)
-        super(ConvModule, self).__init__()
-
-        self.conv_cfg = None
-        self.norm_cfg = None
-        self.act_cfg = None
-        self.inplace = False
-        self.with_spectral_norm = False
-        self.with_explicit_padding = False
-        self.order = ("conv", "norm", "act")
-
-        self.with_norm = True
-        self.with_activation = False
-        self.with_bias = conv.bias is not None
+class DepthwiseSeparableConvModule(nn.Module):
+    """Depthwise separable convolution module.
 
-        # build convolution layer
-        self.conv = conv
-        # export the attributes of self.conv to a higher level for convenience
-        self.in_channels = self.conv.in_channels
-        self.out_channels = self.conv.out_channels
-        self.kernel_size = self.conv.kernel_size
-        self.stride = self.conv.stride
-        self.padding = self.conv.padding
-        self.dilation = self.conv.dilation
-        self.transposed = self.conv.transposed
-        self.output_padding = self.conv.output_padding
-        self.groups = self.conv.groups
+    See https://arxiv.org/pdf/1704.04861.pdf for details.
+
+    This module can replace a ConvModule with the conv block replaced by two
+    conv block: depthwise conv block and pointwise conv block. The depthwise
+    conv block contains depthwise-conv/norm/activation layers. The pointwise
+    conv block contains pointwise-conv/norm/activation layers. It should be
+    noted that there will be norm/activation layer in the depthwise conv block
+    if `norm_cfg` and `act_cfg` are specified.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``. Default: 1.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``. Default: 0.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``. Default: 1.
+        norm_cfg (dict): Default norm config for both depthwise ConvModule and
+            pointwise ConvModule. Default: None.
+        act_cfg (dict): Default activation config for both depthwise ConvModule
+            and pointwise ConvModule. Default: dict(type='ReLU').
+        dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
+            None, it will be the same as `norm_cfg`. Default: None.
+        dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
+            None, it will be the same as `act_cfg`. Default: None.
+        pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
+            None, it will be the same as `norm_cfg`. Default: None.
+        pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
+            None, it will be the same as `act_cfg`. Default: None.
+        kwargs (optional): Other shared arguments for depthwise and pointwise
+            ConvModule. See ConvModule for ref.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int | tuple[int, int],
+        stride: int | tuple[int, int] = 1,
+        padding: int | tuple[int, int] = 0,
+        dilation: int | tuple[int, int] = 1,
+        norm_cfg: dict | None = None,
+        act_cfg: dict | None = None,
+        dw_norm_cfg: dict | None = None,
+        dw_act_cfg: dict | None = None,
+        pw_norm_cfg: dict | None = None,
+        pw_act_cfg: dict | None = None,
+        **kwargs,
+    ):
+        if act_cfg is None:
+            act_cfg = {"type": "ReLU"}
+
+        super().__init__()
+        if "groups" in kwargs:
+            msg = "groups should not be specified in DepthwiseSeparableConvModule."
+            raise ValueError(msg)
+
+        # if norm/activation config of depthwise/pointwise Conv2dModule is not
+        # specified, use default config.
+        dw_norm_cfg = dw_norm_cfg or norm_cfg
+        dw_act_cfg = dw_act_cfg or act_cfg
+        pw_norm_cfg = pw_norm_cfg or norm_cfg
+        pw_act_cfg = pw_act_cfg or act_cfg
+
+        # depthwise convolution
+        self.depthwise_conv = Conv2dModule(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            norm_cfg=dw_norm_cfg,
+            act_cfg=dw_act_cfg,
+            **kwargs,
+        )
+
+        self.pointwise_conv = Conv2dModule(
+            in_channels,
+            out_channels,
+            1,
+            norm_cfg=pw_norm_cfg,
+            act_cfg=pw_act_cfg,
+            **kwargs,
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward."""
+        x = self.depthwise_conv(x)
+        return self.pointwise_conv(x)
+
+
+class Conv2dModule(ConvModule):
+    """A conv2d block that bundles conv/norm/activation layers."""
+
+    _conv_nd = nn.Conv2d
 
-        # build normalization layers
-        self.norm_name, norm = "bn", bn
-        self.add_module(self.norm_name, norm)
 
-        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
+class Conv3dModule(ConvModule):
+    """A conv3d block that bundles conv/norm/activation layers."""
 
-        return self
+    _conv_nd = nn.Conv3d
diff --git a/src/otx/algo/modules/depthwise_separable_conv_module.py b/src/otx/algo/modules/depthwise_separable_conv_module.py
deleted file mode 100644
index f04db1cdf09..00000000000
--- a/src/otx/algo/modules/depthwise_separable_conv_module.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (c) OpenMMLab. All rights reserved.
-
-"""This implementation of DepthwiseSeparableConvModule copied from mmcv.cnn.bricks.depthwise_separable_conv_module."""
-
-from __future__ import annotations
-
-from torch import Tensor, nn
-
-from .conv_module import ConvModule
-
-
-class DepthwiseSeparableConvModule(nn.Module):
-    """Depthwise separable convolution module.
-
-    See https://arxiv.org/pdf/1704.04861.pdf for details.
-
-    This module can replace a ConvModule with the conv block replaced by two
-    conv block: depthwise conv block and pointwise conv block. The depthwise
-    conv block contains depthwise-conv/norm/activation layers. The pointwise
-    conv block contains pointwise-conv/norm/activation layers. It should be
-    noted that there will be norm/activation layer in the depthwise conv block
-    if `norm_cfg` and `act_cfg` are specified.
-
-    Args:
-        in_channels (int): Number of channels in the input feature map.
-            Same as that in ``nn._ConvNd``.
-        out_channels (int): Number of channels produced by the convolution.
-            Same as that in ``nn._ConvNd``.
-        kernel_size (int | tuple[int]): Size of the convolving kernel.
-            Same as that in ``nn._ConvNd``.
-        stride (int | tuple[int]): Stride of the convolution.
-            Same as that in ``nn._ConvNd``. Default: 1.
-        padding (int | tuple[int]): Zero-padding added to both sides of
-            the input. Same as that in ``nn._ConvNd``. Default: 0.
-        dilation (int | tuple[int]): Spacing between kernel elements.
-            Same as that in ``nn._ConvNd``. Default: 1.
-        norm_cfg (dict): Default norm config for both depthwise ConvModule and
-            pointwise ConvModule. Default: None.
-        act_cfg (dict): Default activation config for both depthwise ConvModule
-            and pointwise ConvModule. Default: dict(type='ReLU').
-        dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
-            None, it will be the same as `norm_cfg`. Default: None.
-        dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
-            None, it will be the same as `act_cfg`. Default: None.
-        pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
-            None, it will be the same as `norm_cfg`. Default: None.
-        pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
-            None, it will be the same as `act_cfg`. Default: None.
-        kwargs (optional): Other shared arguments for depthwise and pointwise
-            ConvModule. See ConvModule for ref.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int | tuple[int, int],
-        stride: int | tuple[int, int] = 1,
-        padding: int | tuple[int, int] = 0,
-        dilation: int | tuple[int, int] = 1,
-        norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
-        dw_norm_cfg: dict | None = None,
-        dw_act_cfg: dict | None = None,
-        pw_norm_cfg: dict | None = None,
-        pw_act_cfg: dict | None = None,
-        **kwargs,
-    ):
-        if act_cfg is None:
-            act_cfg = {"type": "ReLU"}
-
-        super().__init__()
-        assert "groups" not in kwargs, "groups should not be specified"  # noqa: S101
-
-        # if norm/activation config of depthwise/pointwise ConvModule is not
-        # specified, use default config.
-        dw_norm_cfg = dw_norm_cfg or norm_cfg
-        dw_act_cfg = dw_act_cfg or act_cfg
-        pw_norm_cfg = pw_norm_cfg or norm_cfg
-        pw_act_cfg = pw_act_cfg or act_cfg
-
-        # depthwise convolution
-        self.depthwise_conv = ConvModule(
-            in_channels,
-            in_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=in_channels,
-            norm_cfg=dw_norm_cfg,
-            act_cfg=dw_act_cfg,
-            **kwargs,
-        )
-
-        self.pointwise_conv = ConvModule(
-            in_channels,
-            out_channels,
-            1,
-            norm_cfg=pw_norm_cfg,
-            act_cfg=pw_act_cfg,
-            **kwargs,
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        """Forward."""
-        x = self.depthwise_conv(x)
-        return self.pointwise_conv(x)
diff --git a/src/otx/algo/modules/transformer.py b/src/otx/algo/modules/transformer.py
index e37e5e45c1e..46cdd96a943 100644
--- a/src/otx/algo/modules/transformer.py
+++ b/src/otx/algo/modules/transformer.py
@@ -3,6 +3,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 """This implementation replaces the functionality of mmcv.cnn.bricks.transformer."""
+
 from __future__ import annotations
 
 import math
@@ -13,7 +14,6 @@
 from otx.algo.modules.base_module import BaseModule, Sequential
 
 from .activation import build_activation_layer
-from .conv import build_conv_layer
 from .drop import build_dropout
 from .norm import build_norm_layer
 
@@ -122,11 +122,11 @@ class PatchEmbed(BaseModule):
 
     We use a conv layer to implement PatchEmbed.
 
+    TODO (sungchul): it is duplicated with otx.algo.instance_segmentation.layers.transformer.PatchEmbed
+
     Args:
         in_channels (int): The num of input channels. Default: 3
         embed_dims (int): The dimensions of embedding. Default: 768
-        conv_type (str): The type of convolution
-            to generate patch embedding. Default: "Conv2d".
         kernel_size (int): The kernel_size of embedding conv. Default: 16.
         stride (int): The slide stride of embedding conv.
             Default: 16.
@@ -149,7 +149,6 @@ def __init__(
         self,
         in_channels: int = 3,
         embed_dims: int = 768,
-        conv_type: str = "Conv2d",
         kernel_size: int | tuple[int, int] = 16,
         stride: int | tuple[int, int] = 16,
         padding: str | int | tuple[int, int] = "corner",
@@ -183,8 +182,7 @@ def __init__(
             self.adaptive_padding = None
         padding = padding if isinstance(padding, tuple) else (padding, padding)
 
-        self.projection = build_conv_layer(
-            {"type": conv_type},
+        self.projection = nn.Conv2d(
             in_channels=in_channels,
             out_channels=embed_dims,
             kernel_size=kernel_size,
diff --git a/src/otx/algo/segmentation/backbones/litehrnet.py b/src/otx/algo/segmentation/backbones/litehrnet.py
index 7b8ffd450e1..48e359862bd 100644
--- a/src/otx/algo/segmentation/backbones/litehrnet.py
+++ b/src/otx/algo/segmentation/backbones/litehrnet.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Intel Corporation
+# Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 """HRNet network modules for base backbone.
@@ -7,7 +7,6 @@
 - https://github.com/HRNet/Lite-HRNet
 """
 
-
 from __future__ import annotations
 
 from pathlib import Path
@@ -17,7 +16,7 @@
 from torch import nn
 from torch.nn import functional
 
-from otx.algo.modules import ConvModule, build_conv_layer, build_norm_layer
+from otx.algo.modules import Conv2dModule, build_norm_layer
 from otx.algo.modules.base_module import BaseModule
 from otx.algo.segmentation.modules import (
     AsymmetricPositionAttentionModule,
@@ -37,7 +36,6 @@ def __init__(
         kernel_size: int = 3,
         key_ratio: int = 8,
         value_ratio: int = 8,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
     ) -> None:
         """Neighbour support module.
@@ -47,7 +45,6 @@ def __init__(
             kernel_size (int): Kernel size for convolutional layers. Default is 3.
             key_ratio (int): Ratio of input channels to key channels. Default is 8.
             value_ratio (int): Ratio of input channels to value channels. Default is 8.
-            conv_cfg (dict | None): Config for convolutional layers. Default is None.
             norm_cfg (dict | None): Config for normalization layers. Default is None.
         """
         super().__init__()
@@ -58,54 +55,49 @@ def __init__(
         self.kernel_size = kernel_size
 
         self.key = nn.Sequential(
-            ConvModule(
+            Conv2dModule(
                 in_channels=self.in_channels,
                 out_channels=self.key_channels,
                 kernel_size=1,
                 stride=1,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg={"type": "ReLU"},
             ),
-            ConvModule(
+            Conv2dModule(
                 self.key_channels,
                 self.key_channels,
                 kernel_size=self.kernel_size,
                 stride=1,
                 padding=(self.kernel_size - 1) // 2,
                 groups=self.key_channels,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=None,
             ),
-            ConvModule(
+            Conv2dModule(
                 in_channels=self.key_channels,
                 out_channels=self.kernel_size * self.kernel_size,
                 kernel_size=1,
                 stride=1,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=None,
             ),
         )
         self.value = nn.Sequential(
-            ConvModule(
+            Conv2dModule(
                 in_channels=self.in_channels,
                 out_channels=self.value_channels,
                 kernel_size=1,
                 stride=1,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=None,
             ),
             nn.Unfold(kernel_size=self.kernel_size, stride=1, padding=1),
         )
-        self.out_conv = ConvModule(
+        self.out_conv = Conv2dModule(
             in_channels=self.value_channels,
             out_channels=self.in_channels,
             kernel_size=1,
             stride=1,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg=None,
         )
@@ -131,7 +123,6 @@ def __init__(
         self,
         channels: list[int],
         ratio: int = 16,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | tuple[dict, dict] = ({"type": "ReLU"}, {"type": "Sigmoid"}),
     ) -> None:
@@ -140,7 +131,6 @@ def __init__(
         Args:
             channels (list[int]): Number of channels for each stage.
             ratio (int): Reduction ratio of the bottleneck block.
-            conv_cfg (dict | None): Config dict for convolution layer. Default: None
             norm_cfg (dict | None): Config dict for normalization layer. Default: None
             act_cfg (dict | tuple[dict, dict]): Config dict or a tuple of config dicts for activation layer(s).
                 Default: ({"type": "ReLU"}, {"type": "Sigmoid"}).
@@ -156,21 +146,19 @@ def __init__(
         self.channels = channels
         total_channel = sum(channels)
 
-        self.conv1 = ConvModule(
+        self.conv1 = Conv2dModule(
             in_channels=total_channel,
             out_channels=int(total_channel / ratio),
             kernel_size=1,
             stride=1,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg=act_cfg[0],
         )
-        self.conv2 = ConvModule(
+        self.conv2 = Conv2dModule(
             in_channels=int(total_channel / ratio),
             out_channels=total_channel,
             kernel_size=1,
             stride=1,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg=act_cfg[1],
         )
@@ -195,7 +183,6 @@ def __init__(
         self,
         channels: int,
         ratio: int = 16,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | tuple[dict, dict] = ({"type": "ReLU"}, {"type": "Sigmoid"}),
         enable_norm: bool = False,
@@ -205,8 +192,6 @@ def __init__(
         Args:
             channels (int): Number of input channels.
             ratio (int): Reduction ratio for the bottleneck block. Default: 16.
-            conv_cfg (dict | None): Configuration dict for convolutional layers.
-                Default: None.
             act_cfg (dict | tuple[dict]): Configuration dict or tuple of dicts for
                 activation layers. If a single dict is provided, it will be used for
                 both activation layers. Default: ({"type": "ReLU"}, {"type": "Sigmoid"}).
@@ -224,20 +209,18 @@ def __init__(
             raise ValueError(msg)
 
         self.global_avgpool = nn.AdaptiveAvgPool2d(1)
-        self.conv1 = ConvModule(
+        self.conv1 = Conv2dModule(
             in_channels=channels,
             out_channels=int(channels / ratio),
             kernel_size=1,
             stride=1,
-            conv_cfg=conv_cfg,
             act_cfg=act_cfg[0],
         )
-        self.conv2 = ConvModule(
+        self.conv2 = Conv2dModule(
             in_channels=int(channels / ratio),
             out_channels=channels,
             kernel_size=1,
             stride=1,
-            conv_cfg=conv_cfg,
             act_cfg=act_cfg[1],
         )
 
@@ -257,7 +240,6 @@ def __init__(
         self,
         channels: int,
         ratio: int = 16,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         enable_norm: bool = False,
     ) -> None:
@@ -266,7 +248,6 @@ def __init__(
         Args:
             channels (int): Number of input channels.
             ratio (int): Reduction ratio of internal channels.
-            conv_cfg (dict | None): Config dict for convolution layer.
             norm_cfg (dict | None): Config dict for normalization layer.
             enable_norm (bool): Whether to enable normalization layers.
         """
@@ -276,54 +257,49 @@ def __init__(
         self.internal_channels = int(channels / ratio)
 
         # channel-only branch
-        self.v_channel = ConvModule(
+        self.v_channel = Conv2dModule(
             in_channels=self.in_channels,
             out_channels=self.internal_channels,
             kernel_size=1,
             stride=1,
             bias=False,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg if enable_norm else None,
             act_cfg=None,
         )
-        self.q_channel = ConvModule(
+        self.q_channel = Conv2dModule(
             in_channels=self.in_channels,
             out_channels=1,
             kernel_size=1,
             stride=1,
             bias=False,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg if enable_norm else None,
             act_cfg=None,
         )
-        self.out_channel = ConvModule(
+        self.out_channel = Conv2dModule(
             in_channels=self.internal_channels,
             out_channels=self.in_channels,
             kernel_size=1,
             stride=1,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg={"type": "Sigmoid"},
         )
 
         # spatial-only branch
-        self.v_spatial = ConvModule(
+        self.v_spatial = Conv2dModule(
             in_channels=self.in_channels,
             out_channels=self.internal_channels,
             kernel_size=1,
             stride=1,
             bias=False,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg if enable_norm else None,
             act_cfg=None,
         )
-        self.q_spatial = ConvModule(
+        self.q_spatial = Conv2dModule(
             in_channels=self.in_channels,
             out_channels=self.internal_channels,
             kernel_size=1,
             stride=1,
             bias=False,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg if enable_norm else None,
             act_cfg=None,
         )
@@ -392,7 +368,6 @@ def __init__(
         in_channels: list[int],
         stride: int,
         reduce_ratio: int,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         with_cp: bool = False,
         dropout: float | None = None,
@@ -406,7 +381,6 @@ def __init__(
             in_channels (list[int]): Number of input channels for each input feature map.
             stride (int): Stride used in the first convolutional layer.
             reduce_ratio (int): Reduction ratio used in the cross-resolution weighting module.
-            conv_cfg (dict | None): Dictionary to construct and configure the convolutional layers.
             norm_cfg (dict | None): Dictionary to construct and configure the normalization layers.
             with_cp (bool): Whether to use checkpointing to save memory.
             dropout (float | None): Dropout probability used in the depthwise convolutional layers.
@@ -434,19 +408,17 @@ def __init__(
         self.cross_resolution_weighting = CrossResolutionWeighting(
             branch_channels,
             ratio=reduce_ratio,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
         )
         self.depthwise_convs = nn.ModuleList(
             [
-                ConvModule(
+                Conv2dModule(
                     channel,
                     channel,
                     kernel_size=dw_ksize,
                     stride=self.stride,
                     padding=dw_ksize // 2,
                     groups=channel,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=None,
                 )
@@ -458,7 +430,6 @@ def __init__(
                 spatial_weighting_module(
                     channels=channel,
                     ratio=4,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     enable_norm=True,
                 )
@@ -475,7 +446,6 @@ def __init__(
                         kernel_size=3,
                         key_ratio=8,
                         value_ratio=4,
-                        conv_cfg=conv_cfg,
                         norm_cfg=norm_cfg,
                     )
                     for channel in branch_channels
@@ -528,7 +498,6 @@ def __init__(
         stem_channels: int,
         out_channels: int,
         expand_ratio: int,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         with_cp: bool = False,
         strides: tuple[int, int] = (2, 2),
@@ -542,7 +511,6 @@ def __init__(
             stem_channels (int): Number of output channels of the stem layer.
             out_channels (int): Number of output channels of the backbone network.
             expand_ratio (int): Expansion ratio of the internal channels.
-            conv_cfg (dict | None): Dictionary to construct and configure convolution layers.
             norm_cfg (dict | None): Dictionary to construct and configure normalization layers.
             with_cp (bool): Use checkpointing to save memory during forward pass.
             num_stages (int): Number of stages in the backbone network.
@@ -568,7 +536,6 @@ def __init__(
 
         self.in_channels = in_channels
         self.out_channels = out_channels
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.with_cp = with_cp
 
@@ -576,26 +543,24 @@ def __init__(
         if input_norm:
             self.input_norm = nn.InstanceNorm2d(in_channels)
 
-        self.conv1 = ConvModule(
+        self.conv1 = Conv2dModule(
             in_channels=in_channels,
             out_channels=stem_channels,
             kernel_size=3,
             stride=strides[0],
             padding=1,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg={"type": "ReLU"},
         )
 
         self.conv2 = None
         if extra_stride:
-            self.conv2 = ConvModule(
+            self.conv2 = Conv2dModule(
                 in_channels=stem_channels,
                 out_channels=stem_channels,
                 kernel_size=3,
                 stride=2,
                 padding=1,
-                conv_cfg=self.conv_cfg,
                 norm_cfg=self.norm_cfg,
                 act_cfg={"type": "ReLU"},
             )
@@ -608,57 +573,52 @@ def __init__(
             inc_channels = self.out_channels - stem_channels
 
         self.branch1 = nn.Sequential(
-            ConvModule(
+            Conv2dModule(
                 branch_channels,
                 branch_channels,
                 kernel_size=3,
                 stride=strides[1],
                 padding=1,
                 groups=branch_channels,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=None,
             ),
-            ConvModule(
+            Conv2dModule(
                 branch_channels,
                 inc_channels,
                 kernel_size=1,
                 stride=1,
                 padding=0,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg={"type": "ReLU"},
             ),
         )
 
-        self.expand_conv = ConvModule(
+        self.expand_conv = Conv2dModule(
             branch_channels,
             mid_channels,
             kernel_size=1,
             stride=1,
             padding=0,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg={"type": "ReLU"},
         )
-        self.depthwise_conv = ConvModule(
+        self.depthwise_conv = Conv2dModule(
             mid_channels,
             mid_channels,
             kernel_size=3,
             stride=strides[1],
             padding=1,
             groups=mid_channels,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg=None,
         )
-        self.linear_conv = ConvModule(
+        self.linear_conv = Conv2dModule(
             mid_channels,
             branch_channels if stem_channels == self.out_channels else stem_channels,
             kernel_size=1,
             stride=1,
             padding=0,
-            conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
             act_cfg={"type": "ReLU"},
         )
@@ -705,7 +665,6 @@ def __init__(
         stem_channels: int,
         out_channels: int,
         expand_ratio: int,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         with_cp: bool = False,
         num_stages: int = 1,
@@ -720,7 +679,6 @@ def __init__(
             stem_channels (int): Number of output channels of the stem layer.
             out_channels (int): Number of output channels of the backbone network.
             expand_ratio (int): Expansion ratio of the internal channels.
-            conv_cfg (dict | None): Dictionary to construct and configure convolution layers.
             norm_cfg (dict | None): Dictionary to construct and configure normalization layers.
             with_cp (bool): Use checkpointing to save memory during forward pass.
             num_stages (int): Number of stages in the backbone network.
@@ -750,7 +708,6 @@ def __init__(
 
         self.in_channels = in_channels
         self.out_channels = out_channels
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.with_cp = with_cp
         self.num_stages = num_stages
@@ -759,26 +716,24 @@ def __init__(
         if input_norm:
             self.input_norm = nn.InstanceNorm2d(in_channels)
 
-        self.conv1 = ConvModule(
+        self.conv1 = Conv2dModule(
             in_channels=in_channels,
             out_channels=stem_channels,
             kernel_size=3,
             stride=strides[0],
             padding=1,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg={"type": "ReLU"},
         )
 
         self.conv2 = None
         if extra_stride:
-            self.conv2 = ConvModule(
+            self.conv2 = Conv2dModule(
                 in_channels=stem_channels,
                 out_channels=stem_channels,
                 kernel_size=3,
                 stride=2,
                 padding=1,
-                conv_cfg=self.conv_cfg,
                 norm_cfg=self.norm_cfg,
                 act_cfg={"type": "ReLU"},
             )
@@ -791,24 +746,22 @@ def __init__(
         for stage in range(1, num_stages + 1):
             self.branch1.append(
                 nn.Sequential(
-                    ConvModule(
+                    Conv2dModule(
                         internal_branch_channels,
                         internal_branch_channels,
                         kernel_size=3,
                         stride=strides[stage],
                         padding=1,
                         groups=internal_branch_channels,
-                        conv_cfg=conv_cfg,
                         norm_cfg=norm_cfg,
                         act_cfg=None,
                     ),
-                    ConvModule(
+                    Conv2dModule(
                         internal_branch_channels,
                         out_branch_channels if stage == num_stages else internal_branch_channels,
                         kernel_size=1,
                         stride=1,
                         padding=0,
-                        conv_cfg=conv_cfg,
                         norm_cfg=norm_cfg,
                         act_cfg={"type": "ReLU"},
                     ),
@@ -817,34 +770,31 @@ def __init__(
 
             self.branch2.append(
                 nn.Sequential(
-                    ConvModule(
+                    Conv2dModule(
                         internal_branch_channels,
                         mid_channels,
                         kernel_size=1,
                         stride=1,
                         padding=0,
-                        conv_cfg=conv_cfg,
                         norm_cfg=norm_cfg,
                         act_cfg={"type": "ReLU"},
                     ),
-                    ConvModule(
+                    Conv2dModule(
                         mid_channels,
                         mid_channels,
                         kernel_size=3,
                         stride=strides[stage],
                         padding=1,
                         groups=mid_channels,
-                        conv_cfg=conv_cfg,
                         norm_cfg=norm_cfg,
                         act_cfg=None,
                     ),
-                    ConvModule(
+                    Conv2dModule(
                         mid_channels,
                         out_branch_channels if stage == num_stages else internal_branch_channels,
                         kernel_size=1,
                         stride=1,
                         padding=0,
-                        conv_cfg=conv_cfg,
                         norm_cfg=norm_cfg,
                         act_cfg={"type": "ReLU"},
                     ),
@@ -893,7 +843,6 @@ def __init__(
         in_channels: int,
         out_channels: int,
         stride: int = 1,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         act_cfg: dict | None = None,
         with_cp: bool = False,
@@ -904,8 +853,6 @@ def __init__(
             in_channels (int): The input channels of the block.
             out_channels (int): The output channels of the block.
             stride (int): Stride of the 3x3 convolution layer. Default: 1
-            conv_cfg (dict): Config dict for convolution layer.
-                Default: None, which means using conv2d.
             norm_cfg (dict): Config dict for normalization layer.
                 Default: dict(type='BN').
             act_cfg (dict): Config dict for activation layer.
@@ -935,58 +882,53 @@ def __init__(
 
         if self.stride > 1:
             self.branch1 = nn.Sequential(
-                ConvModule(
+                Conv2dModule(
                     in_channels,
                     in_channels,
                     kernel_size=3,
                     stride=self.stride,
                     padding=1,
                     groups=in_channels,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=None,
                 ),
-                ConvModule(
+                Conv2dModule(
                     in_channels,
                     branch_features,
                     kernel_size=1,
                     stride=1,
                     padding=0,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=act_cfg,
                 ),
             )
 
         self.branch2 = nn.Sequential(
-            ConvModule(
+            Conv2dModule(
                 in_channels if (self.stride > 1) else branch_features,
                 branch_features,
                 kernel_size=1,
                 stride=1,
                 padding=0,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             ),
-            ConvModule(
+            Conv2dModule(
                 branch_features,
                 branch_features,
                 kernel_size=3,
                 stride=self.stride,
                 padding=1,
                 groups=branch_features,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=None,
             ),
-            ConvModule(
+            Conv2dModule(
                 branch_features,
                 branch_features,
                 kernel_size=1,
                 stride=1,
                 padding=0,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 act_cfg=act_cfg,
             ),
@@ -1019,7 +961,6 @@ def __init__(
         module_type: str,
         multiscale_output: bool = False,
         with_fuse: bool = True,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         with_cp: bool = False,
         dropout: float | None = None,
@@ -1036,7 +977,6 @@ def __init__(
             module_type (str): Type of module to use for the network. Can be "LITE" or "NAIVE".
             multiscale_output (bool, optional): Whether to output features from all branches. Defaults to False.
             with_fuse (bool, optional): Whether to use the fuse layer. Defaults to True.
-            conv_cfg (dict, optional): Configuration for the convolutional layers. Defaults to None.
             norm_cfg (dict, optional): Configuration for the normalization layers. Defaults to None.
             with_cp (bool, optional): Whether to use checkpointing. Defaults to False.
             dropout (float, optional): Dropout rate. Defaults to None.
@@ -1056,7 +996,6 @@ def __init__(
         self.multiscale_output = multiscale_output
         self.with_fuse = with_fuse
         self.norm_cfg = norm_cfg
-        self.conv_cfg = conv_cfg
         self.with_cp = with_cp
         self.weighting_module_version = weighting_module_version
         self.neighbour_weighting = neighbour_weighting
@@ -1089,7 +1028,6 @@ def _make_weighting_blocks(
                 self.in_channels,
                 stride=stride,
                 reduce_ratio=reduce_ratio,
-                conv_cfg=self.conv_cfg,
                 norm_cfg=self.norm_cfg,
                 with_cp=self.with_cp,
                 dropout=dropout,
@@ -1108,7 +1046,6 @@ def _make_one_branch(self, branch_index: int, num_blocks: int, stride: int = 1)
                 self.in_channels[branch_index],
                 self.in_channels[branch_index],
                 stride=stride,
-                conv_cfg=self.conv_cfg,
                 norm_cfg=self.norm_cfg,
                 act_cfg={"type": "ReLU"},
                 with_cp=self.with_cp,
@@ -1118,7 +1055,6 @@ def _make_one_branch(self, branch_index: int, num_blocks: int, stride: int = 1)
                 self.in_channels[branch_index],
                 self.in_channels[branch_index],
                 stride=1,
-                conv_cfg=self.conv_cfg,
                 norm_cfg=self.norm_cfg,
                 act_cfg={"type": "ReLU"},
                 with_cp=self.with_cp,
@@ -1149,8 +1085,7 @@ def _make_fuse_layers(self) -> nn.ModuleList:
                 if j > i:
                     fuse_layer.append(
                         nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
+                            nn.Conv2d(
                                 in_channels[j],
                                 in_channels[i],
                                 kernel_size=1,
@@ -1169,8 +1104,7 @@ def _make_fuse_layers(self) -> nn.ModuleList:
                         if k == i - j - 1:
                             conv_downsamples.append(
                                 nn.Sequential(
-                                    build_conv_layer(
-                                        self.conv_cfg,
+                                    nn.Conv2d(
                                         in_channels[j],
                                         in_channels[j],
                                         kernel_size=3,
@@ -1180,8 +1114,7 @@ def _make_fuse_layers(self) -> nn.ModuleList:
                                         bias=False,
                                     ),
                                     build_norm_layer(self.norm_cfg, in_channels[j])[1],
-                                    build_conv_layer(
-                                        self.conv_cfg,
+                                    nn.Conv2d(
                                         in_channels[j],
                                         in_channels[i],
                                         kernel_size=1,
@@ -1195,8 +1128,7 @@ def _make_fuse_layers(self) -> nn.ModuleList:
                         else:
                             conv_downsamples.append(
                                 nn.Sequential(
-                                    build_conv_layer(
-                                        self.conv_cfg,
+                                    nn.Conv2d(
                                         in_channels[j],
                                         in_channels[j],
                                         kernel_size=3,
@@ -1206,8 +1138,7 @@ def _make_fuse_layers(self) -> nn.ModuleList:
                                         bias=False,
                                     ),
                                     build_norm_layer(self.norm_cfg, in_channels[j])[1],
-                                    build_conv_layer(
-                                        self.conv_cfg,
+                                    nn.Conv2d(
                                         in_channels[j],
                                         in_channels[j],
                                         kernel_size=1,
@@ -1265,7 +1196,6 @@ class LiteHRNet(BaseModule):
     Args:
         extra (dict): detailed configuration for each stage of HRNet.
         in_channels (int): Number of input image channels. Default: 3.
-        conv_cfg (dict): dictionary to construct and config conv layer.
         norm_cfg (dict): dictionary to construct and config norm layer.
         norm_eval (bool): Whether to set norm layers to eval mode, namely,
             freeze running stats (mean and var). Note: Effect on Batch Norm
@@ -1280,7 +1210,6 @@ def __init__(
         self,
         extra: dict,
         in_channels: int = 3,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         norm_eval: bool = False,
         with_cp: bool = False,
@@ -1294,11 +1223,8 @@ def __init__(
 
         if norm_cfg is None:
             norm_cfg = {"type": "BN"}
-        if conv_cfg is None:
-            conv_cfg = {"type": "Conv2d"}
 
         self.extra = extra
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.norm_eval = norm_eval
         self.with_cp = with_cp
@@ -1311,7 +1237,6 @@ def __init__(
             expand_ratio=self.extra["stem"]["expand_ratio"],
             strides=self.extra["stem"]["strides"],
             extra_stride=self.extra["stem"]["extra_stride"],
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
         )
 
@@ -1351,13 +1276,12 @@ def __init__(
             if self.extra["out_modules"]["conv"]["enable"]:
                 out_modules_channels = self.extra["out_modules"]["conv"]["channels"]
                 out_modules.append(
-                    ConvModule(
+                    Conv2dModule(
                         in_channels=in_modules_channels,
                         out_channels=out_modules_channels,
                         kernel_size=1,
                         stride=1,
                         padding=0,
-                        conv_cfg=self.conv_cfg,
                         norm_cfg=self.norm_cfg,
                         act_cfg={"type": "ReLU"},
                     ),
@@ -1370,7 +1294,6 @@ def __init__(
                         key_channels=self.extra["out_modules"]["position_att"]["key_channels"],
                         value_channels=self.extra["out_modules"]["position_att"]["value_channels"],
                         psp_size=self.extra["out_modules"]["position_att"]["psp_size"],
-                        conv_cfg=self.conv_cfg,
                         norm_cfg=self.norm_cfg,
                     ),
                 )
@@ -1378,7 +1301,6 @@ def __init__(
                 out_modules.append(
                     LocalAttentionModule(
                         num_channels=in_modules_channels,
-                        conv_cfg=self.conv_cfg,
                         norm_cfg=self.norm_cfg,
                     ),
                 )
@@ -1390,24 +1312,22 @@ def __init__(
         self.add_stem_features = self.extra.get("add_stem_features", False)
         if self.add_stem_features:
             self.stem_transition = nn.Sequential(
-                ConvModule(
+                Conv2dModule(
                     self.stem.out_channels,
                     self.stem.out_channels,
                     kernel_size=3,
                     stride=1,
                     padding=1,
                     groups=self.stem.out_channels,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg=None,
                 ),
-                ConvModule(
+                Conv2dModule(
                     self.stem.out_channels,
                     num_channels_last[0],
                     kernel_size=1,
                     stride=1,
                     padding=0,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg={"type": "ReLU"},
                 ),
@@ -1420,7 +1340,6 @@ def __init__(
             self.aggregator = IterativeAggregator(
                 in_channels=num_channels_last,
                 min_channels=self.extra["out_aggregator"].get("min_channels", None),
-                conv_cfg=self.conv_cfg,
                 norm_cfg=self.norm_cfg,
             )
 
@@ -1442,8 +1361,7 @@ def _make_transition_layer(
                 if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
                     transition_layers.append(
                         nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
+                            nn.Conv2d(
                                 num_channels_pre_layer[i],
                                 num_channels_pre_layer[i],
                                 kernel_size=3,
@@ -1453,8 +1371,7 @@ def _make_transition_layer(
                                 bias=False,
                             ),
                             build_norm_layer(self.norm_cfg, num_channels_pre_layer[i])[1],
-                            build_conv_layer(
-                                self.conv_cfg,
+                            nn.Conv2d(
                                 num_channels_pre_layer[i],
                                 num_channels_cur_layer[i],
                                 kernel_size=1,
@@ -1475,8 +1392,7 @@ def _make_transition_layer(
                     out_channels = num_channels_cur_layer[i] if j == i - num_branches_pre else in_channels
                     conv_downsamples.append(
                         nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
+                            nn.Conv2d(
                                 in_channels,
                                 in_channels,
                                 kernel_size=3,
@@ -1486,8 +1402,7 @@ def _make_transition_layer(
                                 bias=False,
                             ),
                             build_norm_layer(self.norm_cfg, in_channels)[1],
-                            build_conv_layer(
-                                self.conv_cfg,
+                            nn.Conv2d(
                                 in_channels,
                                 out_channels,
                                 kernel_size=1,
@@ -1546,7 +1461,6 @@ def _make_stage(
                     module_type,
                     multiscale_output=reset_multiscale_output,
                     with_fuse=with_fuse,
-                    conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg,
                     with_cp=self.with_cp,
                     dropout=dropout,
diff --git a/src/otx/algo/segmentation/heads/base_segm_head.py b/src/otx/algo/segmentation/heads/base_segm_head.py
index e642a7529d0..8547b0233dc 100644
--- a/src/otx/algo/segmentation/heads/base_segm_head.py
+++ b/src/otx/algo/segmentation/heads/base_segm_head.py
@@ -24,7 +24,6 @@ def __init__(
         channels: int,
         num_classes: int,
         dropout_ratio: float = 0.1,
-        conv_cfg: dict[str, str] | None = None,
         norm_cfg: dict[str, str] | None = None,
         act_cfg: dict[str, str] | None = None,
         in_index: int | list[int] = -1,
@@ -40,8 +39,6 @@ def __init__(
             channels (int): Number of channels in the feature map.
             num_classes (int): Number of classes for segmentation.
             dropout_ratio (float, optional): The dropout ratio. Defaults to 0.1.
-            conv_cfg (Optional[ConfigType], optional): Config for convolution layer.
-                Defaults to None.
             norm_cfg (Optional[ConfigType], optional): Config for normalization layer.
                 Defaults to None.
             act_cfg (Dict[str, Union[str, Dict]], optional): Activation config.
@@ -59,7 +56,6 @@ def __init__(
         self.num_classes = num_classes
         self.input_transform = input_transform
         self.dropout_ratio = dropout_ratio
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
         if self.input_transform is not None and not isinstance(in_index, list):
diff --git a/src/otx/algo/segmentation/heads/fcn_head.py b/src/otx/algo/segmentation/heads/fcn_head.py
index 42b8ee2ab85..c6d8316c59f 100644
--- a/src/otx/algo/segmentation/heads/fcn_head.py
+++ b/src/otx/algo/segmentation/heads/fcn_head.py
@@ -10,7 +10,7 @@
 import torch
 from torch import Tensor, nn
 
-from otx.algo.modules import ConvModule
+from otx.algo.modules import Conv2dModule
 from otx.algo.segmentation.modules import IterativeAggregator
 
 from .base_segm_head import BaseSegmHead
@@ -34,7 +34,6 @@ def __init__(
         in_channels: list[int] | int,
         in_index: list[int] | int,
         norm_cfg: dict[str, Any] | None = None,
-        conv_cfg: dict[str, Any] | None = None,
         input_transform: str | None = None,
         num_convs: int = 2,
         kernel_size: int = 3,
@@ -73,7 +72,6 @@ def __init__(
             aggregator = IterativeAggregator(
                 in_channels=in_channels,
                 min_channels=aggregator_min_channels,
-                conv_cfg=conv_cfg,
                 norm_cfg=norm_cfg,
                 merge_norm=aggregator_merge_norm,
                 use_concat=aggregator_use_concat,
@@ -91,7 +89,6 @@ def __init__(
         super().__init__(
             in_index=in_index,
             norm_cfg=norm_cfg,
-            conv_cfg=conv_cfg,
             input_transform=input_transform,
             in_channels=in_channels,
             **kwargs,
@@ -105,7 +102,7 @@ def __init__(
 
         conv_padding = (kernel_size // 2) * dilation
         convs = [
-            ConvModule(
+            Conv2dModule(
                 self.in_channels,
                 self.channels,
                 kernel_size=kernel_size,
@@ -117,7 +114,7 @@ def __init__(
         ]
         convs.extend(
             [
-                ConvModule(
+                Conv2dModule(
                     self.channels,
                     self.channels,
                     kernel_size=kernel_size,
@@ -134,12 +131,11 @@ def __init__(
         else:
             self.convs = nn.Sequential(*convs)
         if self.concat_input:
-            self.conv_cat = ConvModule(
+            self.conv_cat = Conv2dModule(
                 self.in_channels + self.channels,
                 self.channels,
                 kernel_size=kernel_size,
                 padding=kernel_size // 2,
-                conv_cfg=self.conv_cfg,
                 norm_cfg=self.norm_cfg,
                 act_cfg=self.act_cfg,
             )
diff --git a/src/otx/algo/segmentation/heads/ham_head.py b/src/otx/algo/segmentation/heads/ham_head.py
index 68c7b006cba..52f789808b0 100644
--- a/src/otx/algo/segmentation/heads/ham_head.py
+++ b/src/otx/algo/segmentation/heads/ham_head.py
@@ -11,7 +11,7 @@
 import torch.nn.functional as f
 from torch import nn
 
-from otx.algo.modules import ConvModule
+from otx.algo.modules import Conv2dModule
 from otx.algo.segmentation.modules import resize
 
 from .base_segm_head import BaseSegmHead
@@ -45,11 +45,11 @@ def __init__(
         """
         super().__init__()
 
-        self.ham_in = ConvModule(ham_channels, ham_channels, 1, norm_cfg=None, act_cfg=None)
+        self.ham_in = Conv2dModule(ham_channels, ham_channels, 1, norm_cfg=None, act_cfg=None)
 
         self.ham = NMF2D(ham_channels=ham_channels, **ham_kwargs)
 
-        self.ham_out = ConvModule(ham_channels, ham_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        self.ham_out = Conv2dModule(ham_channels, ham_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward."""
@@ -97,22 +97,20 @@ def __init__(
         self.ham_channels: int = ham_channels
         self.ham_kwargs: dict[str, Any] = ham_kwargs if ham_kwargs is not None else {}
 
-        self.squeeze = ConvModule(
+        self.squeeze = Conv2dModule(
             sum(self.in_channels),
             self.ham_channels,
             1,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=self.act_cfg,
         )
 
         self.hamburger = Hamburger(self.ham_channels, ham_kwargs=self.ham_kwargs, **kwargs)
 
-        self.align = ConvModule(
+        self.align = Conv2dModule(
             self.ham_channels,
             self.channels,
             1,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=self.act_cfg,
         )
diff --git a/src/otx/algo/segmentation/modules/aggregators.py b/src/otx/algo/segmentation/modules/aggregators.py
index b5143255c66..bff23694b50 100644
--- a/src/otx/algo/segmentation/modules/aggregators.py
+++ b/src/otx/algo/segmentation/modules/aggregators.py
@@ -9,7 +9,7 @@
 from torch import nn
 from torch.nn import functional as f
 
-from otx.algo.modules import ConvModule, DepthwiseSeparableConvModule
+from otx.algo.modules import Conv2dModule, DepthwiseSeparableConvModule
 
 from .utils import normalize
 
@@ -24,7 +24,6 @@ def __init__(
         self,
         in_channels: list[int],
         min_channels: int | None = None,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
         merge_norm: str | None = None,
         use_concat: bool = False,
@@ -34,7 +33,6 @@ def __init__(
         Args:
             in_channels (list[int]): List of input channels for each branch.
             min_channels (int | None): Minimum number of channels. Defaults to None.
-            conv_cfg (dict | None): Config for convolution layers. Defaults to None.
             norm_cfg (dict | None): Config for normalization layers. Defaults to None.
             merge_norm (str | None): Whether to merge normalization layers. Defaults to None.
             use_concat (bool): Whether to use concatenation. Defaults to False.
@@ -44,8 +42,6 @@ def __init__(
         """
         if norm_cfg is None:
             norm_cfg = {"type": "BN"}
-        if conv_cfg is None:
-            conv_cfg = {"type": "Conv2d"}
 
         super().__init__()
 
@@ -57,8 +53,8 @@ def __init__(
         min_channels = min_channels if min_channels is not None else 0
 
         projects: list[DepthwiseSeparableConvModule | None] = []
-        expanders: list[ConvModule | None] = []
-        fuse_layers: list[ConvModule | None] = []
+        expanders: list[Conv2dModule | None] = []
+        fuse_layers: list[Conv2dModule | None] = []
 
         for i in range(num_branches):
             if not self.use_concat or i == 0:
@@ -66,12 +62,11 @@ def __init__(
             else:
                 out_channels = self.in_channels[i + 1]
                 fuse_layers.append(
-                    ConvModule(
+                    Conv2dModule(
                         in_channels=2 * out_channels,
                         out_channels=out_channels,
                         kernel_size=1,
                         stride=1,
-                        conv_cfg=conv_cfg,
                         norm_cfg=norm_cfg,
                         act_cfg={"type": "ReLU"},
                     ),
@@ -89,7 +84,6 @@ def __init__(
                     kernel_size=3,
                     stride=1,
                     padding=1,
-                    conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     act_cfg={"type": "ReLU"},
                     dw_act_cfg=None,
@@ -99,12 +93,11 @@ def __init__(
 
             if self.in_channels[i] < min_channels:
                 expanders.append(
-                    ConvModule(
+                    Conv2dModule(
                         in_channels=self.in_channels[i],
                         out_channels=min_channels,
                         kernel_size=1,
                         stride=1,
-                        conv_cfg=conv_cfg,
                         norm_cfg=norm_cfg,
                         act_cfg={"type": "ReLU"},
                     ),
diff --git a/src/otx/algo/segmentation/modules/blocks.py b/src/otx/algo/segmentation/modules/blocks.py
index c6dddd0ad90..86d049b1da4 100644
--- a/src/otx/algo/segmentation/modules/blocks.py
+++ b/src/otx/algo/segmentation/modules/blocks.py
@@ -12,7 +12,7 @@
 from torch import nn
 from torch.nn import AdaptiveAvgPool2d, AdaptiveMaxPool2d
 
-from otx.algo.modules import ConvModule
+from otx.algo.modules import Conv2dModule
 
 
 class PSPModule(nn.Module):
@@ -54,7 +54,6 @@ def __init__(
         key_channels: int,
         value_channels: int | None = None,
         psp_size: tuple | None = None,
-        conv_cfg: dict | None = None,
         norm_cfg: dict | None = None,
     ):
         super().__init__()
@@ -62,43 +61,39 @@ def __init__(
         self.in_channels = in_channels
         self.key_channels = key_channels
         self.value_channels = value_channels if value_channels is not None else in_channels
-        self.conv_cfg = conv_cfg
         if norm_cfg is None:
             norm_cfg = {"type": "BN"}
         if psp_size is None:
             psp_size = (1, 3, 6, 8)
         self.norm_cfg = norm_cfg
-        self.query_key = ConvModule(
+        self.query_key = Conv2dModule(
             in_channels=self.in_channels,
             out_channels=self.key_channels,
             kernel_size=1,
             stride=1,
             padding=0,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg={"type": "ReLU"},
         )
         self.key_psp = PSPModule(psp_size, method="max")
 
-        self.value = ConvModule(
+        self.value = Conv2dModule(
             in_channels=self.in_channels,
             out_channels=self.value_channels,
             kernel_size=1,
             stride=1,
             padding=0,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg={"type": "ReLU"},
         )
         self.value_psp = PSPModule(psp_size, method="max")
 
-        self.out_conv = ConvModule(
+        self.out_conv = Conv2dModule(
             in_channels=self.value_channels,
             out_channels=self.in_channels,
             kernel_size=1,
             stride=1,
             padding=0,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=None,
         )
@@ -160,45 +155,41 @@ class LocalAttentionModule(nn.Module):
     Reference: https://github.com/lxtGH/GALD-DGCNet.
     """
 
-    def __init__(self, num_channels: int, conv_cfg: dict | None = None, norm_cfg: dict | None = None):
+    def __init__(self, num_channels: int, norm_cfg: dict | None = None):
         if norm_cfg is None:
             norm_cfg = {"type": "BN"}
         super().__init__()
 
         self.num_channels = num_channels
-        self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
 
-        self.dwconv1 = ConvModule(
+        self.dwconv1 = Conv2dModule(
             in_channels=self.num_channels,
             out_channels=self.num_channels,
             kernel_size=3,
             stride=2,
             padding=1,
             groups=self.num_channels,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg={"type": "ReLU"},
         )
-        self.dwconv2 = ConvModule(
+        self.dwconv2 = Conv2dModule(
             in_channels=self.num_channels,
             out_channels=self.num_channels,
             kernel_size=3,
             stride=2,
             padding=1,
             groups=self.num_channels,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg={"type": "ReLU"},
         )
-        self.dwconv3 = ConvModule(
+        self.dwconv3 = Conv2dModule(
             in_channels=self.num_channels,
             out_channels=self.num_channels,
             kernel_size=3,
             stride=2,
             padding=1,
             groups=self.num_channels,
-            conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg={"type": "ReLU"},
         )
diff --git a/tests/unit/algo/detection/heads/test_yolox_head.py b/tests/unit/algo/detection/heads/test_yolox_head.py
index 62ef5a4e6e8..36d190fb74c 100644
--- a/tests/unit/algo/detection/heads/test_yolox_head.py
+++ b/tests/unit/algo/detection/heads/test_yolox_head.py
@@ -10,8 +10,7 @@
 from omegaconf import DictConfig
 from otx.algo.detection.heads import YOLOXHead
 from otx.algo.detection.utils.assigners import SimOTAAssigner
-from otx.algo.modules.conv_module import ConvModule
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 from otx.algo.utils.mmengine_utils import InstanceData
 
 
@@ -52,7 +51,7 @@ def test_loss_by_feat(self):
         }
         head = YOLOXHead(num_classes=4, in_channels=1, stacked_convs=1, use_depthwise=False, train_cfg=train_cfg)
         assert not head.use_l1
-        assert isinstance(head.multi_level_cls_convs[0][0], ConvModule)
+        assert isinstance(head.multi_level_cls_convs[0][0], Conv2dModule)
 
         feat = [torch.rand(1, 1, s // feat_size, s // feat_size) for feat_size in [4, 8, 16]]
         cls_scores, bbox_preds, objectnesses = head.forward(feat)
diff --git a/tests/unit/algo/detection/layers/test_csp_layer.py b/tests/unit/algo/detection/layers/test_csp_layer.py
index 22e1aa1730c..5e3fe06bf0f 100644
--- a/tests/unit/algo/detection/layers/test_csp_layer.py
+++ b/tests/unit/algo/detection/layers/test_csp_layer.py
@@ -6,8 +6,7 @@
 from otx.algo.detection.layers import ChannelAttention
 from otx.algo.detection.layers.csp_layer import CSPLayer, CSPNeXtBlock, DarknetBottleneck
 from otx.algo.modules.activation import Swish
-from otx.algo.modules.conv_module import ConvModule
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 from torch.nn import BatchNorm2d, Conv2d
 
 
@@ -17,7 +16,7 @@ def test_init(self) -> None:
         csp_layer = CSPLayer(3, 5)
 
         assert isinstance(csp_layer.blocks[0], DarknetBottleneck)
-        assert isinstance(csp_layer.blocks[0].conv2, ConvModule)
+        assert isinstance(csp_layer.blocks[0].conv2, Conv2dModule)
         assert isinstance(csp_layer.blocks[0].conv1.conv, Conv2d)
         assert isinstance(csp_layer.blocks[0].conv1.bn, BatchNorm2d)
         assert isinstance(csp_layer.blocks[0].conv1.activate, Swish)
diff --git a/tests/unit/algo/detection/necks/test_yolox_pafpn.py b/tests/unit/algo/detection/necks/test_yolox_pafpn.py
index 29b0cb17554..fb4fd4886a9 100644
--- a/tests/unit/algo/detection/necks/test_yolox_pafpn.py
+++ b/tests/unit/algo/detection/necks/test_yolox_pafpn.py
@@ -8,7 +8,7 @@
 
 import torch
 from otx.algo.detection.necks.yolox_pafpn import YOLOXPAFPN
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from otx.algo.modules.conv_module import DepthwiseSeparableConvModule
 
 
 class TestYOLOXPAFPN:
diff --git a/tests/unit/algo/modules/test_conv.py b/tests/unit/algo/modules/test_conv.py
deleted file mode 100644
index 04339449a5d..00000000000
--- a/tests/unit/algo/modules/test_conv.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-from otx.algo.modules.conv import build_conv_layer
-from torch import nn
-
-
-def test_build_conv_layer():
-    cfg = {"type": "Conv1d"}
-    conv = build_conv_layer(cfg, in_channels=1, out_channels=1, kernel_size=1)
-    assert isinstance(conv, nn.Conv1d)
-
-    cfg = {"type": "Conv2d"}
-    conv = build_conv_layer(cfg, in_channels=1, out_channels=1, kernel_size=1)
-    assert isinstance(conv, nn.Conv2d)
-
-    cfg = {"type": "Conv3d"}
-    conv = build_conv_layer(cfg, in_channels=1, out_channels=1, kernel_size=1)
-    assert isinstance(conv, nn.Conv3d)
-
-    cfg = {"type": "Conv"}
-    conv = build_conv_layer(cfg, in_channels=1, out_channels=1, kernel_size=1)
-    assert isinstance(conv, nn.Conv2d)
-
-    with pytest.raises(TypeError):
-        build_conv_layer(None)
-
-    with pytest.raises(KeyError, match='the cfg dict must contain the key "type"'):
-        build_conv_layer({"cfg": 1})
-
-    with pytest.raises(KeyError, match="Cannot find"):
-        build_conv_layer({"type": "None"})
diff --git a/tests/unit/algo/modules/test_conv_module.py b/tests/unit/algo/modules/test_conv_module.py
index 26dc595a914..be0f8e34463 100644
--- a/tests/unit/algo/modules/test_conv_module.py
+++ b/tests/unit/algo/modules/test_conv_module.py
@@ -2,32 +2,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (c) OpenMMLab. All rights reserved.
 # https://github.com/open-mmlab/mmcv/blob/main/tests/test_cnn/test_conv_module.py
-from unittest.mock import patch
 
 import pytest
 import torch
-from otx.algo.modules.conv_module import ConvModule
+from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 from torch import nn
 
 
 def test_conv_module():
-    conv_cfg = "conv"
-    with pytest.raises(AssertionError):
-        # conv_cfg must be a dict or None
-        ConvModule(3, 8, 2, conv_cfg=conv_cfg)
-
     norm_cfg = "norm"
     with pytest.raises(AssertionError):
         # norm_cfg must be a dict or None
-        ConvModule(3, 8, 2, norm_cfg=norm_cfg)
+        Conv2dModule(3, 8, 2, norm_cfg=norm_cfg)
 
     act_cfg = {"type": "softmax"}
     with pytest.raises(KeyError):
         # softmax is not supported
-        ConvModule(3, 8, 2, act_cfg=act_cfg)
+        Conv2dModule(3, 8, 2, act_cfg=act_cfg)
 
     # conv + norm + act
-    conv = ConvModule(3, 8, 2, norm_cfg={"type": "BN"})
+    conv = Conv2dModule(3, 8, 2, norm_cfg={"type": "BN"})
     assert conv.with_activation
     assert hasattr(conv, "activate")
     assert conv.with_norm
@@ -36,26 +30,8 @@ def test_conv_module():
     output = conv(x)
     assert output.shape == (1, 8, 255, 255)
 
-    # conv + norm with efficient mode
-    efficient_conv = ConvModule(3, 8, 2, norm_cfg={"type": "BN"}, efficient_conv_bn_eval=True).eval()
-    plain_conv = ConvModule(3, 8, 2, norm_cfg={"type": "BN"}, efficient_conv_bn_eval=False).eval()
-    for efficient_param, plain_param in zip(efficient_conv.state_dict().values(), plain_conv.state_dict().values()):
-        plain_param.copy_(efficient_param)
-
-    efficient_mode_output = efficient_conv(x)
-    plain_mode_output = plain_conv(x)
-    assert torch.allclose(efficient_mode_output, plain_mode_output, atol=1e-5)
-
-    # `conv` attribute can be dynamically modified in efficient mode
-    efficient_conv = ConvModule(3, 8, 2, norm_cfg={"type": "BN"}, efficient_conv_bn_eval=True).eval()
-    new_conv = nn.Conv2d(3, 8, 2).eval()
-    efficient_conv.conv = new_conv
-    efficient_mode_output = efficient_conv(x)
-    plain_mode_output = efficient_conv.activate(efficient_conv.norm_layer(new_conv(x)))
-    assert torch.allclose(efficient_mode_output, plain_mode_output, atol=1e-5)
-
     # conv + act
-    conv = ConvModule(3, 8, 2)
+    conv = Conv2dModule(3, 8, 2)
     assert conv.with_activation
     assert hasattr(conv, "activate")
     assert not conv.with_norm
@@ -65,7 +41,7 @@ def test_conv_module():
     assert output.shape == (1, 8, 255, 255)
 
     # conv
-    conv = ConvModule(3, 8, 2, act_cfg=None)
+    conv = Conv2dModule(3, 8, 2, act_cfg=None)
     assert not conv.with_norm
     assert conv.norm_layer is None
     assert not conv.with_activation
@@ -74,46 +50,46 @@ def test_conv_module():
     output = conv(x)
     assert output.shape == (1, 8, 255, 255)
 
-    conv = ConvModule(3, 8, 3, padding=1, with_spectral_norm=True)
+    conv = Conv2dModule(3, 8, 3, padding=1, with_spectral_norm=True)
     assert hasattr(conv.conv, "weight_orig")
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
-    conv = ConvModule(3, 8, 3, padding=1, padding_mode="reflect")
+    conv = Conv2dModule(3, 8, 3, padding=1, padding_mode="reflect")
     assert isinstance(conv.padding_layer, nn.ReflectionPad2d)
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
     # non-existing padding mode
     with pytest.raises(KeyError):
-        conv = ConvModule(3, 8, 3, padding=1, padding_mode="non_exists")
+        conv = Conv2dModule(3, 8, 3, padding=1, padding_mode="non_exists")
 
     # leaky relu
-    conv = ConvModule(3, 8, 3, padding=1, act_cfg={"type": "LeakyReLU"})
+    conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "LeakyReLU"})
     assert isinstance(conv.activate, nn.LeakyReLU)
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
     # tanh
-    conv = ConvModule(3, 8, 3, padding=1, act_cfg={"type": "Tanh"})
+    conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "Tanh"})
     assert isinstance(conv.activate, nn.Tanh)
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
     # Sigmoid
-    conv = ConvModule(3, 8, 3, padding=1, act_cfg={"type": "Sigmoid"})
+    conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "Sigmoid"})
     assert isinstance(conv.activate, nn.Sigmoid)
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
     # PReLU
-    conv = ConvModule(3, 8, 3, padding=1, act_cfg={"type": "PReLU"})
+    conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "PReLU"})
     assert isinstance(conv.activate, nn.PReLU)
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
     # Test norm layer with name
-    conv = ConvModule(3, 8, 2, norm_cfg={"type": "BN", "name": "some_norm_layer"})
+    conv = Conv2dModule(3, 8, 2, norm_cfg={"type": "BN", "name": "some_norm_layer"})
     assert conv.norm_layer.__class__.__name__ == "BatchNorm2d"
     assert conv.norm_name == "some_norm_layer"
     assert hasattr(conv, "norm_layer")
@@ -124,78 +100,108 @@ def test_conv_module():
 
 def test_bias():
     # bias: auto, without norm
-    conv = ConvModule(3, 8, 2)
+    conv = Conv2dModule(3, 8, 2)
     assert conv.conv.bias is not None
 
     # bias: auto, with norm
-    conv = ConvModule(3, 8, 2, norm_cfg={"type": "BN"})
+    conv = Conv2dModule(3, 8, 2, norm_cfg={"type": "BN"})
     assert conv.conv.bias is None
 
     # bias: False, without norm
-    conv = ConvModule(3, 8, 2, bias=False)
+    conv = Conv2dModule(3, 8, 2, bias=False)
     assert conv.conv.bias is None
 
     # bias: True, with batch norm
     with pytest.warns(UserWarning) as record:
-        ConvModule(3, 8, 2, bias=True, norm_cfg={"type": "BN"})
+        Conv2dModule(3, 8, 2, bias=True, norm_cfg={"type": "BN"})
     assert len(record) == 1
     assert record[0].message.args[0] == "Unnecessary conv bias before batch/instance norm"
 
     # bias: True, with instance norm
     with pytest.warns(UserWarning) as record:
-        ConvModule(3, 8, 2, bias=True, norm_cfg={"type": "IN"})
+        Conv2dModule(3, 8, 2, bias=True, norm_cfg={"type": "IN"})
     assert len(record) == 1
     assert record[0].message.args[0] == "Unnecessary conv bias before batch/instance norm"
 
 
-def conv_forward(self, x):
-    return x + "_conv"
-
-
-def bn_forward(self, x):
-    return x + "_bn"
-
-
-def relu_forward(self, x):
-    return x + "_relu"
-
-
-@patch("torch.nn.ReLU.forward", relu_forward)
-@patch("torch.nn.BatchNorm2d.forward", bn_forward)
-@patch("torch.nn.Conv2d.forward", conv_forward)
-def test_order():
-    order = ["conv", "norm", "act"]
-    with pytest.raises(AssertionError):
-        # order must be a tuple
-        ConvModule(3, 8, 2, order=order)
-
-    order = ("conv", "norm")
-    with pytest.raises(AssertionError):
-        # length of order must be 3
-        ConvModule(3, 8, 2, order=order)
-
-    order = ("conv", "norm", "norm")
-    with pytest.raises(AssertionError):
-        # order must be an order of 'conv', 'norm', 'act'
-        ConvModule(3, 8, 2, order=order)
-
-    order = ("conv", "norm", "something")
-    with pytest.raises(AssertionError):
-        # order must be an order of 'conv', 'norm', 'act'
-        ConvModule(3, 8, 2, order=order)
-
-    conv = ConvModule(3, 8, 2, norm_cfg={"type": "BN"})
-    out = conv("input")
-    assert out == "input_conv_bn_relu"
-
-    conv = ConvModule(3, 8, 2, norm_cfg={"type": "BN"}, order=("norm", "conv", "act"))
-    out = conv("input")
-    assert out == "input_bn_conv_relu"
-
-    conv = ConvModule(3, 8, 2, norm_cfg={"type": "BN"})
-    out = conv("input", activate=False)
-    assert out == "input_conv_bn"
-
-    conv = ConvModule(3, 8, 2, norm_cfg={"type": "BN"})
-    out = conv("input", norm=False)
-    assert out == "input_conv_relu"
+class TestDepthwiseSeparableConvModule:
+    def test_forward_with_default_config(self) -> None:
+        # test default config
+        conv = DepthwiseSeparableConvModule(3, 8, 2)
+        assert conv.depthwise_conv.conv.groups == 3
+        assert conv.pointwise_conv.conv.kernel_size == (1, 1)
+        assert not conv.depthwise_conv.with_norm
+        assert not conv.pointwise_conv.with_norm
+        assert conv.depthwise_conv.activate.__class__.__name__ == "ReLU"
+        assert conv.pointwise_conv.activate.__class__.__name__ == "ReLU"
+        x = torch.rand(1, 3, 256, 256)
+        output = conv(x)
+        assert output.shape == (1, 8, 255, 255)
+
+    def test_forward_with_dw_norm_cfg(self) -> None:
+        # test dw_norm_cfg
+        conv = DepthwiseSeparableConvModule(3, 8, 2, dw_norm_cfg={"type": "BN"})
+        assert conv.depthwise_conv.norm_name == "bn"
+        assert not conv.pointwise_conv.with_norm
+        x = torch.rand(1, 3, 256, 256)
+        output = conv(x)
+        assert output.shape == (1, 8, 255, 255)
+
+    def test_forward_with_pw_norm_cfg(self) -> None:
+        # test pw_norm_cfg
+        conv = DepthwiseSeparableConvModule(3, 8, 2, pw_norm_cfg={"type": "BN"})
+        assert not conv.depthwise_conv.with_norm
+        assert conv.pointwise_conv.norm_name == "bn"
+        x = torch.rand(1, 3, 256, 256)
+        output = conv(x)
+        assert output.shape == (1, 8, 255, 255)
+
+    def test_forward_with_norm_cfg(self) -> None:
+        # test norm_cfg
+        conv = DepthwiseSeparableConvModule(3, 8, 2, norm_cfg={"type": "BN"})
+        assert conv.depthwise_conv.norm_name == "bn"
+        assert conv.pointwise_conv.norm_name == "bn"
+        x = torch.rand(1, 3, 256, 256)
+        output = conv(x)
+        assert output.shape == (1, 8, 255, 255)
+
+    def test_forward_with_spectral_norm_padding_mode(self) -> None:
+        x = torch.rand(1, 3, 256, 256)
+
+        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, with_spectral_norm=True)
+        assert hasattr(conv.depthwise_conv.conv, "weight_orig")
+        assert hasattr(conv.pointwise_conv.conv, "weight_orig")
+        output = conv(x)
+        assert output.shape == (1, 8, 256, 256)
+
+        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, padding_mode="reflect")
+        assert isinstance(conv.depthwise_conv.padding_layer, nn.ReflectionPad2d)
+        output = conv(x)
+        assert output.shape == (1, 8, 256, 256)
+
+    def test_forward_with_dw_act_cfg(self) -> None:
+        # test dw_act_cfg
+        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, dw_act_cfg={"type": "LeakyReLU"})
+        x = torch.rand(1, 3, 256, 256)
+        assert conv.depthwise_conv.activate.__class__.__name__ == "LeakyReLU"
+        assert conv.pointwise_conv.activate.__class__.__name__ == "ReLU"
+        output = conv(x)
+        assert output.shape == (1, 8, 256, 256)
+
+    def test_forward_with_pw_act_cfg(self) -> None:
+        # test pw_act_cfg
+        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, pw_act_cfg={"type": "LeakyReLU"})
+        x = torch.rand(1, 3, 256, 256)
+        assert conv.depthwise_conv.activate.__class__.__name__ == "ReLU"
+        assert conv.pointwise_conv.activate.__class__.__name__ == "LeakyReLU"
+        output = conv(x)
+        assert output.shape == (1, 8, 256, 256)
+
+    def test_forward_with_act_cfg(self) -> None:
+        # test act_cfg
+        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, act_cfg={"type": "LeakyReLU"})
+        x = torch.rand(1, 3, 256, 256)
+        assert conv.depthwise_conv.activate.__class__.__name__ == "LeakyReLU"
+        assert conv.pointwise_conv.activate.__class__.__name__ == "LeakyReLU"
+        output = conv(x)
+        assert output.shape == (1, 8, 256, 256)
diff --git a/tests/unit/algo/modules/test_depthwise_separable_conv_module.py b/tests/unit/algo/modules/test_depthwise_separable_conv_module.py
deleted file mode 100644
index b8265dea94d..00000000000
--- a/tests/unit/algo/modules/test_depthwise_separable_conv_module.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (c) OpenMMLab. All rights reserved.
-"""Test of DepthwiseSeparableConvModule.
-
-Reference: https://github.com/open-mmlab/mmcv/blob/main/tests/test_cnn/test_depthwise_seperable_conv_module.py
-"""
-
-import pytest
-import torch
-from otx.algo.modules.depthwise_separable_conv_module import DepthwiseSeparableConvModule
-from torch import nn
-
-
-class TestDepthwiseSeparableConvModule:
-    def test_init_with_non_dict_conv_cfg(self) -> None:
-        # conv_cfg must be a dict or None
-        with pytest.raises(AssertionError):
-            DepthwiseSeparableConvModule(4, 8, 2, groups=2)
-
-    def test_forward_with_default_config(self) -> None:
-        # test default config
-        conv = DepthwiseSeparableConvModule(3, 8, 2)
-        assert conv.depthwise_conv.conv.groups == 3
-        assert conv.pointwise_conv.conv.kernel_size == (1, 1)
-        assert not conv.depthwise_conv.with_norm
-        assert not conv.pointwise_conv.with_norm
-        assert conv.depthwise_conv.activate.__class__.__name__ == "ReLU"
-        assert conv.pointwise_conv.activate.__class__.__name__ == "ReLU"
-        x = torch.rand(1, 3, 256, 256)
-        output = conv(x)
-        assert output.shape == (1, 8, 255, 255)
-
-    def test_forward_with_dw_norm_cfg(self) -> None:
-        # test dw_norm_cfg
-        conv = DepthwiseSeparableConvModule(3, 8, 2, dw_norm_cfg={"type": "BN"})
-        assert conv.depthwise_conv.norm_name == "bn"
-        assert not conv.pointwise_conv.with_norm
-        x = torch.rand(1, 3, 256, 256)
-        output = conv(x)
-        assert output.shape == (1, 8, 255, 255)
-
-    def test_forward_with_pw_norm_cfg(self) -> None:
-        # test pw_norm_cfg
-        conv = DepthwiseSeparableConvModule(3, 8, 2, pw_norm_cfg={"type": "BN"})
-        assert not conv.depthwise_conv.with_norm
-        assert conv.pointwise_conv.norm_name == "bn"
-        x = torch.rand(1, 3, 256, 256)
-        output = conv(x)
-        assert output.shape == (1, 8, 255, 255)
-
-    def test_forward_with_norm_cfg(self) -> None:
-        # test norm_cfg
-        conv = DepthwiseSeparableConvModule(3, 8, 2, norm_cfg={"type": "BN"})
-        assert conv.depthwise_conv.norm_name == "bn"
-        assert conv.pointwise_conv.norm_name == "bn"
-        x = torch.rand(1, 3, 256, 256)
-        output = conv(x)
-        assert output.shape == (1, 8, 255, 255)
-
-    def test_forward_for_order_with_norm_conv_act(self) -> None:
-        # add test for ['norm', 'conv', 'act']
-        conv = DepthwiseSeparableConvModule(3, 8, 2, order=("norm", "conv", "act"))
-        x = torch.rand(1, 3, 256, 256)
-        output = conv(x)
-        assert output.shape == (1, 8, 255, 255)
-
-        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, with_spectral_norm=True)
-        assert hasattr(conv.depthwise_conv.conv, "weight_orig")
-        assert hasattr(conv.pointwise_conv.conv, "weight_orig")
-        output = conv(x)
-        assert output.shape == (1, 8, 256, 256)
-
-        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, padding_mode="reflect")
-        assert isinstance(conv.depthwise_conv.padding_layer, nn.ReflectionPad2d)
-        output = conv(x)
-        assert output.shape == (1, 8, 256, 256)
-
-    def test_forward_with_dw_act_cfg(self) -> None:
-        # test dw_act_cfg
-        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, dw_act_cfg={"type": "LeakyReLU"})
-        x = torch.rand(1, 3, 256, 256)
-        assert conv.depthwise_conv.activate.__class__.__name__ == "LeakyReLU"
-        assert conv.pointwise_conv.activate.__class__.__name__ == "ReLU"
-        output = conv(x)
-        assert output.shape == (1, 8, 256, 256)
-
-    def test_forward_with_pw_act_cfg(self) -> None:
-        # test pw_act_cfg
-        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, pw_act_cfg={"type": "LeakyReLU"})
-        x = torch.rand(1, 3, 256, 256)
-        assert conv.depthwise_conv.activate.__class__.__name__ == "ReLU"
-        assert conv.pointwise_conv.activate.__class__.__name__ == "LeakyReLU"
-        output = conv(x)
-        assert output.shape == (1, 8, 256, 256)
-
-    def test_forward_with_act_cfg(self) -> None:
-        # test act_cfg
-        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, act_cfg={"type": "LeakyReLU"})
-        x = torch.rand(1, 3, 256, 256)
-        assert conv.depthwise_conv.activate.__class__.__name__ == "LeakyReLU"
-        assert conv.pointwise_conv.activate.__class__.__name__ == "LeakyReLU"
-        output = conv(x)
-        assert output.shape == (1, 8, 256, 256)
diff --git a/tests/unit/algo/modules/test_norm.py b/tests/unit/algo/modules/test_norm.py
index f95dfbd8814..8debc2d9a07 100644
--- a/tests/unit/algo/modules/test_norm.py
+++ b/tests/unit/algo/modules/test_norm.py
@@ -6,7 +6,7 @@
 from torch import nn
 
 
-def test_build_conv_layer():
+def test_build_norm_layer():
     cfg = {"type": "BN"}
     name, norm = build_norm_layer(cfg, num_features=1)
     assert isinstance(norm, nn.BatchNorm2d)
diff --git a/tests/unit/algo/segmentation/modules/test_blokcs.py b/tests/unit/algo/segmentation/modules/test_blokcs.py
index 1c6d517b8b2..728d85169a0 100644
--- a/tests/unit/algo/segmentation/modules/test_blokcs.py
+++ b/tests/unit/algo/segmentation/modules/test_blokcs.py
@@ -15,7 +15,6 @@ def init_cfg(self) -> dict[str, Any]:
             "key_channels": 128,
             "value_channels": 320,
             "psp_size": [1, 3, 6, 8],
-            "conv_cfg": {"type": "Conv2d"},
             "norm_cfg": {"type": "BN"},
         }
 
@@ -25,7 +24,6 @@ def test_init(self, init_cfg):
         assert module.in_channels == init_cfg["in_channels"]
         assert module.key_channels == init_cfg["key_channels"]
         assert module.value_channels == init_cfg["value_channels"]
-        assert module.conv_cfg == init_cfg["conv_cfg"]
         assert module.norm_cfg == init_cfg["norm_cfg"]
 
     @pytest.fixture()
@@ -44,7 +42,6 @@ class TestLocalAttentionModule:
     def init_cfg(self) -> dict[str, Any]:
         return {
             "num_channels": 320,
-            "conv_cfg": {"type": "Conv2d"},
             "norm_cfg": {"type": "BN"},
         }
 
@@ -52,7 +49,6 @@ def test_init(self, init_cfg):
         module = LocalAttentionModule(**init_cfg)
 
         assert module.num_channels == init_cfg["num_channels"]
-        assert module.conv_cfg == init_cfg["conv_cfg"]
         assert module.norm_cfg == init_cfg["norm_cfg"]
 
     @pytest.fixture()