From 1a8e10e078c8fd99cacbdc4b2e2fa9051521fb16 Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Mon, 12 Aug 2024 10:10:33 +0900
Subject: [PATCH] Refactoring `ConvModule` by removing `act_cfg` (#3809)

* Remove `act_cfg` and reflect this change to all modules

* Remove `build_activation_layer`

* Enable to update `inplace`

* pre-commit

* Update CHANGELOG

* Fix unit test

* pre-commit

* Fix unit test

* Update keypoint detection part

* Fix default

---------

Co-authored-by: Prokofiev Kirill <kirill.prokofiev@intel.com>
---
 CHANGELOG.md                                  |   2 +
 .../action_classification/backbones/x3d.py    |  47 ++---
 src/otx/algo/action_classification/x3d.py     |   3 +-
 .../classification/backbones/efficientnet.py  | 116 +++++------
 .../heads/multilabel_cls_head.py              |   5 +-
 .../heads/vision_transformer_head.py          |   2 -
 src/otx/algo/common/backbones/cspnext.py      |  21 +-
 .../common/backbones/pytorchcv_backbones.py   |  19 +-
 src/otx/algo/common/layers/spp_layer.py       |  14 +-
 .../algo/detection/backbones/csp_darknet.py   |  27 ++-
 src/otx/algo/detection/backbones/presnet.py   |  94 ++++++---
 .../algo/detection/heads/rtdetr_decoder.py    |  34 ++--
 src/otx/algo/detection/heads/rtmdet_head.py   |  27 +--
 src/otx/algo/detection/heads/yolox_head.py    |  18 +-
 src/otx/algo/detection/layers/csp_layer.py    |  99 +++++----
 src/otx/algo/detection/necks/cspnext_pafpn.py |  25 ++-
 src/otx/algo/detection/necks/fpn.py           |  14 +-
 .../algo/detection/necks/hybrid_encoder.py    |  42 ++--
 src/otx/algo/detection/necks/yolox_pafpn.py   |  26 ++-
 src/otx/algo/detection/rtmdet.py              |  10 +-
 .../instance_segmentation/backbones/swin.py   |  28 ++-
 .../heads/rtmdet_ins_head.py                  |  54 ++---
 .../algo/instance_segmentation/maskrcnn.py    |   3 +-
 .../algo/instance_segmentation/necks/fpn.py   |  12 +-
 .../algo/instance_segmentation/rtmdet_inst.py |   9 +-
 src/otx/algo/keypoint_detection/rtmpose.py    |   4 +-
 src/otx/algo/modules/__init__.py              |   5 +-
 src/otx/algo/modules/activation.py            |  52 +----
 src/otx/algo/modules/conv_module.py           | 133 ++++++++----
 src/otx/algo/modules/transformer.py           |  11 +-
 .../algo/segmentation/backbones/litehrnet.py  | 190 +++++++++---------
 src/otx/algo/segmentation/backbones/mscan.py  | 162 +++++++--------
 .../algo/segmentation/heads/base_segm_head.py |  43 ++--
 src/otx/algo/segmentation/heads/fcn_head.py   |  10 +-
 src/otx/algo/segmentation/heads/ham_head.py   |   8 +-
 .../algo/segmentation/modules/aggregators.py  |  10 +-
 src/otx/algo/segmentation/modules/blocks.py   |  12 +-
 src/otx/algo/segmentation/segnext.py          |  14 +-
 .../backbones/test_pytorchcv_backbones.py     |   8 +-
 .../detection/backbones/test_csp_darknet.py   |   3 +-
 .../algo/detection/heads/test_rtmdet_head.py  |   7 +-
 .../algo/detection/layers/test_csp_layer.py   |   2 +-
 tests/unit/algo/modules/test_activation.py    |  20 +-
 tests/unit/algo/modules/test_conv_module.py   |  71 ++++---
 44 files changed, 805 insertions(+), 711 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 67c87e31181..63b53d13b83 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -35,6 +35,8 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/3759>)
 - Enable to use polygon and bitmap mask as prompt inputs for zero-shot learning
   (<https://github.com/openvinotoolkit/training_extensions/pull/3769>)
+- Refactoring `ConvModule` by removing `conv_cfg` and `act_cfg`
+  (<https://github.com/openvinotoolkit/training_extensions/pull/3783>, <https://github.com/openvinotoolkit/training_extensions/pull/3809>)
 
 ### Bug fixes
 
diff --git a/src/otx/algo/action_classification/backbones/x3d.py b/src/otx/algo/action_classification/backbones/x3d.py
index 7deef62a9f6..7660ae49569 100644
--- a/src/otx/algo/action_classification/backbones/x3d.py
+++ b/src/otx/algo/action_classification/backbones/x3d.py
@@ -7,12 +7,13 @@
 from __future__ import annotations
 
 import math
+from typing import Callable
 
 import torch.utils.checkpoint as cp
 from torch import Tensor, nn
 from torch.nn.modules.batchnorm import _BatchNorm
 
-from otx.algo.modules.activation import Swish, build_activation_layer
+from otx.algo.modules.activation import Swish
 from otx.algo.modules.conv_module import Conv3dModule
 from otx.algo.utils.mmengine_utils import load_checkpoint
 from otx.algo.utils.weight_init import constant_init, kaiming_init
@@ -73,8 +74,8 @@ class BlockX3D(nn.Module):
             before and after the 3x3x3 conv. Default: True.
         norm_cfg (dict): Config for norm layers. required keys are ``type``,
             Default: ``dict(type='BN3d')``.
-        act_cfg (dict): Config dict for activation layer.
-            Default: ``dict(type='ReLU')``.
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+            Defaults to `nn.ReLU`.
         with_cp (bool): Use checkpoint or not. Using checkpoint will save some
             memory while slowing down the training speed. Default: False.
     """
@@ -89,7 +90,7 @@ def __init__(
         se_ratio: float | None = None,
         use_swish: bool = True,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] | None = nn.ReLU,
         with_cp: bool = False,
     ):
         super().__init__()
@@ -102,8 +103,7 @@ def __init__(
         self.se_ratio = se_ratio
         self.use_swish = use_swish
         self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.act_cfg_swish = Swish()
+        self.activation_callable = activation_callable
         self.with_cp = with_cp
 
         self.conv1 = Conv3dModule(
@@ -114,7 +114,7 @@ def __init__(
             padding=0,
             bias=False,
             norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg,
+            activation_callable=self.activation_callable,
         )
         # Here we use the channel-wise conv
         self.conv2 = Conv3dModule(
@@ -126,7 +126,7 @@ def __init__(
             groups=planes,
             bias=False,
             norm_cfg=self.norm_cfg,
-            act_cfg=None,
+            activation_callable=None,
         )
 
         self.swish = Swish()
@@ -139,13 +139,13 @@ def __init__(
             padding=0,
             bias=False,
             norm_cfg=self.norm_cfg,
-            act_cfg=None,
+            activation_callable=None,
         )
 
         if self.se_ratio is not None:
             self.se_module = SEModule(planes, self.se_ratio)
 
-        self.relu = build_activation_layer(self.act_cfg) if self.act_cfg else build_activation_layer({})
+        self.relu = self.activation_callable() if self.activation_callable else nn.ReLU(inplace=True)
 
     def forward(self, x: Tensor) -> Tensor:
         """Defines the computation performed at every call."""
@@ -198,8 +198,8 @@ class X3DBackbone(nn.Module):
         norm_cfg (dict): Config for norm layers. required keys are ``type`` and
             ``requires_grad``.
             Default: ``dict(type='BN3d', requires_grad=True)``.
-        act_cfg (dict): Config dict for activation layer.
-            Default: ``dict(type='ReLU', inplace=True)``.
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+            Defaults to `nn.ReLU`.
         norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
             running stats (mean and var). Default: False.
         with_cp (bool): Use checkpoint or not. Using checkpoint will save some
@@ -224,7 +224,7 @@ def __init__(
         se_ratio: float = 1 / 16,
         use_swish: bool = True,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] | None = nn.ReLU,
         norm_eval: bool = False,
         with_cp: bool = False,
         zero_init_residual: bool = True,
@@ -267,7 +267,7 @@ def __init__(
         self.use_swish = use_swish
 
         self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
+        self.activation_callable = activation_callable
         self.norm_eval = norm_eval
         self.with_cp = with_cp
         self.zero_init_residual = zero_init_residual
@@ -294,7 +294,7 @@ def __init__(
                 se_ratio=self.se_ratio,
                 use_swish=self.use_swish,
                 norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg,
+                activation_callable=self.activation_callable,
                 with_cp=with_cp,
                 **kwargs,
             )
@@ -312,7 +312,7 @@ def __init__(
             padding=0,
             bias=False,
             norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg,
+            activation_callable=self.activation_callable,
         )
         self.feat_dim = int(self.feat_dim * self.gamma_b)
 
@@ -350,7 +350,7 @@ def make_res_layer(
         se_ratio: float | None = None,
         use_swish: bool = True,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] | None = nn.ReLU,
         with_cp: bool = False,
         **kwargs,
     ) -> nn.Module:
@@ -376,7 +376,8 @@ def make_res_layer(
             use_swish (bool): Whether to use swish as the activation function
                 before and after the 3x3x3 conv. Default: True.
             norm_cfg (dict | None): Config for norm layers. Default: None.
-            act_cfg (dict | None): Config for activate layers. Default: None.
+            activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+                Defaults to `nn.ReLU`.
             with_cp (bool | None): Use checkpoint or not. Using checkpoint
                 will save some memory while slowing down the training speed.
                 Default: False.
@@ -394,7 +395,7 @@ def make_res_layer(
                 padding=0,
                 bias=False,
                 norm_cfg=norm_cfg,
-                act_cfg=None,
+                activation_callable=None,
             )
 
         use_se = [False] * blocks
@@ -416,7 +417,7 @@ def make_res_layer(
                 se_ratio=se_ratio if use_se[0] else None,
                 use_swish=use_swish,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
                 with_cp=with_cp,
                 **kwargs,
             ),
@@ -432,7 +433,7 @@ def make_res_layer(
                     se_ratio=se_ratio if use_se[i] else None,
                     use_swish=use_swish,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                     with_cp=with_cp,
                     **kwargs,
                 ),
@@ -450,7 +451,7 @@ def _make_stem_layer(self) -> None:
             padding=(0, 1, 1),
             bias=False,
             norm_cfg=None,
-            act_cfg=None,
+            activation_callable=None,
         )
         self.conv1_t = Conv3dModule(
             self.base_channels,
@@ -461,7 +462,7 @@ def _make_stem_layer(self) -> None:
             groups=self.base_channels,
             bias=False,
             norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg,
+            activation_callable=self.activation_callable,
         )
 
     def _freeze_stages(self) -> None:
diff --git a/src/otx/algo/action_classification/x3d.py b/src/otx/algo/action_classification/x3d.py
index 7f503dadfd4..b8931dc9e1a 100644
--- a/src/otx/algo/action_classification/x3d.py
+++ b/src/otx/algo/action_classification/x3d.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+from functools import partial
 from typing import TYPE_CHECKING
 
 from torch import nn
@@ -65,7 +66,7 @@ def _build_model(self, num_classes: int) -> nn.Module:
                 gamma_d=2.2,
                 gamma_w=1,
                 norm_cfg={"type": "BN3d", "requires_grad": True},
-                act_cfg={"type": "ReLU", "inplace": True},
+                activation_callable=partial(nn.ReLU, inplace=True),
             ),
             cls_head=X3DHead(
                 num_classes=num_classes,
diff --git a/src/otx/algo/classification/backbones/efficientnet.py b/src/otx/algo/classification/backbones/efficientnet.py
index a7081728590..6114a28f60d 100644
--- a/src/otx/algo/classification/backbones/efficientnet.py
+++ b/src/otx/algo/classification/backbones/efficientnet.py
@@ -7,14 +7,14 @@
 
 import math
 from pathlib import Path
-from typing import Literal
+from typing import Callable, Literal
 
 import torch
 from pytorchcv.models.model_store import download_model
 from torch import nn
 from torch.nn import functional, init
 
-from otx.algo.modules.activation import build_activation_layer
+from otx.algo.modules.activation import Swish
 from otx.algo.modules.conv_module import Conv2dModule
 from otx.algo.utils.mmengine_utils import load_checkpoint_to_model
 
@@ -33,7 +33,7 @@ def conv1x1_block(
     bias: bool = False,
     use_bn: bool = True,
     bn_eps: float = 1e-5,
-    activation: str | None = "ReLU",
+    activation_callable: Callable[..., nn.Module] | None = nn.ReLU,
 ) -> Conv2dModule:
     """Conv block."""
     return Conv2dModule(
@@ -45,7 +45,7 @@ def conv1x1_block(
         groups=groups,
         bias=bias,
         norm_cfg=({"type": "BN", "eps": bn_eps} if use_bn else None),
-        act_cfg=({"type": activation} if activation else None),
+        activation_callable=activation_callable,
     )
 
 
@@ -59,7 +59,7 @@ def conv3x3_block(
     bias: bool = False,
     use_bn: bool = True,
     bn_eps: float = 1e-5,
-    activation: str | None = "ReLU",
+    activation_callable: Callable[..., nn.Module] | None = nn.ReLU,
 ) -> Conv2dModule:
     """Conv block."""
     return Conv2dModule(
@@ -72,7 +72,7 @@ def conv3x3_block(
         groups=groups,
         bias=bias,
         norm_cfg=({"type": "BN", "eps": bn_eps} if use_bn else None),
-        act_cfg=({"type": activation} if activation else None),
+        activation_callable=activation_callable,
     )
 
 
@@ -85,7 +85,7 @@ def dwconv3x3_block(
     bias: bool = False,
     use_bn: bool = True,
     bn_eps: float = 1e-5,
-    activation: str | None = "ReLU",
+    activation_callable: Callable[..., nn.Module] | None = nn.ReLU,
 ) -> Conv2dModule:
     """Conv block."""
     return Conv2dModule(
@@ -98,7 +98,7 @@ def dwconv3x3_block(
         groups=out_channels,
         bias=bias,
         norm_cfg=({"type": "BN", "eps": bn_eps} if use_bn else None),
-        act_cfg=({"type": activation} if activation else None),
+        activation_callable=activation_callable,
     )
 
 
@@ -111,7 +111,7 @@ def dwconv5x5_block(
     bias: bool = False,
     use_bn: bool = True,
     bn_eps: float = 1e-5,
-    activation: str | None = "ReLU",
+    activation_callable: Callable[..., nn.Module] | None = nn.ReLU,
 ) -> Conv2dModule:
     """Conv block."""
     return Conv2dModule(
@@ -124,7 +124,7 @@ def dwconv5x5_block(
         groups=out_channels,
         bias=bias,
         norm_cfg=({"type": "BN", "eps": bn_eps} if use_bn else None),
-        act_cfg=({"type": activation} if activation else None),
+        activation_callable=activation_callable,
     )
 
 
@@ -164,13 +164,15 @@ class SEBlock(nn.Module):
     https://arxiv.org/abs/1709.01507.
 
     Args:
-        channels : int. Number of channels.
-        reduction : int, default 16. Squeeze reduction value.
-        mid_channels : int or None, default None. Number of middle channels.
-        round_mid : bool, default False. Whether to round middle channel number (make divisible by 8).
-        use_conv : bool, default True. Whether to convolutional layers instead of fully-connected ones.
-        activation : function, or str, or nn.Module, default 'relu'. Activation function after the first convolution.
-        out_activation : function, or str, or nn.Module, Activation function after the last convolution.
+        channels (int): Number of channels.
+        reduction (int): Squeeze reduction value. Default to 16.
+        mid_channels (int | None): Number of middle channels. Defaults to None.
+        round_mid (bool): Whether to round middle channel number (make divisible by 8). Defaults to False.
+        use_conv (bool): Whether to convolutional layers instead of fully-connected ones. Defaults to True.
+        mid_activation_callable (Callable[..., nn.Module]): Activation layer module after the first convolution.
+            Defaults to `nn.ReLU`.
+        out_activation_callable (Callable[..., nn.Module]): Activation layer module after the last convolution.
+            Defaults to `nn.Sigmoid`.
     """
 
     def __init__(
@@ -180,8 +182,8 @@ def __init__(
         mid_channels: int | None = None,
         round_mid: bool = False,
         use_conv: bool = True,
-        mid_activation: str | None = "ReLU",
-        out_activation: str | None = "Sigmoid",
+        mid_activation_callable: Callable[..., nn.Module] = nn.ReLU,
+        out_activation_callable: Callable[..., nn.Module] = nn.Sigmoid,
     ):
         super().__init__()
         self.use_conv = use_conv
@@ -200,7 +202,7 @@ def __init__(
             )
         else:
             self.fc1 = nn.Linear(in_features=channels, out_features=mid_channels)
-        self.activ = build_activation_layer({"type": mid_activation})
+        self.activ = mid_activation_callable()
         if use_conv:
             self.conv2 = nn.Conv2d(
                 in_channels=mid_channels,
@@ -212,7 +214,7 @@ def __init__(
             )
         else:
             self.fc2 = nn.Linear(in_features=mid_channels, out_features=channels)
-        self.sigmoid = build_activation_layer({"type": out_activation})
+        self.sigmoid = out_activation_callable()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward."""
@@ -232,12 +234,12 @@ class EffiDwsConvUnit(nn.Module):
     """EfficientNet specific depthwise separable conv block/unit with BatchNorms and activations at each conv.
 
     Args:
-        in_channels : int. Number of input channels.
-        out_channels : int. Number of output channels.
-        stride : int or tuple/list of 2 int. Strides of the second convolution layer.
-        bn_eps : float. Small float added to variance in Batch norm.
-        activation : str. Name of activation function.
-        tf_mode : bool. Whether to use TF-like mode.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        stride (int | tuple[int, int]): Strides of the second convolution layer.
+        bn_eps (float): Small float added to variance in Batch norm.
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+        tf_mode (bool): Whether to use TF-like mode.
     """
 
     def __init__(
@@ -246,7 +248,7 @@ def __init__(
         out_channels: int,
         stride: int | tuple[int, int],
         bn_eps: float,
-        activation: str,
+        activation_callable: Callable[..., nn.Module],
         tf_mode: bool,
     ):
         super().__init__()
@@ -258,14 +260,14 @@ def __init__(
             out_channels=in_channels,
             padding=(0 if tf_mode else 1),
             bn_eps=bn_eps,
-            activation=activation,
+            activation_callable=activation_callable,
         )
-        self.se = SEBlock(channels=in_channels, reduction=4, mid_activation=activation)
+        self.se = SEBlock(channels=in_channels, reduction=4, mid_activation_callable=activation_callable)
         self.pw_conv = conv1x1_block(
             in_channels=in_channels,
             out_channels=out_channels,
             bn_eps=bn_eps,
-            activation=None,
+            activation_callable=None,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -286,15 +288,15 @@ class EffiInvResUnit(nn.Module):
     """EfficientNet inverted residual unit.
 
     Args:
-        in_channels : int. Number of input channels.
-        out_channels : int. Number of output channels.
-        kernel_size : int or tuple/list of 2 int. Convolution window size.
-        stride : int or tuple/list of 2 int. Strides of the second convolution layer.
-        exp_factor : int. Factor for expansion of channels.
-        se_factor : int. SE reduction factor for each unit.
-        bn_eps : float. Small float added to variance in Batch norm.
-        activation : str. Name of activation function.
-        tf_mode : bool. Whether to use TF-like mode.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        kernel_size (int | tuple[int, int]): Convolution window size.
+        stride (int | tuple[int, int]): Strides of the second convolution layer.
+        exp_factor (int): Factor for expansion of channels.
+        se_factor (int): SE reduction factor for each unit.
+        bn_eps (float): Small float added to variance in Batch norm.
+        activation_callable (Callable[..., nn.Module]): Name of activation function.
+        tf_mode (bool): Whether to use TF-like mode.
     """
 
     def __init__(
@@ -306,7 +308,7 @@ def __init__(
         exp_factor: int,
         se_factor: int,
         bn_eps: float,
-        activation: str | None,
+        activation_callable: Callable[..., nn.Module],
         tf_mode: bool,
     ):
         super().__init__()
@@ -322,7 +324,7 @@ def __init__(
             in_channels=in_channels,
             out_channels=mid_channels,
             bn_eps=bn_eps,
-            activation=activation,
+            activation_callable=activation_callable,
         )
         self.conv2 = dwconv_block_fn(
             in_channels=mid_channels,
@@ -330,19 +332,19 @@ def __init__(
             stride=stride,
             padding=(0 if tf_mode else kernel_size // 2),
             bn_eps=bn_eps,
-            activation=activation,
+            activation_callable=activation_callable,
         )
         if self.use_se:
             self.se = SEBlock(
                 channels=mid_channels,
                 reduction=(exp_factor * se_factor),
-                mid_activation=activation,
+                mid_activation_callable=activation_callable,
             )
         self.conv3 = conv1x1_block(
             in_channels=mid_channels,
             out_channels=out_channels,
             bn_eps=bn_eps,
-            activation=None,
+            activation_callable=None,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -368,11 +370,11 @@ class EffiInitBlock(nn.Module):
     """EfficientNet specific initial block.
 
     Args:
-        in_channels : int. Number of input channels.
-        out_channels : int. Number of output channels.
-        bn_eps : float. Small float added to variance in Batch norm.
-        activation : str. Name of activation function.
-        tf_mode : bool. Whether to use TF-like mode.
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        bn_eps (float): Small float added to variance in Batch norm.
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+        tf_mode (bool): Whether to use TF-like mode.
     """
 
     def __init__(
@@ -380,7 +382,7 @@ def __init__(
         in_channels: int,
         out_channels: int,
         bn_eps: float,
-        activation: str | None,
+        activation_callable: Callable[..., nn.Module] | None,
         tf_mode: bool,
     ):
         super().__init__()
@@ -392,7 +394,7 @@ def __init__(
             stride=2,
             padding=(0 if tf_mode else 1),
             bn_eps=bn_eps,
-            activation=activation,
+            activation_callable=activation_callable,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -453,7 +455,7 @@ def __init__(
         self.bn_frozen = bn_frozen
         self.pooling_type = pooling_type
         self.num_features = self.num_head_features = final_block_channels
-        activation = "Swish"
+        activation_callable = Swish
         self.features = nn.Sequential()
         self.features.add_module(
             "init_block",
@@ -461,7 +463,7 @@ def __init__(
                 in_channels=in_channels,
                 out_channels=init_block_channels,
                 bn_eps=bn_eps,
-                activation=activation,
+                activation_callable=activation_callable,
                 tf_mode=tf_mode,
             ),
         )
@@ -482,7 +484,7 @@ def __init__(
                             out_channels=out_channels,
                             stride=stride,
                             bn_eps=bn_eps,
-                            activation=activation,
+                            activation_callable=activation_callable,
                             tf_mode=tf_mode,
                         ),
                     )
@@ -497,7 +499,7 @@ def __init__(
                             exp_factor=expansion_factor,
                             se_factor=4,
                             bn_eps=bn_eps,
-                            activation=activation,
+                            activation_callable=activation_callable,
                             tf_mode=tf_mode,
                         ),
                     )
@@ -510,7 +512,7 @@ def __init__(
                 in_channels=in_channels,
                 out_channels=final_block_channels,
                 bn_eps=bn_eps,
-                activation=activation,
+                activation_callable=activation_callable,
             ),
         )
         self._init_params()
diff --git a/src/otx/algo/classification/heads/multilabel_cls_head.py b/src/otx/algo/classification/heads/multilabel_cls_head.py
index 731c247ba5b..2df5523d988 100644
--- a/src/otx/algo/classification/heads/multilabel_cls_head.py
+++ b/src/otx/algo/classification/heads/multilabel_cls_head.py
@@ -241,7 +241,8 @@ class MultiLabelNonLinearClsHead(MultiLabelClsHead):
         num_classes (int): Number of categories.
         in_channels (int): Number of channels in the input feature map.
         hid_channels (int): Number of channels in the hidden feature map.
-        act_cfg (dict | optional): The configuration of the activation function.
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to nn.ReLU.
         scale (float): Positive scale parameter.
         loss (dict): Config of classification loss.
         dropout (bool): Whether use the dropout or not.
@@ -254,7 +255,7 @@ def __init__(
         in_channels: int,
         loss: nn.Module,
         hid_channels: int = 1280,
-        activation_callable: Callable[[], nn.Module] = nn.ReLU,
+        activation_callable: Callable[..., nn.Module] = nn.ReLU,
         scale: float = 1.0,
         dropout: bool = False,
         normalized: bool = False,
diff --git a/src/otx/algo/classification/heads/vision_transformer_head.py b/src/otx/algo/classification/heads/vision_transformer_head.py
index 849913a2dce..a4b9950b260 100644
--- a/src/otx/algo/classification/heads/vision_transformer_head.py
+++ b/src/otx/algo/classification/heads/vision_transformer_head.py
@@ -26,8 +26,6 @@ class VisionTransformerClsHead(BaseModule):
         in_channels (int): Number of channels in the input feature map.
         hidden_dim (int, optional): Number of the dimensions for hidden layer.
             Defaults to None, which means no extra hidden layer.
-        act_cfg (dict): The activation config. Only available during
-            pre-training. Defaults to ``dict(type='Tanh')``.
         init_cfg (dict): The extra initialization configs. Defaults to
             ``dict(type='Constant', layer='Linear', val=0)``.
     """
diff --git a/src/otx/algo/common/backbones/cspnext.py b/src/otx/algo/common/backbones/cspnext.py
index 76bcc56b74b..dafe910946c 100644
--- a/src/otx/algo/common/backbones/cspnext.py
+++ b/src/otx/algo/common/backbones/cspnext.py
@@ -9,7 +9,7 @@
 from __future__ import annotations
 
 import math
-from typing import ClassVar
+from typing import Callable, ClassVar
 
 from otx.algo.common.layers import SPPBottleneck
 from otx.algo.detection.layers import CSPLayer
@@ -45,8 +45,8 @@ class CSPNeXt(BaseModule):
             stage. Defaults to True.
         norm_cfg (dict): Dictionary to construct and
             config norm layer. Defaults to dict(type='BN', requires_grad=True).
-        act_cfg (dict): Config dict for activation layer.
-            Defaults to dict(type='SiLU').
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+            Defaults to `nn.SiLU`.
         norm_eval (bool): Whether to set norm layers to eval mode, namely,
             freeze running stats (mean and var). Note: Effect on Batch Norm
             and its variants only.
@@ -84,7 +84,7 @@ def __init__(
         spp_kernel_sizes: tuple[int, int, int] = (5, 9, 13),
         channel_attention: bool = True,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] | None = nn.SiLU,
         norm_eval: bool = False,
         init_cfg: dict | None = None,
     ) -> None:
@@ -99,7 +99,6 @@ def __init__(
 
         super().__init__(init_cfg=init_cfg)
         norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001}
-        act_cfg = act_cfg or {"type": "SiLU"}
 
         arch_setting = self.arch_settings[arch]
         if arch_ovewrite:
@@ -126,7 +125,7 @@ def __init__(
                 padding=1,
                 stride=2,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
             ),
             Conv2dModule(
                 int(arch_setting[0][0] * widen_factor // 2),
@@ -135,7 +134,7 @@ def __init__(
                 padding=1,
                 stride=1,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
             ),
             Conv2dModule(
                 int(arch_setting[0][0] * widen_factor // 2),
@@ -144,7 +143,7 @@ def __init__(
                 padding=1,
                 stride=1,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
             ),
         )
         self.layers = ["stem"]
@@ -161,7 +160,7 @@ def __init__(
                 stride=2,
                 padding=1,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
             )
             stage.append(conv_layer)
             if use_spp:
@@ -170,7 +169,7 @@ def __init__(
                     out_channels,
                     kernel_sizes=spp_kernel_sizes,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 )
                 stage.append(spp)
             csp_layer = CSPLayer(
@@ -183,7 +182,7 @@ def __init__(
                 expand_ratio=expand_ratio,
                 channel_attention=channel_attention,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
             )
             stage.append(csp_layer)
             self.add_module(f"stage{i + 1}", nn.Sequential(*stage))
diff --git a/src/otx/algo/common/backbones/pytorchcv_backbones.py b/src/otx/algo/common/backbones/pytorchcv_backbones.py
index 8878998a86d..3ed30d8e73d 100644
--- a/src/otx/algo/common/backbones/pytorchcv_backbones.py
+++ b/src/otx/algo/common/backbones/pytorchcv_backbones.py
@@ -6,9 +6,9 @@
 from __future__ import annotations
 
 from pathlib import Path
+from typing import Callable
 
 import torch
-from otx.algo.modules.activation import build_activation_layer
 from otx.algo.modules.norm import build_norm_layer
 from otx.algo.utils.mmengine_utils import get_dist_info
 from pytorchcv.model_provider import _models
@@ -19,16 +19,13 @@
 # ruff: noqa: SLF001
 
 
-def replace_activation(model: nn.Module, activation_cfg: dict) -> nn.Module:
-    """Replace activate funtion."""
+def replace_activation(model: nn.Module, activation_callable: Callable[..., nn.Module]) -> nn.Module:
+    """Replace activation funtion."""
     for name, module in model._modules.items():
         if len(list(module.children())) > 0:
-            model._modules[name] = replace_activation(module, activation_cfg)
+            model._modules[name] = replace_activation(module, activation_callable)
         if "activ" in name:
-            if activation_cfg["type"] == "torch_swish":
-                model._modules[name] = nn.SiLU()
-            else:
-                model._modules[name] = build_activation_layer(activation_cfg)
+            model._modules[name] = activation_callable()
     return model
 
 
@@ -122,7 +119,7 @@ def _build_pytorchcv_model(
     frozen_stages: int = 0,
     norm_eval: bool = False,
     verbose: bool = False,
-    activation_cfg: dict | None = None,
+    activation_callable: Callable[..., nn.Module] | None = None,
     norm_cfg: dict | None = None,
     **kwargs,
 ) -> nn.Module:
@@ -133,8 +130,8 @@ def _build_pytorchcv_model(
         f"Init model {type}, pretrained={is_pretrained}, models cache {models_cache_root}",
     )
     model = _models[type](**kwargs)
-    if activation_cfg:
-        model = replace_activation(model, activation_cfg)
+    if activation_callable:
+        model = replace_activation(model, activation_callable)
     if norm_cfg:
         model = replace_norm(model, norm_cfg)
     model.out_indices = out_indices
diff --git a/src/otx/algo/common/layers/spp_layer.py b/src/otx/algo/common/layers/spp_layer.py
index d314bacea9d..29027dca6bd 100644
--- a/src/otx/algo/common/layers/spp_layer.py
+++ b/src/otx/algo/common/layers/spp_layer.py
@@ -8,7 +8,10 @@
 
 from __future__ import annotations
 
+from typing import Callable
+
 import torch
+from otx.algo.modules.activation import Swish
 from otx.algo.modules.base_module import BaseModule
 from otx.algo.modules.conv_module import Conv2dModule
 from torch import Tensor, nn
@@ -24,8 +27,8 @@ class SPPBottleneck(BaseModule):
             layers. Default: (5, 9, 13).
         norm_cfg (dict): Config dict for normalization layer.
             Default: dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='Swish').
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+            Defaults to `Swish`.
         init_cfg (dict, list[dict], optional): Initialization config dict.
             Default: None.
     """
@@ -36,12 +39,11 @@ def __init__(
         out_channels: int,
         kernel_sizes: tuple[int, ...] = (5, 9, 13),
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] | None = Swish,
         init_cfg: dict | list[dict] | None = None,
     ):
         super().__init__(init_cfg=init_cfg)
         norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001}
-        act_cfg = act_cfg or {"type": "Swish"}
 
         mid_channels = in_channels // 2
         self.conv1 = Conv2dModule(
@@ -50,7 +52,7 @@ def __init__(
             1,
             stride=1,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
         )
         self.poolings = nn.ModuleList([nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) for ks in kernel_sizes])
         conv2_channels = mid_channels * (len(kernel_sizes) + 1)
@@ -59,7 +61,7 @@ def __init__(
             out_channels,
             1,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
         )
 
     def forward(self, x: Tensor) -> Tensor:
diff --git a/src/otx/algo/detection/backbones/csp_darknet.py b/src/otx/algo/detection/backbones/csp_darknet.py
index 6e92b995b06..9df915c4dd2 100644
--- a/src/otx/algo/detection/backbones/csp_darknet.py
+++ b/src/otx/algo/detection/backbones/csp_darknet.py
@@ -9,7 +9,7 @@
 from __future__ import annotations
 
 import math
-from typing import Any, ClassVar, Sequence
+from typing import Any, Callable, ClassVar, Sequence
 
 import torch
 from torch import Tensor, nn
@@ -17,6 +17,7 @@
 
 from otx.algo.common.layers import SPPBottleneck
 from otx.algo.detection.layers import CSPLayer
+from otx.algo.modules.activation import Swish
 from otx.algo.modules.base_module import BaseModule
 from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 
@@ -31,8 +32,8 @@ class Focus(nn.Module):
         stride (int): The stride of the convolution. Default: 1
         norm_cfg (dict): Config dict for normalization layer.
             Default: dict(type='BN', momentum=0.03, eps=0.001).
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='Swish').
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+            Defaults to `Swish`.
     """
 
     def __init__(
@@ -42,11 +43,10 @@ def __init__(
         kernel_size: int = 1,
         stride: int = 1,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] | None = Swish,
     ):
         super().__init__()
         norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001}
-        act_cfg = act_cfg or {"type": "Swish"}
         self.conv = Conv2dModule(
             in_channels * 4,
             out_channels,
@@ -54,7 +54,7 @@ def __init__(
             stride,
             padding=(kernel_size - 1) // 2,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
         )
 
     def forward(self, x: Tensor) -> Tensor:
@@ -110,8 +110,8 @@ class CSPDarknet(BaseModule):
             layers. Default: (5, 9, 13).
         norm_cfg (dict): Dictionary to construct and config norm layer.
             Default: dict(type='BN', requires_grad=True).
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+            Defaults to ``Swish``.
         norm_eval (bool): Whether to set norm layers to eval mode, namely,
             freeze running stats (mean and var). Note: Effect on Batch Norm
             and its variants only.
@@ -148,7 +148,7 @@ def __init__(
         arch_ovewrite: list | None = None,
         spp_kernal_sizes: tuple[int, ...] = (5, 9, 13),
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = Swish,
         norm_eval: bool = False,
         init_cfg: dict | list[dict] | None = None,
     ):
@@ -162,7 +162,6 @@ def __init__(
         }
         super().__init__(init_cfg=init_cfg)
         norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001}
-        act_cfg = act_cfg or {"type": "Swish"}
 
         arch_setting = self.arch_settings[arch]
         if arch_ovewrite:
@@ -183,7 +182,7 @@ def __init__(
             int(arch_setting[0][0] * widen_factor),
             kernel_size=3,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
         )
         self.layers = ["stem"]
 
@@ -199,7 +198,7 @@ def __init__(
                 stride=2,
                 padding=1,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
             )
             stage.append(conv_layer)
             if use_spp:
@@ -208,7 +207,7 @@ def __init__(
                     out_channels,
                     kernel_sizes=spp_kernal_sizes,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 )
                 stage.append(spp)
             csp_layer = CSPLayer(
@@ -218,7 +217,7 @@ def __init__(
                 add_identity=add_identity,
                 use_depthwise=use_depthwise,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
             )
             stage.append(csp_layer)
             self.add_module(f"stage{i + 1}", nn.Sequential(*stage))
diff --git a/src/otx/algo/detection/backbones/presnet.py b/src/otx/algo/detection/backbones/presnet.py
index b31f7f95c3a..bd557e3561d 100644
--- a/src/otx/algo/detection/backbones/presnet.py
+++ b/src/otx/algo/detection/backbones/presnet.py
@@ -6,12 +6,11 @@
 from __future__ import annotations
 
 from collections import OrderedDict
-from typing import Any, ClassVar
+from typing import Any, Callable, ClassVar
 
 import torch
 from torch import nn
 
-from otx.algo.modules import build_activation_layer
 from otx.algo.modules.base_module import BaseModule
 from otx.algo.modules.conv_module import Conv2dModule
 
@@ -29,7 +28,7 @@ def __init__(
         ch_out: int,
         stride: int,
         shortcut: bool,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] | None = None,
         variant: str = "b",
         norm_cfg: dict[str, str] | None = None,
     ) -> None:
@@ -43,16 +42,24 @@ def __init__(
                     OrderedDict(
                         [
                             ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
-                            ("conv", Conv2dModule(ch_in, ch_out, 1, 1, act_cfg=None, norm_cfg=norm_cfg)),
+                            ("conv", Conv2dModule(ch_in, ch_out, 1, 1, activation_callable=None, norm_cfg=norm_cfg)),
                         ],
                     ),
                 )
             else:
-                self.short = Conv2dModule(ch_in, ch_out, 1, stride, act_cfg=None, norm_cfg=norm_cfg)
-
-        self.branch2a = Conv2dModule(ch_in, ch_out, 3, stride, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg)
-        self.branch2b = Conv2dModule(ch_out, ch_out, 3, 1, padding=1, act_cfg=None, norm_cfg=norm_cfg)
-        self.act = nn.Identity() if act_cfg is None else build_activation_layer(act_cfg)
+                self.short = Conv2dModule(ch_in, ch_out, 1, stride, activation_callable=None, norm_cfg=norm_cfg)
+
+        self.branch2a = Conv2dModule(
+            ch_in,
+            ch_out,
+            3,
+            stride,
+            padding=1,
+            activation_callable=activation_callable,
+            norm_cfg=norm_cfg,
+        )
+        self.branch2b = Conv2dModule(ch_out, ch_out, 3, 1, padding=1, activation_callable=None, norm_cfg=norm_cfg)
+        self.act = activation_callable() if activation_callable else nn.Identity()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward."""
@@ -76,7 +83,7 @@ def __init__(
         ch_out: int,
         stride: int,
         shortcut: bool,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] | None = None,
         variant: str = "b",
         norm_cfg: dict[str, str] | None = None,
     ) -> None:
@@ -89,9 +96,24 @@ def __init__(
 
         width = ch_out
 
-        self.branch2a = Conv2dModule(ch_in, width, 1, stride1, act_cfg=act_cfg, norm_cfg=norm_cfg)
-        self.branch2b = Conv2dModule(width, width, 3, stride2, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg)
-        self.branch2c = Conv2dModule(width, ch_out * self.expansion, 1, 1, act_cfg=None, norm_cfg=norm_cfg)
+        self.branch2a = Conv2dModule(
+            ch_in,
+            width,
+            1,
+            stride1,
+            activation_callable=activation_callable,
+            norm_cfg=norm_cfg,
+        )
+        self.branch2b = Conv2dModule(
+            width,
+            width,
+            3,
+            stride2,
+            padding=1,
+            activation_callable=activation_callable,
+            norm_cfg=norm_cfg,
+        )
+        self.branch2c = Conv2dModule(width, ch_out * self.expansion, 1, 1, activation_callable=None, norm_cfg=norm_cfg)
 
         self.shortcut = shortcut
         if not shortcut:
@@ -102,15 +124,29 @@ def __init__(
                             ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
                             (
                                 "conv",
-                                Conv2dModule(ch_in, ch_out * self.expansion, 1, 1, act_cfg=None, norm_cfg=norm_cfg),
+                                Conv2dModule(
+                                    ch_in,
+                                    ch_out * self.expansion,
+                                    1,
+                                    1,
+                                    activation_callable=None,
+                                    norm_cfg=norm_cfg,
+                                ),
                             ),
                         ],
                     ),
                 )
             else:
-                self.short = Conv2dModule(ch_in, ch_out * self.expansion, 1, stride, act_cfg=None, norm_cfg=norm_cfg)
+                self.short = Conv2dModule(
+                    ch_in,
+                    ch_out * self.expansion,
+                    1,
+                    stride,
+                    activation_callable=None,
+                    norm_cfg=norm_cfg,
+                )
 
-        self.act = nn.Identity() if act_cfg is None else build_activation_layer(act_cfg)
+        self.act = activation_callable() if activation_callable else nn.Identity()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward."""
@@ -132,7 +168,7 @@ def __init__(
         ch_out: int,
         count: int,
         stage_num: int,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] | None = None,
         variant: str = "b",
         norm_cfg: dict[str, str] | None = None,
     ) -> None:
@@ -147,7 +183,7 @@ def __init__(
                     stride=2 if i == 0 and stage_num != 2 else 1,
                     shortcut=i != 0,
                     variant=variant,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                     norm_cfg=norm_cfg,
                 ),
             )
@@ -171,7 +207,8 @@ class PResNet(BaseModule):
         variant (str): The variant of the PResNet backbone. Defaults to "d".
         num_stages (int): The number of stages in the PResNet backbone. Defaults to 4.
         return_idx (list[int]): The indices of the stages to return as output. Defaults to [0, 1, 2, 3].
-        act_cfg (dict[str, str] | None, optional): The activation configuration. Defaults to None.
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+            Defaults to None.
         norm_cfg (dict[str, str] | None, optional): The normalization configuration. Defaults to None.
         freeze_at (int): The stage at which to freeze the parameters. Defaults to -1.
         pretrained (bool): Whether to load pretrained weights. Defaults to False.
@@ -197,7 +234,7 @@ def __init__(
         variant: str = "d",
         num_stages: int = 4,
         return_idx: list[int] = [0, 1, 2, 3],  # noqa: B006
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] | None = nn.ReLU,
         norm_cfg: dict[str, str] | None = None,
         freeze_at: int = -1,
         pretrained: bool = False,
@@ -215,12 +252,23 @@ def __init__(
             ]
         else:
             conv_def = [[3, ch_in, 7, 2, "conv1_1"]]
-        act_cfg = act_cfg if act_cfg is not None else {"type": "ReLU"}
+
         norm_cfg = norm_cfg if norm_cfg is not None else {"type": "BN", "name": "norm"}
         self.conv1 = nn.Sequential(
             OrderedDict(
                 [
-                    (_name, Conv2dModule(c_in, c_out, k, s, padding=(k - 1) // 2, act_cfg=act_cfg, norm_cfg=norm_cfg))
+                    (
+                        _name,
+                        Conv2dModule(
+                            c_in,
+                            c_out,
+                            k,
+                            s,
+                            padding=(k - 1) // 2,
+                            activation_callable=activation_callable,
+                            norm_cfg=norm_cfg,
+                        ),
+                    )
                     for c_in, c_out, k, s, _name in conv_def
                 ],
             ),
@@ -242,7 +290,7 @@ def __init__(
                     ch_out_list[i],
                     block_nums[i],
                     stage_num,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                     variant=variant,
                     norm_cfg=norm_cfg,
                 ),
diff --git a/src/otx/algo/detection/heads/rtdetr_decoder.py b/src/otx/algo/detection/heads/rtdetr_decoder.py
index 02de4a8a92a..a3b065df974 100644
--- a/src/otx/algo/detection/heads/rtdetr_decoder.py
+++ b/src/otx/algo/detection/heads/rtdetr_decoder.py
@@ -8,7 +8,7 @@
 import copy
 import math
 from collections import OrderedDict
-from typing import Any
+from typing import Any, Callable
 
 import torch
 import torchvision
@@ -18,7 +18,6 @@
 from otx.algo.detection.utils.utils import (
     inverse_sigmoid,
 )
-from otx.algo.modules import build_activation_layer
 from otx.algo.modules.base_module import BaseModule
 from otx.algo.modules.transformer import deformable_attention_core_func
 
@@ -140,13 +139,13 @@ def __init__(
         hidden_dim: int,
         output_dim: int,
         num_layers: int,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] | None = None,
     ) -> None:
         super().__init__()
         self.num_layers = num_layers
         h = [hidden_dim] * (num_layers - 1)
         self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim]))
-        self.act = nn.Identity() if act_cfg is None else build_activation_layer(act_cfg)
+        self.act = activation_callable() if activation_callable else nn.Identity()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward function of MLP."""
@@ -300,7 +299,8 @@ class TransformerDecoderLayer(nn.Module):
         n_head (int): The number of heads in the multiheadattention models.
         dim_feedforward (int): The dimension of the feedforward network model.
         dropout (float): The dropout value.
-        activation (dict[str, str] | None, optional): The activation function of intermediate layer, ReLU by default.
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.ReLU`.
         n_levels (int): The number of levels in MSDeformableAttention.
         n_points (int): The number of points in MSDeformableAttention.
     """
@@ -311,7 +311,7 @@ def __init__(
         n_head: int = 8,
         dim_feedforward: int = 1024,
         dropout: float = 0.0,
-        activation: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.ReLU,
         n_levels: int = 4,
         n_points: int = 4,
     ):
@@ -330,8 +330,7 @@ def __init__(
 
         # ffn
         self.linear1 = nn.Linear(d_model, dim_feedforward)
-        activation = activation if activation is not None else {"type": "ReLU"}
-        self.activation = build_activation_layer(activation)
+        self.activation = activation_callable()
         self.dropout3 = nn.Dropout(dropout)
         self.linear2 = nn.Linear(dim_feedforward, d_model)
         self.dropout4 = nn.Dropout(dropout)
@@ -468,8 +467,8 @@ class RTDETRTransformer(BaseModule):
         num_decoder_layers (int): Number of decoder layers.
         dim_feedforward (int): Dimension of the feedforward network.
         dropout (float): Dropout rate.
-        activation (dict[str, str] | None): The activation function of intermediate layer.
-            ReLu by default.
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.ReLU`.
         num_denoising (int): Number of denoising samples.
         label_noise_ratio (float): Ratio of label noise.
         box_noise_scale (float): Scale of box noise.
@@ -494,7 +493,7 @@ def __init__(  # noqa: PLR0913
         num_decoder_layers: int = 6,
         dim_feedforward: int = 1024,
         dropout: float = 0.0,
-        activation: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.ReLU,
         num_denoising: int = 100,
         label_noise_ratio: float = 0.5,
         box_noise_scale: float = 1.0,
@@ -531,7 +530,7 @@ def __init__(  # noqa: PLR0913
         self.num_decoder_layers = num_decoder_layers
         self.eval_spatial_size = eval_spatial_size
         self.aux_loss = aux_loss
-        activation = activation if activation is not None else {"type": "ReLU"}
+
         # backbone feature projection
         self._build_input_proj_layer(feat_channels)
 
@@ -541,7 +540,7 @@ def __init__(  # noqa: PLR0913
             nhead,
             dim_feedforward,
             dropout,
-            activation,
+            activation_callable,
             num_levels,
             num_decoder_points,
         )
@@ -558,7 +557,7 @@ def __init__(  # noqa: PLR0913
         self.learnt_init_query = learnt_init_query
         if learnt_init_query:
             self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
-        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2, act_cfg=activation)
+        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2, activation_callable=activation_callable)
 
         # encoder head
         self.enc_output = nn.Sequential(
@@ -566,12 +565,15 @@ def __init__(  # noqa: PLR0913
             nn.LayerNorm(hidden_dim),
         )
         self.enc_score_head = nn.Linear(hidden_dim, num_classes)
-        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3, act_cfg=activation)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3, activation_callable=activation_callable)
 
         # decoder head
         self.dec_score_head = nn.ModuleList([nn.Linear(hidden_dim, num_classes) for _ in range(num_decoder_layers)])
         self.dec_bbox_head = nn.ModuleList(
-            [MLP(hidden_dim, hidden_dim, 4, num_layers=3, act_cfg=activation) for _ in range(num_decoder_layers)],
+            [
+                MLP(hidden_dim, hidden_dim, 4, num_layers=3, activation_callable=activation_callable)
+                for _ in range(num_decoder_layers)
+            ],
         )
 
         # init encoder output anchors and valid_mask
diff --git a/src/otx/algo/detection/heads/rtmdet_head.py b/src/otx/algo/detection/heads/rtmdet_head.py
index 429c03cbe05..71623aad9e7 100644
--- a/src/otx/algo/detection/heads/rtmdet_head.py
+++ b/src/otx/algo/detection/heads/rtmdet_head.py
@@ -8,6 +8,8 @@
 
 from __future__ import annotations
 
+from typing import Callable
+
 import torch
 from torch import Tensor, nn
 
@@ -35,8 +37,8 @@ class RTMDetHead(ATSSHead):
         in_channels (int): Number of channels in the input feature map.
         with_objectness (bool): Whether to add an objectness branch.
             Defaults to True.
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU')
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.ReLU`.
     """
 
     def __init__(
@@ -44,10 +46,10 @@ def __init__(
         num_classes: int,
         in_channels: int,
         with_objectness: bool = True,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.ReLU,
         **kwargs,
     ) -> None:
-        self.act_cfg = act_cfg or {"type": "ReLU"}
+        self.activation_callable = activation_callable
         self.with_objectness = with_objectness
         super().__init__(num_classes, in_channels, **kwargs)
         if self.train_cfg:
@@ -67,7 +69,7 @@ def _init_layers(self) -> None:
                     stride=1,
                     padding=1,
                     norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
+                    activation_callable=self.activation_callable,
                 ),
             )
             self.reg_convs.append(
@@ -78,7 +80,7 @@ def _init_layers(self) -> None:
                     stride=1,
                     padding=1,
                     norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
+                    activation_callable=self.activation_callable,
                 ),
             )
         pred_pad_size = self.pred_kernel_size // 2
@@ -643,8 +645,8 @@ class RTMDetSepBNHead(RTMDetHead):
             Defaults to False.
         norm_cfg (dict): Config dict for normalization layer.
             Defaults to dict(type='BN', momentum=0.03, eps=0.001).
-        act_cfg (dict): Config dict for activation layer.
-            Defaults to dict(type='SiLU').
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.SiLU`.
         pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1.
         exp_on_reg (bool): Whether using exponential of regression features or not. Defaults to False.
     """
@@ -656,12 +658,11 @@ def __init__(
         share_conv: bool = True,
         use_depthwise: bool = False,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.SiLU,
         pred_kernel_size: int = 1,
         exp_on_reg: bool = False,
         **kwargs,
     ) -> None:
-        act_cfg = act_cfg or {"type": "SiLU"}
         norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001}
         self.share_conv = share_conv
         self.exp_on_reg = exp_on_reg
@@ -670,7 +671,7 @@ def __init__(
             num_classes,
             in_channels,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
             pred_kernel_size=pred_kernel_size,
             **kwargs,
         )
@@ -698,7 +699,7 @@ def _init_layers(self) -> None:
                         stride=1,
                         padding=1,
                         norm_cfg=self.norm_cfg,
-                        act_cfg=self.act_cfg,
+                        activation_callable=self.activation_callable,
                     ),
                 )
                 reg_convs.append(
@@ -709,7 +710,7 @@ def _init_layers(self) -> None:
                         stride=1,
                         padding=1,
                         norm_cfg=self.norm_cfg,
-                        act_cfg=self.act_cfg,
+                        activation_callable=self.activation_callable,
                     ),
                 )
             self.cls_convs.append(cls_convs)
diff --git a/src/otx/algo/detection/heads/yolox_head.py b/src/otx/algo/detection/heads/yolox_head.py
index 7f8a12fbef2..af5dd677af0 100644
--- a/src/otx/algo/detection/heads/yolox_head.py
+++ b/src/otx/algo/detection/heads/yolox_head.py
@@ -10,7 +10,7 @@
 
 import logging
 import math
-from typing import Sequence
+from typing import Callable, Sequence
 
 import torch
 import torch.nn.functional as F  # noqa: N812
@@ -24,6 +24,7 @@
 from otx.algo.common.utils.utils import multi_apply, reduce_mean
 from otx.algo.detection.heads.base_head import BaseDenseHead
 from otx.algo.detection.losses import IoULoss
+from otx.algo.modules.activation import Swish
 from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 from otx.algo.utils.mmengine_utils import InstanceData
 
@@ -41,7 +42,7 @@ class YOLOXHead(BaseDenseHead):
         stacked_convs (int): Number of stacking convs of the head.
             Defaults to (8, 16, 32).
         strides (Sequence[int]): Downsample factor of each feature map.
-             Defaults to None.
+            Defaults to None.
         use_depthwise (bool): Whether to depthwise separable convolution in blocks.
             Defaults to False.
         dcn_on_last_conv (bool): If true, use dcn in the last layer of towers.
@@ -51,8 +52,8 @@ class YOLOXHead(BaseDenseHead):
             None, otherwise False. Defaults to "auto".
         norm_cfg (dict): Config dict for normalization layer.
             Defaults to dict(type='BN', momentum=0.03, eps=0.001).
-        act_cfg (dict): Config dict for activation layer.
-            Defaults to None.
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `Swish`.
         loss_cls (nn.Module, optional): Module of classification loss.
         loss_bbox (nn.Module, optional): Module of localization loss.
         loss_obj (nn.Module, optional): Module of objectness loss.
@@ -76,7 +77,7 @@ def __init__(
         dcn_on_last_conv: bool = False,
         conv_bias: bool | str = "auto",
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = Swish,
         loss_cls: nn.Module | None = None,
         loss_bbox: nn.Module | None = None,
         loss_obj: nn.Module | None = None,
@@ -88,9 +89,6 @@ def __init__(
         if norm_cfg is None:
             norm_cfg = {"type": "BN", "momentum": 0.03, "eps": 0.001}
 
-        if act_cfg is None:
-            act_cfg = {"type": "Swish"}
-
         if init_cfg is None:
             init_cfg = {
                 "type": "Kaiming",
@@ -118,7 +116,7 @@ def __init__(
         self.use_sigmoid_cls = True
 
         self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
+        self.activation_callable = activation_callable
 
         self.loss_cls = loss_cls or CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0)
         self.loss_bbox = loss_bbox or IoULoss(mode="square", eps=1e-16, reduction="sum", loss_weight=5.0)
@@ -176,7 +174,7 @@ def _build_stacked_convs(self) -> nn.Sequential:
                     stride=1,
                     padding=1,
                     norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
+                    activation_callable=self.activation_callable,
                     bias=self.conv_bias,
                 ),
             )
diff --git a/src/otx/algo/detection/layers/csp_layer.py b/src/otx/algo/detection/layers/csp_layer.py
index 4cb0d10b57a..b4f39ff421a 100644
--- a/src/otx/algo/detection/layers/csp_layer.py
+++ b/src/otx/algo/detection/layers/csp_layer.py
@@ -5,11 +5,13 @@
 
 from __future__ import annotations
 
+from typing import Callable
+
 import torch
 from torch import Tensor, nn
 
 from otx.algo.detection.layers import ChannelAttention
-from otx.algo.modules import build_activation_layer
+from otx.algo.modules.activation import Swish
 from otx.algo.modules.base_module import BaseModule
 from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 
@@ -33,8 +35,8 @@ class DarknetBottleneck(BaseModule):
             Defaults to False.
         norm_cfg (dict): Config dict for normalization layer.
             Defaults to dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Defaults to dict(type='Swish').
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `Swish`.
     """
 
     def __init__(
@@ -45,15 +47,12 @@ def __init__(
         add_identity: bool = True,
         use_depthwise: bool = False,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = Swish,
         init_cfg: dict | list[dict] | None = None,
     ) -> None:
         if norm_cfg is None:
             norm_cfg = {"type": "BN", "momentum": 0.03, "eps": 0.001}
 
-        if act_cfg is None:
-            act_cfg = {"type": "Swish"}
-
         super().__init__(init_cfg=init_cfg)
 
         hidden_channels = int(out_channels * expansion)
@@ -63,7 +62,7 @@ def __init__(
             hidden_channels,
             1,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
         )
         self.conv2 = conv(
             hidden_channels,
@@ -72,7 +71,7 @@ def __init__(
             stride=1,
             padding=1,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
         )
         self.add_identity = add_identity and in_channels == out_channels
 
@@ -102,8 +101,8 @@ class CSPNeXtBlock(BaseModule):
             Defaults to 5.
         norm_cfg (dict): Config dict for normalization layer.
             Defaults to dict(type='BN', momentum=0.03, eps=0.001).
-        act_cfg (dict): Config dict for activation layer.
-            Defaults to dict(type='SiLU').
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.SiLU`.
         init_cfg (dict or list[dict], optional): Initialization config dict.
             Defaults to None.
     """
@@ -117,20 +116,25 @@ def __init__(
         use_depthwise: bool = False,
         kernel_size: int = 5,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.SiLU,
         init_cfg: dict | list[dict] | None = None,
     ) -> None:
         if norm_cfg is None:
             norm_cfg = {"type": "BN", "momentum": 0.03, "eps": 0.001}
 
-        if act_cfg is None:
-            act_cfg = {"type": "SiLU"}
-
         super().__init__(init_cfg=init_cfg)
 
         hidden_channels = int(out_channels * expansion)
         conv = DepthwiseSeparableConvModule if use_depthwise else Conv2dModule
-        self.conv1 = conv(in_channels, hidden_channels, 3, stride=1, padding=1, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv1 = conv(
+            in_channels,
+            hidden_channels,
+            3,
+            stride=1,
+            padding=1,
+            norm_cfg=norm_cfg,
+            activation_callable=activation_callable,
+        )
         self.conv2 = DepthwiseSeparableConvModule(
             hidden_channels,
             out_channels,
@@ -138,7 +142,7 @@ def __init__(
             stride=1,
             padding=kernel_size // 2,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
         )
         self.add_identity = add_identity and in_channels == out_channels
 
@@ -159,7 +163,8 @@ class RepVggBlock(nn.Module):
     Args:
         ch_in (int): The input channels of this Module.
         ch_out (int): The output channels of this Module.
-        act_cfg (dict[str, str] | None): Config dict for activation layer.
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+            Defaults to None.
         norm_cfg (dict[str, str] | None): Config dict for normalization layer.
     """
 
@@ -167,16 +172,16 @@ def __init__(
         self,
         ch_in: int,
         ch_out: int,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] | None = None,
         norm_cfg: dict[str, str] | None = None,
     ) -> None:
         """Initialize RepVggBlock."""
         super().__init__()
         self.ch_in = ch_in
         self.ch_out = ch_out
-        self.conv1 = Conv2dModule(ch_in, ch_out, 3, 1, padding=1, act_cfg=None, norm_cfg=norm_cfg)
-        self.conv2 = Conv2dModule(ch_in, ch_out, 1, 1, act_cfg=None, norm_cfg=norm_cfg)
-        self.act = nn.Identity() if act_cfg is None else build_activation_layer(act_cfg)
+        self.conv1 = Conv2dModule(ch_in, ch_out, 3, 1, padding=1, activation_callable=None, norm_cfg=norm_cfg)
+        self.conv2 = Conv2dModule(ch_in, ch_out, 1, 1, activation_callable=None, norm_cfg=norm_cfg)
+        self.act = activation_callable() if activation_callable else nn.Identity()
 
     def forward(self, x: Tensor) -> Tensor:
         """Forward function."""
@@ -230,8 +235,8 @@ class CSPLayer(BaseModule):
             stage. Defaults to True.
         norm_cfg (dict): Config dict for normalization layer.
             Defaults to dict(type='BN')
-        act_cfg (dict): Config dict for activation layer.
-            Defaults to dict(type='Swish')
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+            Defaults to `Swish`.
         init_cfg (dict or list[dict], optional): Initialization config dict.
             Defaults to None.
     """
@@ -247,15 +252,12 @@ def __init__(
         use_cspnext_block: bool = False,
         channel_attention: bool = False,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] | None = Swish,
         init_cfg: dict | list[dict] | None = None,
     ) -> None:
         if norm_cfg is None:
             norm_cfg = {"type": "BN", "momentum": 0.03, "eps": 0.001}
 
-        if act_cfg is None:
-            act_cfg = {"type": "Swish"}
-
         super().__init__(init_cfg=init_cfg)
 
         block = CSPNeXtBlock if use_cspnext_block else DarknetBottleneck
@@ -266,21 +268,21 @@ def __init__(
             mid_channels,
             1,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
         )
         self.short_conv = Conv2dModule(
             in_channels,
             mid_channels,
             1,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
         )
         self.final_conv = Conv2dModule(
             2 * mid_channels,
             out_channels,
             1,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
         )
 
         self.blocks = nn.Sequential(
@@ -292,7 +294,7 @@ def __init__(
                     add_identity,
                     use_depthwise,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 )
                 for _ in range(num_blocks)
             ],
@@ -325,7 +327,7 @@ class CSPRepLayer(nn.Module):
             hidden layer. Defaults to 1.0.
         bias (bool): Whether to use bias in the convolution layer.
             Defaults to False.
-        act_cfg (dict[str, str] | None): Config dict for activation layer.
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
             Defaults to None.
         norm_cfg (dict[str, str] | None): Config dict for normalization
             layer. Defaults to None.
@@ -338,17 +340,38 @@ def __init__(
         num_blocks: int = 3,
         expansion: float = 1.0,
         bias: bool = False,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] | None = None,
         norm_cfg: dict[str, str] | None = None,
     ) -> None:
         """Initialize CSPRepLayer."""
         super().__init__()
         hidden_channels = int(out_channels * expansion)
-        self.conv1 = Conv2dModule(in_channels, hidden_channels, 1, 1, bias=bias, act_cfg=act_cfg, norm_cfg=norm_cfg)
-        self.conv2 = Conv2dModule(in_channels, hidden_channels, 1, 1, bias=bias, act_cfg=act_cfg, norm_cfg=norm_cfg)
+        self.conv1 = Conv2dModule(
+            in_channels,
+            hidden_channels,
+            1,
+            1,
+            bias=bias,
+            activation_callable=activation_callable,
+            norm_cfg=norm_cfg,
+        )
+        self.conv2 = Conv2dModule(
+            in_channels,
+            hidden_channels,
+            1,
+            1,
+            bias=bias,
+            activation_callable=activation_callable,
+            norm_cfg=norm_cfg,
+        )
         self.bottlenecks = nn.Sequential(
             *[
-                RepVggBlock(hidden_channels, hidden_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
+                RepVggBlock(
+                    hidden_channels,
+                    hidden_channels,
+                    activation_callable=activation_callable,
+                    norm_cfg=norm_cfg,
+                )
                 for _ in range(num_blocks)
             ],
         )
@@ -359,7 +382,7 @@ def __init__(
                 1,
                 1,
                 bias=bias,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
                 norm_cfg=norm_cfg,
             )
         else:
diff --git a/src/otx/algo/detection/necks/cspnext_pafpn.py b/src/otx/algo/detection/necks/cspnext_pafpn.py
index 4b10101557d..82c91bb9c70 100644
--- a/src/otx/algo/detection/necks/cspnext_pafpn.py
+++ b/src/otx/algo/detection/necks/cspnext_pafpn.py
@@ -11,11 +11,13 @@
 from __future__ import annotations
 
 import math
+from typing import Callable
 
 import torch
 from torch import Tensor, nn
 
 from otx.algo.detection.layers import CSPLayer
+from otx.algo.modules.activation import Swish
 from otx.algo.modules.base_module import BaseModule
 from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 
@@ -31,7 +33,8 @@ class CSPNeXtPAFPN(BaseModule):
         expand_ratio (float): Ratio to adjust the number of channels of the hidden layer. Default: 0.5
         upsample_cfg (dict): Config dict for interpolate layer. Default: `dict(scale_factor=2, mode='nearest')`
         norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN')
-        act_cfg (dict): Config dict for activation layer. Default: dict(type='Swish')
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `Swish`.
         init_cfg (dict or list[dict], optional): Initialization config dict. Default: None.
     """
 
@@ -44,12 +47,11 @@ def __init__(
         expand_ratio: float = 0.5,
         upsample_cfg: dict | None = None,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = Swish,
         init_cfg: dict | None = None,
     ) -> None:
         upsample_cfg = upsample_cfg or {"scale_factor": 2, "mode": "nearest"}
         norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001}
-        act_cfg = act_cfg or {"type": "Swish"}
         init_cfg = init_cfg or {
             "type": "Kaiming",
             "layer": "Conv2d",
@@ -76,7 +78,7 @@ def __init__(
                     in_channels[idx - 1],
                     1,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 ),
             )
             self.top_down_blocks.append(
@@ -89,7 +91,7 @@ def __init__(
                     use_cspnext_block=True,
                     expand_ratio=expand_ratio,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 ),
             )
 
@@ -105,7 +107,7 @@ def __init__(
                     stride=2,
                     padding=1,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 ),
             )
             self.bottom_up_blocks.append(
@@ -118,14 +120,21 @@ def __init__(
                     use_cspnext_block=True,
                     expand_ratio=expand_ratio,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 ),
             )
 
         self.out_convs = nn.ModuleList()
         for i in range(len(in_channels)):
             self.out_convs.append(
-                conv(in_channels[i], out_channels, 3, padding=1, norm_cfg=norm_cfg, act_cfg=act_cfg),
+                conv(
+                    in_channels[i],
+                    out_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    activation_callable=activation_callable,
+                ),
             )
 
     def forward(self, inputs: tuple[Tensor, ...]) -> tuple[Tensor, ...]:
diff --git a/src/otx/algo/detection/necks/fpn.py b/src/otx/algo/detection/necks/fpn.py
index 1d6c32355e0..4239be3944b 100644
--- a/src/otx/algo/detection/necks/fpn.py
+++ b/src/otx/algo/detection/necks/fpn.py
@@ -10,6 +10,8 @@
 
 from __future__ import annotations
 
+from typing import Callable
+
 from torch import Tensor, nn
 
 from otx.algo.modules.base_module import BaseModule
@@ -45,8 +47,8 @@ class FPN(BaseModule):
             Defaults to False.
         norm_cfg (dict, optional): Config dict for
             normalization layer. Defaults to None.
-        act_cfg (dict, optional): Config dict for
-            activation layer in ConvModule. Defaults to None.
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to None.
         upsample_cfg (dict, optional): Config dict
             for interpolate layer. Defaults to dict(mode='nearest').
         init_cfg (dict, list[dict], optional): Initialization config dict.
@@ -63,7 +65,7 @@ def __init__(
         relu_before_extra_convs: bool = False,
         no_norm_on_lateral: bool = False,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] | None = None,
         upsample_cfg: dict | None = None,
         init_cfg: dict | list[dict] | None = None,
     ) -> None:
@@ -103,7 +105,7 @@ def __init__(
                 out_channels,
                 1,
                 norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
                 inplace=False,
             )
             fpn_conv = Conv2dModule(
@@ -112,7 +114,7 @@ def __init__(
                 3,
                 padding=1,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
                 inplace=False,
             )
 
@@ -134,7 +136,7 @@ def __init__(
                     stride=2,
                     padding=1,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                     inplace=False,
                 )
                 self.fpn_convs.append(extra_fpn_conv)
diff --git a/src/otx/algo/detection/necks/hybrid_encoder.py b/src/otx/algo/detection/necks/hybrid_encoder.py
index dc8879ada33..c93d7fb842c 100644
--- a/src/otx/algo/detection/necks/hybrid_encoder.py
+++ b/src/otx/algo/detection/necks/hybrid_encoder.py
@@ -6,12 +6,13 @@
 from __future__ import annotations
 
 import copy
+from typing import Callable
 
 import torch
 from torch import nn
 
 from otx.algo.detection.layers import CSPRepLayer
-from otx.algo.modules import Conv2dModule, build_activation_layer
+from otx.algo.modules import Conv2dModule
 from otx.algo.modules.base_module import BaseModule
 
 __all__ = ["HybridEncoder"]
@@ -25,7 +26,7 @@ def __init__(
         nhead: int,
         dim_feedforward: int = 2048,
         dropout: float = 0.1,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.GELU,
         normalize_before: bool = False,
     ) -> None:
         super().__init__()
@@ -41,8 +42,7 @@ def __init__(
         self.norm2 = nn.LayerNorm(d_model)
         self.dropout1 = nn.Dropout(dropout)
         self.dropout2 = nn.Dropout(dropout)
-        act_cfg = act_cfg if act_cfg is not None else {"type": "GELU"}
-        self.activation = build_activation_layer(act_cfg)
+        self.activation = activation_callable()
 
     @staticmethod
     def with_pos_embed(tensor: torch.Tensor, pos_embed: torch.Tensor | None) -> torch.Tensor:
@@ -111,8 +111,8 @@ class HybridEncoder(BaseModule):
         dim_feedforward (int, optional): Dimension of the feedforward network
             in the transformer encoder. Defaults to 1024.
         dropout (float, optional): Dropout rate. Defaults to 0.0.
-        enc_act_cfg (dict[str, str] | None, optional): Activation configuration
-            for the encoder. Defaults to None.
+        enc_activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.GELU`.
         norm_cfg (dict[str, str] | None, optional): Normalization configuration.
             Defaults to None.
         use_encoder_idx (list[int], optional): List of indices of the encoder to use.
@@ -125,8 +125,8 @@ class HybridEncoder(BaseModule):
             Defaults to 1.0.
         depth_mult (float, optional): Depth multiplier for the CSPRepLayer.
             Defaults to 1.0.
-        act_cfg (dict[str, str] | None, optional): Activation configuration
-            for the CSPRepLayer. Defaults to None.
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.SiLU`.
         eval_spatial_size (tuple[int, int] | None, optional): Spatial size for
             evaluation. Defaults to None.
     """
@@ -139,14 +139,14 @@ def __init__(
         nhead: int = 8,
         dim_feedforward: int = 1024,
         dropout: float = 0.0,
-        enc_act_cfg: dict[str, str] | None = None,
+        enc_activation_callable: Callable[..., nn.Module] = nn.GELU,
         norm_cfg: dict[str, str] | None = None,
         use_encoder_idx: list[int] = [2],  # noqa: B006
         num_encoder_layers: int = 1,
         pe_temperature: float = 10000,
         expansion: float = 1.0,
         depth_mult: float = 1.0,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.SiLU,
         eval_spatial_size: tuple[int, int] | None = None,
     ) -> None:
         """Initialize the HybridEncoder module."""
@@ -161,8 +161,6 @@ def __init__(
 
         self.out_channels = [hidden_dim for _ in range(len(in_channels))]
         self.out_strides = feat_strides
-        enc_act_cfg = enc_act_cfg if enc_act_cfg is not None else {"type": "GELU"}
-        act_cfg = act_cfg if act_cfg is not None else {"type": "SiLU"}
         norm_cfg = norm_cfg if norm_cfg is not None else {"type": "BN", "name": "norm"}
         # channel projection
         self.input_proj = nn.ModuleList()
@@ -180,7 +178,7 @@ def __init__(
             nhead=nhead,
             dim_feedforward=dim_feedforward,
             dropout=dropout,
-            act_cfg=enc_act_cfg,
+            activation_callable=enc_activation_callable,
         )
 
         self.encoder = nn.ModuleList(
@@ -191,13 +189,15 @@ def __init__(
         self.lateral_convs = nn.ModuleList()
         self.fpn_blocks = nn.ModuleList()
         for _ in range(len(in_channels) - 1, 0, -1):
-            self.lateral_convs.append(Conv2dModule(hidden_dim, hidden_dim, 1, 1, act_cfg=act_cfg, norm_cfg=norm_cfg))
+            self.lateral_convs.append(
+                Conv2dModule(hidden_dim, hidden_dim, 1, 1, activation_callable=activation_callable, norm_cfg=norm_cfg),
+            )
             self.fpn_blocks.append(
                 CSPRepLayer(
                     hidden_dim * 2,
                     hidden_dim,
                     round(3 * depth_mult),
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                     expansion=expansion,
                     norm_cfg=norm_cfg,
                 ),
@@ -208,14 +208,22 @@ def __init__(
         self.pan_blocks = nn.ModuleList()
         for _ in range(len(in_channels) - 1):
             self.downsample_convs.append(
-                Conv2dModule(hidden_dim, hidden_dim, 3, 2, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg),
+                Conv2dModule(
+                    hidden_dim,
+                    hidden_dim,
+                    3,
+                    2,
+                    padding=1,
+                    activation_callable=activation_callable,
+                    norm_cfg=norm_cfg,
+                ),
             )
             self.pan_blocks.append(
                 CSPRepLayer(
                     hidden_dim * 2,
                     hidden_dim,
                     round(3 * depth_mult),
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                     expansion=expansion,
                     norm_cfg=norm_cfg,
                 ),
diff --git a/src/otx/algo/detection/necks/yolox_pafpn.py b/src/otx/algo/detection/necks/yolox_pafpn.py
index 762d6c36852..68b99e27caa 100644
--- a/src/otx/algo/detection/necks/yolox_pafpn.py
+++ b/src/otx/algo/detection/necks/yolox_pafpn.py
@@ -9,12 +9,13 @@
 from __future__ import annotations
 
 import math
-from typing import Any
+from typing import Any, Callable
 
 import torch
 from torch import Tensor, nn
 
 from otx.algo.detection.layers import CSPLayer
+from otx.algo.modules.activation import Swish
 from otx.algo.modules.base_module import BaseModule
 from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
 
@@ -32,8 +33,8 @@ class YOLOXPAFPN(BaseModule):
             Default: `dict(scale_factor=2, mode='nearest')`
         norm_cfg (dict): Config dict for normalization layer.
             Default: dict(type='BN')
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='Swish')
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.Swish`.
         init_cfg (dict or list[dict], optional): Initialization config dict.
             Default: None.
     """
@@ -46,12 +47,11 @@ def __init__(
         use_depthwise: bool = False,
         upsample_cfg: dict | None = None,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = Swish,
         init_cfg: dict | list[dict] | None = None,
     ):
         upsample_cfg = upsample_cfg or {"scale_factor": 2, "mode": "nearest"}
         norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001}
-        act_cfg = act_cfg or {"type": "Swish"}
         init_cfg = init_cfg or {
             "type": "Kaiming",
             "layer": "Conv2d",
@@ -79,7 +79,7 @@ def __init__(
                     in_channels[idx - 1],
                     1,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 ),
             )
             self.top_down_blocks.append(
@@ -90,7 +90,7 @@ def __init__(
                     add_identity=False,
                     use_depthwise=use_depthwise,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 ),
             )
 
@@ -106,7 +106,7 @@ def __init__(
                     stride=2,
                     padding=1,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 ),
             )
             self.bottom_up_blocks.append(
@@ -117,14 +117,20 @@ def __init__(
                     add_identity=False,
                     use_depthwise=use_depthwise,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 ),
             )
 
         self.out_convs = nn.ModuleList()
         for i in range(len(in_channels)):
             self.out_convs.append(
-                Conv2dModule(in_channels[i], out_channels, 1, norm_cfg=norm_cfg, act_cfg=act_cfg),
+                Conv2dModule(
+                    in_channels[i],
+                    out_channels,
+                    1,
+                    norm_cfg=norm_cfg,
+                    activation_callable=activation_callable,
+                ),
             )
 
     def forward(self, inputs: tuple[Tensor]) -> tuple[Any, ...]:
diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py
index 051df93020c..254ad2aff4e 100644
--- a/src/otx/algo/detection/rtmdet.py
+++ b/src/otx/algo/detection/rtmdet.py
@@ -5,6 +5,10 @@
 
 from __future__ import annotations
 
+from functools import partial
+
+from torch import nn
+
 from otx.algo.common.backbones import CSPNeXt
 from otx.algo.common.losses import GIoULoss, QualityFocalLoss
 from otx.algo.common.losses.cross_entropy_loss import CrossEntropyLoss
@@ -90,7 +94,7 @@ def _build_model(self, num_classes: int) -> RTMDet:
             deepen_factor=0.167,
             widen_factor=0.375,
             norm_cfg={"type": "BN"},
-            act_cfg={"type": "SiLU", "inplace": True},
+            activation_callable=partial(nn.SiLU, inplace=True),
         )
 
         neck = CSPNeXtPAFPN(
@@ -98,7 +102,7 @@ def _build_model(self, num_classes: int) -> RTMDet:
             out_channels=96,
             num_csp_blocks=1,
             norm_cfg={"type": "BN"},
-            act_cfg={"type": "SiLU", "inplace": True},
+            activation_callable=partial(nn.SiLU, inplace=True),
         )
 
         bbox_head = RTMDetSepBNHead(
@@ -113,7 +117,7 @@ def _build_model(self, num_classes: int) -> RTMDet:
             loss_bbox=GIoULoss(loss_weight=2.0),
             loss_centerness=CrossEntropyLoss(use_sigmoid=True, loss_weight=1.0),
             norm_cfg={"type": "BN"},
-            act_cfg={"type": "SiLU", "inplace": True},
+            activation_callable=partial(nn.SiLU, inplace=True),
             train_cfg=train_cfg,
             test_cfg=test_cfg,
         )
diff --git a/src/otx/algo/instance_segmentation/backbones/swin.py b/src/otx/algo/instance_segmentation/backbones/swin.py
index 4eb85af6362..b249dfdb75c 100644
--- a/src/otx/algo/instance_segmentation/backbones/swin.py
+++ b/src/otx/algo/instance_segmentation/backbones/swin.py
@@ -12,6 +12,7 @@
 from collections import OrderedDict
 from copy import deepcopy
 from pathlib import Path
+from typing import Callable
 
 import torch
 import torch.nn.functional
@@ -317,8 +318,8 @@ class SwinBlock(BaseModule):
         drop_rate (float, optional): Dropout rate. Default: 0.
         attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
         drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
-        act_cfg (dict, optional): The config dict of activation function.
-            Default: dict(type='GELU').
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.GELU`.
         norm_cfg (dict, optional): The config dict of normalization.
             Default: dict(type='LN').
         with_cp (bool, optional): Use checkpoint or not. Using checkpoint
@@ -340,7 +341,7 @@ def __init__(
         drop_rate: float = 0.0,
         attn_drop_rate: float = 0.0,
         drop_path_rate: float = 0.0,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.GELU,
         norm_cfg: dict | None = None,
         with_cp: bool = False,
         init_cfg: None = None,
@@ -350,7 +351,6 @@ def __init__(
         self.init_cfg = init_cfg
         self.with_cp = with_cp
 
-        act_cfg = act_cfg if act_cfg is not None else {"type": "GELU"}
         norm_cfg = norm_cfg if norm_cfg is not None else {"type": "LN"}
 
         self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
@@ -374,7 +374,7 @@ def __init__(
             num_fcs=2,
             ffn_drop=drop_rate,
             dropout_layer={"type": "DropPath", "drop_prob": drop_path_rate},
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
             add_identity=True,
             init_cfg=None,
         )
@@ -415,8 +415,8 @@ class SwinBlockSequence(BaseModule):
             rate. Default: 0.
         downsample (BaseModule | None, optional): The downsample operation
             module. Default: None.
-        act_cfg (dict, optional): The config dict of activation function.
-            Default: dict(type='GELU').
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.GELU`.
         norm_cfg (dict, optional): The config dict of normalization.
             Default: dict(type='LN').
         with_cp (bool, optional): Use checkpoint or not. Using checkpoint
@@ -439,14 +439,13 @@ def __init__(
         attn_drop_rate: float = 0.0,
         drop_path_rate: list[float] | float = 0.0,
         downsample: BaseModule | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.GELU,
         norm_cfg: dict | None = None,
         with_cp: bool = False,
         init_cfg: None = None,
     ):
         super().__init__(init_cfg=init_cfg)
 
-        act_cfg = act_cfg if act_cfg is not None else {"type": "GELU"}
         norm_cfg = norm_cfg if norm_cfg is not None else {"type": "LN"}
 
         if isinstance(drop_path_rate, list):
@@ -470,7 +469,7 @@ def __init__(
                 drop_rate=drop_rate,
                 attn_drop_rate=attn_drop_rate,
                 drop_path_rate=drop_path_rates[i],
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
                 norm_cfg=norm_cfg,
                 with_cp=with_cp,
                 init_cfg=None,
@@ -528,8 +527,8 @@ class SwinTransformer(BaseModule):
         drop_rate (float): Dropout rate. Defaults: 0.
         attn_drop_rate (float): Attention dropout rate. Default: 0.
         drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='GELU').
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.GELU`.
         norm_cfg (dict): Config dict for normalization layer at
             output of backone. Defaults: dict(type='LN').
         with_cp (bool, optional): Use checkpoint or not. Using checkpoint
@@ -564,7 +563,7 @@ def __init__(
         drop_rate: float = 0.0,
         attn_drop_rate: float = 0.0,
         drop_path_rate: float = 0.1,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.GELU,
         norm_cfg: dict | None = None,
         with_cp: bool = False,
         pretrained: str | None = None,
@@ -572,7 +571,6 @@ def __init__(
         frozen_stages: int = -1,
         init_cfg: dict | None = None,
     ):
-        act_cfg = act_cfg if act_cfg is not None else {"type": "GELU"}
         norm_cfg = norm_cfg if norm_cfg is not None else {"type": "LN"}
         self.convert_weights = convert_weights
         self.frozen_stages = frozen_stages
@@ -650,7 +648,7 @@ def __init__(
                 attn_drop_rate=attn_drop_rate,
                 drop_path_rate=dpr[sum(depths[:i]) : sum(depths[: i + 1])],
                 downsample=downsample,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
                 norm_cfg=norm_cfg,
                 with_cp=with_cp,
                 init_cfg=None,
diff --git a/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py b/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py
index 9d46627a43b..87c5c3dbc99 100644
--- a/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py
+++ b/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py
@@ -10,6 +10,8 @@
 
 import copy
 import math
+from functools import partial
+from typing import Callable
 
 import numpy as np
 import torch
@@ -109,7 +111,7 @@ def _init_layers(self) -> None:
                     stride=1,
                     padding=1,
                     norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
+                    activation_callable=self.activation_callable,
                 ),
             )
         pred_pad_size = self.pred_kernel_size // 2
@@ -125,7 +127,7 @@ def _init_layers(self) -> None:
             stacked_convs=4,
             num_levels=len(self.prior_generator.strides),
             num_prototypes=self.num_prototypes,
-            act_cfg=self.act_cfg,
+            activation_callable=self.activation_callable,
             norm_cfg=self.norm_cfg,
         )
 
@@ -702,16 +704,16 @@ class MaskFeatModule(BaseModule):
     Args:
         in_channels (int): Number of channels in the input feature map.
         feat_channels (int): Number of hidden channels of the mask feature
-             map branch.
+            map branch.
         num_levels (int): The starting feature map level from RPN that
-             will be used to predict the mask feature map.
+            will be used to predict the mask feature map.
         num_prototypes (int): Number of output channel of the mask feature
-             map branch. This is the channel count of the mask
-             feature map that to be dynamically convolved with the predicted
-             kernel.
+            map branch. This is the channel count of the mask
+            feature map that to be dynamically convolved with the predicted
+            kernel.
         stacked_convs (int): Number of convs in mask feature branch.
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU', inplace=True)
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `partial(nn.ReLU, inplace=True)`.
         norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN').
     """
 
@@ -722,14 +724,11 @@ def __init__(
         stacked_convs: int = 4,
         num_levels: int = 3,
         num_prototypes: int = 8,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = partial(nn.ReLU, inplace=True),
         norm_cfg: dict | None = None,
     ) -> None:
         super().__init__(init_cfg=None)
 
-        if act_cfg is None:
-            act_cfg = {"type": "ReLU", "inplace": True}
-
         if norm_cfg is None:
             norm_cfg = {"type": "BN"}
 
@@ -738,7 +737,16 @@ def __init__(
         convs = []
         for i in range(stacked_convs):
             in_c = in_channels if i == 0 else feat_channels
-            convs.append(Conv2dModule(in_c, feat_channels, 3, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg))
+            convs.append(
+                Conv2dModule(
+                    in_c,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    activation_callable=activation_callable,
+                    norm_cfg=norm_cfg,
+                ),
+            )
         self.stacked_convs = nn.Sequential(*convs)
         self.projection = nn.Conv2d(feat_channels, num_prototypes, kernel_size=1)
 
@@ -768,8 +776,8 @@ class RTMDetInsSepBNHead(RTMDetInsHead):
             Defaults to True.
         norm_cfg (dict): Config dict for normalization
             layer. Defaults to dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Defaults to dict(type='SiLU', inplace=True).
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `partial(nn.SiLU, inplace=True)`.
         pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1.
     """
 
@@ -780,21 +788,19 @@ def __init__(
         share_conv: bool = True,
         with_objectness: bool = False,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = partial(nn.SiLU, inplace=True),
         pred_kernel_size: int = 1,
         **kwargs,
     ) -> None:
         if norm_cfg is None:
             norm_cfg = {"type": "BN", "requires_grad": True}
-        if act_cfg is None:
-            act_cfg = {"type": "SiLU", "inplace": True}
 
         self.share_conv = share_conv
         super().__init__(
             num_classes,
             in_channels,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
+            activation_callable=activation_callable,
             pred_kernel_size=pred_kernel_size,
             with_objectness=with_objectness,
             **kwargs,
@@ -849,7 +855,7 @@ def _init_layers(self) -> None:
                         stride=1,
                         padding=1,
                         norm_cfg=self.norm_cfg,
-                        act_cfg=self.act_cfg,
+                        activation_callable=self.activation_callable,
                     ),
                 )
                 reg_convs.append(
@@ -860,7 +866,7 @@ def _init_layers(self) -> None:
                         stride=1,
                         padding=1,
                         norm_cfg=self.norm_cfg,
-                        act_cfg=self.act_cfg,
+                        activation_callable=self.activation_callable,
                     ),
                 )
                 kernel_convs.append(
@@ -871,7 +877,7 @@ def _init_layers(self) -> None:
                         stride=1,
                         padding=1,
                         norm_cfg=self.norm_cfg,
-                        act_cfg=self.act_cfg,
+                        activation_callable=self.activation_callable,
                     ),
                 )
             self.cls_convs.append(cls_convs)
@@ -907,7 +913,7 @@ def _init_layers(self) -> None:
             stacked_convs=4,
             num_levels=len(self.prior_generator.strides),
             num_prototypes=self.num_prototypes,
-            act_cfg=self.act_cfg,
+            activation_callable=self.activation_callable,
             norm_cfg=self.norm_cfg,
         )
 
diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py
index 6a9f95c9974..0f6796aa791 100644
--- a/src/otx/algo/instance_segmentation/maskrcnn.py
+++ b/src/otx/algo/instance_segmentation/maskrcnn.py
@@ -7,6 +7,7 @@
 
 from typing import Any
 
+from torch import nn
 from torchvision.ops import RoIAlign
 
 from otx.algo.common.backbones import ResNet, build_model_including_pytorchcv
@@ -330,7 +331,7 @@ def _build_model(self, num_classes: int) -> TwoStageDetector:
                 "out_indices": [2, 3, 4, 5],
                 "frozen_stages": -1,
                 "pretrained": True,
-                "activation_cfg": {"type": "torch_swish"},
+                "activation_callable": nn.SiLU,
                 "norm_cfg": {"type": "BN", "requires_grad": True},
             },
         )
diff --git a/src/otx/algo/instance_segmentation/necks/fpn.py b/src/otx/algo/instance_segmentation/necks/fpn.py
index 67286814f89..005b80cfe91 100644
--- a/src/otx/algo/instance_segmentation/necks/fpn.py
+++ b/src/otx/algo/instance_segmentation/necks/fpn.py
@@ -8,6 +8,8 @@
 
 from __future__ import annotations
 
+from typing import Callable
+
 import torch.nn.functional
 from torch import Tensor, nn
 
@@ -36,8 +38,8 @@ class FPN(BaseModule):
             Defaults to False.
         norm_cfg (dict, optional): Config dict for
             normalization layer. Defaults to None.
-        act_cfg (dict, optional): Config dict for
-            activation layer in ConvModule. Defaults to None.
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+            Defaults to None.
         upsample_cfg (dict, optional): Config dict
             for interpolate layer. Defaults to dict(mode='nearest').
         init_cfg (dict or list[dict]): Initialization config dict.
@@ -53,7 +55,7 @@ def __init__(
         relu_before_extra_convs: bool = False,
         no_norm_on_lateral: bool = False,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] | None = None,
         upsample_cfg: dict | None = None,
         init_cfg: dict | list[dict] | None = None,
     ) -> None:
@@ -97,7 +99,7 @@ def __init__(
                 out_channels,
                 1,
                 norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
                 inplace=False,
             )
             fpn_conv = Conv2dModule(
@@ -106,7 +108,7 @@ def __init__(
                 3,
                 padding=1,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
                 inplace=False,
             )
 
diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py
index 53f028e0f9e..5f0a97215d0 100644
--- a/src/otx/algo/instance_segmentation/rtmdet_inst.py
+++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py
@@ -5,8 +5,11 @@
 
 from __future__ import annotations
 
+from functools import partial
 from typing import TYPE_CHECKING
 
+from torch import nn
+
 from otx.algo.common.backbones import CSPNeXt
 from otx.algo.common.losses import CrossEntropyLoss, GIoULoss, QualityFocalLoss
 from otx.algo.common.utils.assigners import DynamicSoftLabelAssigner
@@ -112,7 +115,7 @@ def _build_model(self, num_classes: int) -> SingleStageDetector:
             widen_factor=0.375,
             channel_attention=True,
             norm_cfg={"type": "BN"},
-            act_cfg={"type": "SiLU", "inplace": True},
+            activation_callable=partial(nn.SiLU, inplace=True),
         )
 
         neck = CSPNeXtPAFPN(
@@ -121,7 +124,7 @@ def _build_model(self, num_classes: int) -> SingleStageDetector:
             num_csp_blocks=1,
             expand_ratio=0.5,
             norm_cfg={"type": "BN"},
-            act_cfg={"type": "SiLU", "inplace": True},
+            activation_callable=partial(nn.SiLU, inplace=True),
         )
 
         bbox_head = RTMDetInsSepBNHead(
@@ -131,7 +134,7 @@ def _build_model(self, num_classes: int) -> SingleStageDetector:
             share_conv=True,
             pred_kernel_size=1,
             feat_channels=96,
-            act_cfg={"type": "SiLU", "inplace": True},
+            activation_callable=partial(nn.SiLU, inplace=True),
             norm_cfg={"type": "BN", "requires_grad": True},
             anchor_generator=MlvlPointGenerator(
                 offset=0,
diff --git a/src/otx/algo/keypoint_detection/rtmpose.py b/src/otx/algo/keypoint_detection/rtmpose.py
index c552580b557..0086acd80dd 100644
--- a/src/otx/algo/keypoint_detection/rtmpose.py
+++ b/src/otx/algo/keypoint_detection/rtmpose.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+from functools import partial
 from typing import TYPE_CHECKING
 
 from otx.algo.common.backbones import CSPNeXt
@@ -13,6 +14,7 @@
 from otx.algo.keypoint_detection.topdown import TopdownPoseEstimator
 from otx.core.exporter.native import OTXNativeModelExporter
 from otx.core.model.keypoint_detection import OTXKeypointDetectionModel
+from torch import nn
 
 if TYPE_CHECKING:
     from otx.core.exporter.base import OTXModelExporter
@@ -77,7 +79,7 @@ def _build_model(self, num_classes: int) -> RTMPose:
             out_indices=(4,),
             channel_attention=True,
             norm_cfg={"type": "BN"},
-            act_cfg={"type": "SiLU", "inplace": True},
+            activation_callable=partial(nn.SiLU, inplace=True),
         )
         head = RTMCCHead(
             out_channels=num_classes,
diff --git a/src/otx/algo/modules/__init__.py b/src/otx/algo/modules/__init__.py
index 605f47c67e0..ddf0be601e4 100644
--- a/src/otx/algo/modules/__init__.py
+++ b/src/otx/algo/modules/__init__.py
@@ -1,16 +1,13 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-# Copyright (c) OpenMMLab. All rights reserved.
 
-"""This module implementation is a code implementation copied or replaced from mmcv.cnn.bricks."""
+"""Common module implementations."""
 
-from .activation import build_activation_layer
 from .conv_module import Conv2dModule, Conv3dModule, DepthwiseSeparableConvModule
 from .norm import FrozenBatchNorm2d, build_norm_layer
 from .padding import build_padding_layer
 
 __all__ = [
-    "build_activation_layer",
     "build_padding_layer",
     "build_norm_layer",
     "Conv2dModule",
diff --git a/src/otx/algo/modules/activation.py b/src/otx/algo/modules/activation.py
index cc3a1e95080..81249243dd1 100644
--- a/src/otx/algo/modules/activation.py
+++ b/src/otx/algo/modules/activation.py
@@ -2,13 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (c) OpenMMLab. All rights reserved.
 
-"""This implementation replaces the functionality of mmcv.cnn.bricks.activation.build_activation_layer."""
-from __future__ import annotations
+"""Custom activation implementation copied from mmcv.cnn.bricks.swish.py."""
 
-import copy
+from __future__ import annotations
 
 import torch
-from torch import nn
+from torch import Tensor, nn
 
 
 class Swish(nn.Module):
@@ -23,52 +22,13 @@ class Swish(nn.Module):
         Tensor: The output tensor.
     """
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: Tensor) -> Tensor:
         """Forward function.
 
         Args:
-            x (torch.Tensor): The input tensor.
+            x (Tensor): The input tensor.
 
         Returns:
-            torch.Tensor: The output tensor.
+            Tensor: The output tensor.
         """
         return x * torch.sigmoid(x)
-
-
-ACTIVATION_DICT = {
-    "ReLU": nn.ReLU,
-    "LeakyReLU": nn.LeakyReLU,
-    "PReLU": nn.PReLU,
-    "RReLU": nn.RReLU,
-    "ReLU6": nn.ReLU6,
-    "ELU": nn.ELU,
-    "Sigmoid": nn.Sigmoid,
-    "Tanh": nn.Tanh,
-    "SiLU": nn.SiLU,
-    "GELU": nn.GELU,
-    "Swish": Swish,
-}
-
-
-def build_activation_layer(cfg: dict) -> nn.Module:
-    """Build activation layer.
-
-    Args:
-        cfg (dict): The activation layer config, which should contain:
-
-            - type (str): Layer type.
-            - layer args: Args needed to instantiate an activation layer.
-
-    Returns:
-        nn.Module: Created activation layer.
-    """
-    _cfg = copy.deepcopy(cfg)
-    activation_type = _cfg.pop("type", None)
-    if activation_type is None:
-        msg = "The cfg dict must contain the key 'type'"
-        raise KeyError(msg)
-    if activation_type not in ACTIVATION_DICT:
-        msg = f"Cannot find {activation_type} in {ACTIVATION_DICT.keys()}"
-        raise KeyError(msg)
-
-    return ACTIVATION_DICT[activation_type](**_cfg)
diff --git a/src/otx/algo/modules/conv_module.py b/src/otx/algo/modules/conv_module.py
index 8fa9d6764d7..aa5e51879ba 100644
--- a/src/otx/algo/modules/conv_module.py
+++ b/src/otx/algo/modules/conv_module.py
@@ -2,14 +2,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (c) OpenMMLab. All rights reserved.
 
-"""This implementation copied ConvModule of mmcv.cnn.bricks.ConvModule."""
+"""This implementation modified ConvModule of mmcv.cnn.bricks.ConvModule."""
 
 # TODO(someone): Revisit mypy errors after deprecation of mmlab
 
 from __future__ import annotations
 
 import warnings
-from typing import TYPE_CHECKING
+from functools import partial
+from typing import TYPE_CHECKING, Callable
 
 from torch import Tensor, nn
 from torch.nn.modules.batchnorm import _BatchNorm as BatchNorm
@@ -17,7 +18,6 @@
 
 from otx.algo.utils.weight_init import constant_init, kaiming_init
 
-from .activation import build_activation_layer
 from .norm import build_norm_layer
 from .padding import build_padding_layer
 
@@ -25,12 +25,37 @@
     from torch.nn.modules.conv import _ConvNd as ConvNd
 
 
+AVAILABLE_ACTIVATION_LIST: list[str] = [
+    "ReLU",
+    "LeakyReLU",
+    "PReLU",
+    "RReLU",
+    "ReLU6",
+    "ELU",
+    "Sigmoid",
+    "Tanh",
+    "SiLU",
+    "GELU",
+    "Swish",
+]
+
+ACTIVATION_LIST_NOT_SUPPORTING_INPLACE: list[str] = [
+    "Tanh",
+    "PReLU",
+    "Sigmoid",
+    "HSigmoid",
+    "Swish",
+    "GELU",
+    "SiLU",
+]
+
+
 class ConvModule(nn.Module):
     """A conv block that bundles conv/norm/activation layers.
 
     This block simplifies the usage of convolution layers, which are commonly
     used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
-    It is based upon two build methods: `build_norm_layer()` and `build_activation_layer()`.
+    It is based upon a build method: `build_norm_layer()`.
 
     Besides, we add some additional features in this module.
     1. Automatically set `bias` of the conv layer.
@@ -57,8 +82,8 @@ class ConvModule(nn.Module):
             norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
             False. Default: "auto".
         norm_cfg (dict): Config dict for normalization layer. Default: None.
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU').
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.ReLU`.
         inplace (bool): Whether to use inplace mode for activation.
             Default: True.
         with_spectral_norm (bool): Whether use spectral norm in conv module.
@@ -84,7 +109,7 @@ def __init__(
         groups: int = 1,
         bias: bool | str = "auto",
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = {"type": "ReLU"},  # noqa: B006
+        activation_callable: Callable[..., nn.Module] | None = nn.ReLU,
         inplace: bool = True,
         with_spectral_norm: bool = False,
         padding_mode: str = "zeros",
@@ -93,13 +118,11 @@ def __init__(
         assert norm_cfg is None or isinstance(norm_cfg, dict)  # noqa: S101
         official_padding_mode = ["zeros", "circular"]
         self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
         self.inplace = inplace
         self.with_spectral_norm = with_spectral_norm
         self.with_explicit_padding = padding_mode not in official_padding_mode
 
         self.with_norm = norm_cfg is not None
-        self.with_activation = act_cfg is not None
         # if the conv layer is before a norm layer, bias is unnecessary.
         if bias == "auto":
             bias = not self.with_norm
@@ -148,24 +171,48 @@ def __init__(
             self.norm_name = None  # type: ignore[assignment]
 
         # build activation layer
-        if self.with_activation:
-            act_cfg_ = act_cfg.copy()  # type: ignore[union-attr]
-            # nn.Tanh has no 'inplace' argument
-            if act_cfg_["type"] not in [
-                "Tanh",
-                "PReLU",
-                "Sigmoid",
-                "HSigmoid",
-                "Swish",
-                "GELU",
-                "SiLU",
-            ]:
-                act_cfg_.setdefault("inplace", inplace)
-            self.activate = build_activation_layer(act_cfg_)
+        self.activation: nn.Module | None = None
+        self._with_activation: bool | None = None
+        if activation_callable is not None:
+            if (
+                isinstance(activation_callable, partial)
+                and activation_callable.func.__name__ not in AVAILABLE_ACTIVATION_LIST
+            ):
+                msg = f"Unsupported activation: {activation_callable.func.__name__}."
+                raise ValueError(msg)
+
+            if (
+                not isinstance(activation_callable, partial)
+                and activation_callable.__name__ not in AVAILABLE_ACTIVATION_LIST
+            ):
+                msg = f"Unsupported activation: {activation_callable.__name__}."
+                raise ValueError(msg)
+
+            self.activation = activation_callable()
+
+            # update inplace
+            if self.activation.__class__.__name__ not in ACTIVATION_LIST_NOT_SUPPORTING_INPLACE:
+                self.activation.inplace = inplace
 
         # Use msra init by default
         self.init_weights()
 
+    @property
+    def with_activation(self) -> bool:
+        """Whether the conv module has activation."""
+        if self._with_activation is not None:
+            # src/otx/algo/segmentation/heads/fcn_head.py L144
+            return self._with_activation
+        return self.activation is not None
+
+    @with_activation.setter
+    def with_activation(self, value: bool) -> None:
+        """Setter for with_activation.
+
+        For src/otx/algo/segmentation/heads/fcn_head.py L144.
+        """
+        self._with_activation = value
+
     @property
     def norm_layer(self) -> nn.Module | None:
         """Get the normalization layer.
@@ -189,9 +236,9 @@ def init_weights(self) -> None:
         # Note: For PyTorch's conv layers, they will be overwritten by our
         #    initialization implementation using default ``kaiming_init``.
         if not hasattr(self.conv, "init_weights"):
-            if self.with_activation and self.act_cfg["type"] == "LeakyReLU":  # type: ignore[index]
+            if self.with_activation and isinstance(self.activation, nn.LeakyReLU):
                 nonlinearity = "leaky_relu"
-                a = self.act_cfg.get("negative_slope", 0.01)  # type: ignore[union-attr]
+                a = getattr(self.activation, "negative_slop", 0.01)
             else:
                 nonlinearity = "relu"
                 a = 0
@@ -216,7 +263,7 @@ def forward(self, x: Tensor, activate: bool = True, norm: bool = True) -> Tensor
         if norm and self.with_norm:
             x = self.norm_layer(x)  # type: ignore[misc]
         if activate and self.with_activation:
-            x = self.activate(x)
+            x = self.activation(x)  # type: ignore[misc]
         return x
 
 
@@ -230,7 +277,7 @@ class DepthwiseSeparableConvModule(nn.Module):
     conv block contains depthwise-conv/norm/activation layers. The pointwise
     conv block contains pointwise-conv/norm/activation layers. It should be
     noted that there will be norm/activation layer in the depthwise conv block
-    if `norm_cfg` and `act_cfg` are specified.
+    if `norm_cfg` and `activation_callable` are specified.
 
     Args:
         in_channels (int): Number of channels in the input feature map.
@@ -247,16 +294,19 @@ class DepthwiseSeparableConvModule(nn.Module):
             Same as that in ``nn._ConvNd``. Default: 1.
         norm_cfg (dict): Default norm config for both depthwise ConvModule and
             pointwise ConvModule. Default: None.
-        act_cfg (dict): Default activation config for both depthwise ConvModule
-            and pointwise ConvModule. Default: dict(type='ReLU').
+        activation_callable (Callable[..., nn.Module]): Activation layer module
+            for both depthwise ConvModule and pointwise ConvModule.
+            Defaults to `nn.ReLU`.
         dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
             None, it will be the same as `norm_cfg`. Default: None.
-        dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
-            None, it will be the same as `act_cfg`. Default: None.
+        dw_activation_callable (Callable[..., nn.Module] | None): Activation layer module of depthwise ConvModule.
+            If it is None, it will be the same as `activation_callable`.
+            Defaults to None.
         pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
             None, it will be the same as `norm_cfg`. Default: None.
-        pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
-            None, it will be the same as `act_cfg`. Default: None.
+        pw_activation_callable (Callable[..., nn.Module] | None): Activation layer module of pointwise ConvModule.
+            If it is None, it will be the same as `activation_callable`.
+            Defaults to None.
         kwargs (optional): Other shared arguments for depthwise and pointwise
             ConvModule. See ConvModule for ref.
     """
@@ -270,16 +320,13 @@ def __init__(
         padding: int | tuple[int, int] = 0,
         dilation: int | tuple[int, int] = 1,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.ReLU,
         dw_norm_cfg: dict | None = None,
-        dw_act_cfg: dict | None = None,
+        dw_activation_callable: Callable[..., nn.Module] | None = None,
         pw_norm_cfg: dict | None = None,
-        pw_act_cfg: dict | None = None,
+        pw_activation_callable: Callable[..., nn.Module] | None = None,
         **kwargs,
     ):
-        if act_cfg is None:
-            act_cfg = {"type": "ReLU"}
-
         super().__init__()
         if "groups" in kwargs:
             msg = "groups should not be specified in DepthwiseSeparableConvModule."
@@ -288,9 +335,9 @@ def __init__(
         # if norm/activation config of depthwise/pointwise Conv2dModule is not
         # specified, use default config.
         dw_norm_cfg = dw_norm_cfg or norm_cfg
-        dw_act_cfg = dw_act_cfg or act_cfg
+        dw_activation_callable = dw_activation_callable or activation_callable
         pw_norm_cfg = pw_norm_cfg or norm_cfg
-        pw_act_cfg = pw_act_cfg or act_cfg
+        pw_activation_callable = pw_activation_callable or activation_callable
 
         # depthwise convolution
         self.depthwise_conv = Conv2dModule(
@@ -302,7 +349,7 @@ def __init__(
             dilation=dilation,
             groups=in_channels,
             norm_cfg=dw_norm_cfg,
-            act_cfg=dw_act_cfg,
+            activation_callable=dw_activation_callable,
             **kwargs,
         )
 
@@ -311,7 +358,7 @@ def __init__(
             out_channels,
             1,
             norm_cfg=pw_norm_cfg,
-            act_cfg=pw_act_cfg,
+            activation_callable=pw_activation_callable,
             **kwargs,
         )
 
diff --git a/src/otx/algo/modules/transformer.py b/src/otx/algo/modules/transformer.py
index 46cdd96a943..28bbfe76728 100644
--- a/src/otx/algo/modules/transformer.py
+++ b/src/otx/algo/modules/transformer.py
@@ -7,13 +7,14 @@
 from __future__ import annotations
 
 import math
+from functools import partial
+from typing import Callable
 
 import torch
 from torch import nn
 
 from otx.algo.modules.base_module import BaseModule, Sequential
 
-from .activation import build_activation_layer
 from .drop import build_dropout
 from .norm import build_norm_layer
 
@@ -252,8 +253,8 @@ class FFN(BaseModule):
             Defaults: 1024.
         num_fcs (int, optional): The number of fully-connected layers in
             FFNs. Default: 2.
-        act_cfg (dict, optional): The activation config for FFNs.
-            Default: dict(type='ReLU')
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `partial(nn.ReLU, inplace=True)`.
         ffn_drop (float, optional): Probability of an element to be
             zeroed in FFN. Default 0.0.
         add_identity (bool, optional): Whether to add the
@@ -269,7 +270,7 @@ def __init__(
         embed_dims: int = 256,
         feedforward_channels: int = 1024,
         num_fcs: int = 2,
-        act_cfg: dict = {"type": "ReLU", "inplace": True},  # noqa: B006
+        activation_callable: Callable[..., nn.Module] = partial(nn.ReLU, inplace=True),
         ffn_drop: float = 0.0,
         dropout_layer: dict | None = None,
         add_identity: bool = True,
@@ -289,7 +290,7 @@ def __init__(
             layers.append(
                 Sequential(
                     nn.Linear(in_channels, feedforward_channels),
-                    build_activation_layer(act_cfg),
+                    activation_callable(),
                     nn.Dropout(ffn_drop),
                 ),
             )
diff --git a/src/otx/algo/segmentation/backbones/litehrnet.py b/src/otx/algo/segmentation/backbones/litehrnet.py
index 48e359862bd..a47a4571bf1 100644
--- a/src/otx/algo/segmentation/backbones/litehrnet.py
+++ b/src/otx/algo/segmentation/backbones/litehrnet.py
@@ -10,6 +10,7 @@
 from __future__ import annotations
 
 from pathlib import Path
+from typing import Callable
 
 import torch
 import torch.utils.checkpoint as cp
@@ -61,7 +62,7 @@ def __init__(
                 kernel_size=1,
                 stride=1,
                 norm_cfg=norm_cfg,
-                act_cfg={"type": "ReLU"},
+                activation_callable=nn.ReLU,
             ),
             Conv2dModule(
                 self.key_channels,
@@ -71,7 +72,7 @@ def __init__(
                 padding=(self.kernel_size - 1) // 2,
                 groups=self.key_channels,
                 norm_cfg=norm_cfg,
-                act_cfg=None,
+                activation_callable=None,
             ),
             Conv2dModule(
                 in_channels=self.key_channels,
@@ -79,7 +80,7 @@ def __init__(
                 kernel_size=1,
                 stride=1,
                 norm_cfg=norm_cfg,
-                act_cfg=None,
+                activation_callable=None,
             ),
         )
         self.value = nn.Sequential(
@@ -89,7 +90,7 @@ def __init__(
                 kernel_size=1,
                 stride=1,
                 norm_cfg=norm_cfg,
-                act_cfg=None,
+                activation_callable=None,
             ),
             nn.Unfold(kernel_size=self.kernel_size, stride=1, padding=1),
         )
@@ -99,7 +100,7 @@ def __init__(
             kernel_size=1,
             stride=1,
             norm_cfg=norm_cfg,
-            act_cfg=None,
+            activation_callable=None,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -117,30 +118,34 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class CrossResolutionWeighting(nn.Module):
-    """Cross resolution weighting."""
+    """Cross resolution weighting.
+
+    Args:
+        channels (list[int]): Number of channels for each stage.
+        ratio (int): Reduction ratio of the bottleneck block.
+        norm_cfg (dict | None): Config dict for normalization layer. Default: None
+        activation_callable (Callable[..., nn.Module] | tuple[Callable[..., nn.Module], Callable[..., nn.Module]]): \
+            Activation layer module or a tuple of activation layer modules.
+            Defaults to (`nn.ReLU`, `nn.Sigmoid`).
+    """
 
     def __init__(
         self,
         channels: list[int],
         ratio: int = 16,
         norm_cfg: dict | None = None,
-        act_cfg: dict | tuple[dict, dict] = ({"type": "ReLU"}, {"type": "Sigmoid"}),
+        activation_callable: Callable[..., nn.Module] | tuple[Callable[..., nn.Module], Callable[..., nn.Module]] = (
+            nn.ReLU,
+            nn.Sigmoid,
+        ),
     ) -> None:
-        """Cross resolution weighting.
-
-        Args:
-            channels (list[int]): Number of channels for each stage.
-            ratio (int): Reduction ratio of the bottleneck block.
-            norm_cfg (dict | None): Config dict for normalization layer. Default: None
-            act_cfg (dict | tuple[dict, dict]): Config dict or a tuple of config dicts for activation layer(s).
-                Default: ({"type": "ReLU"}, {"type": "Sigmoid"}).
-        """
         super().__init__()
 
-        if isinstance(act_cfg, dict):
-            act_cfg = (act_cfg, act_cfg)
-        if len(act_cfg) != 2:
-            msg = "act_cfg must be a dict or a tuple of dicts of length 2."
+        if callable(activation_callable):
+            activation_callable = (activation_callable, activation_callable)
+
+        if len(activation_callable) != 2:
+            msg = "activation_callable must be a callable or a tuple of callables of length 2."
             raise ValueError(msg)
 
         self.channels = channels
@@ -152,7 +157,7 @@ def __init__(
             kernel_size=1,
             stride=1,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg[0],
+            activation_callable=activation_callable[0],
         )
         self.conv2 = Conv2dModule(
             in_channels=int(total_channel / ratio),
@@ -160,7 +165,7 @@ def __init__(
             kernel_size=1,
             stride=1,
             norm_cfg=norm_cfg,
-            act_cfg=act_cfg[1],
+            activation_callable=activation_callable[1],
         )
 
     def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
@@ -177,35 +182,38 @@ def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
 
 
 class SpatialWeighting(nn.Module):
-    """Spatial weighting."""
+    """Spatial weighting.
+
+    Args:
+        channels (int): Number of input channels.
+        ratio (int): Reduction ratio for the bottleneck block. Default: 16.
+        activation_callable (Callable[..., nn.Module] | tuple[Callable[..., nn.Module], Callable[..., nn.Module]]): \
+            Activation layer module or a tuple of activation layer modules.
+            If a single module is provided, it will be used for both activation layers.
+            Defaults to (`nn.ReLU`, `nn.Sigmoid`).
+
+    Raises:
+        ValueError: activation_callable must be a callable or a tuple of callables of length 2.
+        TypeError: If activation_callable is not a callable or a tuple of callables.
+    """
 
     def __init__(
         self,
         channels: int,
         ratio: int = 16,
-        norm_cfg: dict | None = None,
-        act_cfg: dict | tuple[dict, dict] = ({"type": "ReLU"}, {"type": "Sigmoid"}),
-        enable_norm: bool = False,
+        activation_callable: Callable[..., nn.Module] | tuple[Callable[..., nn.Module], Callable[..., nn.Module]] = (
+            nn.ReLU,
+            nn.Sigmoid,
+        ),
+        **kwargs,
     ) -> None:
-        """Spatial weighting.
-
-        Args:
-            channels (int): Number of input channels.
-            ratio (int): Reduction ratio for the bottleneck block. Default: 16.
-            act_cfg (dict | tuple[dict]): Configuration dict or tuple of dicts for
-                activation layers. If a single dict is provided, it will be used for
-                both activation layers. Default: ({"type": "ReLU"}, {"type": "Sigmoid"}).
-
-        Raises:
-            ValueError: act_cfg must be a dict or a tuple of dicts of length 2.
-            TypeError: If act_cfg is not a dict or a tuple of dicts.
-        """
         super().__init__()
 
-        if isinstance(act_cfg, dict):
-            act_cfg = (act_cfg, act_cfg)
-        if len(act_cfg) != 2:
-            msg = "act_cfg must be a dict or a tuple of dicts of length 2."
+        if callable(activation_callable):
+            activation_callable = (activation_callable, activation_callable)
+
+        if len(activation_callable) != 2:
+            msg = "activation_callable must be a callable or a tuple of callables of length 2."
             raise ValueError(msg)
 
         self.global_avgpool = nn.AdaptiveAvgPool2d(1)
@@ -214,14 +222,14 @@ def __init__(
             out_channels=int(channels / ratio),
             kernel_size=1,
             stride=1,
-            act_cfg=act_cfg[0],
+            activation_callable=activation_callable[0],
         )
         self.conv2 = Conv2dModule(
             in_channels=int(channels / ratio),
             out_channels=channels,
             kernel_size=1,
             stride=1,
-            act_cfg=act_cfg[1],
+            activation_callable=activation_callable[1],
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -264,7 +272,7 @@ def __init__(
             stride=1,
             bias=False,
             norm_cfg=norm_cfg if enable_norm else None,
-            act_cfg=None,
+            activation_callable=None,
         )
         self.q_channel = Conv2dModule(
             in_channels=self.in_channels,
@@ -273,7 +281,7 @@ def __init__(
             stride=1,
             bias=False,
             norm_cfg=norm_cfg if enable_norm else None,
-            act_cfg=None,
+            activation_callable=None,
         )
         self.out_channel = Conv2dModule(
             in_channels=self.internal_channels,
@@ -281,7 +289,7 @@ def __init__(
             kernel_size=1,
             stride=1,
             norm_cfg=norm_cfg,
-            act_cfg={"type": "Sigmoid"},
+            activation_callable=nn.Sigmoid,
         )
 
         # spatial-only branch
@@ -292,7 +300,7 @@ def __init__(
             stride=1,
             bias=False,
             norm_cfg=norm_cfg if enable_norm else None,
-            act_cfg=None,
+            activation_callable=None,
         )
         self.q_spatial = Conv2dModule(
             in_channels=self.in_channels,
@@ -301,7 +309,7 @@ def __init__(
             stride=1,
             bias=False,
             norm_cfg=norm_cfg if enable_norm else None,
-            act_cfg=None,
+            activation_callable=None,
         )
         self.global_avgpool = nn.AdaptiveAvgPool2d(1)
 
@@ -420,7 +428,7 @@ def __init__(
                     padding=dw_ksize // 2,
                     groups=channel,
                     norm_cfg=norm_cfg,
-                    act_cfg=None,
+                    activation_callable=None,
                 )
                 for channel in branch_channels
             ],
@@ -550,7 +558,7 @@ def __init__(
             stride=strides[0],
             padding=1,
             norm_cfg=self.norm_cfg,
-            act_cfg={"type": "ReLU"},
+            activation_callable=nn.ReLU,
         )
 
         self.conv2 = None
@@ -562,7 +570,7 @@ def __init__(
                 stride=2,
                 padding=1,
                 norm_cfg=self.norm_cfg,
-                act_cfg={"type": "ReLU"},
+                activation_callable=nn.ReLU,
             )
 
         mid_channels = int(round(stem_channels * expand_ratio))
@@ -581,7 +589,7 @@ def __init__(
                 padding=1,
                 groups=branch_channels,
                 norm_cfg=norm_cfg,
-                act_cfg=None,
+                activation_callable=None,
             ),
             Conv2dModule(
                 branch_channels,
@@ -590,7 +598,7 @@ def __init__(
                 stride=1,
                 padding=0,
                 norm_cfg=norm_cfg,
-                act_cfg={"type": "ReLU"},
+                activation_callable=nn.ReLU,
             ),
         )
 
@@ -601,7 +609,7 @@ def __init__(
             stride=1,
             padding=0,
             norm_cfg=norm_cfg,
-            act_cfg={"type": "ReLU"},
+            activation_callable=nn.ReLU,
         )
         self.depthwise_conv = Conv2dModule(
             mid_channels,
@@ -611,7 +619,7 @@ def __init__(
             padding=1,
             groups=mid_channels,
             norm_cfg=norm_cfg,
-            act_cfg=None,
+            activation_callable=None,
         )
         self.linear_conv = Conv2dModule(
             mid_channels,
@@ -620,7 +628,7 @@ def __init__(
             stride=1,
             padding=0,
             norm_cfg=norm_cfg,
-            act_cfg={"type": "ReLU"},
+            activation_callable=nn.ReLU,
         )
 
     def _inner_forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -723,7 +731,7 @@ def __init__(
             stride=strides[0],
             padding=1,
             norm_cfg=self.norm_cfg,
-            act_cfg={"type": "ReLU"},
+            activation_callable=nn.ReLU,
         )
 
         self.conv2 = None
@@ -735,7 +743,7 @@ def __init__(
                 stride=2,
                 padding=1,
                 norm_cfg=self.norm_cfg,
-                act_cfg={"type": "ReLU"},
+                activation_callable=nn.ReLU,
             )
 
         mid_channels = int(round(stem_channels * expand_ratio))
@@ -754,7 +762,7 @@ def __init__(
                         padding=1,
                         groups=internal_branch_channels,
                         norm_cfg=norm_cfg,
-                        act_cfg=None,
+                        activation_callable=None,
                     ),
                     Conv2dModule(
                         internal_branch_channels,
@@ -763,7 +771,7 @@ def __init__(
                         stride=1,
                         padding=0,
                         norm_cfg=norm_cfg,
-                        act_cfg={"type": "ReLU"},
+                        activation_callable=nn.ReLU,
                     ),
                 ),
             )
@@ -777,7 +785,7 @@ def __init__(
                         stride=1,
                         padding=0,
                         norm_cfg=norm_cfg,
-                        act_cfg={"type": "ReLU"},
+                        activation_callable=nn.ReLU,
                     ),
                     Conv2dModule(
                         mid_channels,
@@ -787,7 +795,7 @@ def __init__(
                         padding=1,
                         groups=mid_channels,
                         norm_cfg=norm_cfg,
-                        act_cfg=None,
+                        activation_callable=None,
                     ),
                     Conv2dModule(
                         mid_channels,
@@ -796,7 +804,7 @@ def __init__(
                         stride=1,
                         padding=0,
                         norm_cfg=norm_cfg,
-                        act_cfg={"type": "ReLU"},
+                        activation_callable=nn.ReLU,
                     ),
                 ),
             )
@@ -836,7 +844,19 @@ def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
 
 
 class ShuffleUnit(nn.Module):
-    """InvertedResidual block for ShuffleNetV2 backbone."""
+    """InvertedResidual block for ShuffleNetV2 backbone.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        out_channels (int): The output channels of the block.
+        stride (int): Stride of the 3x3 convolution layer. Default: 1
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.ReLU`.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
 
     def __init__(
         self,
@@ -844,29 +864,13 @@ def __init__(
         out_channels: int,
         stride: int = 1,
         norm_cfg: dict | None = None,
-        act_cfg: dict | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.ReLU,
         with_cp: bool = False,
     ) -> None:
-        """InvertedResidual block for ShuffleNetV2 backbone.
-
-        Args:
-            in_channels (int): The input channels of the block.
-            out_channels (int): The output channels of the block.
-            stride (int): Stride of the 3x3 convolution layer. Default: 1
-            norm_cfg (dict): Config dict for normalization layer.
-                Default: dict(type='BN').
-            act_cfg (dict): Config dict for activation layer.
-                Default: dict(type='ReLU').
-            with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-                memory while slowing down the training speed. Default: False.
-
-        """
         super().__init__()
 
         if norm_cfg is None:
             norm_cfg = {"type": "BN"}
-        if act_cfg is None:
-            act_cfg = {"type": "ReLU"}
 
         self.stride = stride
         self.with_cp = with_cp
@@ -890,7 +894,7 @@ def __init__(
                     padding=1,
                     groups=in_channels,
                     norm_cfg=norm_cfg,
-                    act_cfg=None,
+                    activation_callable=None,
                 ),
                 Conv2dModule(
                     in_channels,
@@ -899,7 +903,7 @@ def __init__(
                     stride=1,
                     padding=0,
                     norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
+                    activation_callable=activation_callable,
                 ),
             )
 
@@ -911,7 +915,7 @@ def __init__(
                 stride=1,
                 padding=0,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
             ),
             Conv2dModule(
                 branch_features,
@@ -921,7 +925,7 @@ def __init__(
                 padding=1,
                 groups=branch_features,
                 norm_cfg=norm_cfg,
-                act_cfg=None,
+                activation_callable=None,
             ),
             Conv2dModule(
                 branch_features,
@@ -930,7 +934,7 @@ def __init__(
                 stride=1,
                 padding=0,
                 norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
+                activation_callable=activation_callable,
             ),
         )
 
@@ -1047,7 +1051,7 @@ def _make_one_branch(self, branch_index: int, num_blocks: int, stride: int = 1)
                 self.in_channels[branch_index],
                 stride=stride,
                 norm_cfg=self.norm_cfg,
-                act_cfg={"type": "ReLU"},
+                activation_callable=nn.ReLU,
                 with_cp=self.with_cp,
             ),
         ] + [
@@ -1056,7 +1060,7 @@ def _make_one_branch(self, branch_index: int, num_blocks: int, stride: int = 1)
                 self.in_channels[branch_index],
                 stride=1,
                 norm_cfg=self.norm_cfg,
-                act_cfg={"type": "ReLU"},
+                activation_callable=nn.ReLU,
                 with_cp=self.with_cp,
             )
             for _ in range(1, num_blocks)
@@ -1283,7 +1287,7 @@ def __init__(
                         stride=1,
                         padding=0,
                         norm_cfg=self.norm_cfg,
-                        act_cfg={"type": "ReLU"},
+                        activation_callable=nn.ReLU,
                     ),
                 )
                 in_modules_channels = out_modules_channels
@@ -1320,7 +1324,7 @@ def __init__(
                     padding=1,
                     groups=self.stem.out_channels,
                     norm_cfg=norm_cfg,
-                    act_cfg=None,
+                    activation_callable=None,
                 ),
                 Conv2dModule(
                     self.stem.out_channels,
@@ -1329,7 +1333,7 @@ def __init__(
                     stride=1,
                     padding=0,
                     norm_cfg=norm_cfg,
-                    act_cfg={"type": "ReLU"},
+                    activation_callable=nn.ReLU,
                 ),
             )
 
diff --git a/src/otx/algo/segmentation/backbones/mscan.py b/src/otx/algo/segmentation/backbones/mscan.py
index fd10ea7b432..415655bf8ca 100644
--- a/src/otx/algo/segmentation/backbones/mscan.py
+++ b/src/otx/algo/segmentation/backbones/mscan.py
@@ -6,12 +6,12 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Callable
 
 import torch
 from torch import nn
 
-from otx.algo.modules import build_activation_layer, build_norm_layer
+from otx.algo.modules import build_norm_layer
 from otx.algo.modules.base_module import BaseModule
 from otx.algo.utils.mmengine_utils import load_checkpoint_to_model, load_from_http
 
@@ -62,8 +62,8 @@ class Mlp(BaseModule):
             Defaults: None.
         out_features (int): The dimension of output features.
             Defaults: None.
-        act_cfg (dict): Config dict for activation layer in block.
-            Default: dict(type='GELU').
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.GELU`.
         drop (float): The number of dropout rate in MLP block.
             Defaults: 0.0.
     """
@@ -73,30 +73,16 @@ def __init__(
         in_features: int,
         hidden_features: int | None = None,
         out_features: int | None = None,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.GELU,
         drop: float = 0.0,
     ) -> None:
-        """Initializes the MLP module.
-
-        Args:
-            in_features (int): The dimension of the input features.
-            hidden_features (Optional[int]): The dimension of the hidden features.
-                Defaults to None.
-            out_features (Optional[int]): The dimension of the output features.
-                Defaults to None.
-            act_cfg (Dict[str, str] | None): Config dict for the activation layer in the block.
-                Defaults to {"type": "GELU"} if None.
-            drop (float): The dropout rate in the MLP block.
-                Defaults to 0.0.
-        """
+        """Initializes the MLP module."""
         super().__init__()
-        if act_cfg is None:
-            act_cfg = {"type": "GELU"}
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
         self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
         self.dwconv = nn.Conv2d(hidden_features, hidden_features, 3, 1, 1, bias=True, groups=hidden_features)
-        self.act = build_activation_layer(act_cfg)
+        self.act = activation_callable()
         self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
         self.drop = nn.Dropout(drop)
 
@@ -118,8 +104,8 @@ class StemConv(BaseModule):
     Args:
         in_channels (int): The dimension of input channels.
         out_channels (int): The dimension of output channels.
-        act_cfg (dict): Config dict for activation layer in block.
-            Default: dict(type='GELU').
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.GELU`.
         norm_cfg (dict): Config dict for normalization layer.
             Defaults: dict(type='SyncBN', requires_grad=True).
     """
@@ -128,29 +114,17 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.GELU,
         norm_cfg: dict[str, str | bool] | None = None,
     ) -> None:
-        """Stem Block at the beginning of Semantic Branch.
-
-        Args:
-            in_channels (int): The dimension of input channels.
-            out_channels (int): The dimension of output channels.
-            act_cfg (Dict[str, str] | None): Config dict for activation layer in block.
-                Default: dict(type='GELU') if None.
-            norm_cfg (Dict[str, Union[str, bool]] | None): Config dict for normalization layer.
-                Defaults: dict(type='SyncBN', requires_grad=True) if None.
-        """
         super().__init__()
-        if act_cfg is None:
-            act_cfg = {"type": "GELU"}
         if norm_cfg is None:
             norm_cfg = {"type": "SyncBN", "requires_grad": True}
 
         self.proj = nn.Sequential(
             nn.Conv2d(in_channels, out_channels // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
             build_norm_layer(norm_cfg, out_channels // 2)[1],
-            build_activation_layer(act_cfg),
+            activation_callable(),
             nn.Conv2d(out_channels // 2, out_channels, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
             build_norm_layer(norm_cfg, out_channels)[1],
         )
@@ -218,28 +192,27 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class MSCASpatialAttention(BaseModule):
-    """Spatial Attention Module in Multi-Scale Convolutional Attention Module (MSCA)."""
+    """Spatial Attention Module in Multi-Scale Convolutional Attention Module (MSCA).
+
+    Args:
+        in_channels (int): The number of input channels.
+        attention_kernel_sizes (List[Union[int, List[int]]]): The size of attention kernels.
+        attention_kernel_paddings (List[Union[int, List[int]]]): The paddings of attention kernels.
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.GELU`.
+    """
 
     def __init__(
         self,
         in_channels: int,
         attention_kernel_sizes: list[int | list[int]] = [5, [1, 7], [1, 11], [1, 21]],  # noqa: B006
         attention_kernel_paddings: list[int | list[int]] = [2, [0, 3], [0, 5], [0, 10]],  # noqa: B006
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.GELU,
     ) -> None:
-        """Init the MSCASpatialAttention module.
-
-        Args:
-            in_channels (int): The number of input channels.
-            attention_kernel_sizes (List[Union[int, List[int]]]): The size of attention kernels.
-            attention_kernel_paddings (List[Union[int, List[int]]]): The paddings of attention kernels.
-            act_cfg (Dict[str, str] | None): The config of activation layer.
-        """
+        """Init the MSCASpatialAttention module."""
         super().__init__()
-        if act_cfg is None:
-            act_cfg = {"type": "GELU"}
         self.proj_1 = nn.Conv2d(in_channels, in_channels, 1)  # type: nn.Conv2d
-        self.activation = build_activation_layer(act_cfg)  # type: nn.Module
+        self.activation = activation_callable()  # type: nn.Module
         self.spatial_gating_unit = MSCAAttention(in_channels, attention_kernel_sizes, attention_kernel_paddings)  # type: MSCAAttention
         self.proj_2 = nn.Conv2d(in_channels, in_channels, 1)  # type: nn.Conv2d
 
@@ -260,6 +233,17 @@ class MSCABlock(BaseModule):
     attention. In each branch, it uses two depth-wise strip convolutions to
     approximate standard depth-wise convolutions with large kernels. The kernel
     size for each branch is set to 7, 11, and 21, respectively.
+
+    Args:
+        channels (int): The number of input channels.
+        attention_kernel_sizes (List[Union[int, List[int]]]): The size of attention kernels.
+        attention_kernel_paddings (List[Union[int, List[int]]]): The paddings of attention kernels.
+        mlp_ratio (float): The ratio of the number of hidden units in the MLP to the number of input channels.
+        drop (float): The dropout rate.
+        drop_path (float): The dropout rate for the path.
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.GELU`.
+        norm_cfg (Dict[str, Union[str, bool]] | None): The config of normalization layer.
     """
 
     def __init__(
@@ -270,32 +254,29 @@ def __init__(
         mlp_ratio: float = 4.0,
         drop: float = 0.0,
         drop_path: float = 0.0,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.GELU,
         norm_cfg: dict[str, str | bool] | None = None,
     ) -> None:
-        """Initialize a MSCABlock.
-
-        Args:
-            channels (int): The number of input channels.
-            attention_kernel_sizes (List[Union[int, List[int]]]): The size of attention kernels.
-            attention_kernel_paddings (List[Union[int, List[int]]]): The paddings of attention kernels.
-            mlp_ratio (float): The ratio of the number of hidden units in the MLP to the number of input channels.
-            drop (float): The dropout rate.
-            drop_path (float): The dropout rate for the path.
-            act_cfg (Dict[str, str] | None): The config of activation layer.
-            norm_cfg (Dict[str, Union[str, bool]] | None): The config of normalization layer.
-        """
+        """Initialize a MSCABlock."""
         super().__init__()
-        if act_cfg is None:
-            act_cfg = {"type": "GELU"}
         if norm_cfg is None:
             norm_cfg = {"type": "SyncBN", "requires_grad": True}
         self.norm1 = build_norm_layer(norm_cfg, channels)[1]  # type: nn.Module
-        self.attn = MSCASpatialAttention(channels, attention_kernel_sizes, attention_kernel_paddings, act_cfg)  # type: MSCAAttention
+        self.attn = MSCASpatialAttention(
+            channels,
+            attention_kernel_sizes,
+            attention_kernel_paddings,
+            activation_callable,
+        )  # type: MSCAAttention
         self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()  # type: nn.Module
         self.norm2 = build_norm_layer(norm_cfg, channels)[1]  # type: nn.Module
         mlp_hidden_channels = int(channels * mlp_ratio)  # type: int
-        self.mlp = Mlp(in_features=channels, hidden_features=mlp_hidden_channels, act_cfg=act_cfg, drop=drop)  # type: Mlp
+        self.mlp = Mlp(
+            in_features=channels,
+            hidden_features=mlp_hidden_channels,
+            activation_callable=activation_callable,
+            drop=drop,
+        )  # type: Mlp
         layer_scale_init_value = 1e-2  # type: float
         self.layer_scale_1 = nn.Parameter(layer_scale_init_value * torch.ones(channels), requires_grad=True)  # type: nn.Parameter
         self.layer_scale_2 = nn.Parameter(layer_scale_init_value * torch.ones(channels), requires_grad=True)  # type: nn.Parameter
@@ -355,6 +336,25 @@ class MSCAN(BaseModule):
     Convolutional Attention Design for Semantic
     Segmentation <https://arxiv.org/abs/2209.08575>`_.
     Inspiration from https://github.com/visual-attention-network/segnext.
+
+    Args:
+        in_channels (int): The number of input channels. Defaults to 3.
+        embed_dims (List[int]): Embedding dimension. Defaults to [64, 128, 256, 512].
+        mlp_ratios (List[int]): Ratio of mlp hidden dim to embedding dim. Defaults to [4, 4, 4, 4].
+        drop_rate (float): Dropout rate. Defaults to 0.0.
+        drop_path_rate (float): Stochastic depth rate. Defaults to 0.0.
+        depths (List[int]): Depths of each Swin Transformer stage. Defaults to [3, 4, 6, 3].
+        num_stages (int): MSCAN stages. Defaults to 4.
+        attention_kernel_sizes (List[Union[int, List[int]]]): Size of attention kernel in
+            Attention Module (Figure 2(b) of original paper). Defaults to [5, [1, 7], [1, 11], [1, 21]].
+        attention_kernel_paddings (List[Union[int, List[int]]]): Size of attention paddings
+            in Attention Module (Figure 2(b) of original paper). Defaults to [2, [0, 3], [0, 5], [0, 10]].
+        activation_callable (Callable[..., nn.Module]): Activation layer module.
+            Defaults to `nn.GELU`.
+        norm_cfg (Dict[str, Union[str, bool]] | None): Config dict for normalization layer.
+            Defaults to dict(type='SyncBN', requires_grad=True) if None.
+        init_cfg (Optional[Union[Dict[str, str], List[Dict[str, str]]]]): Initialization config dict.
+            Defaults to None.
     """
 
     def __init__(
@@ -368,35 +368,13 @@ def __init__(
         num_stages: int = 4,
         attention_kernel_sizes: list[int | list[int]] = [5, [1, 7], [1, 11], [1, 21]],  # noqa: B006
         attention_kernel_paddings: list[int | list[int]] = [2, [0, 3], [0, 5], [0, 10]],  # noqa: B006
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] = nn.GELU,
         norm_cfg: dict[str, str | bool] | None = None,
         init_cfg: dict[str, str] | list[dict[str, str]] | None = None,
         pretrained_weights: str | None = None,
     ) -> None:
-        """Initialize a MSCAN backbone.
-
-        Args:
-            in_channels (int): The number of input channels. Defaults to 3.
-            embed_dims (List[int]): Embedding dimension. Defaults to [64, 128, 256, 512].
-            mlp_ratios (List[int]): Ratio of mlp hidden dim to embedding dim. Defaults to [4, 4, 4, 4].
-            drop_rate (float): Dropout rate. Defaults to 0.0.
-            drop_path_rate (float): Stochastic depth rate. Defaults to 0.0.
-            depths (List[int]): Depths of each Swin Transformer stage. Defaults to [3, 4, 6, 3].
-            num_stages (int): MSCAN stages. Defaults to 4.
-            attention_kernel_sizes (List[Union[int, List[int]]]): Size of attention kernel in
-                Attention Module (Figure 2(b) of original paper). Defaults to [5, [1, 7], [1, 11], [1, 21]].
-            attention_kernel_paddings (List[Union[int, List[int]]]): Size of attention paddings
-                in Attention Module (Figure 2(b) of original paper). Defaults to [2, [0, 3], [0, 5], [0, 10]].
-            act_cfg (Dict[str, str] | None): Config dict for activation layer in block.
-                Defaults to dict(type='GELU') if None.
-            norm_cfg (Dict[str, Union[str, bool]] | None): Config dict for normalization layer.
-                Defaults to dict(type='SyncBN', requires_grad=True) if None.
-            init_cfg (Optional[Union[Dict[str, str], List[Dict[str, str]]]]): Initialization config dict.
-                Defaults to None.
-        """
+        """Initialize a MSCAN backbone."""
         super().__init__(init_cfg=init_cfg)
-        if act_cfg is None:
-            act_cfg = {"type": "GELU"}
         if norm_cfg is None:
             norm_cfg = {"type": "SyncBN", "requires_grad": True}
 
@@ -426,7 +404,7 @@ def __init__(
                         mlp_ratio=mlp_ratios[i],
                         drop=drop_rate,
                         drop_path=dpr[cur + j],
-                        act_cfg=act_cfg,
+                        activation_callable=activation_callable,
                         norm_cfg=norm_cfg,
                     )
                     for j in range(depths[i])
diff --git a/src/otx/algo/segmentation/heads/base_segm_head.py b/src/otx/algo/segmentation/heads/base_segm_head.py
index 8547b0233dc..419fea64071 100644
--- a/src/otx/algo/segmentation/heads/base_segm_head.py
+++ b/src/otx/algo/segmentation/heads/base_segm_head.py
@@ -7,6 +7,7 @@
 
 from abc import ABCMeta, abstractmethod
 from pathlib import Path
+from typing import Callable
 
 import torch
 from torch import nn
@@ -16,7 +17,23 @@
 
 
 class BaseSegmHead(nn.Module, metaclass=ABCMeta):
-    """Base class for segmentation heads."""
+    """Base class for segmentation heads.
+
+    Args:
+        in_channels (int | list[int]): Number of input channels.
+        channels (int): Number of channels in the feature map.
+        num_classes (int): Number of classes for segmentation.
+        dropout_ratio (float, optional): The dropout ratio. Defaults to 0.1.
+        norm_cfg (Optional[ConfigType], optional): Config for normalization layer.
+            Defaults to None.
+        activation_callable (Callable[..., nn.Module] | None): Activation layer module.
+            Defaults to `nn.ReLU`.
+        in_index (int, list[int], optional): Input index. Defaults to -1.
+        input_transform (Optional[str], optional): Input transform type.
+            Defaults to None.
+        ignore_index (int, optional): The index to be ignored. Defaults to 255.
+        align_corners (bool, optional): Whether to align corners. Defaults to False.
+    """
 
     def __init__(
         self,
@@ -25,39 +42,21 @@ def __init__(
         num_classes: int,
         dropout_ratio: float = 0.1,
         norm_cfg: dict[str, str] | None = None,
-        act_cfg: dict[str, str] | None = None,
+        activation_callable: Callable[..., nn.Module] | None = nn.ReLU,
         in_index: int | list[int] = -1,
         input_transform: str | None = None,
         ignore_index: int = 255,
         align_corners: bool = False,
         pretrained_weights: str | None = None,
     ) -> None:
-        """Initialize the BaseSegmHead.
-
-        Args:
-            in_channels (int | list[int]): Number of input channels.
-            channels (int): Number of channels in the feature map.
-            num_classes (int): Number of classes for segmentation.
-            dropout_ratio (float, optional): The dropout ratio. Defaults to 0.1.
-            norm_cfg (Optional[ConfigType], optional): Config for normalization layer.
-                Defaults to None.
-            act_cfg (Dict[str, Union[str, Dict]], optional): Activation config.
-                Defaults to dict(type='ReLU').
-            in_index (int, list[int], optional): Input index. Defaults to -1.
-            input_transform (Optional[str], optional): Input transform type.
-                Defaults to None.
-            ignore_index (int, optional): The index to be ignored. Defaults to 255.
-            align_corners (bool, optional): Whether to align corners. Defaults to False.
-        """
+        """Initialize the BaseSegmHead."""
         super().__init__()
-        if act_cfg is None:
-            act_cfg = {"type": "ReLU"}
         self.channels = channels
         self.num_classes = num_classes
         self.input_transform = input_transform
         self.dropout_ratio = dropout_ratio
         self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
+        self.activation_callable = activation_callable
         if self.input_transform is not None and not isinstance(in_index, list):
             msg = f'"in_index" expects a list, but got {type(in_index)}'
             raise TypeError(msg)
diff --git a/src/otx/algo/segmentation/heads/fcn_head.py b/src/otx/algo/segmentation/heads/fcn_head.py
index c6d8316c59f..da79e2db239 100644
--- a/src/otx/algo/segmentation/heads/fcn_head.py
+++ b/src/otx/algo/segmentation/heads/fcn_head.py
@@ -109,7 +109,7 @@ def __init__(
                 padding=conv_padding,
                 dilation=dilation,
                 norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg,
+                activation_callable=self.activation_callable,
             ),
         ]
         convs.extend(
@@ -121,7 +121,7 @@ def __init__(
                     padding=conv_padding,
                     dilation=dilation,
                     norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
+                    activation_callable=self.activation_callable,
                 )
                 for _ in range(num_convs - 1)
             ],
@@ -137,12 +137,12 @@ def __init__(
                 kernel_size=kernel_size,
                 padding=kernel_size // 2,
                 norm_cfg=self.norm_cfg,
-                act_cfg=self.act_cfg,
+                activation_callable=self.activation_callable,
             )
 
-        if self.act_cfg:
+        if self.activation_callable:
             self.convs[-1].with_activation = False
-            delattr(self.convs[-1], "activate")  # why we delete last activation?
+            delattr(self.convs[-1], "activation")  # why we delete last activation?
 
     def _forward_feature(self, inputs: Tensor) -> Tensor:
         """Forward function for feature maps.
diff --git a/src/otx/algo/segmentation/heads/ham_head.py b/src/otx/algo/segmentation/heads/ham_head.py
index 52f789808b0..cd079752a15 100644
--- a/src/otx/algo/segmentation/heads/ham_head.py
+++ b/src/otx/algo/segmentation/heads/ham_head.py
@@ -45,11 +45,11 @@ def __init__(
         """
         super().__init__()
 
-        self.ham_in = Conv2dModule(ham_channels, ham_channels, 1, norm_cfg=None, act_cfg=None)
+        self.ham_in = Conv2dModule(ham_channels, ham_channels, 1, norm_cfg=None, activation_callable=None)
 
         self.ham = NMF2D(ham_channels=ham_channels, **ham_kwargs)
 
-        self.ham_out = Conv2dModule(ham_channels, ham_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        self.ham_out = Conv2dModule(ham_channels, ham_channels, 1, norm_cfg=norm_cfg, activation_callable=None)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward."""
@@ -102,7 +102,7 @@ def __init__(
             self.ham_channels,
             1,
             norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg,
+            activation_callable=self.activation_callable,
         )
 
         self.hamburger = Hamburger(self.ham_channels, ham_kwargs=self.ham_kwargs, **kwargs)
@@ -112,7 +112,7 @@ def __init__(
             self.channels,
             1,
             norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg,
+            activation_callable=self.activation_callable,
         )
 
     def forward(self, inputs: list[torch.Tensor]) -> torch.Tensor:
diff --git a/src/otx/algo/segmentation/modules/aggregators.py b/src/otx/algo/segmentation/modules/aggregators.py
index bff23694b50..0b5e78debc9 100644
--- a/src/otx/algo/segmentation/modules/aggregators.py
+++ b/src/otx/algo/segmentation/modules/aggregators.py
@@ -68,7 +68,7 @@ def __init__(
                         kernel_size=1,
                         stride=1,
                         norm_cfg=norm_cfg,
-                        act_cfg={"type": "ReLU"},
+                        activation_callable=nn.ReLU,
                     ),
                 )
 
@@ -85,9 +85,9 @@ def __init__(
                     stride=1,
                     padding=1,
                     norm_cfg=norm_cfg,
-                    act_cfg={"type": "ReLU"},
-                    dw_act_cfg=None,
-                    pw_act_cfg={"type": "ReLU"},
+                    activation_callable=nn.ReLU,
+                    dw_activation_callable=None,
+                    pw_activation_callable=nn.ReLU,
                 ),
             )
 
@@ -99,7 +99,7 @@ def __init__(
                         kernel_size=1,
                         stride=1,
                         norm_cfg=norm_cfg,
-                        act_cfg={"type": "ReLU"},
+                        activation_callable=nn.ReLU,
                     ),
                 )
             else:
diff --git a/src/otx/algo/segmentation/modules/blocks.py b/src/otx/algo/segmentation/modules/blocks.py
index 86d049b1da4..240924ab476 100644
--- a/src/otx/algo/segmentation/modules/blocks.py
+++ b/src/otx/algo/segmentation/modules/blocks.py
@@ -73,7 +73,7 @@ def __init__(
             stride=1,
             padding=0,
             norm_cfg=self.norm_cfg,
-            act_cfg={"type": "ReLU"},
+            activation_callable=nn.ReLU,
         )
         self.key_psp = PSPModule(psp_size, method="max")
 
@@ -84,7 +84,7 @@ def __init__(
             stride=1,
             padding=0,
             norm_cfg=self.norm_cfg,
-            act_cfg={"type": "ReLU"},
+            activation_callable=nn.ReLU,
         )
         self.value_psp = PSPModule(psp_size, method="max")
 
@@ -95,7 +95,7 @@ def __init__(
             stride=1,
             padding=0,
             norm_cfg=self.norm_cfg,
-            act_cfg=None,
+            activation_callable=None,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -171,7 +171,7 @@ def __init__(self, num_channels: int, norm_cfg: dict | None = None):
             padding=1,
             groups=self.num_channels,
             norm_cfg=self.norm_cfg,
-            act_cfg={"type": "ReLU"},
+            activation_callable=nn.ReLU,
         )
         self.dwconv2 = Conv2dModule(
             in_channels=self.num_channels,
@@ -181,7 +181,7 @@ def __init__(self, num_channels: int, norm_cfg: dict | None = None):
             padding=1,
             groups=self.num_channels,
             norm_cfg=self.norm_cfg,
-            act_cfg={"type": "ReLU"},
+            activation_callable=nn.ReLU,
         )
         self.dwconv3 = Conv2dModule(
             in_channels=self.num_channels,
@@ -191,7 +191,7 @@ def __init__(self, num_channels: int, norm_cfg: dict | None = None):
             padding=1,
             groups=self.num_channels,
             norm_cfg=self.norm_cfg,
-            act_cfg={"type": "ReLU"},
+            activation_callable=nn.ReLU,
         )
         self.sigmoid_spatial = nn.Sigmoid()
 
diff --git a/src/otx/algo/segmentation/segnext.py b/src/otx/algo/segmentation/segnext.py
index c3d2ca86fb3..0c2eaff739b 100644
--- a/src/otx/algo/segmentation/segnext.py
+++ b/src/otx/algo/segmentation/segnext.py
@@ -2,9 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 """SegNext model implementations."""
+
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, ClassVar
+from typing import Any, ClassVar
+
+from torch import nn
 
 from otx.algo.segmentation.backbones import MSCAN
 from otx.algo.segmentation.heads import LightHamHead
@@ -13,15 +16,12 @@
 
 from .base_model import BaseSegmModel
 
-if TYPE_CHECKING:
-    from torch import nn
-
 
 class SegNextB(BaseSegmModel):
     """SegNextB Model."""
 
     default_backbone_configuration: ClassVar[dict[str, Any]] = {
-        "act_cfg": {"type": "GELU"},
+        "activation_callable": nn.GELU,
         "attention_kernel_paddings": [2, [0, 3], [0, 5], [0, 10]],
         "attention_kernel_sizes": [5, [1, 7], [1, 11], [1, 21]],
         "depths": [3, 3, 12, 3],
@@ -48,7 +48,7 @@ class SegNextS(BaseSegmModel):
     """SegNextS Model."""
 
     default_backbone_configuration: ClassVar[dict[str, Any]] = {
-        "act_cfg": {"type": "GELU"},
+        "activation_callable": nn.GELU,
         "attention_kernel_paddings": [2, [0, 3], [0, 5], [0, 10]],
         "attention_kernel_sizes": [5, [1, 7], [1, 11], [1, 21]],
         "depths": [2, 2, 4, 2],
@@ -75,7 +75,7 @@ class SegNextT(BaseSegmModel):
     """SegNextT Model."""
 
     default_backbone_configuration: ClassVar[dict[str, Any]] = {
-        "act_cfg": {"type": "GELU"},
+        "activation_callable": nn.GELU,
         "attention_kernel_paddings": [2, [0, 3], [0, 5], [0, 10]],
         "attention_kernel_sizes": [5, [1, 7], [1, 11], [1, 21]],
         "depths": [3, 3, 5, 2],
diff --git a/tests/unit/algo/common/backbones/test_pytorchcv_backbones.py b/tests/unit/algo/common/backbones/test_pytorchcv_backbones.py
index 8e61ae65a25..81176677d6e 100644
--- a/tests/unit/algo/common/backbones/test_pytorchcv_backbones.py
+++ b/tests/unit/algo/common/backbones/test_pytorchcv_backbones.py
@@ -30,14 +30,14 @@ def __init__(self):
 
 
 def test_replace_activation() -> None:
-    activation_cfg = {"type": "GELU"}
+    activation_callable = nn.GELU
     model = MockModule()
-    model = replace_activation(model, activation_cfg)
+    model = replace_activation(model, activation_callable)
     assert isinstance(model._modules["activ1"], nn.GELU)
     assert isinstance(model._modules["activ2"], nn.GELU)
 
-    activation_cfg = {"type": "torch_swish"}
-    model = replace_activation(model, activation_cfg)
+    activation_callable = nn.SiLU
+    model = replace_activation(model, activation_callable)
     assert isinstance(model._modules["activ1"], nn.SiLU)
     assert isinstance(model._modules["activ2"], nn.SiLU)
 
diff --git a/tests/unit/algo/detection/backbones/test_csp_darknet.py b/tests/unit/algo/detection/backbones/test_csp_darknet.py
index 3c24d83cd57..42650c2bc49 100644
--- a/tests/unit/algo/detection/backbones/test_csp_darknet.py
+++ b/tests/unit/algo/detection/backbones/test_csp_darknet.py
@@ -9,6 +9,7 @@
 import pytest
 import torch
 from otx.algo.detection.backbones.csp_darknet import CSPDarknet, Focus
+from torch import nn
 from torch.nn.modules import GroupNorm
 from torch.nn.modules.batchnorm import _BatchNorm
 
@@ -108,7 +109,7 @@ def test_forward(self) -> None:
         assert feat[5].shape == torch.Size((1, 256, 2, 2))
 
         # Test CSPDarknet forward with dict(type='ReLU')
-        model = CSPDarknet(widen_factor=0.125, act_cfg={"type": "ReLU"}, out_indices=range(5))
+        model = CSPDarknet(widen_factor=0.125, activation_callable=nn.ReLU, out_indices=range(5))
         model.train()
 
         imgs = torch.randn(1, 3, 64, 64)
diff --git a/tests/unit/algo/detection/heads/test_rtmdet_head.py b/tests/unit/algo/detection/heads/test_rtmdet_head.py
index 61e48622371..4d1d594c5be 100644
--- a/tests/unit/algo/detection/heads/test_rtmdet_head.py
+++ b/tests/unit/algo/detection/heads/test_rtmdet_head.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """Test of RTMDetHead."""
 
+from functools import partial
+
 import pytest
 import torch
 from omegaconf import DictConfig
@@ -11,6 +13,7 @@
 from otx.algo.common.utils.prior_generators import MlvlPointGenerator
 from otx.algo.common.utils.samplers import PseudoSampler
 from otx.algo.detection.heads.rtmdet_head import RTMDetHead, RTMDetSepBNHead
+from torch import nn
 
 
 @pytest.fixture()
@@ -54,7 +57,7 @@ def rtmdet_head(self) -> RTMDetHead:
             with_objectness=False,
             pred_kernel_size=1,
             norm_cfg={"type": "BN"},
-            act_cfg={"type": "SiLU", "inplace": True},
+            activation_callable=partial(nn.SiLU, inplace=True),
             train_cfg=train_cfg,
             test_cfg=test_cfg,
         )
@@ -166,7 +169,7 @@ def rtmdet_sep_bn_head(self) -> RTMDetSepBNHead:
             share_conv=True,
             pred_kernel_size=1,
             norm_cfg={"type": "BN"},
-            act_cfg={"type": "SiLU", "inplace": True},
+            activation_callable=partial(nn.SiLU, inplace=True),
             train_cfg=train_cfg,
             test_cfg=test_cfg,
         )
diff --git a/tests/unit/algo/detection/layers/test_csp_layer.py b/tests/unit/algo/detection/layers/test_csp_layer.py
index 5e3fe06bf0f..ef8a774b6a1 100644
--- a/tests/unit/algo/detection/layers/test_csp_layer.py
+++ b/tests/unit/algo/detection/layers/test_csp_layer.py
@@ -19,7 +19,7 @@ def test_init(self) -> None:
         assert isinstance(csp_layer.blocks[0].conv2, Conv2dModule)
         assert isinstance(csp_layer.blocks[0].conv1.conv, Conv2d)
         assert isinstance(csp_layer.blocks[0].conv1.bn, BatchNorm2d)
-        assert isinstance(csp_layer.blocks[0].conv1.activate, Swish)
+        assert isinstance(csp_layer.blocks[0].conv1.activation, Swish)
         assert not hasattr(csp_layer, "attention")
 
         # use DepthwiseSeparableConvModule
diff --git a/tests/unit/algo/modules/test_activation.py b/tests/unit/algo/modules/test_activation.py
index 45e0804bc02..848cd09ec02 100644
--- a/tests/unit/algo/modules/test_activation.py
+++ b/tests/unit/algo/modules/test_activation.py
@@ -4,7 +4,7 @@
 # https://github.com/open-mmlab/mmcv/blob/main/tests/test_cnn/test_swish.py
 
 import torch
-from otx.algo.modules.activation import Swish, build_activation_layer
+from otx.algo.modules.activation import Swish
 from torch.nn import functional
 
 
@@ -17,21 +17,3 @@ def test_swish():
     assert output.shape == expected_output.shape
     # test output value
     assert torch.equal(output, expected_output)
-
-
-def test_build_activation_layer():
-    cfg = {"type": "PReLU"}
-    activation_layer = build_activation_layer(cfg=cfg)
-    assert isinstance(activation_layer, torch.nn.PReLU)
-
-    cfg = {"type": "ReLU"}
-    activation_layer = build_activation_layer(cfg=cfg)
-    assert isinstance(activation_layer, torch.nn.ReLU)
-
-    cfg = {"type": "LeakyReLU"}
-    activation_layer = build_activation_layer(cfg=cfg)
-    assert isinstance(activation_layer, torch.nn.LeakyReLU)
-
-    cfg = {"type": "Swish"}
-    activation_layer = build_activation_layer(cfg=cfg)
-    assert isinstance(activation_layer, Swish)
diff --git a/tests/unit/algo/modules/test_conv_module.py b/tests/unit/algo/modules/test_conv_module.py
index be0f8e34463..8fadd8ace41 100644
--- a/tests/unit/algo/modules/test_conv_module.py
+++ b/tests/unit/algo/modules/test_conv_module.py
@@ -3,6 +3,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # https://github.com/open-mmlab/mmcv/blob/main/tests/test_cnn/test_conv_module.py
 
+from functools import partial
+
 import pytest
 import torch
 from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
@@ -15,15 +17,20 @@ def test_conv_module():
         # norm_cfg must be a dict or None
         Conv2dModule(3, 8, 2, norm_cfg=norm_cfg)
 
-    act_cfg = {"type": "softmax"}
-    with pytest.raises(KeyError):
+    activation_callable = nn.Softmax
+    with pytest.raises(ValueError, match="Unsupported activation"):
+        # softmax is not supported
+        Conv2dModule(3, 8, 2, activation_callable=activation_callable)
+
+    activation_callable = partial(nn.Softmax)
+    with pytest.raises(ValueError, match="Unsupported activation"):
         # softmax is not supported
-        Conv2dModule(3, 8, 2, act_cfg=act_cfg)
+        Conv2dModule(3, 8, 2, activation_callable=activation_callable)
 
     # conv + norm + act
     conv = Conv2dModule(3, 8, 2, norm_cfg={"type": "BN"})
     assert conv.with_activation
-    assert hasattr(conv, "activate")
+    assert isinstance(conv.activation, nn.Module)
     assert conv.with_norm
     assert hasattr(conv, "norm_layer")
     x = torch.rand(1, 3, 256, 256)
@@ -33,7 +40,7 @@ def test_conv_module():
     # conv + act
     conv = Conv2dModule(3, 8, 2)
     assert conv.with_activation
-    assert hasattr(conv, "activate")
+    assert isinstance(conv.activation, nn.Module)
     assert not conv.with_norm
     assert conv.norm_layer is None
     x = torch.rand(1, 3, 256, 256)
@@ -41,11 +48,11 @@ def test_conv_module():
     assert output.shape == (1, 8, 255, 255)
 
     # conv
-    conv = Conv2dModule(3, 8, 2, act_cfg=None)
+    conv = Conv2dModule(3, 8, 2, activation_callable=None)
     assert not conv.with_norm
     assert conv.norm_layer is None
     assert not conv.with_activation
-    assert not hasattr(conv, "activate")
+    assert conv.activation is None
     x = torch.rand(1, 3, 256, 256)
     output = conv(x)
     assert output.shape == (1, 8, 255, 255)
@@ -65,26 +72,26 @@ def test_conv_module():
         conv = Conv2dModule(3, 8, 3, padding=1, padding_mode="non_exists")
 
     # leaky relu
-    conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "LeakyReLU"})
-    assert isinstance(conv.activate, nn.LeakyReLU)
+    conv = Conv2dModule(3, 8, 3, padding=1, activation_callable=nn.LeakyReLU)
+    assert isinstance(conv.activation, nn.LeakyReLU)
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
     # tanh
-    conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "Tanh"})
-    assert isinstance(conv.activate, nn.Tanh)
+    conv = Conv2dModule(3, 8, 3, padding=1, activation_callable=nn.Tanh)
+    assert isinstance(conv.activation, nn.Tanh)
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
     # Sigmoid
-    conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "Sigmoid"})
-    assert isinstance(conv.activate, nn.Sigmoid)
+    conv = Conv2dModule(3, 8, 3, padding=1, activation_callable=nn.Sigmoid)
+    assert isinstance(conv.activation, nn.Sigmoid)
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
     # PReLU
-    conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "PReLU"})
-    assert isinstance(conv.activate, nn.PReLU)
+    conv = Conv2dModule(3, 8, 3, padding=1, activation_callable=nn.PReLU)
+    assert isinstance(conv.activation, nn.PReLU)
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
@@ -132,8 +139,8 @@ def test_forward_with_default_config(self) -> None:
         assert conv.pointwise_conv.conv.kernel_size == (1, 1)
         assert not conv.depthwise_conv.with_norm
         assert not conv.pointwise_conv.with_norm
-        assert conv.depthwise_conv.activate.__class__.__name__ == "ReLU"
-        assert conv.pointwise_conv.activate.__class__.__name__ == "ReLU"
+        assert conv.depthwise_conv.activation.__class__.__name__ == "ReLU"
+        assert conv.pointwise_conv.activation.__class__.__name__ == "ReLU"
         x = torch.rand(1, 3, 256, 256)
         output = conv(x)
         assert output.shape == (1, 8, 255, 255)
@@ -179,29 +186,29 @@ def test_forward_with_spectral_norm_padding_mode(self) -> None:
         output = conv(x)
         assert output.shape == (1, 8, 256, 256)
 
-    def test_forward_with_dw_act_cfg(self) -> None:
-        # test dw_act_cfg
-        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, dw_act_cfg={"type": "LeakyReLU"})
+    def test_forward_with_dw_activation_callable(self) -> None:
+        # test dw_activation_callable
+        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, dw_activation_callable=nn.LeakyReLU)
         x = torch.rand(1, 3, 256, 256)
-        assert conv.depthwise_conv.activate.__class__.__name__ == "LeakyReLU"
-        assert conv.pointwise_conv.activate.__class__.__name__ == "ReLU"
+        assert conv.depthwise_conv.activation.__class__.__name__ == "LeakyReLU"
+        assert conv.pointwise_conv.activation.__class__.__name__ == "ReLU"
         output = conv(x)
         assert output.shape == (1, 8, 256, 256)
 
-    def test_forward_with_pw_act_cfg(self) -> None:
-        # test pw_act_cfg
-        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, pw_act_cfg={"type": "LeakyReLU"})
+    def test_forward_with_pw_activation_callable(self) -> None:
+        # test pw_activation_callable
+        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, pw_activation_callable=nn.LeakyReLU)
         x = torch.rand(1, 3, 256, 256)
-        assert conv.depthwise_conv.activate.__class__.__name__ == "ReLU"
-        assert conv.pointwise_conv.activate.__class__.__name__ == "LeakyReLU"
+        assert conv.depthwise_conv.activation.__class__.__name__ == "ReLU"
+        assert conv.pointwise_conv.activation.__class__.__name__ == "LeakyReLU"
         output = conv(x)
         assert output.shape == (1, 8, 256, 256)
 
-    def test_forward_with_act_cfg(self) -> None:
-        # test act_cfg
-        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, act_cfg={"type": "LeakyReLU"})
+    def test_forward_with_activation_callable(self) -> None:
+        # test activation_callable
+        conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, activation_callable=nn.LeakyReLU)
         x = torch.rand(1, 3, 256, 256)
-        assert conv.depthwise_conv.activate.__class__.__name__ == "LeakyReLU"
-        assert conv.pointwise_conv.activate.__class__.__name__ == "LeakyReLU"
+        assert conv.depthwise_conv.activation.__class__.__name__ == "LeakyReLU"
+        assert conv.pointwise_conv.activation.__class__.__name__ == "LeakyReLU"
         output = conv(x)
         assert output.shape == (1, 8, 256, 256)