From 1a8e10e078c8fd99cacbdc4b2e2fa9051521fb16 Mon Sep 17 00:00:00 2001 From: "Kim, Sungchul" Date: Mon, 12 Aug 2024 10:10:33 +0900 Subject: [PATCH] Refactoring `ConvModule` by removing `act_cfg` (#3809) * Remove `act_cfg` and reflect this change to all modules * Remove `build_activation_layer` * Enable to update `inplace` * pre-commit * Update CHANGELOG * Fix unit test * pre-commit * Fix unit test * Update keypoint detection part * Fix default --------- Co-authored-by: Prokofiev Kirill --- CHANGELOG.md | 2 + .../action_classification/backbones/x3d.py | 47 ++--- src/otx/algo/action_classification/x3d.py | 3 +- .../classification/backbones/efficientnet.py | 116 +++++------ .../heads/multilabel_cls_head.py | 5 +- .../heads/vision_transformer_head.py | 2 - src/otx/algo/common/backbones/cspnext.py | 21 +- .../common/backbones/pytorchcv_backbones.py | 19 +- src/otx/algo/common/layers/spp_layer.py | 14 +- .../algo/detection/backbones/csp_darknet.py | 27 ++- src/otx/algo/detection/backbones/presnet.py | 94 ++++++--- .../algo/detection/heads/rtdetr_decoder.py | 34 ++-- src/otx/algo/detection/heads/rtmdet_head.py | 27 +-- src/otx/algo/detection/heads/yolox_head.py | 18 +- src/otx/algo/detection/layers/csp_layer.py | 99 +++++---- src/otx/algo/detection/necks/cspnext_pafpn.py | 25 ++- src/otx/algo/detection/necks/fpn.py | 14 +- .../algo/detection/necks/hybrid_encoder.py | 42 ++-- src/otx/algo/detection/necks/yolox_pafpn.py | 26 ++- src/otx/algo/detection/rtmdet.py | 10 +- .../instance_segmentation/backbones/swin.py | 28 ++- .../heads/rtmdet_ins_head.py | 54 ++--- .../algo/instance_segmentation/maskrcnn.py | 3 +- .../algo/instance_segmentation/necks/fpn.py | 12 +- .../algo/instance_segmentation/rtmdet_inst.py | 9 +- src/otx/algo/keypoint_detection/rtmpose.py | 4 +- src/otx/algo/modules/__init__.py | 5 +- src/otx/algo/modules/activation.py | 52 +---- src/otx/algo/modules/conv_module.py | 133 ++++++++---- src/otx/algo/modules/transformer.py | 11 +- .../algo/segmentation/backbones/litehrnet.py | 190 +++++++++--------- src/otx/algo/segmentation/backbones/mscan.py | 162 +++++++-------- .../algo/segmentation/heads/base_segm_head.py | 43 ++-- src/otx/algo/segmentation/heads/fcn_head.py | 10 +- src/otx/algo/segmentation/heads/ham_head.py | 8 +- .../algo/segmentation/modules/aggregators.py | 10 +- src/otx/algo/segmentation/modules/blocks.py | 12 +- src/otx/algo/segmentation/segnext.py | 14 +- .../backbones/test_pytorchcv_backbones.py | 8 +- .../detection/backbones/test_csp_darknet.py | 3 +- .../algo/detection/heads/test_rtmdet_head.py | 7 +- .../algo/detection/layers/test_csp_layer.py | 2 +- tests/unit/algo/modules/test_activation.py | 20 +- tests/unit/algo/modules/test_conv_module.py | 71 ++++--- 44 files changed, 805 insertions(+), 711 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67c87e31181..63b53d13b83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,8 @@ All notable changes to this project will be documented in this file. () - Enable to use polygon and bitmap mask as prompt inputs for zero-shot learning () +- Refactoring `ConvModule` by removing `conv_cfg` and `act_cfg` + (, ) ### Bug fixes diff --git a/src/otx/algo/action_classification/backbones/x3d.py b/src/otx/algo/action_classification/backbones/x3d.py index 7deef62a9f6..7660ae49569 100644 --- a/src/otx/algo/action_classification/backbones/x3d.py +++ b/src/otx/algo/action_classification/backbones/x3d.py @@ -7,12 +7,13 @@ from __future__ import annotations import math +from typing import Callable import torch.utils.checkpoint as cp from torch import Tensor, nn from torch.nn.modules.batchnorm import _BatchNorm -from otx.algo.modules.activation import Swish, build_activation_layer +from otx.algo.modules.activation import Swish from otx.algo.modules.conv_module import Conv3dModule from otx.algo.utils.mmengine_utils import load_checkpoint from otx.algo.utils.weight_init import constant_init, kaiming_init @@ -73,8 +74,8 @@ class BlockX3D(nn.Module): before and after the 3x3x3 conv. Default: True. norm_cfg (dict): Config for norm layers. required keys are ``type``, Default: ``dict(type='BN3d')``. - act_cfg (dict): Config dict for activation layer. - Default: ``dict(type='ReLU')``. + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to `nn.ReLU`. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. """ @@ -89,7 +90,7 @@ def __init__( se_ratio: float | None = None, use_swish: bool = True, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] | None = nn.ReLU, with_cp: bool = False, ): super().__init__() @@ -102,8 +103,7 @@ def __init__( self.se_ratio = se_ratio self.use_swish = use_swish self.norm_cfg = norm_cfg - self.act_cfg = act_cfg - self.act_cfg_swish = Swish() + self.activation_callable = activation_callable self.with_cp = with_cp self.conv1 = Conv3dModule( @@ -114,7 +114,7 @@ def __init__( padding=0, bias=False, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ) # Here we use the channel-wise conv self.conv2 = Conv3dModule( @@ -126,7 +126,7 @@ def __init__( groups=planes, bias=False, norm_cfg=self.norm_cfg, - act_cfg=None, + activation_callable=None, ) self.swish = Swish() @@ -139,13 +139,13 @@ def __init__( padding=0, bias=False, norm_cfg=self.norm_cfg, - act_cfg=None, + activation_callable=None, ) if self.se_ratio is not None: self.se_module = SEModule(planes, self.se_ratio) - self.relu = build_activation_layer(self.act_cfg) if self.act_cfg else build_activation_layer({}) + self.relu = self.activation_callable() if self.activation_callable else nn.ReLU(inplace=True) def forward(self, x: Tensor) -> Tensor: """Defines the computation performed at every call.""" @@ -198,8 +198,8 @@ class X3DBackbone(nn.Module): norm_cfg (dict): Config for norm layers. required keys are ``type`` and ``requires_grad``. Default: ``dict(type='BN3d', requires_grad=True)``. - act_cfg (dict): Config dict for activation layer. - Default: ``dict(type='ReLU', inplace=True)``. + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to `nn.ReLU`. norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze running stats (mean and var). Default: False. with_cp (bool): Use checkpoint or not. Using checkpoint will save some @@ -224,7 +224,7 @@ def __init__( se_ratio: float = 1 / 16, use_swish: bool = True, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] | None = nn.ReLU, norm_eval: bool = False, with_cp: bool = False, zero_init_residual: bool = True, @@ -267,7 +267,7 @@ def __init__( self.use_swish = use_swish self.norm_cfg = norm_cfg - self.act_cfg = act_cfg + self.activation_callable = activation_callable self.norm_eval = norm_eval self.with_cp = with_cp self.zero_init_residual = zero_init_residual @@ -294,7 +294,7 @@ def __init__( se_ratio=self.se_ratio, use_swish=self.use_swish, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, with_cp=with_cp, **kwargs, ) @@ -312,7 +312,7 @@ def __init__( padding=0, bias=False, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ) self.feat_dim = int(self.feat_dim * self.gamma_b) @@ -350,7 +350,7 @@ def make_res_layer( se_ratio: float | None = None, use_swish: bool = True, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] | None = nn.ReLU, with_cp: bool = False, **kwargs, ) -> nn.Module: @@ -376,7 +376,8 @@ def make_res_layer( use_swish (bool): Whether to use swish as the activation function before and after the 3x3x3 conv. Default: True. norm_cfg (dict | None): Config for norm layers. Default: None. - act_cfg (dict | None): Config for activate layers. Default: None. + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to `nn.ReLU`. with_cp (bool | None): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Default: False. @@ -394,7 +395,7 @@ def make_res_layer( padding=0, bias=False, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ) use_se = [False] * blocks @@ -416,7 +417,7 @@ def make_res_layer( se_ratio=se_ratio if use_se[0] else None, use_swish=use_swish, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, with_cp=with_cp, **kwargs, ), @@ -432,7 +433,7 @@ def make_res_layer( se_ratio=se_ratio if use_se[i] else None, use_swish=use_swish, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, with_cp=with_cp, **kwargs, ), @@ -450,7 +451,7 @@ def _make_stem_layer(self) -> None: padding=(0, 1, 1), bias=False, norm_cfg=None, - act_cfg=None, + activation_callable=None, ) self.conv1_t = Conv3dModule( self.base_channels, @@ -461,7 +462,7 @@ def _make_stem_layer(self) -> None: groups=self.base_channels, bias=False, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ) def _freeze_stages(self) -> None: diff --git a/src/otx/algo/action_classification/x3d.py b/src/otx/algo/action_classification/x3d.py index 7f503dadfd4..b8931dc9e1a 100644 --- a/src/otx/algo/action_classification/x3d.py +++ b/src/otx/algo/action_classification/x3d.py @@ -5,6 +5,7 @@ from __future__ import annotations +from functools import partial from typing import TYPE_CHECKING from torch import nn @@ -65,7 +66,7 @@ def _build_model(self, num_classes: int) -> nn.Module: gamma_d=2.2, gamma_w=1, norm_cfg={"type": "BN3d", "requires_grad": True}, - act_cfg={"type": "ReLU", "inplace": True}, + activation_callable=partial(nn.ReLU, inplace=True), ), cls_head=X3DHead( num_classes=num_classes, diff --git a/src/otx/algo/classification/backbones/efficientnet.py b/src/otx/algo/classification/backbones/efficientnet.py index a7081728590..6114a28f60d 100644 --- a/src/otx/algo/classification/backbones/efficientnet.py +++ b/src/otx/algo/classification/backbones/efficientnet.py @@ -7,14 +7,14 @@ import math from pathlib import Path -from typing import Literal +from typing import Callable, Literal import torch from pytorchcv.models.model_store import download_model from torch import nn from torch.nn import functional, init -from otx.algo.modules.activation import build_activation_layer +from otx.algo.modules.activation import Swish from otx.algo.modules.conv_module import Conv2dModule from otx.algo.utils.mmengine_utils import load_checkpoint_to_model @@ -33,7 +33,7 @@ def conv1x1_block( bias: bool = False, use_bn: bool = True, bn_eps: float = 1e-5, - activation: str | None = "ReLU", + activation_callable: Callable[..., nn.Module] | None = nn.ReLU, ) -> Conv2dModule: """Conv block.""" return Conv2dModule( @@ -45,7 +45,7 @@ def conv1x1_block( groups=groups, bias=bias, norm_cfg=({"type": "BN", "eps": bn_eps} if use_bn else None), - act_cfg=({"type": activation} if activation else None), + activation_callable=activation_callable, ) @@ -59,7 +59,7 @@ def conv3x3_block( bias: bool = False, use_bn: bool = True, bn_eps: float = 1e-5, - activation: str | None = "ReLU", + activation_callable: Callable[..., nn.Module] | None = nn.ReLU, ) -> Conv2dModule: """Conv block.""" return Conv2dModule( @@ -72,7 +72,7 @@ def conv3x3_block( groups=groups, bias=bias, norm_cfg=({"type": "BN", "eps": bn_eps} if use_bn else None), - act_cfg=({"type": activation} if activation else None), + activation_callable=activation_callable, ) @@ -85,7 +85,7 @@ def dwconv3x3_block( bias: bool = False, use_bn: bool = True, bn_eps: float = 1e-5, - activation: str | None = "ReLU", + activation_callable: Callable[..., nn.Module] | None = nn.ReLU, ) -> Conv2dModule: """Conv block.""" return Conv2dModule( @@ -98,7 +98,7 @@ def dwconv3x3_block( groups=out_channels, bias=bias, norm_cfg=({"type": "BN", "eps": bn_eps} if use_bn else None), - act_cfg=({"type": activation} if activation else None), + activation_callable=activation_callable, ) @@ -111,7 +111,7 @@ def dwconv5x5_block( bias: bool = False, use_bn: bool = True, bn_eps: float = 1e-5, - activation: str | None = "ReLU", + activation_callable: Callable[..., nn.Module] | None = nn.ReLU, ) -> Conv2dModule: """Conv block.""" return Conv2dModule( @@ -124,7 +124,7 @@ def dwconv5x5_block( groups=out_channels, bias=bias, norm_cfg=({"type": "BN", "eps": bn_eps} if use_bn else None), - act_cfg=({"type": activation} if activation else None), + activation_callable=activation_callable, ) @@ -164,13 +164,15 @@ class SEBlock(nn.Module): https://arxiv.org/abs/1709.01507. Args: - channels : int. Number of channels. - reduction : int, default 16. Squeeze reduction value. - mid_channels : int or None, default None. Number of middle channels. - round_mid : bool, default False. Whether to round middle channel number (make divisible by 8). - use_conv : bool, default True. Whether to convolutional layers instead of fully-connected ones. - activation : function, or str, or nn.Module, default 'relu'. Activation function after the first convolution. - out_activation : function, or str, or nn.Module, Activation function after the last convolution. + channels (int): Number of channels. + reduction (int): Squeeze reduction value. Default to 16. + mid_channels (int | None): Number of middle channels. Defaults to None. + round_mid (bool): Whether to round middle channel number (make divisible by 8). Defaults to False. + use_conv (bool): Whether to convolutional layers instead of fully-connected ones. Defaults to True. + mid_activation_callable (Callable[..., nn.Module]): Activation layer module after the first convolution. + Defaults to `nn.ReLU`. + out_activation_callable (Callable[..., nn.Module]): Activation layer module after the last convolution. + Defaults to `nn.Sigmoid`. """ def __init__( @@ -180,8 +182,8 @@ def __init__( mid_channels: int | None = None, round_mid: bool = False, use_conv: bool = True, - mid_activation: str | None = "ReLU", - out_activation: str | None = "Sigmoid", + mid_activation_callable: Callable[..., nn.Module] = nn.ReLU, + out_activation_callable: Callable[..., nn.Module] = nn.Sigmoid, ): super().__init__() self.use_conv = use_conv @@ -200,7 +202,7 @@ def __init__( ) else: self.fc1 = nn.Linear(in_features=channels, out_features=mid_channels) - self.activ = build_activation_layer({"type": mid_activation}) + self.activ = mid_activation_callable() if use_conv: self.conv2 = nn.Conv2d( in_channels=mid_channels, @@ -212,7 +214,7 @@ def __init__( ) else: self.fc2 = nn.Linear(in_features=mid_channels, out_features=channels) - self.sigmoid = build_activation_layer({"type": out_activation}) + self.sigmoid = out_activation_callable() def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward.""" @@ -232,12 +234,12 @@ class EffiDwsConvUnit(nn.Module): """EfficientNet specific depthwise separable conv block/unit with BatchNorms and activations at each conv. Args: - in_channels : int. Number of input channels. - out_channels : int. Number of output channels. - stride : int or tuple/list of 2 int. Strides of the second convolution layer. - bn_eps : float. Small float added to variance in Batch norm. - activation : str. Name of activation function. - tf_mode : bool. Whether to use TF-like mode. + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + stride (int | tuple[int, int]): Strides of the second convolution layer. + bn_eps (float): Small float added to variance in Batch norm. + activation_callable (Callable[..., nn.Module]): Activation layer module. + tf_mode (bool): Whether to use TF-like mode. """ def __init__( @@ -246,7 +248,7 @@ def __init__( out_channels: int, stride: int | tuple[int, int], bn_eps: float, - activation: str, + activation_callable: Callable[..., nn.Module], tf_mode: bool, ): super().__init__() @@ -258,14 +260,14 @@ def __init__( out_channels=in_channels, padding=(0 if tf_mode else 1), bn_eps=bn_eps, - activation=activation, + activation_callable=activation_callable, ) - self.se = SEBlock(channels=in_channels, reduction=4, mid_activation=activation) + self.se = SEBlock(channels=in_channels, reduction=4, mid_activation_callable=activation_callable) self.pw_conv = conv1x1_block( in_channels=in_channels, out_channels=out_channels, bn_eps=bn_eps, - activation=None, + activation_callable=None, ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -286,15 +288,15 @@ class EffiInvResUnit(nn.Module): """EfficientNet inverted residual unit. Args: - in_channels : int. Number of input channels. - out_channels : int. Number of output channels. - kernel_size : int or tuple/list of 2 int. Convolution window size. - stride : int or tuple/list of 2 int. Strides of the second convolution layer. - exp_factor : int. Factor for expansion of channels. - se_factor : int. SE reduction factor for each unit. - bn_eps : float. Small float added to variance in Batch norm. - activation : str. Name of activation function. - tf_mode : bool. Whether to use TF-like mode. + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int | tuple[int, int]): Convolution window size. + stride (int | tuple[int, int]): Strides of the second convolution layer. + exp_factor (int): Factor for expansion of channels. + se_factor (int): SE reduction factor for each unit. + bn_eps (float): Small float added to variance in Batch norm. + activation_callable (Callable[..., nn.Module]): Name of activation function. + tf_mode (bool): Whether to use TF-like mode. """ def __init__( @@ -306,7 +308,7 @@ def __init__( exp_factor: int, se_factor: int, bn_eps: float, - activation: str | None, + activation_callable: Callable[..., nn.Module], tf_mode: bool, ): super().__init__() @@ -322,7 +324,7 @@ def __init__( in_channels=in_channels, out_channels=mid_channels, bn_eps=bn_eps, - activation=activation, + activation_callable=activation_callable, ) self.conv2 = dwconv_block_fn( in_channels=mid_channels, @@ -330,19 +332,19 @@ def __init__( stride=stride, padding=(0 if tf_mode else kernel_size // 2), bn_eps=bn_eps, - activation=activation, + activation_callable=activation_callable, ) if self.use_se: self.se = SEBlock( channels=mid_channels, reduction=(exp_factor * se_factor), - mid_activation=activation, + mid_activation_callable=activation_callable, ) self.conv3 = conv1x1_block( in_channels=mid_channels, out_channels=out_channels, bn_eps=bn_eps, - activation=None, + activation_callable=None, ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -368,11 +370,11 @@ class EffiInitBlock(nn.Module): """EfficientNet specific initial block. Args: - in_channels : int. Number of input channels. - out_channels : int. Number of output channels. - bn_eps : float. Small float added to variance in Batch norm. - activation : str. Name of activation function. - tf_mode : bool. Whether to use TF-like mode. + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + bn_eps (float): Small float added to variance in Batch norm. + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + tf_mode (bool): Whether to use TF-like mode. """ def __init__( @@ -380,7 +382,7 @@ def __init__( in_channels: int, out_channels: int, bn_eps: float, - activation: str | None, + activation_callable: Callable[..., nn.Module] | None, tf_mode: bool, ): super().__init__() @@ -392,7 +394,7 @@ def __init__( stride=2, padding=(0 if tf_mode else 1), bn_eps=bn_eps, - activation=activation, + activation_callable=activation_callable, ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -453,7 +455,7 @@ def __init__( self.bn_frozen = bn_frozen self.pooling_type = pooling_type self.num_features = self.num_head_features = final_block_channels - activation = "Swish" + activation_callable = Swish self.features = nn.Sequential() self.features.add_module( "init_block", @@ -461,7 +463,7 @@ def __init__( in_channels=in_channels, out_channels=init_block_channels, bn_eps=bn_eps, - activation=activation, + activation_callable=activation_callable, tf_mode=tf_mode, ), ) @@ -482,7 +484,7 @@ def __init__( out_channels=out_channels, stride=stride, bn_eps=bn_eps, - activation=activation, + activation_callable=activation_callable, tf_mode=tf_mode, ), ) @@ -497,7 +499,7 @@ def __init__( exp_factor=expansion_factor, se_factor=4, bn_eps=bn_eps, - activation=activation, + activation_callable=activation_callable, tf_mode=tf_mode, ), ) @@ -510,7 +512,7 @@ def __init__( in_channels=in_channels, out_channels=final_block_channels, bn_eps=bn_eps, - activation=activation, + activation_callable=activation_callable, ), ) self._init_params() diff --git a/src/otx/algo/classification/heads/multilabel_cls_head.py b/src/otx/algo/classification/heads/multilabel_cls_head.py index 731c247ba5b..2df5523d988 100644 --- a/src/otx/algo/classification/heads/multilabel_cls_head.py +++ b/src/otx/algo/classification/heads/multilabel_cls_head.py @@ -241,7 +241,8 @@ class MultiLabelNonLinearClsHead(MultiLabelClsHead): num_classes (int): Number of categories. in_channels (int): Number of channels in the input feature map. hid_channels (int): Number of channels in the hidden feature map. - act_cfg (dict | optional): The configuration of the activation function. + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to nn.ReLU. scale (float): Positive scale parameter. loss (dict): Config of classification loss. dropout (bool): Whether use the dropout or not. @@ -254,7 +255,7 @@ def __init__( in_channels: int, loss: nn.Module, hid_channels: int = 1280, - activation_callable: Callable[[], nn.Module] = nn.ReLU, + activation_callable: Callable[..., nn.Module] = nn.ReLU, scale: float = 1.0, dropout: bool = False, normalized: bool = False, diff --git a/src/otx/algo/classification/heads/vision_transformer_head.py b/src/otx/algo/classification/heads/vision_transformer_head.py index 849913a2dce..a4b9950b260 100644 --- a/src/otx/algo/classification/heads/vision_transformer_head.py +++ b/src/otx/algo/classification/heads/vision_transformer_head.py @@ -26,8 +26,6 @@ class VisionTransformerClsHead(BaseModule): in_channels (int): Number of channels in the input feature map. hidden_dim (int, optional): Number of the dimensions for hidden layer. Defaults to None, which means no extra hidden layer. - act_cfg (dict): The activation config. Only available during - pre-training. Defaults to ``dict(type='Tanh')``. init_cfg (dict): The extra initialization configs. Defaults to ``dict(type='Constant', layer='Linear', val=0)``. """ diff --git a/src/otx/algo/common/backbones/cspnext.py b/src/otx/algo/common/backbones/cspnext.py index 76bcc56b74b..dafe910946c 100644 --- a/src/otx/algo/common/backbones/cspnext.py +++ b/src/otx/algo/common/backbones/cspnext.py @@ -9,7 +9,7 @@ from __future__ import annotations import math -from typing import ClassVar +from typing import Callable, ClassVar from otx.algo.common.layers import SPPBottleneck from otx.algo.detection.layers import CSPLayer @@ -45,8 +45,8 @@ class CSPNeXt(BaseModule): stage. Defaults to True. norm_cfg (dict): Dictionary to construct and config norm layer. Defaults to dict(type='BN', requires_grad=True). - act_cfg (dict): Config dict for activation layer. - Defaults to dict(type='SiLU'). + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to `nn.SiLU`. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. @@ -84,7 +84,7 @@ def __init__( spp_kernel_sizes: tuple[int, int, int] = (5, 9, 13), channel_attention: bool = True, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] | None = nn.SiLU, norm_eval: bool = False, init_cfg: dict | None = None, ) -> None: @@ -99,7 +99,6 @@ def __init__( super().__init__(init_cfg=init_cfg) norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001} - act_cfg = act_cfg or {"type": "SiLU"} arch_setting = self.arch_settings[arch] if arch_ovewrite: @@ -126,7 +125,7 @@ def __init__( padding=1, stride=2, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), Conv2dModule( int(arch_setting[0][0] * widen_factor // 2), @@ -135,7 +134,7 @@ def __init__( padding=1, stride=1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), Conv2dModule( int(arch_setting[0][0] * widen_factor // 2), @@ -144,7 +143,7 @@ def __init__( padding=1, stride=1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), ) self.layers = ["stem"] @@ -161,7 +160,7 @@ def __init__( stride=2, padding=1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) stage.append(conv_layer) if use_spp: @@ -170,7 +169,7 @@ def __init__( out_channels, kernel_sizes=spp_kernel_sizes, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) stage.append(spp) csp_layer = CSPLayer( @@ -183,7 +182,7 @@ def __init__( expand_ratio=expand_ratio, channel_attention=channel_attention, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) stage.append(csp_layer) self.add_module(f"stage{i + 1}", nn.Sequential(*stage)) diff --git a/src/otx/algo/common/backbones/pytorchcv_backbones.py b/src/otx/algo/common/backbones/pytorchcv_backbones.py index 8878998a86d..3ed30d8e73d 100644 --- a/src/otx/algo/common/backbones/pytorchcv_backbones.py +++ b/src/otx/algo/common/backbones/pytorchcv_backbones.py @@ -6,9 +6,9 @@ from __future__ import annotations from pathlib import Path +from typing import Callable import torch -from otx.algo.modules.activation import build_activation_layer from otx.algo.modules.norm import build_norm_layer from otx.algo.utils.mmengine_utils import get_dist_info from pytorchcv.model_provider import _models @@ -19,16 +19,13 @@ # ruff: noqa: SLF001 -def replace_activation(model: nn.Module, activation_cfg: dict) -> nn.Module: - """Replace activate funtion.""" +def replace_activation(model: nn.Module, activation_callable: Callable[..., nn.Module]) -> nn.Module: + """Replace activation funtion.""" for name, module in model._modules.items(): if len(list(module.children())) > 0: - model._modules[name] = replace_activation(module, activation_cfg) + model._modules[name] = replace_activation(module, activation_callable) if "activ" in name: - if activation_cfg["type"] == "torch_swish": - model._modules[name] = nn.SiLU() - else: - model._modules[name] = build_activation_layer(activation_cfg) + model._modules[name] = activation_callable() return model @@ -122,7 +119,7 @@ def _build_pytorchcv_model( frozen_stages: int = 0, norm_eval: bool = False, verbose: bool = False, - activation_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] | None = None, norm_cfg: dict | None = None, **kwargs, ) -> nn.Module: @@ -133,8 +130,8 @@ def _build_pytorchcv_model( f"Init model {type}, pretrained={is_pretrained}, models cache {models_cache_root}", ) model = _models[type](**kwargs) - if activation_cfg: - model = replace_activation(model, activation_cfg) + if activation_callable: + model = replace_activation(model, activation_callable) if norm_cfg: model = replace_norm(model, norm_cfg) model.out_indices = out_indices diff --git a/src/otx/algo/common/layers/spp_layer.py b/src/otx/algo/common/layers/spp_layer.py index d314bacea9d..29027dca6bd 100644 --- a/src/otx/algo/common/layers/spp_layer.py +++ b/src/otx/algo/common/layers/spp_layer.py @@ -8,7 +8,10 @@ from __future__ import annotations +from typing import Callable + import torch +from otx.algo.modules.activation import Swish from otx.algo.modules.base_module import BaseModule from otx.algo.modules.conv_module import Conv2dModule from torch import Tensor, nn @@ -24,8 +27,8 @@ class SPPBottleneck(BaseModule): layers. Default: (5, 9, 13). norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN'). - act_cfg (dict): Config dict for activation layer. - Default: dict(type='Swish'). + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to `Swish`. init_cfg (dict, list[dict], optional): Initialization config dict. Default: None. """ @@ -36,12 +39,11 @@ def __init__( out_channels: int, kernel_sizes: tuple[int, ...] = (5, 9, 13), norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] | None = Swish, init_cfg: dict | list[dict] | None = None, ): super().__init__(init_cfg=init_cfg) norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001} - act_cfg = act_cfg or {"type": "Swish"} mid_channels = in_channels // 2 self.conv1 = Conv2dModule( @@ -50,7 +52,7 @@ def __init__( 1, stride=1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) self.poolings = nn.ModuleList([nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) for ks in kernel_sizes]) conv2_channels = mid_channels * (len(kernel_sizes) + 1) @@ -59,7 +61,7 @@ def __init__( out_channels, 1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) def forward(self, x: Tensor) -> Tensor: diff --git a/src/otx/algo/detection/backbones/csp_darknet.py b/src/otx/algo/detection/backbones/csp_darknet.py index 6e92b995b06..9df915c4dd2 100644 --- a/src/otx/algo/detection/backbones/csp_darknet.py +++ b/src/otx/algo/detection/backbones/csp_darknet.py @@ -9,7 +9,7 @@ from __future__ import annotations import math -from typing import Any, ClassVar, Sequence +from typing import Any, Callable, ClassVar, Sequence import torch from torch import Tensor, nn @@ -17,6 +17,7 @@ from otx.algo.common.layers import SPPBottleneck from otx.algo.detection.layers import CSPLayer +from otx.algo.modules.activation import Swish from otx.algo.modules.base_module import BaseModule from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule @@ -31,8 +32,8 @@ class Focus(nn.Module): stride (int): The stride of the convolution. Default: 1 norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN', momentum=0.03, eps=0.001). - act_cfg (dict): Config dict for activation layer. - Default: dict(type='Swish'). + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to `Swish`. """ def __init__( @@ -42,11 +43,10 @@ def __init__( kernel_size: int = 1, stride: int = 1, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] | None = Swish, ): super().__init__() norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001} - act_cfg = act_cfg or {"type": "Swish"} self.conv = Conv2dModule( in_channels * 4, out_channels, @@ -54,7 +54,7 @@ def __init__( stride, padding=(kernel_size - 1) // 2, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) def forward(self, x: Tensor) -> Tensor: @@ -110,8 +110,8 @@ class CSPDarknet(BaseModule): layers. Default: (5, 9, 13). norm_cfg (dict): Dictionary to construct and config norm layer. Default: dict(type='BN', requires_grad=True). - act_cfg (dict): Config dict for activation layer. - Default: dict(type='LeakyReLU', negative_slope=0.1). + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to ``Swish``. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. @@ -148,7 +148,7 @@ def __init__( arch_ovewrite: list | None = None, spp_kernal_sizes: tuple[int, ...] = (5, 9, 13), norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = Swish, norm_eval: bool = False, init_cfg: dict | list[dict] | None = None, ): @@ -162,7 +162,6 @@ def __init__( } super().__init__(init_cfg=init_cfg) norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001} - act_cfg = act_cfg or {"type": "Swish"} arch_setting = self.arch_settings[arch] if arch_ovewrite: @@ -183,7 +182,7 @@ def __init__( int(arch_setting[0][0] * widen_factor), kernel_size=3, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) self.layers = ["stem"] @@ -199,7 +198,7 @@ def __init__( stride=2, padding=1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) stage.append(conv_layer) if use_spp: @@ -208,7 +207,7 @@ def __init__( out_channels, kernel_sizes=spp_kernal_sizes, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) stage.append(spp) csp_layer = CSPLayer( @@ -218,7 +217,7 @@ def __init__( add_identity=add_identity, use_depthwise=use_depthwise, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) stage.append(csp_layer) self.add_module(f"stage{i + 1}", nn.Sequential(*stage)) diff --git a/src/otx/algo/detection/backbones/presnet.py b/src/otx/algo/detection/backbones/presnet.py index b31f7f95c3a..bd557e3561d 100644 --- a/src/otx/algo/detection/backbones/presnet.py +++ b/src/otx/algo/detection/backbones/presnet.py @@ -6,12 +6,11 @@ from __future__ import annotations from collections import OrderedDict -from typing import Any, ClassVar +from typing import Any, Callable, ClassVar import torch from torch import nn -from otx.algo.modules import build_activation_layer from otx.algo.modules.base_module import BaseModule from otx.algo.modules.conv_module import Conv2dModule @@ -29,7 +28,7 @@ def __init__( ch_out: int, stride: int, shortcut: bool, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] | None = None, variant: str = "b", norm_cfg: dict[str, str] | None = None, ) -> None: @@ -43,16 +42,24 @@ def __init__( OrderedDict( [ ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)), - ("conv", Conv2dModule(ch_in, ch_out, 1, 1, act_cfg=None, norm_cfg=norm_cfg)), + ("conv", Conv2dModule(ch_in, ch_out, 1, 1, activation_callable=None, norm_cfg=norm_cfg)), ], ), ) else: - self.short = Conv2dModule(ch_in, ch_out, 1, stride, act_cfg=None, norm_cfg=norm_cfg) - - self.branch2a = Conv2dModule(ch_in, ch_out, 3, stride, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg) - self.branch2b = Conv2dModule(ch_out, ch_out, 3, 1, padding=1, act_cfg=None, norm_cfg=norm_cfg) - self.act = nn.Identity() if act_cfg is None else build_activation_layer(act_cfg) + self.short = Conv2dModule(ch_in, ch_out, 1, stride, activation_callable=None, norm_cfg=norm_cfg) + + self.branch2a = Conv2dModule( + ch_in, + ch_out, + 3, + stride, + padding=1, + activation_callable=activation_callable, + norm_cfg=norm_cfg, + ) + self.branch2b = Conv2dModule(ch_out, ch_out, 3, 1, padding=1, activation_callable=None, norm_cfg=norm_cfg) + self.act = activation_callable() if activation_callable else nn.Identity() def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward.""" @@ -76,7 +83,7 @@ def __init__( ch_out: int, stride: int, shortcut: bool, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] | None = None, variant: str = "b", norm_cfg: dict[str, str] | None = None, ) -> None: @@ -89,9 +96,24 @@ def __init__( width = ch_out - self.branch2a = Conv2dModule(ch_in, width, 1, stride1, act_cfg=act_cfg, norm_cfg=norm_cfg) - self.branch2b = Conv2dModule(width, width, 3, stride2, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg) - self.branch2c = Conv2dModule(width, ch_out * self.expansion, 1, 1, act_cfg=None, norm_cfg=norm_cfg) + self.branch2a = Conv2dModule( + ch_in, + width, + 1, + stride1, + activation_callable=activation_callable, + norm_cfg=norm_cfg, + ) + self.branch2b = Conv2dModule( + width, + width, + 3, + stride2, + padding=1, + activation_callable=activation_callable, + norm_cfg=norm_cfg, + ) + self.branch2c = Conv2dModule(width, ch_out * self.expansion, 1, 1, activation_callable=None, norm_cfg=norm_cfg) self.shortcut = shortcut if not shortcut: @@ -102,15 +124,29 @@ def __init__( ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)), ( "conv", - Conv2dModule(ch_in, ch_out * self.expansion, 1, 1, act_cfg=None, norm_cfg=norm_cfg), + Conv2dModule( + ch_in, + ch_out * self.expansion, + 1, + 1, + activation_callable=None, + norm_cfg=norm_cfg, + ), ), ], ), ) else: - self.short = Conv2dModule(ch_in, ch_out * self.expansion, 1, stride, act_cfg=None, norm_cfg=norm_cfg) + self.short = Conv2dModule( + ch_in, + ch_out * self.expansion, + 1, + stride, + activation_callable=None, + norm_cfg=norm_cfg, + ) - self.act = nn.Identity() if act_cfg is None else build_activation_layer(act_cfg) + self.act = activation_callable() if activation_callable else nn.Identity() def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward.""" @@ -132,7 +168,7 @@ def __init__( ch_out: int, count: int, stage_num: int, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] | None = None, variant: str = "b", norm_cfg: dict[str, str] | None = None, ) -> None: @@ -147,7 +183,7 @@ def __init__( stride=2 if i == 0 and stage_num != 2 else 1, shortcut=i != 0, variant=variant, - act_cfg=act_cfg, + activation_callable=activation_callable, norm_cfg=norm_cfg, ), ) @@ -171,7 +207,8 @@ class PResNet(BaseModule): variant (str): The variant of the PResNet backbone. Defaults to "d". num_stages (int): The number of stages in the PResNet backbone. Defaults to 4. return_idx (list[int]): The indices of the stages to return as output. Defaults to [0, 1, 2, 3]. - act_cfg (dict[str, str] | None, optional): The activation configuration. Defaults to None. + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to None. norm_cfg (dict[str, str] | None, optional): The normalization configuration. Defaults to None. freeze_at (int): The stage at which to freeze the parameters. Defaults to -1. pretrained (bool): Whether to load pretrained weights. Defaults to False. @@ -197,7 +234,7 @@ def __init__( variant: str = "d", num_stages: int = 4, return_idx: list[int] = [0, 1, 2, 3], # noqa: B006 - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] | None = nn.ReLU, norm_cfg: dict[str, str] | None = None, freeze_at: int = -1, pretrained: bool = False, @@ -215,12 +252,23 @@ def __init__( ] else: conv_def = [[3, ch_in, 7, 2, "conv1_1"]] - act_cfg = act_cfg if act_cfg is not None else {"type": "ReLU"} + norm_cfg = norm_cfg if norm_cfg is not None else {"type": "BN", "name": "norm"} self.conv1 = nn.Sequential( OrderedDict( [ - (_name, Conv2dModule(c_in, c_out, k, s, padding=(k - 1) // 2, act_cfg=act_cfg, norm_cfg=norm_cfg)) + ( + _name, + Conv2dModule( + c_in, + c_out, + k, + s, + padding=(k - 1) // 2, + activation_callable=activation_callable, + norm_cfg=norm_cfg, + ), + ) for c_in, c_out, k, s, _name in conv_def ], ), @@ -242,7 +290,7 @@ def __init__( ch_out_list[i], block_nums[i], stage_num, - act_cfg=act_cfg, + activation_callable=activation_callable, variant=variant, norm_cfg=norm_cfg, ), diff --git a/src/otx/algo/detection/heads/rtdetr_decoder.py b/src/otx/algo/detection/heads/rtdetr_decoder.py index 02de4a8a92a..a3b065df974 100644 --- a/src/otx/algo/detection/heads/rtdetr_decoder.py +++ b/src/otx/algo/detection/heads/rtdetr_decoder.py @@ -8,7 +8,7 @@ import copy import math from collections import OrderedDict -from typing import Any +from typing import Any, Callable import torch import torchvision @@ -18,7 +18,6 @@ from otx.algo.detection.utils.utils import ( inverse_sigmoid, ) -from otx.algo.modules import build_activation_layer from otx.algo.modules.base_module import BaseModule from otx.algo.modules.transformer import deformable_attention_core_func @@ -140,13 +139,13 @@ def __init__( hidden_dim: int, output_dim: int, num_layers: int, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] | None = None, ) -> None: super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim])) - self.act = nn.Identity() if act_cfg is None else build_activation_layer(act_cfg) + self.act = activation_callable() if activation_callable else nn.Identity() def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward function of MLP.""" @@ -300,7 +299,8 @@ class TransformerDecoderLayer(nn.Module): n_head (int): The number of heads in the multiheadattention models. dim_feedforward (int): The dimension of the feedforward network model. dropout (float): The dropout value. - activation (dict[str, str] | None, optional): The activation function of intermediate layer, ReLU by default. + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.ReLU`. n_levels (int): The number of levels in MSDeformableAttention. n_points (int): The number of points in MSDeformableAttention. """ @@ -311,7 +311,7 @@ def __init__( n_head: int = 8, dim_feedforward: int = 1024, dropout: float = 0.0, - activation: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] = nn.ReLU, n_levels: int = 4, n_points: int = 4, ): @@ -330,8 +330,7 @@ def __init__( # ffn self.linear1 = nn.Linear(d_model, dim_feedforward) - activation = activation if activation is not None else {"type": "ReLU"} - self.activation = build_activation_layer(activation) + self.activation = activation_callable() self.dropout3 = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.dropout4 = nn.Dropout(dropout) @@ -468,8 +467,8 @@ class RTDETRTransformer(BaseModule): num_decoder_layers (int): Number of decoder layers. dim_feedforward (int): Dimension of the feedforward network. dropout (float): Dropout rate. - activation (dict[str, str] | None): The activation function of intermediate layer. - ReLu by default. + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.ReLU`. num_denoising (int): Number of denoising samples. label_noise_ratio (float): Ratio of label noise. box_noise_scale (float): Scale of box noise. @@ -494,7 +493,7 @@ def __init__( # noqa: PLR0913 num_decoder_layers: int = 6, dim_feedforward: int = 1024, dropout: float = 0.0, - activation: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] = nn.ReLU, num_denoising: int = 100, label_noise_ratio: float = 0.5, box_noise_scale: float = 1.0, @@ -531,7 +530,7 @@ def __init__( # noqa: PLR0913 self.num_decoder_layers = num_decoder_layers self.eval_spatial_size = eval_spatial_size self.aux_loss = aux_loss - activation = activation if activation is not None else {"type": "ReLU"} + # backbone feature projection self._build_input_proj_layer(feat_channels) @@ -541,7 +540,7 @@ def __init__( # noqa: PLR0913 nhead, dim_feedforward, dropout, - activation, + activation_callable, num_levels, num_decoder_points, ) @@ -558,7 +557,7 @@ def __init__( # noqa: PLR0913 self.learnt_init_query = learnt_init_query if learnt_init_query: self.tgt_embed = nn.Embedding(num_queries, hidden_dim) - self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2, act_cfg=activation) + self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2, activation_callable=activation_callable) # encoder head self.enc_output = nn.Sequential( @@ -566,12 +565,15 @@ def __init__( # noqa: PLR0913 nn.LayerNorm(hidden_dim), ) self.enc_score_head = nn.Linear(hidden_dim, num_classes) - self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3, act_cfg=activation) + self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3, activation_callable=activation_callable) # decoder head self.dec_score_head = nn.ModuleList([nn.Linear(hidden_dim, num_classes) for _ in range(num_decoder_layers)]) self.dec_bbox_head = nn.ModuleList( - [MLP(hidden_dim, hidden_dim, 4, num_layers=3, act_cfg=activation) for _ in range(num_decoder_layers)], + [ + MLP(hidden_dim, hidden_dim, 4, num_layers=3, activation_callable=activation_callable) + for _ in range(num_decoder_layers) + ], ) # init encoder output anchors and valid_mask diff --git a/src/otx/algo/detection/heads/rtmdet_head.py b/src/otx/algo/detection/heads/rtmdet_head.py index 429c03cbe05..71623aad9e7 100644 --- a/src/otx/algo/detection/heads/rtmdet_head.py +++ b/src/otx/algo/detection/heads/rtmdet_head.py @@ -8,6 +8,8 @@ from __future__ import annotations +from typing import Callable + import torch from torch import Tensor, nn @@ -35,8 +37,8 @@ class RTMDetHead(ATSSHead): in_channels (int): Number of channels in the input feature map. with_objectness (bool): Whether to add an objectness branch. Defaults to True. - act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU') + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.ReLU`. """ def __init__( @@ -44,10 +46,10 @@ def __init__( num_classes: int, in_channels: int, with_objectness: bool = True, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = nn.ReLU, **kwargs, ) -> None: - self.act_cfg = act_cfg or {"type": "ReLU"} + self.activation_callable = activation_callable self.with_objectness = with_objectness super().__init__(num_classes, in_channels, **kwargs) if self.train_cfg: @@ -67,7 +69,7 @@ def _init_layers(self) -> None: stride=1, padding=1, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ), ) self.reg_convs.append( @@ -78,7 +80,7 @@ def _init_layers(self) -> None: stride=1, padding=1, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ), ) pred_pad_size = self.pred_kernel_size // 2 @@ -643,8 +645,8 @@ class RTMDetSepBNHead(RTMDetHead): Defaults to False. norm_cfg (dict): Config dict for normalization layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). - act_cfg (dict): Config dict for activation layer. - Defaults to dict(type='SiLU'). + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.SiLU`. pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1. exp_on_reg (bool): Whether using exponential of regression features or not. Defaults to False. """ @@ -656,12 +658,11 @@ def __init__( share_conv: bool = True, use_depthwise: bool = False, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = nn.SiLU, pred_kernel_size: int = 1, exp_on_reg: bool = False, **kwargs, ) -> None: - act_cfg = act_cfg or {"type": "SiLU"} norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001} self.share_conv = share_conv self.exp_on_reg = exp_on_reg @@ -670,7 +671,7 @@ def __init__( num_classes, in_channels, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, pred_kernel_size=pred_kernel_size, **kwargs, ) @@ -698,7 +699,7 @@ def _init_layers(self) -> None: stride=1, padding=1, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ), ) reg_convs.append( @@ -709,7 +710,7 @@ def _init_layers(self) -> None: stride=1, padding=1, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ), ) self.cls_convs.append(cls_convs) diff --git a/src/otx/algo/detection/heads/yolox_head.py b/src/otx/algo/detection/heads/yolox_head.py index 7f8a12fbef2..af5dd677af0 100644 --- a/src/otx/algo/detection/heads/yolox_head.py +++ b/src/otx/algo/detection/heads/yolox_head.py @@ -10,7 +10,7 @@ import logging import math -from typing import Sequence +from typing import Callable, Sequence import torch import torch.nn.functional as F # noqa: N812 @@ -24,6 +24,7 @@ from otx.algo.common.utils.utils import multi_apply, reduce_mean from otx.algo.detection.heads.base_head import BaseDenseHead from otx.algo.detection.losses import IoULoss +from otx.algo.modules.activation import Swish from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule from otx.algo.utils.mmengine_utils import InstanceData @@ -41,7 +42,7 @@ class YOLOXHead(BaseDenseHead): stacked_convs (int): Number of stacking convs of the head. Defaults to (8, 16, 32). strides (Sequence[int]): Downsample factor of each feature map. - Defaults to None. + Defaults to None. use_depthwise (bool): Whether to depthwise separable convolution in blocks. Defaults to False. dcn_on_last_conv (bool): If true, use dcn in the last layer of towers. @@ -51,8 +52,8 @@ class YOLOXHead(BaseDenseHead): None, otherwise False. Defaults to "auto". norm_cfg (dict): Config dict for normalization layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). - act_cfg (dict): Config dict for activation layer. - Defaults to None. + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `Swish`. loss_cls (nn.Module, optional): Module of classification loss. loss_bbox (nn.Module, optional): Module of localization loss. loss_obj (nn.Module, optional): Module of objectness loss. @@ -76,7 +77,7 @@ def __init__( dcn_on_last_conv: bool = False, conv_bias: bool | str = "auto", norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = Swish, loss_cls: nn.Module | None = None, loss_bbox: nn.Module | None = None, loss_obj: nn.Module | None = None, @@ -88,9 +89,6 @@ def __init__( if norm_cfg is None: norm_cfg = {"type": "BN", "momentum": 0.03, "eps": 0.001} - if act_cfg is None: - act_cfg = {"type": "Swish"} - if init_cfg is None: init_cfg = { "type": "Kaiming", @@ -118,7 +116,7 @@ def __init__( self.use_sigmoid_cls = True self.norm_cfg = norm_cfg - self.act_cfg = act_cfg + self.activation_callable = activation_callable self.loss_cls = loss_cls or CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0) self.loss_bbox = loss_bbox or IoULoss(mode="square", eps=1e-16, reduction="sum", loss_weight=5.0) @@ -176,7 +174,7 @@ def _build_stacked_convs(self) -> nn.Sequential: stride=1, padding=1, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, bias=self.conv_bias, ), ) diff --git a/src/otx/algo/detection/layers/csp_layer.py b/src/otx/algo/detection/layers/csp_layer.py index 4cb0d10b57a..b4f39ff421a 100644 --- a/src/otx/algo/detection/layers/csp_layer.py +++ b/src/otx/algo/detection/layers/csp_layer.py @@ -5,11 +5,13 @@ from __future__ import annotations +from typing import Callable + import torch from torch import Tensor, nn from otx.algo.detection.layers import ChannelAttention -from otx.algo.modules import build_activation_layer +from otx.algo.modules.activation import Swish from otx.algo.modules.base_module import BaseModule from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule @@ -33,8 +35,8 @@ class DarknetBottleneck(BaseModule): Defaults to False. norm_cfg (dict): Config dict for normalization layer. Defaults to dict(type='BN'). - act_cfg (dict): Config dict for activation layer. - Defaults to dict(type='Swish'). + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `Swish`. """ def __init__( @@ -45,15 +47,12 @@ def __init__( add_identity: bool = True, use_depthwise: bool = False, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = Swish, init_cfg: dict | list[dict] | None = None, ) -> None: if norm_cfg is None: norm_cfg = {"type": "BN", "momentum": 0.03, "eps": 0.001} - if act_cfg is None: - act_cfg = {"type": "Swish"} - super().__init__(init_cfg=init_cfg) hidden_channels = int(out_channels * expansion) @@ -63,7 +62,7 @@ def __init__( hidden_channels, 1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) self.conv2 = conv( hidden_channels, @@ -72,7 +71,7 @@ def __init__( stride=1, padding=1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) self.add_identity = add_identity and in_channels == out_channels @@ -102,8 +101,8 @@ class CSPNeXtBlock(BaseModule): Defaults to 5. norm_cfg (dict): Config dict for normalization layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). - act_cfg (dict): Config dict for activation layer. - Defaults to dict(type='SiLU'). + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.SiLU`. init_cfg (dict or list[dict], optional): Initialization config dict. Defaults to None. """ @@ -117,20 +116,25 @@ def __init__( use_depthwise: bool = False, kernel_size: int = 5, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = nn.SiLU, init_cfg: dict | list[dict] | None = None, ) -> None: if norm_cfg is None: norm_cfg = {"type": "BN", "momentum": 0.03, "eps": 0.001} - if act_cfg is None: - act_cfg = {"type": "SiLU"} - super().__init__(init_cfg=init_cfg) hidden_channels = int(out_channels * expansion) conv = DepthwiseSeparableConvModule if use_depthwise else Conv2dModule - self.conv1 = conv(in_channels, hidden_channels, 3, stride=1, padding=1, norm_cfg=norm_cfg, act_cfg=act_cfg) + self.conv1 = conv( + in_channels, + hidden_channels, + 3, + stride=1, + padding=1, + norm_cfg=norm_cfg, + activation_callable=activation_callable, + ) self.conv2 = DepthwiseSeparableConvModule( hidden_channels, out_channels, @@ -138,7 +142,7 @@ def __init__( stride=1, padding=kernel_size // 2, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) self.add_identity = add_identity and in_channels == out_channels @@ -159,7 +163,8 @@ class RepVggBlock(nn.Module): Args: ch_in (int): The input channels of this Module. ch_out (int): The output channels of this Module. - act_cfg (dict[str, str] | None): Config dict for activation layer. + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to None. norm_cfg (dict[str, str] | None): Config dict for normalization layer. """ @@ -167,16 +172,16 @@ def __init__( self, ch_in: int, ch_out: int, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] | None = None, norm_cfg: dict[str, str] | None = None, ) -> None: """Initialize RepVggBlock.""" super().__init__() self.ch_in = ch_in self.ch_out = ch_out - self.conv1 = Conv2dModule(ch_in, ch_out, 3, 1, padding=1, act_cfg=None, norm_cfg=norm_cfg) - self.conv2 = Conv2dModule(ch_in, ch_out, 1, 1, act_cfg=None, norm_cfg=norm_cfg) - self.act = nn.Identity() if act_cfg is None else build_activation_layer(act_cfg) + self.conv1 = Conv2dModule(ch_in, ch_out, 3, 1, padding=1, activation_callable=None, norm_cfg=norm_cfg) + self.conv2 = Conv2dModule(ch_in, ch_out, 1, 1, activation_callable=None, norm_cfg=norm_cfg) + self.act = activation_callable() if activation_callable else nn.Identity() def forward(self, x: Tensor) -> Tensor: """Forward function.""" @@ -230,8 +235,8 @@ class CSPLayer(BaseModule): stage. Defaults to True. norm_cfg (dict): Config dict for normalization layer. Defaults to dict(type='BN') - act_cfg (dict): Config dict for activation layer. - Defaults to dict(type='Swish') + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to `Swish`. init_cfg (dict or list[dict], optional): Initialization config dict. Defaults to None. """ @@ -247,15 +252,12 @@ def __init__( use_cspnext_block: bool = False, channel_attention: bool = False, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] | None = Swish, init_cfg: dict | list[dict] | None = None, ) -> None: if norm_cfg is None: norm_cfg = {"type": "BN", "momentum": 0.03, "eps": 0.001} - if act_cfg is None: - act_cfg = {"type": "Swish"} - super().__init__(init_cfg=init_cfg) block = CSPNeXtBlock if use_cspnext_block else DarknetBottleneck @@ -266,21 +268,21 @@ def __init__( mid_channels, 1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) self.short_conv = Conv2dModule( in_channels, mid_channels, 1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) self.final_conv = Conv2dModule( 2 * mid_channels, out_channels, 1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) self.blocks = nn.Sequential( @@ -292,7 +294,7 @@ def __init__( add_identity, use_depthwise, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ) for _ in range(num_blocks) ], @@ -325,7 +327,7 @@ class CSPRepLayer(nn.Module): hidden layer. Defaults to 1.0. bias (bool): Whether to use bias in the convolution layer. Defaults to False. - act_cfg (dict[str, str] | None): Config dict for activation layer. + activation_callable (Callable[..., nn.Module] | None): Activation layer module. Defaults to None. norm_cfg (dict[str, str] | None): Config dict for normalization layer. Defaults to None. @@ -338,17 +340,38 @@ def __init__( num_blocks: int = 3, expansion: float = 1.0, bias: bool = False, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] | None = None, norm_cfg: dict[str, str] | None = None, ) -> None: """Initialize CSPRepLayer.""" super().__init__() hidden_channels = int(out_channels * expansion) - self.conv1 = Conv2dModule(in_channels, hidden_channels, 1, 1, bias=bias, act_cfg=act_cfg, norm_cfg=norm_cfg) - self.conv2 = Conv2dModule(in_channels, hidden_channels, 1, 1, bias=bias, act_cfg=act_cfg, norm_cfg=norm_cfg) + self.conv1 = Conv2dModule( + in_channels, + hidden_channels, + 1, + 1, + bias=bias, + activation_callable=activation_callable, + norm_cfg=norm_cfg, + ) + self.conv2 = Conv2dModule( + in_channels, + hidden_channels, + 1, + 1, + bias=bias, + activation_callable=activation_callable, + norm_cfg=norm_cfg, + ) self.bottlenecks = nn.Sequential( *[ - RepVggBlock(hidden_channels, hidden_channels, act_cfg=act_cfg, norm_cfg=norm_cfg) + RepVggBlock( + hidden_channels, + hidden_channels, + activation_callable=activation_callable, + norm_cfg=norm_cfg, + ) for _ in range(num_blocks) ], ) @@ -359,7 +382,7 @@ def __init__( 1, 1, bias=bias, - act_cfg=act_cfg, + activation_callable=activation_callable, norm_cfg=norm_cfg, ) else: diff --git a/src/otx/algo/detection/necks/cspnext_pafpn.py b/src/otx/algo/detection/necks/cspnext_pafpn.py index 4b10101557d..82c91bb9c70 100644 --- a/src/otx/algo/detection/necks/cspnext_pafpn.py +++ b/src/otx/algo/detection/necks/cspnext_pafpn.py @@ -11,11 +11,13 @@ from __future__ import annotations import math +from typing import Callable import torch from torch import Tensor, nn from otx.algo.detection.layers import CSPLayer +from otx.algo.modules.activation import Swish from otx.algo.modules.base_module import BaseModule from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule @@ -31,7 +33,8 @@ class CSPNeXtPAFPN(BaseModule): expand_ratio (float): Ratio to adjust the number of channels of the hidden layer. Default: 0.5 upsample_cfg (dict): Config dict for interpolate layer. Default: `dict(scale_factor=2, mode='nearest')` norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN') - act_cfg (dict): Config dict for activation layer. Default: dict(type='Swish') + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `Swish`. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None. """ @@ -44,12 +47,11 @@ def __init__( expand_ratio: float = 0.5, upsample_cfg: dict | None = None, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = Swish, init_cfg: dict | None = None, ) -> None: upsample_cfg = upsample_cfg or {"scale_factor": 2, "mode": "nearest"} norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001} - act_cfg = act_cfg or {"type": "Swish"} init_cfg = init_cfg or { "type": "Kaiming", "layer": "Conv2d", @@ -76,7 +78,7 @@ def __init__( in_channels[idx - 1], 1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), ) self.top_down_blocks.append( @@ -89,7 +91,7 @@ def __init__( use_cspnext_block=True, expand_ratio=expand_ratio, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), ) @@ -105,7 +107,7 @@ def __init__( stride=2, padding=1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), ) self.bottom_up_blocks.append( @@ -118,14 +120,21 @@ def __init__( use_cspnext_block=True, expand_ratio=expand_ratio, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), ) self.out_convs = nn.ModuleList() for i in range(len(in_channels)): self.out_convs.append( - conv(in_channels[i], out_channels, 3, padding=1, norm_cfg=norm_cfg, act_cfg=act_cfg), + conv( + in_channels[i], + out_channels, + 3, + padding=1, + norm_cfg=norm_cfg, + activation_callable=activation_callable, + ), ) def forward(self, inputs: tuple[Tensor, ...]) -> tuple[Tensor, ...]: diff --git a/src/otx/algo/detection/necks/fpn.py b/src/otx/algo/detection/necks/fpn.py index 1d6c32355e0..4239be3944b 100644 --- a/src/otx/algo/detection/necks/fpn.py +++ b/src/otx/algo/detection/necks/fpn.py @@ -10,6 +10,8 @@ from __future__ import annotations +from typing import Callable + from torch import Tensor, nn from otx.algo.modules.base_module import BaseModule @@ -45,8 +47,8 @@ class FPN(BaseModule): Defaults to False. norm_cfg (dict, optional): Config dict for normalization layer. Defaults to None. - act_cfg (dict, optional): Config dict for - activation layer in ConvModule. Defaults to None. + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to None. upsample_cfg (dict, optional): Config dict for interpolate layer. Defaults to dict(mode='nearest'). init_cfg (dict, list[dict], optional): Initialization config dict. @@ -63,7 +65,7 @@ def __init__( relu_before_extra_convs: bool = False, no_norm_on_lateral: bool = False, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] | None = None, upsample_cfg: dict | None = None, init_cfg: dict | list[dict] | None = None, ) -> None: @@ -103,7 +105,7 @@ def __init__( out_channels, 1, norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, - act_cfg=act_cfg, + activation_callable=activation_callable, inplace=False, ) fpn_conv = Conv2dModule( @@ -112,7 +114,7 @@ def __init__( 3, padding=1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, inplace=False, ) @@ -134,7 +136,7 @@ def __init__( stride=2, padding=1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, inplace=False, ) self.fpn_convs.append(extra_fpn_conv) diff --git a/src/otx/algo/detection/necks/hybrid_encoder.py b/src/otx/algo/detection/necks/hybrid_encoder.py index dc8879ada33..c93d7fb842c 100644 --- a/src/otx/algo/detection/necks/hybrid_encoder.py +++ b/src/otx/algo/detection/necks/hybrid_encoder.py @@ -6,12 +6,13 @@ from __future__ import annotations import copy +from typing import Callable import torch from torch import nn from otx.algo.detection.layers import CSPRepLayer -from otx.algo.modules import Conv2dModule, build_activation_layer +from otx.algo.modules import Conv2dModule from otx.algo.modules.base_module import BaseModule __all__ = ["HybridEncoder"] @@ -25,7 +26,7 @@ def __init__( nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] = nn.GELU, normalize_before: bool = False, ) -> None: super().__init__() @@ -41,8 +42,7 @@ def __init__( self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) - act_cfg = act_cfg if act_cfg is not None else {"type": "GELU"} - self.activation = build_activation_layer(act_cfg) + self.activation = activation_callable() @staticmethod def with_pos_embed(tensor: torch.Tensor, pos_embed: torch.Tensor | None) -> torch.Tensor: @@ -111,8 +111,8 @@ class HybridEncoder(BaseModule): dim_feedforward (int, optional): Dimension of the feedforward network in the transformer encoder. Defaults to 1024. dropout (float, optional): Dropout rate. Defaults to 0.0. - enc_act_cfg (dict[str, str] | None, optional): Activation configuration - for the encoder. Defaults to None. + enc_activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.GELU`. norm_cfg (dict[str, str] | None, optional): Normalization configuration. Defaults to None. use_encoder_idx (list[int], optional): List of indices of the encoder to use. @@ -125,8 +125,8 @@ class HybridEncoder(BaseModule): Defaults to 1.0. depth_mult (float, optional): Depth multiplier for the CSPRepLayer. Defaults to 1.0. - act_cfg (dict[str, str] | None, optional): Activation configuration - for the CSPRepLayer. Defaults to None. + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.SiLU`. eval_spatial_size (tuple[int, int] | None, optional): Spatial size for evaluation. Defaults to None. """ @@ -139,14 +139,14 @@ def __init__( nhead: int = 8, dim_feedforward: int = 1024, dropout: float = 0.0, - enc_act_cfg: dict[str, str] | None = None, + enc_activation_callable: Callable[..., nn.Module] = nn.GELU, norm_cfg: dict[str, str] | None = None, use_encoder_idx: list[int] = [2], # noqa: B006 num_encoder_layers: int = 1, pe_temperature: float = 10000, expansion: float = 1.0, depth_mult: float = 1.0, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] = nn.SiLU, eval_spatial_size: tuple[int, int] | None = None, ) -> None: """Initialize the HybridEncoder module.""" @@ -161,8 +161,6 @@ def __init__( self.out_channels = [hidden_dim for _ in range(len(in_channels))] self.out_strides = feat_strides - enc_act_cfg = enc_act_cfg if enc_act_cfg is not None else {"type": "GELU"} - act_cfg = act_cfg if act_cfg is not None else {"type": "SiLU"} norm_cfg = norm_cfg if norm_cfg is not None else {"type": "BN", "name": "norm"} # channel projection self.input_proj = nn.ModuleList() @@ -180,7 +178,7 @@ def __init__( nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, - act_cfg=enc_act_cfg, + activation_callable=enc_activation_callable, ) self.encoder = nn.ModuleList( @@ -191,13 +189,15 @@ def __init__( self.lateral_convs = nn.ModuleList() self.fpn_blocks = nn.ModuleList() for _ in range(len(in_channels) - 1, 0, -1): - self.lateral_convs.append(Conv2dModule(hidden_dim, hidden_dim, 1, 1, act_cfg=act_cfg, norm_cfg=norm_cfg)) + self.lateral_convs.append( + Conv2dModule(hidden_dim, hidden_dim, 1, 1, activation_callable=activation_callable, norm_cfg=norm_cfg), + ) self.fpn_blocks.append( CSPRepLayer( hidden_dim * 2, hidden_dim, round(3 * depth_mult), - act_cfg=act_cfg, + activation_callable=activation_callable, expansion=expansion, norm_cfg=norm_cfg, ), @@ -208,14 +208,22 @@ def __init__( self.pan_blocks = nn.ModuleList() for _ in range(len(in_channels) - 1): self.downsample_convs.append( - Conv2dModule(hidden_dim, hidden_dim, 3, 2, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg), + Conv2dModule( + hidden_dim, + hidden_dim, + 3, + 2, + padding=1, + activation_callable=activation_callable, + norm_cfg=norm_cfg, + ), ) self.pan_blocks.append( CSPRepLayer( hidden_dim * 2, hidden_dim, round(3 * depth_mult), - act_cfg=act_cfg, + activation_callable=activation_callable, expansion=expansion, norm_cfg=norm_cfg, ), diff --git a/src/otx/algo/detection/necks/yolox_pafpn.py b/src/otx/algo/detection/necks/yolox_pafpn.py index 762d6c36852..68b99e27caa 100644 --- a/src/otx/algo/detection/necks/yolox_pafpn.py +++ b/src/otx/algo/detection/necks/yolox_pafpn.py @@ -9,12 +9,13 @@ from __future__ import annotations import math -from typing import Any +from typing import Any, Callable import torch from torch import Tensor, nn from otx.algo.detection.layers import CSPLayer +from otx.algo.modules.activation import Swish from otx.algo.modules.base_module import BaseModule from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule @@ -32,8 +33,8 @@ class YOLOXPAFPN(BaseModule): Default: `dict(scale_factor=2, mode='nearest')` norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN') - act_cfg (dict): Config dict for activation layer. - Default: dict(type='Swish') + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.Swish`. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None. """ @@ -46,12 +47,11 @@ def __init__( use_depthwise: bool = False, upsample_cfg: dict | None = None, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = Swish, init_cfg: dict | list[dict] | None = None, ): upsample_cfg = upsample_cfg or {"scale_factor": 2, "mode": "nearest"} norm_cfg = norm_cfg or {"type": "BN", "momentum": 0.03, "eps": 0.001} - act_cfg = act_cfg or {"type": "Swish"} init_cfg = init_cfg or { "type": "Kaiming", "layer": "Conv2d", @@ -79,7 +79,7 @@ def __init__( in_channels[idx - 1], 1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), ) self.top_down_blocks.append( @@ -90,7 +90,7 @@ def __init__( add_identity=False, use_depthwise=use_depthwise, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), ) @@ -106,7 +106,7 @@ def __init__( stride=2, padding=1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), ) self.bottom_up_blocks.append( @@ -117,14 +117,20 @@ def __init__( add_identity=False, use_depthwise=use_depthwise, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), ) self.out_convs = nn.ModuleList() for i in range(len(in_channels)): self.out_convs.append( - Conv2dModule(in_channels[i], out_channels, 1, norm_cfg=norm_cfg, act_cfg=act_cfg), + Conv2dModule( + in_channels[i], + out_channels, + 1, + norm_cfg=norm_cfg, + activation_callable=activation_callable, + ), ) def forward(self, inputs: tuple[Tensor]) -> tuple[Any, ...]: diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py index 051df93020c..254ad2aff4e 100644 --- a/src/otx/algo/detection/rtmdet.py +++ b/src/otx/algo/detection/rtmdet.py @@ -5,6 +5,10 @@ from __future__ import annotations +from functools import partial + +from torch import nn + from otx.algo.common.backbones import CSPNeXt from otx.algo.common.losses import GIoULoss, QualityFocalLoss from otx.algo.common.losses.cross_entropy_loss import CrossEntropyLoss @@ -90,7 +94,7 @@ def _build_model(self, num_classes: int) -> RTMDet: deepen_factor=0.167, widen_factor=0.375, norm_cfg={"type": "BN"}, - act_cfg={"type": "SiLU", "inplace": True}, + activation_callable=partial(nn.SiLU, inplace=True), ) neck = CSPNeXtPAFPN( @@ -98,7 +102,7 @@ def _build_model(self, num_classes: int) -> RTMDet: out_channels=96, num_csp_blocks=1, norm_cfg={"type": "BN"}, - act_cfg={"type": "SiLU", "inplace": True}, + activation_callable=partial(nn.SiLU, inplace=True), ) bbox_head = RTMDetSepBNHead( @@ -113,7 +117,7 @@ def _build_model(self, num_classes: int) -> RTMDet: loss_bbox=GIoULoss(loss_weight=2.0), loss_centerness=CrossEntropyLoss(use_sigmoid=True, loss_weight=1.0), norm_cfg={"type": "BN"}, - act_cfg={"type": "SiLU", "inplace": True}, + activation_callable=partial(nn.SiLU, inplace=True), train_cfg=train_cfg, test_cfg=test_cfg, ) diff --git a/src/otx/algo/instance_segmentation/backbones/swin.py b/src/otx/algo/instance_segmentation/backbones/swin.py index 4eb85af6362..b249dfdb75c 100644 --- a/src/otx/algo/instance_segmentation/backbones/swin.py +++ b/src/otx/algo/instance_segmentation/backbones/swin.py @@ -12,6 +12,7 @@ from collections import OrderedDict from copy import deepcopy from pathlib import Path +from typing import Callable import torch import torch.nn.functional @@ -317,8 +318,8 @@ class SwinBlock(BaseModule): drop_rate (float, optional): Dropout rate. Default: 0. attn_drop_rate (float, optional): Attention dropout rate. Default: 0. drop_path_rate (float, optional): Stochastic depth rate. Default: 0. - act_cfg (dict, optional): The config dict of activation function. - Default: dict(type='GELU'). + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.GELU`. norm_cfg (dict, optional): The config dict of normalization. Default: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint @@ -340,7 +341,7 @@ def __init__( drop_rate: float = 0.0, attn_drop_rate: float = 0.0, drop_path_rate: float = 0.0, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = nn.GELU, norm_cfg: dict | None = None, with_cp: bool = False, init_cfg: None = None, @@ -350,7 +351,6 @@ def __init__( self.init_cfg = init_cfg self.with_cp = with_cp - act_cfg = act_cfg if act_cfg is not None else {"type": "GELU"} norm_cfg = norm_cfg if norm_cfg is not None else {"type": "LN"} self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] @@ -374,7 +374,7 @@ def __init__( num_fcs=2, ffn_drop=drop_rate, dropout_layer={"type": "DropPath", "drop_prob": drop_path_rate}, - act_cfg=act_cfg, + activation_callable=activation_callable, add_identity=True, init_cfg=None, ) @@ -415,8 +415,8 @@ class SwinBlockSequence(BaseModule): rate. Default: 0. downsample (BaseModule | None, optional): The downsample operation module. Default: None. - act_cfg (dict, optional): The config dict of activation function. - Default: dict(type='GELU'). + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.GELU`. norm_cfg (dict, optional): The config dict of normalization. Default: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint @@ -439,14 +439,13 @@ def __init__( attn_drop_rate: float = 0.0, drop_path_rate: list[float] | float = 0.0, downsample: BaseModule | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = nn.GELU, norm_cfg: dict | None = None, with_cp: bool = False, init_cfg: None = None, ): super().__init__(init_cfg=init_cfg) - act_cfg = act_cfg if act_cfg is not None else {"type": "GELU"} norm_cfg = norm_cfg if norm_cfg is not None else {"type": "LN"} if isinstance(drop_path_rate, list): @@ -470,7 +469,7 @@ def __init__( drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rates[i], - act_cfg=act_cfg, + activation_callable=activation_callable, norm_cfg=norm_cfg, with_cp=with_cp, init_cfg=None, @@ -528,8 +527,8 @@ class SwinTransformer(BaseModule): drop_rate (float): Dropout rate. Defaults: 0. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Defaults: 0.1. - act_cfg (dict): Config dict for activation layer. - Default: dict(type='GELU'). + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.GELU`. norm_cfg (dict): Config dict for normalization layer at output of backone. Defaults: dict(type='LN'). with_cp (bool, optional): Use checkpoint or not. Using checkpoint @@ -564,7 +563,7 @@ def __init__( drop_rate: float = 0.0, attn_drop_rate: float = 0.0, drop_path_rate: float = 0.1, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = nn.GELU, norm_cfg: dict | None = None, with_cp: bool = False, pretrained: str | None = None, @@ -572,7 +571,6 @@ def __init__( frozen_stages: int = -1, init_cfg: dict | None = None, ): - act_cfg = act_cfg if act_cfg is not None else {"type": "GELU"} norm_cfg = norm_cfg if norm_cfg is not None else {"type": "LN"} self.convert_weights = convert_weights self.frozen_stages = frozen_stages @@ -650,7 +648,7 @@ def __init__( attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[sum(depths[:i]) : sum(depths[: i + 1])], downsample=downsample, - act_cfg=act_cfg, + activation_callable=activation_callable, norm_cfg=norm_cfg, with_cp=with_cp, init_cfg=None, diff --git a/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py b/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py index 9d46627a43b..87c5c3dbc99 100644 --- a/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py +++ b/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py @@ -10,6 +10,8 @@ import copy import math +from functools import partial +from typing import Callable import numpy as np import torch @@ -109,7 +111,7 @@ def _init_layers(self) -> None: stride=1, padding=1, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ), ) pred_pad_size = self.pred_kernel_size // 2 @@ -125,7 +127,7 @@ def _init_layers(self) -> None: stacked_convs=4, num_levels=len(self.prior_generator.strides), num_prototypes=self.num_prototypes, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, norm_cfg=self.norm_cfg, ) @@ -702,16 +704,16 @@ class MaskFeatModule(BaseModule): Args: in_channels (int): Number of channels in the input feature map. feat_channels (int): Number of hidden channels of the mask feature - map branch. + map branch. num_levels (int): The starting feature map level from RPN that - will be used to predict the mask feature map. + will be used to predict the mask feature map. num_prototypes (int): Number of output channel of the mask feature - map branch. This is the channel count of the mask - feature map that to be dynamically convolved with the predicted - kernel. + map branch. This is the channel count of the mask + feature map that to be dynamically convolved with the predicted + kernel. stacked_convs (int): Number of convs in mask feature branch. - act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU', inplace=True) + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `partial(nn.ReLU, inplace=True)`. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='BN'). """ @@ -722,14 +724,11 @@ def __init__( stacked_convs: int = 4, num_levels: int = 3, num_prototypes: int = 8, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = partial(nn.ReLU, inplace=True), norm_cfg: dict | None = None, ) -> None: super().__init__(init_cfg=None) - if act_cfg is None: - act_cfg = {"type": "ReLU", "inplace": True} - if norm_cfg is None: norm_cfg = {"type": "BN"} @@ -738,7 +737,16 @@ def __init__( convs = [] for i in range(stacked_convs): in_c = in_channels if i == 0 else feat_channels - convs.append(Conv2dModule(in_c, feat_channels, 3, padding=1, act_cfg=act_cfg, norm_cfg=norm_cfg)) + convs.append( + Conv2dModule( + in_c, + feat_channels, + 3, + padding=1, + activation_callable=activation_callable, + norm_cfg=norm_cfg, + ), + ) self.stacked_convs = nn.Sequential(*convs) self.projection = nn.Conv2d(feat_channels, num_prototypes, kernel_size=1) @@ -768,8 +776,8 @@ class RTMDetInsSepBNHead(RTMDetInsHead): Defaults to True. norm_cfg (dict): Config dict for normalization layer. Defaults to dict(type='BN'). - act_cfg (dict): Config dict for activation layer. - Defaults to dict(type='SiLU', inplace=True). + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `partial(nn.SiLU, inplace=True)`. pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1. """ @@ -780,21 +788,19 @@ def __init__( share_conv: bool = True, with_objectness: bool = False, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = partial(nn.SiLU, inplace=True), pred_kernel_size: int = 1, **kwargs, ) -> None: if norm_cfg is None: norm_cfg = {"type": "BN", "requires_grad": True} - if act_cfg is None: - act_cfg = {"type": "SiLU", "inplace": True} self.share_conv = share_conv super().__init__( num_classes, in_channels, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, pred_kernel_size=pred_kernel_size, with_objectness=with_objectness, **kwargs, @@ -849,7 +855,7 @@ def _init_layers(self) -> None: stride=1, padding=1, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ), ) reg_convs.append( @@ -860,7 +866,7 @@ def _init_layers(self) -> None: stride=1, padding=1, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ), ) kernel_convs.append( @@ -871,7 +877,7 @@ def _init_layers(self) -> None: stride=1, padding=1, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ), ) self.cls_convs.append(cls_convs) @@ -907,7 +913,7 @@ def _init_layers(self) -> None: stacked_convs=4, num_levels=len(self.prior_generator.strides), num_prototypes=self.num_prototypes, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, norm_cfg=self.norm_cfg, ) diff --git a/src/otx/algo/instance_segmentation/maskrcnn.py b/src/otx/algo/instance_segmentation/maskrcnn.py index 6a9f95c9974..0f6796aa791 100644 --- a/src/otx/algo/instance_segmentation/maskrcnn.py +++ b/src/otx/algo/instance_segmentation/maskrcnn.py @@ -7,6 +7,7 @@ from typing import Any +from torch import nn from torchvision.ops import RoIAlign from otx.algo.common.backbones import ResNet, build_model_including_pytorchcv @@ -330,7 +331,7 @@ def _build_model(self, num_classes: int) -> TwoStageDetector: "out_indices": [2, 3, 4, 5], "frozen_stages": -1, "pretrained": True, - "activation_cfg": {"type": "torch_swish"}, + "activation_callable": nn.SiLU, "norm_cfg": {"type": "BN", "requires_grad": True}, }, ) diff --git a/src/otx/algo/instance_segmentation/necks/fpn.py b/src/otx/algo/instance_segmentation/necks/fpn.py index 67286814f89..005b80cfe91 100644 --- a/src/otx/algo/instance_segmentation/necks/fpn.py +++ b/src/otx/algo/instance_segmentation/necks/fpn.py @@ -8,6 +8,8 @@ from __future__ import annotations +from typing import Callable + import torch.nn.functional from torch import Tensor, nn @@ -36,8 +38,8 @@ class FPN(BaseModule): Defaults to False. norm_cfg (dict, optional): Config dict for normalization layer. Defaults to None. - act_cfg (dict, optional): Config dict for - activation layer in ConvModule. Defaults to None. + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to None. upsample_cfg (dict, optional): Config dict for interpolate layer. Defaults to dict(mode='nearest'). init_cfg (dict or list[dict]): Initialization config dict. @@ -53,7 +55,7 @@ def __init__( relu_before_extra_convs: bool = False, no_norm_on_lateral: bool = False, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] | None = None, upsample_cfg: dict | None = None, init_cfg: dict | list[dict] | None = None, ) -> None: @@ -97,7 +99,7 @@ def __init__( out_channels, 1, norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, - act_cfg=act_cfg, + activation_callable=activation_callable, inplace=False, ) fpn_conv = Conv2dModule( @@ -106,7 +108,7 @@ def __init__( 3, padding=1, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, inplace=False, ) diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py index 53f028e0f9e..5f0a97215d0 100644 --- a/src/otx/algo/instance_segmentation/rtmdet_inst.py +++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py @@ -5,8 +5,11 @@ from __future__ import annotations +from functools import partial from typing import TYPE_CHECKING +from torch import nn + from otx.algo.common.backbones import CSPNeXt from otx.algo.common.losses import CrossEntropyLoss, GIoULoss, QualityFocalLoss from otx.algo.common.utils.assigners import DynamicSoftLabelAssigner @@ -112,7 +115,7 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: widen_factor=0.375, channel_attention=True, norm_cfg={"type": "BN"}, - act_cfg={"type": "SiLU", "inplace": True}, + activation_callable=partial(nn.SiLU, inplace=True), ) neck = CSPNeXtPAFPN( @@ -121,7 +124,7 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: num_csp_blocks=1, expand_ratio=0.5, norm_cfg={"type": "BN"}, - act_cfg={"type": "SiLU", "inplace": True}, + activation_callable=partial(nn.SiLU, inplace=True), ) bbox_head = RTMDetInsSepBNHead( @@ -131,7 +134,7 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: share_conv=True, pred_kernel_size=1, feat_channels=96, - act_cfg={"type": "SiLU", "inplace": True}, + activation_callable=partial(nn.SiLU, inplace=True), norm_cfg={"type": "BN", "requires_grad": True}, anchor_generator=MlvlPointGenerator( offset=0, diff --git a/src/otx/algo/keypoint_detection/rtmpose.py b/src/otx/algo/keypoint_detection/rtmpose.py index c552580b557..0086acd80dd 100644 --- a/src/otx/algo/keypoint_detection/rtmpose.py +++ b/src/otx/algo/keypoint_detection/rtmpose.py @@ -5,6 +5,7 @@ from __future__ import annotations +from functools import partial from typing import TYPE_CHECKING from otx.algo.common.backbones import CSPNeXt @@ -13,6 +14,7 @@ from otx.algo.keypoint_detection.topdown import TopdownPoseEstimator from otx.core.exporter.native import OTXNativeModelExporter from otx.core.model.keypoint_detection import OTXKeypointDetectionModel +from torch import nn if TYPE_CHECKING: from otx.core.exporter.base import OTXModelExporter @@ -77,7 +79,7 @@ def _build_model(self, num_classes: int) -> RTMPose: out_indices=(4,), channel_attention=True, norm_cfg={"type": "BN"}, - act_cfg={"type": "SiLU", "inplace": True}, + activation_callable=partial(nn.SiLU, inplace=True), ) head = RTMCCHead( out_channels=num_classes, diff --git a/src/otx/algo/modules/__init__.py b/src/otx/algo/modules/__init__.py index 605f47c67e0..ddf0be601e4 100644 --- a/src/otx/algo/modules/__init__.py +++ b/src/otx/algo/modules/__init__.py @@ -1,16 +1,13 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -# Copyright (c) OpenMMLab. All rights reserved. -"""This module implementation is a code implementation copied or replaced from mmcv.cnn.bricks.""" +"""Common module implementations.""" -from .activation import build_activation_layer from .conv_module import Conv2dModule, Conv3dModule, DepthwiseSeparableConvModule from .norm import FrozenBatchNorm2d, build_norm_layer from .padding import build_padding_layer __all__ = [ - "build_activation_layer", "build_padding_layer", "build_norm_layer", "Conv2dModule", diff --git a/src/otx/algo/modules/activation.py b/src/otx/algo/modules/activation.py index cc3a1e95080..81249243dd1 100644 --- a/src/otx/algo/modules/activation.py +++ b/src/otx/algo/modules/activation.py @@ -2,13 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) OpenMMLab. All rights reserved. -"""This implementation replaces the functionality of mmcv.cnn.bricks.activation.build_activation_layer.""" -from __future__ import annotations +"""Custom activation implementation copied from mmcv.cnn.bricks.swish.py.""" -import copy +from __future__ import annotations import torch -from torch import nn +from torch import Tensor, nn class Swish(nn.Module): @@ -23,52 +22,13 @@ class Swish(nn.Module): Tensor: The output tensor. """ - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, x: Tensor) -> Tensor: """Forward function. Args: - x (torch.Tensor): The input tensor. + x (Tensor): The input tensor. Returns: - torch.Tensor: The output tensor. + Tensor: The output tensor. """ return x * torch.sigmoid(x) - - -ACTIVATION_DICT = { - "ReLU": nn.ReLU, - "LeakyReLU": nn.LeakyReLU, - "PReLU": nn.PReLU, - "RReLU": nn.RReLU, - "ReLU6": nn.ReLU6, - "ELU": nn.ELU, - "Sigmoid": nn.Sigmoid, - "Tanh": nn.Tanh, - "SiLU": nn.SiLU, - "GELU": nn.GELU, - "Swish": Swish, -} - - -def build_activation_layer(cfg: dict) -> nn.Module: - """Build activation layer. - - Args: - cfg (dict): The activation layer config, which should contain: - - - type (str): Layer type. - - layer args: Args needed to instantiate an activation layer. - - Returns: - nn.Module: Created activation layer. - """ - _cfg = copy.deepcopy(cfg) - activation_type = _cfg.pop("type", None) - if activation_type is None: - msg = "The cfg dict must contain the key 'type'" - raise KeyError(msg) - if activation_type not in ACTIVATION_DICT: - msg = f"Cannot find {activation_type} in {ACTIVATION_DICT.keys()}" - raise KeyError(msg) - - return ACTIVATION_DICT[activation_type](**_cfg) diff --git a/src/otx/algo/modules/conv_module.py b/src/otx/algo/modules/conv_module.py index 8fa9d6764d7..aa5e51879ba 100644 --- a/src/otx/algo/modules/conv_module.py +++ b/src/otx/algo/modules/conv_module.py @@ -2,14 +2,15 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright (c) OpenMMLab. All rights reserved. -"""This implementation copied ConvModule of mmcv.cnn.bricks.ConvModule.""" +"""This implementation modified ConvModule of mmcv.cnn.bricks.ConvModule.""" # TODO(someone): Revisit mypy errors after deprecation of mmlab from __future__ import annotations import warnings -from typing import TYPE_CHECKING +from functools import partial +from typing import TYPE_CHECKING, Callable from torch import Tensor, nn from torch.nn.modules.batchnorm import _BatchNorm as BatchNorm @@ -17,7 +18,6 @@ from otx.algo.utils.weight_init import constant_init, kaiming_init -from .activation import build_activation_layer from .norm import build_norm_layer from .padding import build_padding_layer @@ -25,12 +25,37 @@ from torch.nn.modules.conv import _ConvNd as ConvNd +AVAILABLE_ACTIVATION_LIST: list[str] = [ + "ReLU", + "LeakyReLU", + "PReLU", + "RReLU", + "ReLU6", + "ELU", + "Sigmoid", + "Tanh", + "SiLU", + "GELU", + "Swish", +] + +ACTIVATION_LIST_NOT_SUPPORTING_INPLACE: list[str] = [ + "Tanh", + "PReLU", + "Sigmoid", + "HSigmoid", + "Swish", + "GELU", + "SiLU", +] + + class ConvModule(nn.Module): """A conv block that bundles conv/norm/activation layers. This block simplifies the usage of convolution layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU). - It is based upon two build methods: `build_norm_layer()` and `build_activation_layer()`. + It is based upon a build method: `build_norm_layer()`. Besides, we add some additional features in this module. 1. Automatically set `bias` of the conv layer. @@ -57,8 +82,8 @@ class ConvModule(nn.Module): norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise False. Default: "auto". norm_cfg (dict): Config dict for normalization layer. Default: None. - act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU'). + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.ReLU`. inplace (bool): Whether to use inplace mode for activation. Default: True. with_spectral_norm (bool): Whether use spectral norm in conv module. @@ -84,7 +109,7 @@ def __init__( groups: int = 1, bias: bool | str = "auto", norm_cfg: dict | None = None, - act_cfg: dict | None = {"type": "ReLU"}, # noqa: B006 + activation_callable: Callable[..., nn.Module] | None = nn.ReLU, inplace: bool = True, with_spectral_norm: bool = False, padding_mode: str = "zeros", @@ -93,13 +118,11 @@ def __init__( assert norm_cfg is None or isinstance(norm_cfg, dict) # noqa: S101 official_padding_mode = ["zeros", "circular"] self.norm_cfg = norm_cfg - self.act_cfg = act_cfg self.inplace = inplace self.with_spectral_norm = with_spectral_norm self.with_explicit_padding = padding_mode not in official_padding_mode self.with_norm = norm_cfg is not None - self.with_activation = act_cfg is not None # if the conv layer is before a norm layer, bias is unnecessary. if bias == "auto": bias = not self.with_norm @@ -148,24 +171,48 @@ def __init__( self.norm_name = None # type: ignore[assignment] # build activation layer - if self.with_activation: - act_cfg_ = act_cfg.copy() # type: ignore[union-attr] - # nn.Tanh has no 'inplace' argument - if act_cfg_["type"] not in [ - "Tanh", - "PReLU", - "Sigmoid", - "HSigmoid", - "Swish", - "GELU", - "SiLU", - ]: - act_cfg_.setdefault("inplace", inplace) - self.activate = build_activation_layer(act_cfg_) + self.activation: nn.Module | None = None + self._with_activation: bool | None = None + if activation_callable is not None: + if ( + isinstance(activation_callable, partial) + and activation_callable.func.__name__ not in AVAILABLE_ACTIVATION_LIST + ): + msg = f"Unsupported activation: {activation_callable.func.__name__}." + raise ValueError(msg) + + if ( + not isinstance(activation_callable, partial) + and activation_callable.__name__ not in AVAILABLE_ACTIVATION_LIST + ): + msg = f"Unsupported activation: {activation_callable.__name__}." + raise ValueError(msg) + + self.activation = activation_callable() + + # update inplace + if self.activation.__class__.__name__ not in ACTIVATION_LIST_NOT_SUPPORTING_INPLACE: + self.activation.inplace = inplace # Use msra init by default self.init_weights() + @property + def with_activation(self) -> bool: + """Whether the conv module has activation.""" + if self._with_activation is not None: + # src/otx/algo/segmentation/heads/fcn_head.py L144 + return self._with_activation + return self.activation is not None + + @with_activation.setter + def with_activation(self, value: bool) -> None: + """Setter for with_activation. + + For src/otx/algo/segmentation/heads/fcn_head.py L144. + """ + self._with_activation = value + @property def norm_layer(self) -> nn.Module | None: """Get the normalization layer. @@ -189,9 +236,9 @@ def init_weights(self) -> None: # Note: For PyTorch's conv layers, they will be overwritten by our # initialization implementation using default ``kaiming_init``. if not hasattr(self.conv, "init_weights"): - if self.with_activation and self.act_cfg["type"] == "LeakyReLU": # type: ignore[index] + if self.with_activation and isinstance(self.activation, nn.LeakyReLU): nonlinearity = "leaky_relu" - a = self.act_cfg.get("negative_slope", 0.01) # type: ignore[union-attr] + a = getattr(self.activation, "negative_slop", 0.01) else: nonlinearity = "relu" a = 0 @@ -216,7 +263,7 @@ def forward(self, x: Tensor, activate: bool = True, norm: bool = True) -> Tensor if norm and self.with_norm: x = self.norm_layer(x) # type: ignore[misc] if activate and self.with_activation: - x = self.activate(x) + x = self.activation(x) # type: ignore[misc] return x @@ -230,7 +277,7 @@ class DepthwiseSeparableConvModule(nn.Module): conv block contains depthwise-conv/norm/activation layers. The pointwise conv block contains pointwise-conv/norm/activation layers. It should be noted that there will be norm/activation layer in the depthwise conv block - if `norm_cfg` and `act_cfg` are specified. + if `norm_cfg` and `activation_callable` are specified. Args: in_channels (int): Number of channels in the input feature map. @@ -247,16 +294,19 @@ class DepthwiseSeparableConvModule(nn.Module): Same as that in ``nn._ConvNd``. Default: 1. norm_cfg (dict): Default norm config for both depthwise ConvModule and pointwise ConvModule. Default: None. - act_cfg (dict): Default activation config for both depthwise ConvModule - and pointwise ConvModule. Default: dict(type='ReLU'). + activation_callable (Callable[..., nn.Module]): Activation layer module + for both depthwise ConvModule and pointwise ConvModule. + Defaults to `nn.ReLU`. dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is None, it will be the same as `norm_cfg`. Default: None. - dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is - None, it will be the same as `act_cfg`. Default: None. + dw_activation_callable (Callable[..., nn.Module] | None): Activation layer module of depthwise ConvModule. + If it is None, it will be the same as `activation_callable`. + Defaults to None. pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is None, it will be the same as `norm_cfg`. Default: None. - pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is - None, it will be the same as `act_cfg`. Default: None. + pw_activation_callable (Callable[..., nn.Module] | None): Activation layer module of pointwise ConvModule. + If it is None, it will be the same as `activation_callable`. + Defaults to None. kwargs (optional): Other shared arguments for depthwise and pointwise ConvModule. See ConvModule for ref. """ @@ -270,16 +320,13 @@ def __init__( padding: int | tuple[int, int] = 0, dilation: int | tuple[int, int] = 1, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = nn.ReLU, dw_norm_cfg: dict | None = None, - dw_act_cfg: dict | None = None, + dw_activation_callable: Callable[..., nn.Module] | None = None, pw_norm_cfg: dict | None = None, - pw_act_cfg: dict | None = None, + pw_activation_callable: Callable[..., nn.Module] | None = None, **kwargs, ): - if act_cfg is None: - act_cfg = {"type": "ReLU"} - super().__init__() if "groups" in kwargs: msg = "groups should not be specified in DepthwiseSeparableConvModule." @@ -288,9 +335,9 @@ def __init__( # if norm/activation config of depthwise/pointwise Conv2dModule is not # specified, use default config. dw_norm_cfg = dw_norm_cfg or norm_cfg - dw_act_cfg = dw_act_cfg or act_cfg + dw_activation_callable = dw_activation_callable or activation_callable pw_norm_cfg = pw_norm_cfg or norm_cfg - pw_act_cfg = pw_act_cfg or act_cfg + pw_activation_callable = pw_activation_callable or activation_callable # depthwise convolution self.depthwise_conv = Conv2dModule( @@ -302,7 +349,7 @@ def __init__( dilation=dilation, groups=in_channels, norm_cfg=dw_norm_cfg, - act_cfg=dw_act_cfg, + activation_callable=dw_activation_callable, **kwargs, ) @@ -311,7 +358,7 @@ def __init__( out_channels, 1, norm_cfg=pw_norm_cfg, - act_cfg=pw_act_cfg, + activation_callable=pw_activation_callable, **kwargs, ) diff --git a/src/otx/algo/modules/transformer.py b/src/otx/algo/modules/transformer.py index 46cdd96a943..28bbfe76728 100644 --- a/src/otx/algo/modules/transformer.py +++ b/src/otx/algo/modules/transformer.py @@ -7,13 +7,14 @@ from __future__ import annotations import math +from functools import partial +from typing import Callable import torch from torch import nn from otx.algo.modules.base_module import BaseModule, Sequential -from .activation import build_activation_layer from .drop import build_dropout from .norm import build_norm_layer @@ -252,8 +253,8 @@ class FFN(BaseModule): Defaults: 1024. num_fcs (int, optional): The number of fully-connected layers in FFNs. Default: 2. - act_cfg (dict, optional): The activation config for FFNs. - Default: dict(type='ReLU') + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `partial(nn.ReLU, inplace=True)`. ffn_drop (float, optional): Probability of an element to be zeroed in FFN. Default 0.0. add_identity (bool, optional): Whether to add the @@ -269,7 +270,7 @@ def __init__( embed_dims: int = 256, feedforward_channels: int = 1024, num_fcs: int = 2, - act_cfg: dict = {"type": "ReLU", "inplace": True}, # noqa: B006 + activation_callable: Callable[..., nn.Module] = partial(nn.ReLU, inplace=True), ffn_drop: float = 0.0, dropout_layer: dict | None = None, add_identity: bool = True, @@ -289,7 +290,7 @@ def __init__( layers.append( Sequential( nn.Linear(in_channels, feedforward_channels), - build_activation_layer(act_cfg), + activation_callable(), nn.Dropout(ffn_drop), ), ) diff --git a/src/otx/algo/segmentation/backbones/litehrnet.py b/src/otx/algo/segmentation/backbones/litehrnet.py index 48e359862bd..a47a4571bf1 100644 --- a/src/otx/algo/segmentation/backbones/litehrnet.py +++ b/src/otx/algo/segmentation/backbones/litehrnet.py @@ -10,6 +10,7 @@ from __future__ import annotations from pathlib import Path +from typing import Callable import torch import torch.utils.checkpoint as cp @@ -61,7 +62,7 @@ def __init__( kernel_size=1, stride=1, norm_cfg=norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ), Conv2dModule( self.key_channels, @@ -71,7 +72,7 @@ def __init__( padding=(self.kernel_size - 1) // 2, groups=self.key_channels, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ), Conv2dModule( in_channels=self.key_channels, @@ -79,7 +80,7 @@ def __init__( kernel_size=1, stride=1, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ), ) self.value = nn.Sequential( @@ -89,7 +90,7 @@ def __init__( kernel_size=1, stride=1, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ), nn.Unfold(kernel_size=self.kernel_size, stride=1, padding=1), ) @@ -99,7 +100,7 @@ def __init__( kernel_size=1, stride=1, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -117,30 +118,34 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class CrossResolutionWeighting(nn.Module): - """Cross resolution weighting.""" + """Cross resolution weighting. + + Args: + channels (list[int]): Number of channels for each stage. + ratio (int): Reduction ratio of the bottleneck block. + norm_cfg (dict | None): Config dict for normalization layer. Default: None + activation_callable (Callable[..., nn.Module] | tuple[Callable[..., nn.Module], Callable[..., nn.Module]]): \ + Activation layer module or a tuple of activation layer modules. + Defaults to (`nn.ReLU`, `nn.Sigmoid`). + """ def __init__( self, channels: list[int], ratio: int = 16, norm_cfg: dict | None = None, - act_cfg: dict | tuple[dict, dict] = ({"type": "ReLU"}, {"type": "Sigmoid"}), + activation_callable: Callable[..., nn.Module] | tuple[Callable[..., nn.Module], Callable[..., nn.Module]] = ( + nn.ReLU, + nn.Sigmoid, + ), ) -> None: - """Cross resolution weighting. - - Args: - channels (list[int]): Number of channels for each stage. - ratio (int): Reduction ratio of the bottleneck block. - norm_cfg (dict | None): Config dict for normalization layer. Default: None - act_cfg (dict | tuple[dict, dict]): Config dict or a tuple of config dicts for activation layer(s). - Default: ({"type": "ReLU"}, {"type": "Sigmoid"}). - """ super().__init__() - if isinstance(act_cfg, dict): - act_cfg = (act_cfg, act_cfg) - if len(act_cfg) != 2: - msg = "act_cfg must be a dict or a tuple of dicts of length 2." + if callable(activation_callable): + activation_callable = (activation_callable, activation_callable) + + if len(activation_callable) != 2: + msg = "activation_callable must be a callable or a tuple of callables of length 2." raise ValueError(msg) self.channels = channels @@ -152,7 +157,7 @@ def __init__( kernel_size=1, stride=1, norm_cfg=norm_cfg, - act_cfg=act_cfg[0], + activation_callable=activation_callable[0], ) self.conv2 = Conv2dModule( in_channels=int(total_channel / ratio), @@ -160,7 +165,7 @@ def __init__( kernel_size=1, stride=1, norm_cfg=norm_cfg, - act_cfg=act_cfg[1], + activation_callable=activation_callable[1], ) def forward(self, x: torch.Tensor) -> list[torch.Tensor]: @@ -177,35 +182,38 @@ def forward(self, x: torch.Tensor) -> list[torch.Tensor]: class SpatialWeighting(nn.Module): - """Spatial weighting.""" + """Spatial weighting. + + Args: + channels (int): Number of input channels. + ratio (int): Reduction ratio for the bottleneck block. Default: 16. + activation_callable (Callable[..., nn.Module] | tuple[Callable[..., nn.Module], Callable[..., nn.Module]]): \ + Activation layer module or a tuple of activation layer modules. + If a single module is provided, it will be used for both activation layers. + Defaults to (`nn.ReLU`, `nn.Sigmoid`). + + Raises: + ValueError: activation_callable must be a callable or a tuple of callables of length 2. + TypeError: If activation_callable is not a callable or a tuple of callables. + """ def __init__( self, channels: int, ratio: int = 16, - norm_cfg: dict | None = None, - act_cfg: dict | tuple[dict, dict] = ({"type": "ReLU"}, {"type": "Sigmoid"}), - enable_norm: bool = False, + activation_callable: Callable[..., nn.Module] | tuple[Callable[..., nn.Module], Callable[..., nn.Module]] = ( + nn.ReLU, + nn.Sigmoid, + ), + **kwargs, ) -> None: - """Spatial weighting. - - Args: - channels (int): Number of input channels. - ratio (int): Reduction ratio for the bottleneck block. Default: 16. - act_cfg (dict | tuple[dict]): Configuration dict or tuple of dicts for - activation layers. If a single dict is provided, it will be used for - both activation layers. Default: ({"type": "ReLU"}, {"type": "Sigmoid"}). - - Raises: - ValueError: act_cfg must be a dict or a tuple of dicts of length 2. - TypeError: If act_cfg is not a dict or a tuple of dicts. - """ super().__init__() - if isinstance(act_cfg, dict): - act_cfg = (act_cfg, act_cfg) - if len(act_cfg) != 2: - msg = "act_cfg must be a dict or a tuple of dicts of length 2." + if callable(activation_callable): + activation_callable = (activation_callable, activation_callable) + + if len(activation_callable) != 2: + msg = "activation_callable must be a callable or a tuple of callables of length 2." raise ValueError(msg) self.global_avgpool = nn.AdaptiveAvgPool2d(1) @@ -214,14 +222,14 @@ def __init__( out_channels=int(channels / ratio), kernel_size=1, stride=1, - act_cfg=act_cfg[0], + activation_callable=activation_callable[0], ) self.conv2 = Conv2dModule( in_channels=int(channels / ratio), out_channels=channels, kernel_size=1, stride=1, - act_cfg=act_cfg[1], + activation_callable=activation_callable[1], ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -264,7 +272,7 @@ def __init__( stride=1, bias=False, norm_cfg=norm_cfg if enable_norm else None, - act_cfg=None, + activation_callable=None, ) self.q_channel = Conv2dModule( in_channels=self.in_channels, @@ -273,7 +281,7 @@ def __init__( stride=1, bias=False, norm_cfg=norm_cfg if enable_norm else None, - act_cfg=None, + activation_callable=None, ) self.out_channel = Conv2dModule( in_channels=self.internal_channels, @@ -281,7 +289,7 @@ def __init__( kernel_size=1, stride=1, norm_cfg=norm_cfg, - act_cfg={"type": "Sigmoid"}, + activation_callable=nn.Sigmoid, ) # spatial-only branch @@ -292,7 +300,7 @@ def __init__( stride=1, bias=False, norm_cfg=norm_cfg if enable_norm else None, - act_cfg=None, + activation_callable=None, ) self.q_spatial = Conv2dModule( in_channels=self.in_channels, @@ -301,7 +309,7 @@ def __init__( stride=1, bias=False, norm_cfg=norm_cfg if enable_norm else None, - act_cfg=None, + activation_callable=None, ) self.global_avgpool = nn.AdaptiveAvgPool2d(1) @@ -420,7 +428,7 @@ def __init__( padding=dw_ksize // 2, groups=channel, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ) for channel in branch_channels ], @@ -550,7 +558,7 @@ def __init__( stride=strides[0], padding=1, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ) self.conv2 = None @@ -562,7 +570,7 @@ def __init__( stride=2, padding=1, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ) mid_channels = int(round(stem_channels * expand_ratio)) @@ -581,7 +589,7 @@ def __init__( padding=1, groups=branch_channels, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ), Conv2dModule( branch_channels, @@ -590,7 +598,7 @@ def __init__( stride=1, padding=0, norm_cfg=norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ), ) @@ -601,7 +609,7 @@ def __init__( stride=1, padding=0, norm_cfg=norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ) self.depthwise_conv = Conv2dModule( mid_channels, @@ -611,7 +619,7 @@ def __init__( padding=1, groups=mid_channels, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ) self.linear_conv = Conv2dModule( mid_channels, @@ -620,7 +628,7 @@ def __init__( stride=1, padding=0, norm_cfg=norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ) def _inner_forward(self, x: torch.Tensor) -> torch.Tensor: @@ -723,7 +731,7 @@ def __init__( stride=strides[0], padding=1, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ) self.conv2 = None @@ -735,7 +743,7 @@ def __init__( stride=2, padding=1, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ) mid_channels = int(round(stem_channels * expand_ratio)) @@ -754,7 +762,7 @@ def __init__( padding=1, groups=internal_branch_channels, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ), Conv2dModule( internal_branch_channels, @@ -763,7 +771,7 @@ def __init__( stride=1, padding=0, norm_cfg=norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ), ), ) @@ -777,7 +785,7 @@ def __init__( stride=1, padding=0, norm_cfg=norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ), Conv2dModule( mid_channels, @@ -787,7 +795,7 @@ def __init__( padding=1, groups=mid_channels, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ), Conv2dModule( mid_channels, @@ -796,7 +804,7 @@ def __init__( stride=1, padding=0, norm_cfg=norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ), ), ) @@ -836,7 +844,19 @@ def forward(self, x: torch.Tensor) -> list[torch.Tensor]: class ShuffleUnit(nn.Module): - """InvertedResidual block for ShuffleNetV2 backbone.""" + """InvertedResidual block for ShuffleNetV2 backbone. + + Args: + in_channels (int): The input channels of the block. + out_channels (int): The output channels of the block. + stride (int): Stride of the 3x3 convolution layer. Default: 1 + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.ReLU`. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ def __init__( self, @@ -844,29 +864,13 @@ def __init__( out_channels: int, stride: int = 1, norm_cfg: dict | None = None, - act_cfg: dict | None = None, + activation_callable: Callable[..., nn.Module] = nn.ReLU, with_cp: bool = False, ) -> None: - """InvertedResidual block for ShuffleNetV2 backbone. - - Args: - in_channels (int): The input channels of the block. - out_channels (int): The output channels of the block. - stride (int): Stride of the 3x3 convolution layer. Default: 1 - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU'). - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - - """ super().__init__() if norm_cfg is None: norm_cfg = {"type": "BN"} - if act_cfg is None: - act_cfg = {"type": "ReLU"} self.stride = stride self.with_cp = with_cp @@ -890,7 +894,7 @@ def __init__( padding=1, groups=in_channels, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ), Conv2dModule( in_channels, @@ -899,7 +903,7 @@ def __init__( stride=1, padding=0, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), ) @@ -911,7 +915,7 @@ def __init__( stride=1, padding=0, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), Conv2dModule( branch_features, @@ -921,7 +925,7 @@ def __init__( padding=1, groups=branch_features, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ), Conv2dModule( branch_features, @@ -930,7 +934,7 @@ def __init__( stride=1, padding=0, norm_cfg=norm_cfg, - act_cfg=act_cfg, + activation_callable=activation_callable, ), ) @@ -1047,7 +1051,7 @@ def _make_one_branch(self, branch_index: int, num_blocks: int, stride: int = 1) self.in_channels[branch_index], stride=stride, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, with_cp=self.with_cp, ), ] + [ @@ -1056,7 +1060,7 @@ def _make_one_branch(self, branch_index: int, num_blocks: int, stride: int = 1) self.in_channels[branch_index], stride=1, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, with_cp=self.with_cp, ) for _ in range(1, num_blocks) @@ -1283,7 +1287,7 @@ def __init__( stride=1, padding=0, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ), ) in_modules_channels = out_modules_channels @@ -1320,7 +1324,7 @@ def __init__( padding=1, groups=self.stem.out_channels, norm_cfg=norm_cfg, - act_cfg=None, + activation_callable=None, ), Conv2dModule( self.stem.out_channels, @@ -1329,7 +1333,7 @@ def __init__( stride=1, padding=0, norm_cfg=norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ), ) diff --git a/src/otx/algo/segmentation/backbones/mscan.py b/src/otx/algo/segmentation/backbones/mscan.py index fd10ea7b432..415655bf8ca 100644 --- a/src/otx/algo/segmentation/backbones/mscan.py +++ b/src/otx/algo/segmentation/backbones/mscan.py @@ -6,12 +6,12 @@ from __future__ import annotations from pathlib import Path -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Callable import torch from torch import nn -from otx.algo.modules import build_activation_layer, build_norm_layer +from otx.algo.modules import build_norm_layer from otx.algo.modules.base_module import BaseModule from otx.algo.utils.mmengine_utils import load_checkpoint_to_model, load_from_http @@ -62,8 +62,8 @@ class Mlp(BaseModule): Defaults: None. out_features (int): The dimension of output features. Defaults: None. - act_cfg (dict): Config dict for activation layer in block. - Default: dict(type='GELU'). + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.GELU`. drop (float): The number of dropout rate in MLP block. Defaults: 0.0. """ @@ -73,30 +73,16 @@ def __init__( in_features: int, hidden_features: int | None = None, out_features: int | None = None, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] = nn.GELU, drop: float = 0.0, ) -> None: - """Initializes the MLP module. - - Args: - in_features (int): The dimension of the input features. - hidden_features (Optional[int]): The dimension of the hidden features. - Defaults to None. - out_features (Optional[int]): The dimension of the output features. - Defaults to None. - act_cfg (Dict[str, str] | None): Config dict for the activation layer in the block. - Defaults to {"type": "GELU"} if None. - drop (float): The dropout rate in the MLP block. - Defaults to 0.0. - """ + """Initializes the MLP module.""" super().__init__() - if act_cfg is None: - act_cfg = {"type": "GELU"} out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Conv2d(in_features, hidden_features, 1) self.dwconv = nn.Conv2d(hidden_features, hidden_features, 3, 1, 1, bias=True, groups=hidden_features) - self.act = build_activation_layer(act_cfg) + self.act = activation_callable() self.fc2 = nn.Conv2d(hidden_features, out_features, 1) self.drop = nn.Dropout(drop) @@ -118,8 +104,8 @@ class StemConv(BaseModule): Args: in_channels (int): The dimension of input channels. out_channels (int): The dimension of output channels. - act_cfg (dict): Config dict for activation layer in block. - Default: dict(type='GELU'). + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.GELU`. norm_cfg (dict): Config dict for normalization layer. Defaults: dict(type='SyncBN', requires_grad=True). """ @@ -128,29 +114,17 @@ def __init__( self, in_channels: int, out_channels: int, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] = nn.GELU, norm_cfg: dict[str, str | bool] | None = None, ) -> None: - """Stem Block at the beginning of Semantic Branch. - - Args: - in_channels (int): The dimension of input channels. - out_channels (int): The dimension of output channels. - act_cfg (Dict[str, str] | None): Config dict for activation layer in block. - Default: dict(type='GELU') if None. - norm_cfg (Dict[str, Union[str, bool]] | None): Config dict for normalization layer. - Defaults: dict(type='SyncBN', requires_grad=True) if None. - """ super().__init__() - if act_cfg is None: - act_cfg = {"type": "GELU"} if norm_cfg is None: norm_cfg = {"type": "SyncBN", "requires_grad": True} self.proj = nn.Sequential( nn.Conv2d(in_channels, out_channels // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), build_norm_layer(norm_cfg, out_channels // 2)[1], - build_activation_layer(act_cfg), + activation_callable(), nn.Conv2d(out_channels // 2, out_channels, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), build_norm_layer(norm_cfg, out_channels)[1], ) @@ -218,28 +192,27 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class MSCASpatialAttention(BaseModule): - """Spatial Attention Module in Multi-Scale Convolutional Attention Module (MSCA).""" + """Spatial Attention Module in Multi-Scale Convolutional Attention Module (MSCA). + + Args: + in_channels (int): The number of input channels. + attention_kernel_sizes (List[Union[int, List[int]]]): The size of attention kernels. + attention_kernel_paddings (List[Union[int, List[int]]]): The paddings of attention kernels. + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.GELU`. + """ def __init__( self, in_channels: int, attention_kernel_sizes: list[int | list[int]] = [5, [1, 7], [1, 11], [1, 21]], # noqa: B006 attention_kernel_paddings: list[int | list[int]] = [2, [0, 3], [0, 5], [0, 10]], # noqa: B006 - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] = nn.GELU, ) -> None: - """Init the MSCASpatialAttention module. - - Args: - in_channels (int): The number of input channels. - attention_kernel_sizes (List[Union[int, List[int]]]): The size of attention kernels. - attention_kernel_paddings (List[Union[int, List[int]]]): The paddings of attention kernels. - act_cfg (Dict[str, str] | None): The config of activation layer. - """ + """Init the MSCASpatialAttention module.""" super().__init__() - if act_cfg is None: - act_cfg = {"type": "GELU"} self.proj_1 = nn.Conv2d(in_channels, in_channels, 1) # type: nn.Conv2d - self.activation = build_activation_layer(act_cfg) # type: nn.Module + self.activation = activation_callable() # type: nn.Module self.spatial_gating_unit = MSCAAttention(in_channels, attention_kernel_sizes, attention_kernel_paddings) # type: MSCAAttention self.proj_2 = nn.Conv2d(in_channels, in_channels, 1) # type: nn.Conv2d @@ -260,6 +233,17 @@ class MSCABlock(BaseModule): attention. In each branch, it uses two depth-wise strip convolutions to approximate standard depth-wise convolutions with large kernels. The kernel size for each branch is set to 7, 11, and 21, respectively. + + Args: + channels (int): The number of input channels. + attention_kernel_sizes (List[Union[int, List[int]]]): The size of attention kernels. + attention_kernel_paddings (List[Union[int, List[int]]]): The paddings of attention kernels. + mlp_ratio (float): The ratio of the number of hidden units in the MLP to the number of input channels. + drop (float): The dropout rate. + drop_path (float): The dropout rate for the path. + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.GELU`. + norm_cfg (Dict[str, Union[str, bool]] | None): The config of normalization layer. """ def __init__( @@ -270,32 +254,29 @@ def __init__( mlp_ratio: float = 4.0, drop: float = 0.0, drop_path: float = 0.0, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] = nn.GELU, norm_cfg: dict[str, str | bool] | None = None, ) -> None: - """Initialize a MSCABlock. - - Args: - channels (int): The number of input channels. - attention_kernel_sizes (List[Union[int, List[int]]]): The size of attention kernels. - attention_kernel_paddings (List[Union[int, List[int]]]): The paddings of attention kernels. - mlp_ratio (float): The ratio of the number of hidden units in the MLP to the number of input channels. - drop (float): The dropout rate. - drop_path (float): The dropout rate for the path. - act_cfg (Dict[str, str] | None): The config of activation layer. - norm_cfg (Dict[str, Union[str, bool]] | None): The config of normalization layer. - """ + """Initialize a MSCABlock.""" super().__init__() - if act_cfg is None: - act_cfg = {"type": "GELU"} if norm_cfg is None: norm_cfg = {"type": "SyncBN", "requires_grad": True} self.norm1 = build_norm_layer(norm_cfg, channels)[1] # type: nn.Module - self.attn = MSCASpatialAttention(channels, attention_kernel_sizes, attention_kernel_paddings, act_cfg) # type: MSCAAttention + self.attn = MSCASpatialAttention( + channels, + attention_kernel_sizes, + attention_kernel_paddings, + activation_callable, + ) # type: MSCAAttention self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() # type: nn.Module self.norm2 = build_norm_layer(norm_cfg, channels)[1] # type: nn.Module mlp_hidden_channels = int(channels * mlp_ratio) # type: int - self.mlp = Mlp(in_features=channels, hidden_features=mlp_hidden_channels, act_cfg=act_cfg, drop=drop) # type: Mlp + self.mlp = Mlp( + in_features=channels, + hidden_features=mlp_hidden_channels, + activation_callable=activation_callable, + drop=drop, + ) # type: Mlp layer_scale_init_value = 1e-2 # type: float self.layer_scale_1 = nn.Parameter(layer_scale_init_value * torch.ones(channels), requires_grad=True) # type: nn.Parameter self.layer_scale_2 = nn.Parameter(layer_scale_init_value * torch.ones(channels), requires_grad=True) # type: nn.Parameter @@ -355,6 +336,25 @@ class MSCAN(BaseModule): Convolutional Attention Design for Semantic Segmentation `_. Inspiration from https://github.com/visual-attention-network/segnext. + + Args: + in_channels (int): The number of input channels. Defaults to 3. + embed_dims (List[int]): Embedding dimension. Defaults to [64, 128, 256, 512]. + mlp_ratios (List[int]): Ratio of mlp hidden dim to embedding dim. Defaults to [4, 4, 4, 4]. + drop_rate (float): Dropout rate. Defaults to 0.0. + drop_path_rate (float): Stochastic depth rate. Defaults to 0.0. + depths (List[int]): Depths of each Swin Transformer stage. Defaults to [3, 4, 6, 3]. + num_stages (int): MSCAN stages. Defaults to 4. + attention_kernel_sizes (List[Union[int, List[int]]]): Size of attention kernel in + Attention Module (Figure 2(b) of original paper). Defaults to [5, [1, 7], [1, 11], [1, 21]]. + attention_kernel_paddings (List[Union[int, List[int]]]): Size of attention paddings + in Attention Module (Figure 2(b) of original paper). Defaults to [2, [0, 3], [0, 5], [0, 10]]. + activation_callable (Callable[..., nn.Module]): Activation layer module. + Defaults to `nn.GELU`. + norm_cfg (Dict[str, Union[str, bool]] | None): Config dict for normalization layer. + Defaults to dict(type='SyncBN', requires_grad=True) if None. + init_cfg (Optional[Union[Dict[str, str], List[Dict[str, str]]]]): Initialization config dict. + Defaults to None. """ def __init__( @@ -368,35 +368,13 @@ def __init__( num_stages: int = 4, attention_kernel_sizes: list[int | list[int]] = [5, [1, 7], [1, 11], [1, 21]], # noqa: B006 attention_kernel_paddings: list[int | list[int]] = [2, [0, 3], [0, 5], [0, 10]], # noqa: B006 - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] = nn.GELU, norm_cfg: dict[str, str | bool] | None = None, init_cfg: dict[str, str] | list[dict[str, str]] | None = None, pretrained_weights: str | None = None, ) -> None: - """Initialize a MSCAN backbone. - - Args: - in_channels (int): The number of input channels. Defaults to 3. - embed_dims (List[int]): Embedding dimension. Defaults to [64, 128, 256, 512]. - mlp_ratios (List[int]): Ratio of mlp hidden dim to embedding dim. Defaults to [4, 4, 4, 4]. - drop_rate (float): Dropout rate. Defaults to 0.0. - drop_path_rate (float): Stochastic depth rate. Defaults to 0.0. - depths (List[int]): Depths of each Swin Transformer stage. Defaults to [3, 4, 6, 3]. - num_stages (int): MSCAN stages. Defaults to 4. - attention_kernel_sizes (List[Union[int, List[int]]]): Size of attention kernel in - Attention Module (Figure 2(b) of original paper). Defaults to [5, [1, 7], [1, 11], [1, 21]]. - attention_kernel_paddings (List[Union[int, List[int]]]): Size of attention paddings - in Attention Module (Figure 2(b) of original paper). Defaults to [2, [0, 3], [0, 5], [0, 10]]. - act_cfg (Dict[str, str] | None): Config dict for activation layer in block. - Defaults to dict(type='GELU') if None. - norm_cfg (Dict[str, Union[str, bool]] | None): Config dict for normalization layer. - Defaults to dict(type='SyncBN', requires_grad=True) if None. - init_cfg (Optional[Union[Dict[str, str], List[Dict[str, str]]]]): Initialization config dict. - Defaults to None. - """ + """Initialize a MSCAN backbone.""" super().__init__(init_cfg=init_cfg) - if act_cfg is None: - act_cfg = {"type": "GELU"} if norm_cfg is None: norm_cfg = {"type": "SyncBN", "requires_grad": True} @@ -426,7 +404,7 @@ def __init__( mlp_ratio=mlp_ratios[i], drop=drop_rate, drop_path=dpr[cur + j], - act_cfg=act_cfg, + activation_callable=activation_callable, norm_cfg=norm_cfg, ) for j in range(depths[i]) diff --git a/src/otx/algo/segmentation/heads/base_segm_head.py b/src/otx/algo/segmentation/heads/base_segm_head.py index 8547b0233dc..419fea64071 100644 --- a/src/otx/algo/segmentation/heads/base_segm_head.py +++ b/src/otx/algo/segmentation/heads/base_segm_head.py @@ -7,6 +7,7 @@ from abc import ABCMeta, abstractmethod from pathlib import Path +from typing import Callable import torch from torch import nn @@ -16,7 +17,23 @@ class BaseSegmHead(nn.Module, metaclass=ABCMeta): - """Base class for segmentation heads.""" + """Base class for segmentation heads. + + Args: + in_channels (int | list[int]): Number of input channels. + channels (int): Number of channels in the feature map. + num_classes (int): Number of classes for segmentation. + dropout_ratio (float, optional): The dropout ratio. Defaults to 0.1. + norm_cfg (Optional[ConfigType], optional): Config for normalization layer. + Defaults to None. + activation_callable (Callable[..., nn.Module] | None): Activation layer module. + Defaults to `nn.ReLU`. + in_index (int, list[int], optional): Input index. Defaults to -1. + input_transform (Optional[str], optional): Input transform type. + Defaults to None. + ignore_index (int, optional): The index to be ignored. Defaults to 255. + align_corners (bool, optional): Whether to align corners. Defaults to False. + """ def __init__( self, @@ -25,39 +42,21 @@ def __init__( num_classes: int, dropout_ratio: float = 0.1, norm_cfg: dict[str, str] | None = None, - act_cfg: dict[str, str] | None = None, + activation_callable: Callable[..., nn.Module] | None = nn.ReLU, in_index: int | list[int] = -1, input_transform: str | None = None, ignore_index: int = 255, align_corners: bool = False, pretrained_weights: str | None = None, ) -> None: - """Initialize the BaseSegmHead. - - Args: - in_channels (int | list[int]): Number of input channels. - channels (int): Number of channels in the feature map. - num_classes (int): Number of classes for segmentation. - dropout_ratio (float, optional): The dropout ratio. Defaults to 0.1. - norm_cfg (Optional[ConfigType], optional): Config for normalization layer. - Defaults to None. - act_cfg (Dict[str, Union[str, Dict]], optional): Activation config. - Defaults to dict(type='ReLU'). - in_index (int, list[int], optional): Input index. Defaults to -1. - input_transform (Optional[str], optional): Input transform type. - Defaults to None. - ignore_index (int, optional): The index to be ignored. Defaults to 255. - align_corners (bool, optional): Whether to align corners. Defaults to False. - """ + """Initialize the BaseSegmHead.""" super().__init__() - if act_cfg is None: - act_cfg = {"type": "ReLU"} self.channels = channels self.num_classes = num_classes self.input_transform = input_transform self.dropout_ratio = dropout_ratio self.norm_cfg = norm_cfg - self.act_cfg = act_cfg + self.activation_callable = activation_callable if self.input_transform is not None and not isinstance(in_index, list): msg = f'"in_index" expects a list, but got {type(in_index)}' raise TypeError(msg) diff --git a/src/otx/algo/segmentation/heads/fcn_head.py b/src/otx/algo/segmentation/heads/fcn_head.py index c6d8316c59f..da79e2db239 100644 --- a/src/otx/algo/segmentation/heads/fcn_head.py +++ b/src/otx/algo/segmentation/heads/fcn_head.py @@ -109,7 +109,7 @@ def __init__( padding=conv_padding, dilation=dilation, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ), ] convs.extend( @@ -121,7 +121,7 @@ def __init__( padding=conv_padding, dilation=dilation, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ) for _ in range(num_convs - 1) ], @@ -137,12 +137,12 @@ def __init__( kernel_size=kernel_size, padding=kernel_size // 2, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ) - if self.act_cfg: + if self.activation_callable: self.convs[-1].with_activation = False - delattr(self.convs[-1], "activate") # why we delete last activation? + delattr(self.convs[-1], "activation") # why we delete last activation? def _forward_feature(self, inputs: Tensor) -> Tensor: """Forward function for feature maps. diff --git a/src/otx/algo/segmentation/heads/ham_head.py b/src/otx/algo/segmentation/heads/ham_head.py index 52f789808b0..cd079752a15 100644 --- a/src/otx/algo/segmentation/heads/ham_head.py +++ b/src/otx/algo/segmentation/heads/ham_head.py @@ -45,11 +45,11 @@ def __init__( """ super().__init__() - self.ham_in = Conv2dModule(ham_channels, ham_channels, 1, norm_cfg=None, act_cfg=None) + self.ham_in = Conv2dModule(ham_channels, ham_channels, 1, norm_cfg=None, activation_callable=None) self.ham = NMF2D(ham_channels=ham_channels, **ham_kwargs) - self.ham_out = Conv2dModule(ham_channels, ham_channels, 1, norm_cfg=norm_cfg, act_cfg=None) + self.ham_out = Conv2dModule(ham_channels, ham_channels, 1, norm_cfg=norm_cfg, activation_callable=None) def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward.""" @@ -102,7 +102,7 @@ def __init__( self.ham_channels, 1, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ) self.hamburger = Hamburger(self.ham_channels, ham_kwargs=self.ham_kwargs, **kwargs) @@ -112,7 +112,7 @@ def __init__( self.channels, 1, norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, + activation_callable=self.activation_callable, ) def forward(self, inputs: list[torch.Tensor]) -> torch.Tensor: diff --git a/src/otx/algo/segmentation/modules/aggregators.py b/src/otx/algo/segmentation/modules/aggregators.py index bff23694b50..0b5e78debc9 100644 --- a/src/otx/algo/segmentation/modules/aggregators.py +++ b/src/otx/algo/segmentation/modules/aggregators.py @@ -68,7 +68,7 @@ def __init__( kernel_size=1, stride=1, norm_cfg=norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ), ) @@ -85,9 +85,9 @@ def __init__( stride=1, padding=1, norm_cfg=norm_cfg, - act_cfg={"type": "ReLU"}, - dw_act_cfg=None, - pw_act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, + dw_activation_callable=None, + pw_activation_callable=nn.ReLU, ), ) @@ -99,7 +99,7 @@ def __init__( kernel_size=1, stride=1, norm_cfg=norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ), ) else: diff --git a/src/otx/algo/segmentation/modules/blocks.py b/src/otx/algo/segmentation/modules/blocks.py index 86d049b1da4..240924ab476 100644 --- a/src/otx/algo/segmentation/modules/blocks.py +++ b/src/otx/algo/segmentation/modules/blocks.py @@ -73,7 +73,7 @@ def __init__( stride=1, padding=0, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ) self.key_psp = PSPModule(psp_size, method="max") @@ -84,7 +84,7 @@ def __init__( stride=1, padding=0, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ) self.value_psp = PSPModule(psp_size, method="max") @@ -95,7 +95,7 @@ def __init__( stride=1, padding=0, norm_cfg=self.norm_cfg, - act_cfg=None, + activation_callable=None, ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -171,7 +171,7 @@ def __init__(self, num_channels: int, norm_cfg: dict | None = None): padding=1, groups=self.num_channels, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ) self.dwconv2 = Conv2dModule( in_channels=self.num_channels, @@ -181,7 +181,7 @@ def __init__(self, num_channels: int, norm_cfg: dict | None = None): padding=1, groups=self.num_channels, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ) self.dwconv3 = Conv2dModule( in_channels=self.num_channels, @@ -191,7 +191,7 @@ def __init__(self, num_channels: int, norm_cfg: dict | None = None): padding=1, groups=self.num_channels, norm_cfg=self.norm_cfg, - act_cfg={"type": "ReLU"}, + activation_callable=nn.ReLU, ) self.sigmoid_spatial = nn.Sigmoid() diff --git a/src/otx/algo/segmentation/segnext.py b/src/otx/algo/segmentation/segnext.py index c3d2ca86fb3..0c2eaff739b 100644 --- a/src/otx/algo/segmentation/segnext.py +++ b/src/otx/algo/segmentation/segnext.py @@ -2,9 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 # """SegNext model implementations.""" + from __future__ import annotations -from typing import TYPE_CHECKING, Any, ClassVar +from typing import Any, ClassVar + +from torch import nn from otx.algo.segmentation.backbones import MSCAN from otx.algo.segmentation.heads import LightHamHead @@ -13,15 +16,12 @@ from .base_model import BaseSegmModel -if TYPE_CHECKING: - from torch import nn - class SegNextB(BaseSegmModel): """SegNextB Model.""" default_backbone_configuration: ClassVar[dict[str, Any]] = { - "act_cfg": {"type": "GELU"}, + "activation_callable": nn.GELU, "attention_kernel_paddings": [2, [0, 3], [0, 5], [0, 10]], "attention_kernel_sizes": [5, [1, 7], [1, 11], [1, 21]], "depths": [3, 3, 12, 3], @@ -48,7 +48,7 @@ class SegNextS(BaseSegmModel): """SegNextS Model.""" default_backbone_configuration: ClassVar[dict[str, Any]] = { - "act_cfg": {"type": "GELU"}, + "activation_callable": nn.GELU, "attention_kernel_paddings": [2, [0, 3], [0, 5], [0, 10]], "attention_kernel_sizes": [5, [1, 7], [1, 11], [1, 21]], "depths": [2, 2, 4, 2], @@ -75,7 +75,7 @@ class SegNextT(BaseSegmModel): """SegNextT Model.""" default_backbone_configuration: ClassVar[dict[str, Any]] = { - "act_cfg": {"type": "GELU"}, + "activation_callable": nn.GELU, "attention_kernel_paddings": [2, [0, 3], [0, 5], [0, 10]], "attention_kernel_sizes": [5, [1, 7], [1, 11], [1, 21]], "depths": [3, 3, 5, 2], diff --git a/tests/unit/algo/common/backbones/test_pytorchcv_backbones.py b/tests/unit/algo/common/backbones/test_pytorchcv_backbones.py index 8e61ae65a25..81176677d6e 100644 --- a/tests/unit/algo/common/backbones/test_pytorchcv_backbones.py +++ b/tests/unit/algo/common/backbones/test_pytorchcv_backbones.py @@ -30,14 +30,14 @@ def __init__(self): def test_replace_activation() -> None: - activation_cfg = {"type": "GELU"} + activation_callable = nn.GELU model = MockModule() - model = replace_activation(model, activation_cfg) + model = replace_activation(model, activation_callable) assert isinstance(model._modules["activ1"], nn.GELU) assert isinstance(model._modules["activ2"], nn.GELU) - activation_cfg = {"type": "torch_swish"} - model = replace_activation(model, activation_cfg) + activation_callable = nn.SiLU + model = replace_activation(model, activation_callable) assert isinstance(model._modules["activ1"], nn.SiLU) assert isinstance(model._modules["activ2"], nn.SiLU) diff --git a/tests/unit/algo/detection/backbones/test_csp_darknet.py b/tests/unit/algo/detection/backbones/test_csp_darknet.py index 3c24d83cd57..42650c2bc49 100644 --- a/tests/unit/algo/detection/backbones/test_csp_darknet.py +++ b/tests/unit/algo/detection/backbones/test_csp_darknet.py @@ -9,6 +9,7 @@ import pytest import torch from otx.algo.detection.backbones.csp_darknet import CSPDarknet, Focus +from torch import nn from torch.nn.modules import GroupNorm from torch.nn.modules.batchnorm import _BatchNorm @@ -108,7 +109,7 @@ def test_forward(self) -> None: assert feat[5].shape == torch.Size((1, 256, 2, 2)) # Test CSPDarknet forward with dict(type='ReLU') - model = CSPDarknet(widen_factor=0.125, act_cfg={"type": "ReLU"}, out_indices=range(5)) + model = CSPDarknet(widen_factor=0.125, activation_callable=nn.ReLU, out_indices=range(5)) model.train() imgs = torch.randn(1, 3, 64, 64) diff --git a/tests/unit/algo/detection/heads/test_rtmdet_head.py b/tests/unit/algo/detection/heads/test_rtmdet_head.py index 61e48622371..4d1d594c5be 100644 --- a/tests/unit/algo/detection/heads/test_rtmdet_head.py +++ b/tests/unit/algo/detection/heads/test_rtmdet_head.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 """Test of RTMDetHead.""" +from functools import partial + import pytest import torch from omegaconf import DictConfig @@ -11,6 +13,7 @@ from otx.algo.common.utils.prior_generators import MlvlPointGenerator from otx.algo.common.utils.samplers import PseudoSampler from otx.algo.detection.heads.rtmdet_head import RTMDetHead, RTMDetSepBNHead +from torch import nn @pytest.fixture() @@ -54,7 +57,7 @@ def rtmdet_head(self) -> RTMDetHead: with_objectness=False, pred_kernel_size=1, norm_cfg={"type": "BN"}, - act_cfg={"type": "SiLU", "inplace": True}, + activation_callable=partial(nn.SiLU, inplace=True), train_cfg=train_cfg, test_cfg=test_cfg, ) @@ -166,7 +169,7 @@ def rtmdet_sep_bn_head(self) -> RTMDetSepBNHead: share_conv=True, pred_kernel_size=1, norm_cfg={"type": "BN"}, - act_cfg={"type": "SiLU", "inplace": True}, + activation_callable=partial(nn.SiLU, inplace=True), train_cfg=train_cfg, test_cfg=test_cfg, ) diff --git a/tests/unit/algo/detection/layers/test_csp_layer.py b/tests/unit/algo/detection/layers/test_csp_layer.py index 5e3fe06bf0f..ef8a774b6a1 100644 --- a/tests/unit/algo/detection/layers/test_csp_layer.py +++ b/tests/unit/algo/detection/layers/test_csp_layer.py @@ -19,7 +19,7 @@ def test_init(self) -> None: assert isinstance(csp_layer.blocks[0].conv2, Conv2dModule) assert isinstance(csp_layer.blocks[0].conv1.conv, Conv2d) assert isinstance(csp_layer.blocks[0].conv1.bn, BatchNorm2d) - assert isinstance(csp_layer.blocks[0].conv1.activate, Swish) + assert isinstance(csp_layer.blocks[0].conv1.activation, Swish) assert not hasattr(csp_layer, "attention") # use DepthwiseSeparableConvModule diff --git a/tests/unit/algo/modules/test_activation.py b/tests/unit/algo/modules/test_activation.py index 45e0804bc02..848cd09ec02 100644 --- a/tests/unit/algo/modules/test_activation.py +++ b/tests/unit/algo/modules/test_activation.py @@ -4,7 +4,7 @@ # https://github.com/open-mmlab/mmcv/blob/main/tests/test_cnn/test_swish.py import torch -from otx.algo.modules.activation import Swish, build_activation_layer +from otx.algo.modules.activation import Swish from torch.nn import functional @@ -17,21 +17,3 @@ def test_swish(): assert output.shape == expected_output.shape # test output value assert torch.equal(output, expected_output) - - -def test_build_activation_layer(): - cfg = {"type": "PReLU"} - activation_layer = build_activation_layer(cfg=cfg) - assert isinstance(activation_layer, torch.nn.PReLU) - - cfg = {"type": "ReLU"} - activation_layer = build_activation_layer(cfg=cfg) - assert isinstance(activation_layer, torch.nn.ReLU) - - cfg = {"type": "LeakyReLU"} - activation_layer = build_activation_layer(cfg=cfg) - assert isinstance(activation_layer, torch.nn.LeakyReLU) - - cfg = {"type": "Swish"} - activation_layer = build_activation_layer(cfg=cfg) - assert isinstance(activation_layer, Swish) diff --git a/tests/unit/algo/modules/test_conv_module.py b/tests/unit/algo/modules/test_conv_module.py index be0f8e34463..8fadd8ace41 100644 --- a/tests/unit/algo/modules/test_conv_module.py +++ b/tests/unit/algo/modules/test_conv_module.py @@ -3,6 +3,8 @@ # Copyright (c) OpenMMLab. All rights reserved. # https://github.com/open-mmlab/mmcv/blob/main/tests/test_cnn/test_conv_module.py +from functools import partial + import pytest import torch from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule @@ -15,15 +17,20 @@ def test_conv_module(): # norm_cfg must be a dict or None Conv2dModule(3, 8, 2, norm_cfg=norm_cfg) - act_cfg = {"type": "softmax"} - with pytest.raises(KeyError): + activation_callable = nn.Softmax + with pytest.raises(ValueError, match="Unsupported activation"): + # softmax is not supported + Conv2dModule(3, 8, 2, activation_callable=activation_callable) + + activation_callable = partial(nn.Softmax) + with pytest.raises(ValueError, match="Unsupported activation"): # softmax is not supported - Conv2dModule(3, 8, 2, act_cfg=act_cfg) + Conv2dModule(3, 8, 2, activation_callable=activation_callable) # conv + norm + act conv = Conv2dModule(3, 8, 2, norm_cfg={"type": "BN"}) assert conv.with_activation - assert hasattr(conv, "activate") + assert isinstance(conv.activation, nn.Module) assert conv.with_norm assert hasattr(conv, "norm_layer") x = torch.rand(1, 3, 256, 256) @@ -33,7 +40,7 @@ def test_conv_module(): # conv + act conv = Conv2dModule(3, 8, 2) assert conv.with_activation - assert hasattr(conv, "activate") + assert isinstance(conv.activation, nn.Module) assert not conv.with_norm assert conv.norm_layer is None x = torch.rand(1, 3, 256, 256) @@ -41,11 +48,11 @@ def test_conv_module(): assert output.shape == (1, 8, 255, 255) # conv - conv = Conv2dModule(3, 8, 2, act_cfg=None) + conv = Conv2dModule(3, 8, 2, activation_callable=None) assert not conv.with_norm assert conv.norm_layer is None assert not conv.with_activation - assert not hasattr(conv, "activate") + assert conv.activation is None x = torch.rand(1, 3, 256, 256) output = conv(x) assert output.shape == (1, 8, 255, 255) @@ -65,26 +72,26 @@ def test_conv_module(): conv = Conv2dModule(3, 8, 3, padding=1, padding_mode="non_exists") # leaky relu - conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "LeakyReLU"}) - assert isinstance(conv.activate, nn.LeakyReLU) + conv = Conv2dModule(3, 8, 3, padding=1, activation_callable=nn.LeakyReLU) + assert isinstance(conv.activation, nn.LeakyReLU) output = conv(x) assert output.shape == (1, 8, 256, 256) # tanh - conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "Tanh"}) - assert isinstance(conv.activate, nn.Tanh) + conv = Conv2dModule(3, 8, 3, padding=1, activation_callable=nn.Tanh) + assert isinstance(conv.activation, nn.Tanh) output = conv(x) assert output.shape == (1, 8, 256, 256) # Sigmoid - conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "Sigmoid"}) - assert isinstance(conv.activate, nn.Sigmoid) + conv = Conv2dModule(3, 8, 3, padding=1, activation_callable=nn.Sigmoid) + assert isinstance(conv.activation, nn.Sigmoid) output = conv(x) assert output.shape == (1, 8, 256, 256) # PReLU - conv = Conv2dModule(3, 8, 3, padding=1, act_cfg={"type": "PReLU"}) - assert isinstance(conv.activate, nn.PReLU) + conv = Conv2dModule(3, 8, 3, padding=1, activation_callable=nn.PReLU) + assert isinstance(conv.activation, nn.PReLU) output = conv(x) assert output.shape == (1, 8, 256, 256) @@ -132,8 +139,8 @@ def test_forward_with_default_config(self) -> None: assert conv.pointwise_conv.conv.kernel_size == (1, 1) assert not conv.depthwise_conv.with_norm assert not conv.pointwise_conv.with_norm - assert conv.depthwise_conv.activate.__class__.__name__ == "ReLU" - assert conv.pointwise_conv.activate.__class__.__name__ == "ReLU" + assert conv.depthwise_conv.activation.__class__.__name__ == "ReLU" + assert conv.pointwise_conv.activation.__class__.__name__ == "ReLU" x = torch.rand(1, 3, 256, 256) output = conv(x) assert output.shape == (1, 8, 255, 255) @@ -179,29 +186,29 @@ def test_forward_with_spectral_norm_padding_mode(self) -> None: output = conv(x) assert output.shape == (1, 8, 256, 256) - def test_forward_with_dw_act_cfg(self) -> None: - # test dw_act_cfg - conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, dw_act_cfg={"type": "LeakyReLU"}) + def test_forward_with_dw_activation_callable(self) -> None: + # test dw_activation_callable + conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, dw_activation_callable=nn.LeakyReLU) x = torch.rand(1, 3, 256, 256) - assert conv.depthwise_conv.activate.__class__.__name__ == "LeakyReLU" - assert conv.pointwise_conv.activate.__class__.__name__ == "ReLU" + assert conv.depthwise_conv.activation.__class__.__name__ == "LeakyReLU" + assert conv.pointwise_conv.activation.__class__.__name__ == "ReLU" output = conv(x) assert output.shape == (1, 8, 256, 256) - def test_forward_with_pw_act_cfg(self) -> None: - # test pw_act_cfg - conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, pw_act_cfg={"type": "LeakyReLU"}) + def test_forward_with_pw_activation_callable(self) -> None: + # test pw_activation_callable + conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, pw_activation_callable=nn.LeakyReLU) x = torch.rand(1, 3, 256, 256) - assert conv.depthwise_conv.activate.__class__.__name__ == "ReLU" - assert conv.pointwise_conv.activate.__class__.__name__ == "LeakyReLU" + assert conv.depthwise_conv.activation.__class__.__name__ == "ReLU" + assert conv.pointwise_conv.activation.__class__.__name__ == "LeakyReLU" output = conv(x) assert output.shape == (1, 8, 256, 256) - def test_forward_with_act_cfg(self) -> None: - # test act_cfg - conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, act_cfg={"type": "LeakyReLU"}) + def test_forward_with_activation_callable(self) -> None: + # test activation_callable + conv = DepthwiseSeparableConvModule(3, 8, 3, padding=1, activation_callable=nn.LeakyReLU) x = torch.rand(1, 3, 256, 256) - assert conv.depthwise_conv.activate.__class__.__name__ == "LeakyReLU" - assert conv.pointwise_conv.activate.__class__.__name__ == "LeakyReLU" + assert conv.depthwise_conv.activation.__class__.__name__ == "LeakyReLU" + assert conv.pointwise_conv.activation.__class__.__name__ == "LeakyReLU" output = conv(x) assert output.shape == (1, 8, 256, 256)