From 0a395b2b0ec2900a828c44bfad8a03e7f99d9503 Mon Sep 17 00:00:00 2001 From: "Kim, Sungchul" Date: Tue, 27 Aug 2024 09:57:09 +0900 Subject: [PATCH] Refactoring detection task (#3860) * Rename directory from `base_models` to `detectors` * Update pytorchcv version * Create criterion modules * Fix unit tests * Update comment * FIx iseg unit test * Enable factory for ATSS * Enable factory for YOLOX and update * Enable factory for RTMDet * Update recipes * Update * Enable factory for SSD * Fix unit tests * Enable factory for RTDETR * Reduce default parameters * Update huggingface, keypoint, and iseg * Revert default `input_size` argument * Fix unit test * Fix integration test * Add ABC * Fix * Change `model_version` to `model_name` * precommit * Remove `DetectionBackboneFactory` * Use `Literal` for `model_name` --- src/otx/algo/common/backbones/cspnext.py | 44 ++- src/otx/algo/common/utils/coders/__init__.py | 3 +- .../common/utils/coders/base_bbox_coder.py | 26 ++ .../utils/coders/delta_xywh_bbox_coder.py | 4 +- .../utils/coders/distance_point_bbox_coder.py | 4 +- .../common/utils/prior_generators/__init__.py | 3 +- .../prior_generators/anchor_generator.py | 6 +- .../prior_generators/base_prior_generator.py | 37 +++ .../utils/prior_generators/point_generator.py | 6 +- src/otx/algo/detection/atss.py | 235 +++++++-------- .../algo/detection/backbones/csp_darknet.py | 21 +- src/otx/algo/detection/backbones/presnet.py | 38 ++- .../{base_models => detectors}/__init__.py | 0 .../detection_transformer.py | 0 .../single_stage_detector.py | 10 +- src/otx/algo/detection/heads/anchor_head.py | 35 ++- src/otx/algo/detection/heads/atss_head.py | 254 ++++------------ src/otx/algo/detection/heads/base_head.py | 8 +- .../algo/detection/heads/rtdetr_decoder.py | 39 ++- src/otx/algo/detection/heads/rtmdet_head.py | 148 ++++----- src/otx/algo/detection/heads/ssd_head.py | 138 ++++----- src/otx/algo/detection/heads/yolox_head.py | 108 ++++--- src/otx/algo/detection/huggingface_model.py | 2 +- src/otx/algo/detection/losses/__init__.py | 6 +- src/otx/algo/detection/losses/atss_loss.py | 280 ++++++++++++++++++ src/otx/algo/detection/losses/rtmdet_loss.py | 129 ++++++++ src/otx/algo/detection/losses/ssd_loss.py | 132 +++++++++ src/otx/algo/detection/losses/yolox_loss.py | 110 +++++++ src/otx/algo/detection/necks/cspnext_pafpn.py | 34 ++- src/otx/algo/detection/necks/fpn.py | 35 ++- .../algo/detection/necks/hybrid_encoder.py | 29 +- src/otx/algo/detection/necks/yolox_pafpn.py | 38 ++- src/otx/algo/detection/rtdetr.py | 193 +++--------- src/otx/algo/detection/rtmdet.py | 137 ++++----- src/otx/algo/detection/ssd.py | 92 ++++-- src/otx/algo/detection/yolox.py | 236 +++++---------- .../heads/rtmdet_ins_head.py | 80 ++--- .../instance_segmentation/losses/__init__.py | 3 +- .../losses/rtmdet_inst_loss.py | 89 ++++++ .../algo/instance_segmentation/rtmdet_inst.py | 36 +-- src/otx/algo/keypoint_detection/rtmpose.py | 13 +- src/otx/core/model/detection.py | 15 +- .../recipe/detection/atss_mobilenetv2.yaml | 3 +- .../detection/atss_mobilenetv2_tile.yaml | 3 +- src/otx/recipe/detection/atss_resnext101.yaml | 3 +- src/otx/recipe/detection/rtdetr_101.yaml | 3 +- src/otx/recipe/detection/rtdetr_18.yaml | 3 +- src/otx/recipe/detection/rtdetr_50.yaml | 3 +- src/otx/recipe/detection/rtmdet_tiny.yaml | 3 +- src/otx/recipe/detection/ssd_mobilenetv2.yaml | 1 + .../detection/ssd_mobilenetv2_tile.yaml | 1 + src/otx/recipe/detection/yolox_l.yaml | 3 +- src/otx/recipe/detection/yolox_l_tile.yaml | 3 +- src/otx/recipe/detection/yolox_s.yaml | 3 +- src/otx/recipe/detection/yolox_s_tile.yaml | 3 +- src/otx/recipe/detection/yolox_tiny.yaml | 3 +- src/otx/recipe/detection/yolox_tiny_tile.yaml | 3 +- src/otx/recipe/detection/yolox_x.yaml | 3 +- src/otx/recipe/detection/yolox_x_tile.yaml | 3 +- .../detection/backbones/test_csp_darknet.py | 22 +- .../algo/detection/backbones/test_presnet.py | 8 +- .../{base_models => detectors}/test_detr.py | 12 +- .../test_single_stage_detector.py | 21 +- .../heads/test_class_incremental_mixin.py | 50 +++- .../detection/heads/test_rtdetr_decoder.py | 6 +- .../algo/detection/heads/test_rtmdet_head.py | 31 +- ...st_custom_ssd_head.py => test_ssd_head.py} | 34 ++- .../algo/detection/heads/test_yolox_head.py | 64 ++-- .../algo/detection/losses/test_yolox_loss.py | 80 +++++ .../detection/necks/test_hybrid_encoder.py | 4 +- .../algo/detection/necks/test_yolox_pafpn.py | 12 +- tests/unit/algo/detection/test_atss.py | 28 +- tests/unit/algo/detection/test_rtdetr.py | 2 +- tests/unit/algo/detection/test_rtmdet.py | 24 +- tests/unit/algo/detection/test_ssd.py | 4 +- tests/unit/algo/detection/test_yolox.py | 55 +++- tests/unit/core/data/test_tiling.py | 8 +- tests/unit/core/model/test_detection.py | 13 +- 78 files changed, 2079 insertions(+), 1302 deletions(-) create mode 100644 src/otx/algo/common/utils/coders/base_bbox_coder.py create mode 100644 src/otx/algo/common/utils/prior_generators/base_prior_generator.py rename src/otx/algo/detection/{base_models => detectors}/__init__.py (100%) rename src/otx/algo/detection/{base_models => detectors}/detection_transformer.py (100%) rename src/otx/algo/detection/{base_models => detectors}/single_stage_detector.py (97%) create mode 100644 src/otx/algo/detection/losses/atss_loss.py create mode 100644 src/otx/algo/detection/losses/rtmdet_loss.py create mode 100644 src/otx/algo/detection/losses/ssd_loss.py create mode 100644 src/otx/algo/detection/losses/yolox_loss.py create mode 100644 src/otx/algo/instance_segmentation/losses/rtmdet_inst_loss.py rename tests/unit/algo/detection/{base_models => detectors}/test_detr.py (93%) rename tests/unit/algo/detection/{base_models => detectors}/test_single_stage_detector.py (84%) rename tests/unit/algo/detection/heads/{test_custom_ssd_head.py => test_ssd_head.py} (59%) create mode 100644 tests/unit/algo/detection/losses/test_yolox_loss.py diff --git a/src/otx/algo/common/backbones/cspnext.py b/src/otx/algo/common/backbones/cspnext.py index 38b412f9813..dfc412d8f01 100644 --- a/src/otx/algo/common/backbones/cspnext.py +++ b/src/otx/algo/common/backbones/cspnext.py @@ -10,7 +10,7 @@ import math from functools import partial -from typing import Callable, ClassVar +from typing import Any, Callable, ClassVar from otx.algo.common.layers import SPPBottleneck from otx.algo.detection.layers import CSPLayer @@ -22,7 +22,7 @@ from torch.nn.modules.batchnorm import _BatchNorm -class CSPNeXt(BaseModule): +class CSPNeXtModule(BaseModule): """CSPNeXt backbone used in RTMDet. Args: @@ -225,3 +225,43 @@ def forward(self, x: tuple[Tensor, ...]) -> tuple[Tensor, ...]: if i in self.out_indices: outs.append(x) return tuple(outs) + + +class CSPNeXt: + """CSPNeXt factory for detection.""" + + CSPNEXT_CFG: ClassVar[dict[str, Any]] = { + "rtmdet_tiny": { + "deepen_factor": 0.167, + "widen_factor": 0.375, + "normalization": nn.BatchNorm2d, + "activation": partial(nn.SiLU, inplace=True), + }, + "rtmpose_tiny": { + "arch": "P5", + "expand_ratio": 0.5, + "deepen_factor": 0.167, + "widen_factor": 0.375, + "out_indices": (4,), + "channel_attention": True, + "normalization": nn.BatchNorm2d, + "activation": partial(nn.SiLU, inplace=True), + }, + "rtmdet_inst_tiny": { + "arch": "P5", + "expand_ratio": 0.5, + "deepen_factor": 0.167, + "widen_factor": 0.375, + "channel_attention": True, + "normalization": nn.BatchNorm2d, + "activation": partial(nn.SiLU, inplace=True), + }, + } + + def __new__(cls, model_name: str) -> CSPNeXtModule: + """Constructor for CSPNeXt.""" + if model_name not in cls.CSPNEXT_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return CSPNeXtModule(**cls.CSPNEXT_CFG[model_name]) diff --git a/src/otx/algo/common/utils/coders/__init__.py b/src/otx/algo/common/utils/coders/__init__.py index 1793d536212..225143287fb 100644 --- a/src/otx/algo/common/utils/coders/__init__.py +++ b/src/otx/algo/common/utils/coders/__init__.py @@ -3,7 +3,8 @@ # """Custom coder implementations.""" +from .base_bbox_coder import BaseBBoxCoder from .delta_xywh_bbox_coder import DeltaXYWHBBoxCoder from .distance_point_bbox_coder import DistancePointBBoxCoder -__all__ = ["DeltaXYWHBBoxCoder", "DistancePointBBoxCoder"] +__all__ = ["BaseBBoxCoder", "DeltaXYWHBBoxCoder", "DistancePointBBoxCoder"] diff --git a/src/otx/algo/common/utils/coders/base_bbox_coder.py b/src/otx/algo/common/utils/coders/base_bbox_coder.py new file mode 100644 index 00000000000..ffdc844b6f0 --- /dev/null +++ b/src/otx/algo/common/utils/coders/base_bbox_coder.py @@ -0,0 +1,26 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Base bounding box coder.""" + +from abc import ABCMeta, abstractmethod + +from torch import Tensor + + +class BaseBBoxCoder(metaclass=ABCMeta): + """Base class for bounding box coder.""" + + encode_size: int + + @abstractmethod + def encode(self, *args, **kwargs) -> Tensor: + """Encode bounding boxes.""" + + @abstractmethod + def decode(self, *args, **kwargs) -> Tensor: + """Decode bounding boxes.""" + + @abstractmethod + def decode_export(self, *args, **kwargs) -> Tensor: + """Decode bounding boxes for export.""" diff --git a/src/otx/algo/common/utils/coders/delta_xywh_bbox_coder.py b/src/otx/algo/common/utils/coders/delta_xywh_bbox_coder.py index c84b08fb33b..6b880a601e8 100644 --- a/src/otx/algo/common/utils/coders/delta_xywh_bbox_coder.py +++ b/src/otx/algo/common/utils/coders/delta_xywh_bbox_coder.py @@ -13,8 +13,10 @@ from otx.algo.detection.utils.utils import clip_bboxes from torch import Tensor +from .base_bbox_coder import BaseBBoxCoder -class DeltaXYWHBBoxCoder: + +class DeltaXYWHBBoxCoder(BaseBBoxCoder): """Delta XYWH BBox coder. Following the practice in `R-CNN `_, diff --git a/src/otx/algo/common/utils/coders/distance_point_bbox_coder.py b/src/otx/algo/common/utils/coders/distance_point_bbox_coder.py index 527b811c9e4..c8188ccaee3 100644 --- a/src/otx/algo/common/utils/coders/distance_point_bbox_coder.py +++ b/src/otx/algo/common/utils/coders/distance_point_bbox_coder.py @@ -13,11 +13,13 @@ from otx.algo.common.utils.utils import bbox2distance, distance2bbox from otx.algo.detection.utils.utils import distance2bbox_export +from .base_bbox_coder import BaseBBoxCoder + if TYPE_CHECKING: from torch import Tensor -class DistancePointBBoxCoder: +class DistancePointBBoxCoder(BaseBBoxCoder): """Distance Point BBox coder. This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left, diff --git a/src/otx/algo/common/utils/prior_generators/__init__.py b/src/otx/algo/common/utils/prior_generators/__init__.py index 0c0ad9e2a77..ffce5952bf1 100644 --- a/src/otx/algo/common/utils/prior_generators/__init__.py +++ b/src/otx/algo/common/utils/prior_generators/__init__.py @@ -4,6 +4,7 @@ """Anchor generators for detection task.""" from .anchor_generator import AnchorGenerator, SSDAnchorGeneratorClustered +from .base_prior_generator import BasePriorGenerator from .point_generator import MlvlPointGenerator -__all__ = ["AnchorGenerator", "SSDAnchorGeneratorClustered", "MlvlPointGenerator"] +__all__ = ["AnchorGenerator", "SSDAnchorGeneratorClustered", "BasePriorGenerator", "MlvlPointGenerator"] diff --git a/src/otx/algo/common/utils/prior_generators/anchor_generator.py b/src/otx/algo/common/utils/prior_generators/anchor_generator.py index eda70e0fbfd..0d46362dd61 100644 --- a/src/otx/algo/common/utils/prior_generators/anchor_generator.py +++ b/src/otx/algo/common/utils/prior_generators/anchor_generator.py @@ -15,8 +15,10 @@ from torch import Tensor from torch.nn.modules.utils import _pair +from .base_prior_generator import BasePriorGenerator -class AnchorGenerator: + +class AnchorGenerator(BasePriorGenerator): """Standard anchor generator for 2D anchor-based detectors. # TODO (sungchul): change strides format from (w, h) to (h, w) @@ -72,7 +74,7 @@ def __init__( raise ValueError(msg) # calculate base sizes of anchors - self.strides = [_pair(stride) for stride in strides] + self.strides: list[tuple[int, int]] = [_pair(stride) for stride in strides] self.base_sizes = [min(stride) for stride in self.strides] if base_sizes is None else base_sizes if scales is not None: diff --git a/src/otx/algo/common/utils/prior_generators/base_prior_generator.py b/src/otx/algo/common/utils/prior_generators/base_prior_generator.py new file mode 100644 index 00000000000..321343e0d75 --- /dev/null +++ b/src/otx/algo/common/utils/prior_generators/base_prior_generator.py @@ -0,0 +1,37 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Base prior generator.""" + +from __future__ import annotations + +from abc import ABCMeta, abstractmethod +from typing import TYPE_CHECKING, Callable + +if TYPE_CHECKING: + from torch import Tensor + + +class BasePriorGenerator(metaclass=ABCMeta): + """Base class for prior generator.""" + + strides: list[tuple[int, int]] + grid_anchors: Callable[..., list[Tensor]] + + @property + @abstractmethod + def num_base_priors(self) -> list[int]: + """Return the number of priors (anchors/points) at a point on the feature grid.""" + + @property + @abstractmethod + def num_levels(self) -> int: + """int: number of feature levels that the generator will be applied.""" + + @abstractmethod + def grid_priors(self, *args, **kwargs) -> list[Tensor]: + """Generate grid anchors/points of multiple feature levels.""" + + @abstractmethod + def valid_flags(self, *args, **kwargs) -> list[Tensor]: + """Generate valid flags of anchors/points of multiple feature levels.""" diff --git a/src/otx/algo/common/utils/prior_generators/point_generator.py b/src/otx/algo/common/utils/prior_generators/point_generator.py index 628b8ea65c1..fd97e8eb93d 100644 --- a/src/otx/algo/common/utils/prior_generators/point_generator.py +++ b/src/otx/algo/common/utils/prior_generators/point_generator.py @@ -15,10 +15,12 @@ from torch import Tensor from torch.nn.modules.utils import _pair +from .base_prior_generator import BasePriorGenerator + DeviceType = Union[str, torch.device] -class MlvlPointGenerator: +class MlvlPointGenerator(BasePriorGenerator): """Standard points generator for multi-level (Mlvl) feature maps in 2D points-based detectors. # TODO (sungchul): change strides format from (w, h) to (h, w) @@ -31,7 +33,7 @@ class MlvlPointGenerator: """ def __init__(self, strides: list[int] | list[tuple[int, int]], offset: float = 0.5) -> None: - self.strides = [_pair(stride) for stride in strides] + self.strides: list[tuple[int, int]] = [_pair(stride) for stride in strides] self.offset = offset @property diff --git a/src/otx/algo/detection/atss.py b/src/otx/algo/detection/atss.py index 019c7e5cc82..1a35a4ffee2 100644 --- a/src/otx/algo/detection/atss.py +++ b/src/otx/algo/detection/atss.py @@ -5,15 +5,15 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal -from otx.algo.common.backbones import ResNeXt, build_model_including_pytorchcv from otx.algo.common.losses import CrossEntropyLoss, CrossSigmoidFocalLoss, GIoULoss from otx.algo.common.utils.coders import DeltaXYWHBBoxCoder from otx.algo.common.utils.prior_generators import AnchorGenerator from otx.algo.common.utils.samplers import PseudoSampler -from otx.algo.detection.base_models import SingleStageDetector +from otx.algo.detection.detectors import SingleStageDetector from otx.algo.detection.heads import ATSSHead +from otx.algo.detection.losses import ATSSCriterion from otx.algo.detection.necks import FPN from otx.algo.detection.utils.assigners import ATSSAssigner from otx.algo.utils.support_otx_v1 import OTXv1Helper @@ -26,6 +26,7 @@ if TYPE_CHECKING: from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + from torch import nn from typing_extensions import Self from otx.core.metrics import MetricCallable @@ -33,11 +34,30 @@ from otx.core.types.label import LabelInfoTypes +PRETRAINED_ROOT: ( + str +) = "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions/models/object_detection/v2/" + +PRETRAINED_WEIGHTS: dict[str, str] = { + "atss_mobilenetv2": PRETRAINED_ROOT + "mobilenet_v2-atss.pth", + "atss_resnext101": PRETRAINED_ROOT + "resnext101_atss_070623.pth", +} + + class ATSS(ExplainableOTXDetModel): - """OTX Detection model class for ATSS.""" + """OTX Detection model class for ATSS. + + Default input size per model: + - atss_mobilenetv2 : (800, 992) + - atss_resnext101 : (800, 992) + """ + + mean: tuple[float, float, float] = (0.0, 0.0, 0.0) + std: tuple[float, float, float] = (255.0, 255.0, 255.0) def __init__( self, + model_name: Literal["atss_mobilenetv2", "atss_resnext101"], label_info: LabelInfoTypes, input_size: tuple[int, int] = (800, 992), optimizer: OptimizerCallable = DefaultOptimizerCallable, @@ -46,7 +66,9 @@ def __init__( torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), ) -> None: + self.load_from: str = PRETRAINED_WEIGHTS[model_name] super().__init__( + model_name=model_name, label_info=label_info, input_size=input_size, optimizer=optimizer, @@ -56,51 +78,8 @@ def __init__( tile_config=tile_config, ) - @property - def _exporter(self) -> OTXModelExporter: - """Creates OTXModelExporter object that can export the model.""" - if self.input_size is None: - msg = f"Input size attribute is not set for {self.__class__}" - raise ValueError(msg) - - return OTXNativeModelExporter( - task_level_export_parameters=self._export_parameters, - input_size=(1, 3, *self.input_size), - mean=self.mean, - std=self.std, - resize_mode="standard", - pad_value=0, - swap_rgb=False, - via_onnx=True, # Currently ATSS should be exported through ONNX - onnx_export_configuration={ - "input_names": ["image"], - "output_names": ["boxes", "labels"], - "dynamic_axes": { - "image": {0: "batch", 2: "height", 3: "width"}, - "boxes": {0: "batch", 1: "num_dets"}, - "labels": {0: "batch", 1: "num_dets"}, - }, - "autograd_inlining": False, - }, - output_names=["bboxes", "labels", "feature_vector", "saliency_map"] if self.explain_mode else None, - ) - - def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict: - """Load the previous OTX ckpt according to OTX2.0.""" - return OTXv1Helper.load_det_ckpt(state_dict, add_prefix) - - -class MobileNetV2ATSS(ATSS): - """ATSS detector with MobileNetV2 backbone.""" - - load_from = ( - "https://storage.openvinotoolkit.org/repositories/" - "openvino_training_extensions/models/object_detection/v2/mobilenet_v2-atss.pth" - ) - mean = (0.0, 0.0, 0.0) - std = (255.0, 255.0, 255.0) - def _build_model(self, num_classes: int) -> SingleStageDetector: + # initialize backbones train_cfg = { "assigner": ATSSAssigner(topk=9), "sampler": PseudoSampler(), @@ -115,26 +94,11 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: "max_per_img": 100, "nms_pre": 1000, } - backbone = build_model_including_pytorchcv( - cfg={ - "type": "mobilenetv2_w1", - "out_indices": [2, 3, 4, 5], - "frozen_stages": -1, - "norm_eval": False, - "pretrained": True, - }, - ) - neck = FPN( - in_channels=[24, 32, 96, 320], - out_channels=64, - num_outs=5, - start_level=1, - add_extra_convs="on_output", - relu_before_extra_convs=True, - ) + backbone = self._build_backbone(model_name=self.model_name) + neck = FPN(model_name=self.model_name) bbox_head = ATSSHead( + model_name=self.model_name, num_classes=num_classes, - in_channels=64, anchor_generator=AnchorGenerator( ratios=[1.0], octave_base_scale=8, @@ -145,67 +109,11 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: target_means=(0.0, 0.0, 0.0, 0.0), target_stds=(0.1, 0.1, 0.2, 0.2), ), - loss_cls=CrossSigmoidFocalLoss( - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - loss_weight=1.0, - ), - loss_bbox=GIoULoss(loss_weight=2.0), - loss_centerness=CrossEntropyLoss(use_sigmoid=True, loss_weight=1.0), - feat_channels=64, - train_cfg=train_cfg, - test_cfg=test_cfg, + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove ) - return SingleStageDetector(backbone, bbox_head, neck=neck, train_cfg=train_cfg, test_cfg=test_cfg) - - -class ResNeXt101ATSS(ATSS): - """ATSS with ResNeXt101 backbone.""" - - load_from = ( - "https://storage.openvinotoolkit.org/repositories/" - "openvino_training_extensions/models/object_detection/v2/resnext101_atss_070623.pth" - ) - mean = (0.0, 0.0, 0.0) - std = (255.0, 255.0, 255.0) - - def _build_model(self, num_classes: int) -> SingleStageDetector: - train_cfg = { - "assigner": ATSSAssigner(topk=9), - "sampler": PseudoSampler(), - "allowed_border": -1, - "pos_weight": -1, - "debug": False, - } - test_cfg = { - "nms": {"type": "nms", "iou_threshold": 0.6}, - "min_bbox_size": 0, - "score_thr": 0.05, - "max_per_img": 100, - "nms_pre": 1000, - } - backbone = ResNeXt( - depth=101, - groups=64, - frozen_stages=1, - init_cfg={"type": "Pretrained", "checkpoint": "open-mmlab://resnext101_64x4d"}, - ) - neck = FPN( - in_channels=[256, 512, 1024, 2048], - out_channels=256, - start_level=1, - add_extra_convs="on_output", - num_outs=5, - relu_before_extra_convs=True, - ) - bbox_head = ATSSHead( - anchor_generator=AnchorGenerator( - ratios=[1.0], - octave_base_scale=8, - scales_per_octave=1, - strides=[8, 16, 32, 64, 128], - ), + criterion = ATSSCriterion( + num_classes=num_classes, bbox_coder=DeltaXYWHBBoxCoder( target_means=(0.0, 0.0, 0.0, 0.0), target_stds=(0.1, 0.1, 0.2, 0.2), @@ -218,17 +126,80 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: ), loss_bbox=GIoULoss(loss_weight=2.0), loss_centerness=CrossEntropyLoss(use_sigmoid=True, loss_weight=1.0), - num_classes=num_classes, - in_channels=256, - train_cfg=train_cfg, - test_cfg=test_cfg, ) - return SingleStageDetector(backbone, bbox_head, neck=neck, train_cfg=train_cfg, test_cfg=test_cfg) + return SingleStageDetector( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + criterion=criterion, + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove + ) + + def _build_backbone(self, model_name: str) -> nn.Module: + if "mobilenetv2" in model_name: + from otx.algo.common.backbones import build_model_including_pytorchcv + + return build_model_including_pytorchcv( + cfg={ + "type": "mobilenetv2_w1", + "out_indices": [2, 3, 4, 5], + "frozen_stages": -1, + "norm_eval": False, + "pretrained": True, + }, + ) + + if "resnext101" in model_name: + from otx.algo.common.backbones import ResNeXt + + return ResNeXt( + depth=101, + groups=64, + frozen_stages=1, + init_cfg={"type": "Pretrained", "checkpoint": "open-mmlab://resnext101_64x4d"}, + ) + + msg = f"Unknown backbone name: {model_name}" + raise ValueError(msg) + + @property + def _exporter(self) -> OTXModelExporter: + """Creates OTXModelExporter object that can export the model.""" + if self.input_size is None: + msg = f"Input size attribute is not set for {self.__class__}" + raise ValueError(msg) + + return OTXNativeModelExporter( + task_level_export_parameters=self._export_parameters, + input_size=(1, 3, *self.input_size), + mean=self.mean, + std=self.std, + resize_mode="standard", + pad_value=0, + swap_rgb=False, + via_onnx=True, # Currently ATSS should be exported through ONNX + onnx_export_configuration={ + "input_names": ["image"], + "output_names": ["boxes", "labels"], + "dynamic_axes": { + "image": {0: "batch", 2: "height", 3: "width"}, + "boxes": {0: "batch", 1: "num_dets"}, + "labels": {0: "batch", 1: "num_dets"}, + }, + "autograd_inlining": False, + }, + output_names=["bboxes", "labels", "feature_vector", "saliency_map"] if self.explain_mode else None, + ) + + def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict: + """Load the previous OTX ckpt according to OTX2.0.""" + return OTXv1Helper.load_det_ckpt(state_dict, add_prefix) def to(self, *args, **kwargs) -> Self: """Return a model with specified device.""" ret = super().to(*args, **kwargs) - if self.device.type == "xpu": + if self.model_name == "atss_resnext101" and self.device.type == "xpu": msg = f"{type(self).__name__} doesn't support XPU." raise RuntimeError(msg) return ret diff --git a/src/otx/algo/detection/backbones/csp_darknet.py b/src/otx/algo/detection/backbones/csp_darknet.py index f5d46233fee..30ac71c38bb 100644 --- a/src/otx/algo/detection/backbones/csp_darknet.py +++ b/src/otx/algo/detection/backbones/csp_darknet.py @@ -90,7 +90,7 @@ def export(self, x: Tensor) -> Tensor: return self.conv(x) -class CSPDarknet(BaseModule): +class CSPDarknetModule(BaseModule): """CSP-Darknet backbone used in YOLOv5 and YOLOX. Args: @@ -249,3 +249,22 @@ def forward(self, x: Tensor) -> tuple[Any, ...]: if i in self.out_indices: outs.append(x) return tuple(outs) + + +class CSPDarknet: + """CSPDarknet factory for detection.""" + + CSPDARKNET_CFG: ClassVar[dict[str, Any]] = { + "yolox_tiny": {"deepen_factor": 0.33, "widen_factor": 0.375}, + "yolox_s": {"deepen_factor": 0.33, "widen_factor": 0.5}, + "yolox_l": {}, + "yolox_x": {"deepen_factor": 1.33, "widen_factor": 1.25}, + } + + def __new__(cls, model_name: str) -> CSPDarknetModule: + """Constructor for CSPDarknet.""" + if model_name not in cls.CSPDARKNET_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return CSPDarknetModule(**cls.CSPDARKNET_CFG[model_name]) diff --git a/src/otx/algo/detection/backbones/presnet.py b/src/otx/algo/detection/backbones/presnet.py index 4ad54ef2502..1ac6cf0e5da 100644 --- a/src/otx/algo/detection/backbones/presnet.py +++ b/src/otx/algo/detection/backbones/presnet.py @@ -15,7 +15,7 @@ from otx.algo.modules.activation import build_activation_layer from otx.algo.modules.base_module import BaseModule from otx.algo.modules.conv_module import Conv2dModule -from otx.algo.modules.norm import build_norm_layer +from otx.algo.modules.norm import FrozenBatchNorm2d, build_norm_layer __all__ = ["PResNet"] @@ -243,7 +243,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return out -class PResNet(BaseModule): +class PResNetModule(BaseModule): """PResNet backbone. Args: @@ -369,3 +369,37 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: if idx in self.return_idx: outs.append(x) return outs + + +class PResNet: + """PResNet factory for detection.""" + + PRESNET_CFG: ClassVar[dict[str, Any]] = { + "rtdetr_18": { + "depth": 18, + "pretrained": True, + "return_idx": [1, 2, 3], + }, + "rtdetr_50": { + "depth": 50, + "return_idx": [1, 2, 3], + "pretrained": True, + "freeze_at": 0, + "normalization": partial(build_norm_layer, FrozenBatchNorm2d, layer_name="norm"), + }, + "rtdetr_101": { + "depth": 101, + "return_idx": [1, 2, 3], + "normalization": partial(build_norm_layer, FrozenBatchNorm2d, layer_name="norm"), + "pretrained": True, + "freeze_at": 0, + }, + } + + def __new__(cls, model_name: str) -> PResNetModule: + """Constructor for PResNet.""" + if model_name not in cls.PRESNET_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return PResNetModule(**cls.PRESNET_CFG[model_name]) diff --git a/src/otx/algo/detection/base_models/__init__.py b/src/otx/algo/detection/detectors/__init__.py similarity index 100% rename from src/otx/algo/detection/base_models/__init__.py rename to src/otx/algo/detection/detectors/__init__.py diff --git a/src/otx/algo/detection/base_models/detection_transformer.py b/src/otx/algo/detection/detectors/detection_transformer.py similarity index 100% rename from src/otx/algo/detection/base_models/detection_transformer.py rename to src/otx/algo/detection/detectors/detection_transformer.py diff --git a/src/otx/algo/detection/base_models/single_stage_detector.py b/src/otx/algo/detection/detectors/single_stage_detector.py similarity index 97% rename from src/otx/algo/detection/base_models/single_stage_detector.py rename to src/otx/algo/detection/detectors/single_stage_detector.py index c83626aa29c..9ebe1ad4320 100644 --- a/src/otx/algo/detection/base_models/single_stage_detector.py +++ b/src/otx/algo/detection/detectors/single_stage_detector.py @@ -26,6 +26,7 @@ class SingleStageDetector(BaseModule): Args: backbone (nn.Module): Backbone module. bbox_head (nn.Module): Bbox head module. + criterion (nn.Module): Criterion module. neck (nn.Module | None, optional): Neck module. Defaults to None. train_cfg (dict | None, optional): Training config. Defaults to None. test_cfg (dict | None, optional): Test config. Defaults to None. @@ -36,6 +37,7 @@ def __init__( self, backbone: nn.Module, bbox_head: nn.Module, + criterion: nn.Module, neck: nn.Module | None = None, train_cfg: dict | None = None, test_cfg: dict | None = None, @@ -46,6 +48,7 @@ def __init__( self.backbone = backbone self.bbox_head = bbox_head self.neck = neck + self.criterion = criterion self.init_cfg = init_cfg self.train_cfg = train_cfg self.test_cfg = test_cfg @@ -129,7 +132,7 @@ def forward( def loss( self, entity: DetBatchDataEntity, - ) -> dict | list: + ) -> dict: """Calculate losses from a batch of inputs and data samples. Args: @@ -143,7 +146,10 @@ def loss( dict: A dictionary of loss components. """ x = self.extract_feat(entity.images) - return self.bbox_head.loss(x, entity) + # TODO (sungchul): compare .loss with other forwards and remove duplicated code + outputs: dict[str, Tensor] = self.bbox_head.loss(x, entity) + + return self.criterion(**outputs) def predict( self, diff --git a/src/otx/algo/detection/heads/anchor_head.py b/src/otx/algo/detection/heads/anchor_head.py index 115741c7619..86bce2d5caf 100644 --- a/src/otx/algo/detection/heads/anchor_head.py +++ b/src/otx/algo/detection/heads/anchor_head.py @@ -13,7 +13,8 @@ import torch from torch import Tensor, nn -from otx.algo.common.utils.prior_generators import AnchorGenerator +from otx.algo.common.utils.coders import BaseBBoxCoder +from otx.algo.common.utils.prior_generators import BasePriorGenerator from otx.algo.common.utils.utils import multi_apply from otx.algo.detection.heads.base_head import BaseDenseHead from otx.algo.detection.utils.prior_generators.utils import anchor_inside_flags @@ -28,10 +29,14 @@ class AnchorHead(BaseDenseHead): num_classes (int): Number of categories excluding the background category. in_channels (tuple[int, ...], int): Number of channels in the input feature map. - anchor_generator (nn.Module): Module for anchor generator - bbox_coder (nn.Module): Module of bounding box coder. - loss_cls (nn.Module): Module of classification loss. - loss_bbox (nn.Module): Module of localization loss. + anchor_generator (BasePriorGenerator): Anchor generator class. + bbox_coder (BaseBBoxCoder): Bounding box coder class. + loss_cls (nn.Module | None): Module of classification loss. + It is related to RPNHead for iseg, will be deprecated. + Defaults to None. + loss_bbox (nn.Module | None): Module of localization loss. + It is related to RPNHead for iseg, will be deprecated. + Defaults to None. train_cfg (dict): Training config of anchor head. test_cfg (dict, optional): Testing config of anchor head. feat_channels (int): Number of hidden channels. Used in child classes. @@ -47,11 +52,11 @@ def __init__( self, num_classes: int, in_channels: tuple[int, ...] | int, - anchor_generator: nn.Module, - bbox_coder: nn.Module, - loss_cls: nn.Module, - loss_bbox: nn.Module, + anchor_generator: BasePriorGenerator, + bbox_coder: BaseBBoxCoder, train_cfg: dict, + loss_cls: nn.Module | None = None, # TODO (kirill): deprecated + loss_bbox: nn.Module | None = None, # TODO (kirill): deprecated test_cfg: dict | None = None, feat_channels: int = 256, reg_decoded_bbox: bool = False, @@ -61,7 +66,7 @@ def __init__( self.in_channels = in_channels self.num_classes = num_classes self.feat_channels = feat_channels - self.use_sigmoid_cls = loss_cls.use_sigmoid + self.use_sigmoid_cls = loss_cls.use_sigmoid if loss_cls else True # TODO (kirill): revert or update if self.use_sigmoid_cls: self.cls_out_channels = num_classes else: @@ -103,7 +108,7 @@ def num_anchors(self) -> int: return self.prior_generator.num_base_priors[0] @property - def anchor_generator(self) -> AnchorGenerator: + def anchor_generator(self) -> BasePriorGenerator: """Anchor generator.""" warnings.warn( "DeprecationWarning: anchor_generator is deprecated, please use `prior_generator` instead", @@ -410,6 +415,8 @@ def loss_by_feat_single( ) -> tuple: """Calculate the loss of a single scale level based on the features extracted by the detection head. + TODO (kirill): it is related to RPNHead for iseg, will be deprecated + Args: cls_score (Tensor): Box scores for each scale level Has shape (N, num_anchors * num_classes, H, W). @@ -434,7 +441,7 @@ def loss_by_feat_single( labels = labels.reshape(-1) label_weights = label_weights.reshape(-1) cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) - loss_cls = self.loss_cls(cls_score, labels, label_weights, avg_factor=avg_factor) + loss_cls = self.loss_cls(cls_score, labels, label_weights, avg_factor=avg_factor) # type: ignore[misc] # TODO (kirill): fix # regression loss target_dim = bbox_targets.size(-1) bbox_targets = bbox_targets.reshape(-1, target_dim) @@ -446,7 +453,7 @@ def loss_by_feat_single( # decodes the already encoded coordinates to absolute format. anchors = anchors.reshape(-1, anchors.size(-1)) bbox_pred = self.bbox_coder.decode(anchors, bbox_pred) - loss_bbox = self.loss_bbox(bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor) + loss_bbox = self.loss_bbox(bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor) # type: ignore[misc] # TODO (kirill): fix return loss_cls, loss_bbox def loss_by_feat( @@ -459,6 +466,8 @@ def loss_by_feat( ) -> dict: """Calculate the loss based on the features extracted by the detection head. + TODO (kirill): it is related to RPNHead for iseg, will be deprecated + Args: cls_scores (list[Tensor]): Box scores for each scale level has shape (N, num_anchors * num_classes, H, W). diff --git a/src/otx/algo/detection/heads/atss_head.py b/src/otx/algo/detection/heads/atss_head.py index 4651172d990..8cdf8814445 100644 --- a/src/otx/algo/detection/heads/atss_head.py +++ b/src/otx/algo/detection/heads/atss_head.py @@ -9,13 +9,13 @@ from __future__ import annotations from functools import partial -from typing import Callable +from typing import Any, Callable, ClassVar import torch from torch import Tensor, nn -from otx.algo.common.losses import CrossEntropyLoss, CrossSigmoidFocalLoss -from otx.algo.common.utils.bbox_overlaps import bbox_overlaps +from otx.algo.common.utils.coders import BaseBBoxCoder +from otx.algo.common.utils.prior_generators import BasePriorGenerator from otx.algo.common.utils.utils import multi_apply, reduce_mean from otx.algo.detection.heads.anchor_head import AnchorHead from otx.algo.detection.heads.class_incremental_mixin import ( @@ -31,7 +31,7 @@ EPS = 1e-12 -class ATSSHead(ClassIncrementalMixin, AnchorHead): +class ATSSHeadModule(ClassIncrementalMixin, AnchorHead): """Detection Head of `ATSS `_. ATSS head structure is similar with FCOS, however ATSS use anchor boxes @@ -50,7 +50,6 @@ class ATSSHead(ClassIncrementalMixin, AnchorHead): the predicted boxes and regression targets to absolute coordinates format. Defaults to False. It should be `True` when using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head. - loss_centerness (nn.Module, optinoal): Module of centerness loss. Defaults to None. init_cfg (dict, list[dict], optional): Initialization config dict. """ @@ -67,11 +66,7 @@ def __init__( requires_grad=True, ), reg_decoded_bbox: bool = True, - loss_centerness: nn.Module | None = None, init_cfg: dict | None = None, - bg_loss_weight: float = -1.0, - use_qfl: bool = False, - qfl_cfg: dict | None = None, **kwargs, ) -> None: self.pred_kernel_size = pred_kernel_size @@ -92,21 +87,6 @@ def __init__( ) self.sampling = False - self.loss_centerness = loss_centerness or CrossEntropyLoss(use_sigmoid=True, loss_weight=1.0) - - if use_qfl: - kwargs["loss_cls"] = ( - qfl_cfg - if qfl_cfg - else { - "type": "QualityFocalLoss", - "use_sigmoid": True, - "beta": 2.0, - "loss_weight": 1.0, - } - ) - self.bg_loss_weight = bg_loss_weight - self.use_qfl = use_qfl def _init_layers(self) -> None: """Initialize layers of the head.""" @@ -230,7 +210,7 @@ def loss_by_feat( # type: ignore[override] Defaults to None. Returns: - dict[str, Tensor]: A dictionary of loss components. + dict[str, Tensor]: A dictionary of raw outputs. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] if len(featmap_sizes) != self.prior_generator.num_levels: @@ -259,182 +239,17 @@ def loss_by_feat( # type: ignore[override] ) = cls_reg_targets avg_factor = reduce_mean(torch.tensor(avg_factor, dtype=torch.float, device=device)).item() - losses_cls, losses_bbox, loss_centerness, bbox_avg_factor = multi_apply( - self.loss_by_feat_single, - anchor_list, - cls_scores, - bbox_preds, - centernesses, - labels_list, - label_weights_list, - bbox_targets_list, - valid_label_mask, - avg_factor=avg_factor, - ) - - bbox_avg_factor = sum(bbox_avg_factor) - bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item() - losses_bbox = [loss_bbox / bbox_avg_factor for loss_bbox in losses_bbox] - return {"loss_cls": losses_cls, "loss_bbox": losses_bbox, "loss_centerness": loss_centerness} - - def loss_by_feat_single( # type: ignore[override] - self, - anchors: Tensor, - cls_score: Tensor, - bbox_pred: Tensor, - centerness: Tensor, - labels: Tensor, - label_weights: Tensor, - bbox_targets: Tensor, - valid_label_mask: Tensor, - avg_factor: float, - ) -> tuple: - """Compute loss of a single scale level. - - Args: - anchors (Tensor): Box reference for each scale level with shape - (N, num_total_anchors, 4). - cls_score (Tensor): Box scores for each scale level - Has shape (N, num_anchors * num_classes, H, W). - bbox_pred (Tensor): Box energies / deltas for each scale - level with shape (N, num_anchors * 4, H, W). - centerness(Tensor): Centerness scores for each scale level. - labels (Tensor): Labels of each anchors with shape - (N, num_total_anchors). - label_weights (Tensor): Label weights of each anchor with shape - (N, num_total_anchors) - bbox_targets (Tensor): BBox regression targets of each anchor with - shape (N, num_total_anchors, 4). - valid_label_mask (Tensor): Label mask for consideration of ignored - label with shape (N, num_total_anchors, 1). - avg_factor (float): Average factor that is used to average - the loss. When using sampling method, avg_factor is usually - the sum of positive and negative priors. When using - `PseudoSampler`, `avg_factor` is usually equal to the number - of positive priors. - - Returns: - tuple[Tensor]: A tuple of loss components. - """ - anchors = anchors.reshape(-1, 4) - cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels).contiguous() - bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) - centerness = centerness.permute(0, 2, 3, 1).reshape(-1) - bbox_targets = bbox_targets.reshape(-1, 4) - labels = labels.reshape(-1) - label_weights = label_weights.reshape(-1) - valid_label_mask = valid_label_mask.reshape(-1, self.cls_out_channels) - - # FG cat_id: [0, num_classes -1], BG cat_id: num_classes - pos_inds = self._get_pos_inds(labels) - - if self.use_qfl: - quality = label_weights.new_zeros(labels.shape) - - if len(pos_inds) > 0: - pos_bbox_targets = bbox_targets[pos_inds] - pos_bbox_pred = bbox_pred[pos_inds] - pos_anchors = anchors[pos_inds] - pos_centerness = centerness[pos_inds] - - centerness_targets = self.centerness_target(pos_anchors, pos_bbox_targets) - if self.reg_decoded_bbox: - pos_bbox_pred = self.bbox_coder.decode(pos_anchors, pos_bbox_pred) - - if self.use_qfl: - quality[pos_inds] = bbox_overlaps(pos_bbox_pred.detach(), pos_bbox_targets, is_aligned=True).clamp( - min=1e-6, - ) - - # regression loss - loss_bbox = self._get_loss_bbox(pos_bbox_targets, pos_bbox_pred, centerness_targets) - - # centerness loss - loss_centerness = self._get_loss_centerness(avg_factor, pos_centerness, centerness_targets) - - else: - loss_bbox = bbox_pred.sum() * 0 - loss_centerness = centerness.sum() * 0 - centerness_targets = bbox_targets.new_tensor(0.0) - - # Re-weigting BG loss - if self.bg_loss_weight >= 0.0: - neg_indices = labels == self.num_classes - label_weights[neg_indices] = self.bg_loss_weight - - if self.use_qfl: - labels = (labels, quality) # For quality focal loss arg spec - - # classification loss - loss_cls = self._get_loss_cls(cls_score, labels, label_weights, valid_label_mask, avg_factor) - - return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum() - - def centerness_target(self, anchors: Tensor, gts: Tensor) -> Tensor: - """Calculate the centerness between anchors and gts. - - Only calculate pos centerness targets, otherwise there may be nan. - - Args: - anchors (Tensor): Anchors with shape (N, 4), "xyxy" format. - gts (Tensor): Ground truth bboxes with shape (N, 4), "xyxy" format. - - Returns: - Tensor: Centerness between anchors and gts. - """ - anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2 - anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2 - l_ = anchors_cx - gts[:, 0] - t_ = anchors_cy - gts[:, 1] - r_ = gts[:, 2] - anchors_cx - b_ = gts[:, 3] - anchors_cy - - left_right = torch.stack([l_, r_], dim=1) - top_bottom = torch.stack([t_, b_], dim=1) - return torch.sqrt( - (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) - * (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]), - ) - - def _get_pos_inds(self, labels: Tensor) -> Tensor: - bg_class_ind = self.num_classes - return ((labels >= 0) & (labels < bg_class_ind)).nonzero().squeeze(1) - - def _get_loss_cls( - self, - cls_score: Tensor, - labels: Tensor, - label_weights: Tensor, - valid_label_mask: Tensor, - avg_factor: Tensor, - ) -> Tensor: - if isinstance(self.loss_cls, CrossSigmoidFocalLoss): - loss_cls = self.loss_cls( - cls_score, - labels, - label_weights, - avg_factor=avg_factor, - valid_label_mask=valid_label_mask, - ) - else: - loss_cls = self.loss_cls(cls_score, labels, label_weights, avg_factor=avg_factor) - return loss_cls - - def _get_loss_centerness( - self, - avg_factor: Tensor, - pos_centerness: Tensor, - centerness_targets: Tensor, - ) -> Tensor: - return self.loss_centerness(pos_centerness, centerness_targets, avg_factor=avg_factor) - - def _get_loss_bbox( - self, - pos_bbox_targets: Tensor, - pos_bbox_pred: Tensor, - centerness_targets: Tensor, - ) -> Tensor: - return self.loss_bbox(pos_bbox_pred, pos_bbox_targets, weight=centerness_targets, avg_factor=1.0) + return { + "anchors": anchor_list, + "cls_score": cls_scores, + "bbox_pred": bbox_preds, + "centerness": centernesses, + "labels": labels_list, + "label_weights": label_weights_list, + "bbox_targets": bbox_targets_list, + "valid_label_mask": valid_label_mask, + "avg_factor": avg_factor, + } def get_targets( self, @@ -576,3 +391,40 @@ def get_num_level_anchors_inside(self, num_level_anchors: list[int], inside_flag """Get the number of valid anchors in every level.""" split_inside_flags = torch.split(inside_flags, num_level_anchors) return [int(flags.sum()) for flags in split_inside_flags] + + +class ATSSHead: + """ATSSHead factory for detection.""" + + ATSSHEAD_CFG: ClassVar[dict[str, Any]] = { + "atss_mobilenetv2": { + "in_channels": 64, + "feat_channels": 64, + }, + "atss_resnext101": { + "in_channels": 256, + }, + } + + def __new__( + cls, + model_name: str, + num_classes: int, + anchor_generator: BasePriorGenerator, + bbox_coder: BaseBBoxCoder, + train_cfg: dict, + test_cfg: dict | None = None, + ) -> ATSSHeadModule: + """Constructor for ATSSHead.""" + if model_name not in cls.ATSSHEAD_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return ATSSHeadModule( + **cls.ATSSHEAD_CFG[model_name], + num_classes=num_classes, + anchor_generator=anchor_generator, + bbox_coder=bbox_coder, + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove + ) diff --git a/src/otx/algo/detection/heads/base_head.py b/src/otx/algo/detection/heads/base_head.py index 3d945f00679..02bcb2aa7e7 100644 --- a/src/otx/algo/detection/heads/base_head.py +++ b/src/otx/algo/detection/heads/base_head.py @@ -275,8 +275,8 @@ def _predict_by_feat_single( # the `custom_cls_channels` parameter is derived from # CrossEntropyCustomLoss and FocalCustomLoss, and is currently used # in v3det. - if getattr(self.loss_cls, "custom_cls_channels", False): - scores = self.loss_cls.get_activation(cls_score) + if hasattr(self, "loss_cls") and getattr(self.loss_cls, "custom_cls_channels", False): + scores = self.loss_cls.get_activation(cls_score) # TODO (sungchul): remove `loss_cls` in head elif self.use_sigmoid_cls: scores = cls_score.sigmoid() else: @@ -502,8 +502,8 @@ def export_by_feat( ): scores = cls_score.permute(0, 2, 3, 1).reshape(batch_size, -1, self.cls_out_channels) - if getattr(self.loss_cls, "custom_cls_channels", False): - scores = self.loss_cls.get_activation(cls_score) + if hasattr(self, "loss_cls") and getattr(self.loss_cls, "custom_cls_channels", False): + scores = self.loss_cls.get_activation(cls_score) # TODO (sungchul): remove `loss_cls` in head elif self.use_sigmoid_cls: scores = scores.sigmoid() else: diff --git a/src/otx/algo/detection/heads/rtdetr_decoder.py b/src/otx/algo/detection/heads/rtdetr_decoder.py index 1b534633fcd..2d190dcaf32 100644 --- a/src/otx/algo/detection/heads/rtdetr_decoder.py +++ b/src/otx/algo/detection/heads/rtdetr_decoder.py @@ -8,7 +8,7 @@ import copy import math from collections import OrderedDict -from typing import Any, Callable +from typing import Any, Callable, ClassVar import torch import torchvision @@ -451,7 +451,7 @@ def forward( return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits) -class RTDETRTransformer(BaseModule): +class RTDETRTransformerModule(BaseModule): """RTDETRTransformer. Args: @@ -801,3 +801,38 @@ def _set_aux_loss(self, outputs_class: torch.Tensor, outputs_coord: torch.Tensor # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. return [{"pred_logits": a, "pred_boxes": b} for a, b in zip(outputs_class, outputs_coord)] + + +class RTDETRTransformer: + """RTDETRTransformer factory for detection.""" + + RTDETRTRANSFORMER_CFG: ClassVar[dict[str, Any]] = { + "rtdetr_18": { + "num_decoder_layers": 3, + "feat_channels": [256, 256, 256], + }, + "rtdetr_50": { + "num_decoder_layers": 6, + "feat_channels": [256, 256, 256], + }, + "rtdetr_101": { + "feat_channels": [384, 384, 384], + }, + } + + def __new__( + cls, + model_name: str, + num_classes: int, + eval_spatial_size: tuple[int, int] | None = None, + ) -> RTDETRTransformerModule: + """Constructor for RTDETRTransformer.""" + if model_name not in cls.RTDETRTRANSFORMER_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return RTDETRTransformerModule( + **cls.RTDETRTRANSFORMER_CFG[model_name], + num_classes=num_classes, + eval_spatial_size=eval_spatial_size, + ) diff --git a/src/otx/algo/detection/heads/rtmdet_head.py b/src/otx/algo/detection/heads/rtmdet_head.py index 7f1fb39756e..b1c15710c53 100644 --- a/src/otx/algo/detection/heads/rtmdet_head.py +++ b/src/otx/algo/detection/heads/rtmdet_head.py @@ -9,14 +9,16 @@ from __future__ import annotations from functools import partial -from typing import Callable +from typing import Any, Callable, ClassVar import torch from torch import Tensor, nn +from otx.algo.common.utils.coders import BaseBBoxCoder from otx.algo.common.utils.nms import multiclass_nms -from otx.algo.common.utils.utils import distance2bbox, inverse_sigmoid, multi_apply, reduce_mean -from otx.algo.detection.heads import ATSSHead +from otx.algo.common.utils.prior_generators import BasePriorGenerator +from otx.algo.common.utils.utils import distance2bbox, inverse_sigmoid, multi_apply +from otx.algo.detection.heads.atss_head import ATSSHeadModule from otx.algo.detection.utils.prior_generators.utils import anchor_inside_flags from otx.algo.detection.utils.utils import ( images_to_levels, @@ -31,7 +33,7 @@ from otx.algo.utils.weight_init import bias_init_with_prob, constant_init, normal_init -class RTMDetHead(ATSSHead): +class RTMDetHead(ATSSHeadModule): """Detection Head of RTMDet. Args: @@ -155,75 +157,6 @@ def forward(self, feats: tuple[Tensor, ...]) -> tuple: bbox_preds.append(reg_dist) return tuple(cls_scores), tuple(bbox_preds) - def loss_by_feat_single( # type: ignore[override] - self, - cls_score: Tensor, - bbox_pred: Tensor, - labels: Tensor, - label_weights: Tensor, - bbox_targets: Tensor, - assign_metrics: Tensor, - stride: list[int], - ) -> tuple[Tensor, ...]: - """Compute loss of a single scale level. - - Args: - cls_score (Tensor): Box scores for each scale level - Has shape (N, num_anchors * num_classes, H, W). - bbox_pred (Tensor): Decoded bboxes for each scale - level with shape (N, num_anchors * 4, H, W). - labels (Tensor): Labels of each anchors with shape - (N, num_total_anchors). - label_weights (Tensor): Label weights of each anchor with shape - (N, num_total_anchors). - bbox_targets (Tensor): BBox regression targets of each anchor with - shape (N, num_total_anchors, 4). - assign_metrics (Tensor): Assign metrics with shape - (N, num_total_anchors). - stride (list[int]): Downsample stride of the feature map. - - Returns: - dict[str, Tensor]: A dictionary of loss components. - """ - if stride[0] != stride[1]: - msg = "h stride is not equal to w stride!" - raise ValueError(msg) - cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels).contiguous() - bbox_pred = bbox_pred.reshape(-1, 4) - bbox_targets = bbox_targets.reshape(-1, 4) - labels = labels.reshape(-1) - assign_metrics = assign_metrics.reshape(-1) - label_weights = label_weights.reshape(-1) - targets = (labels, assign_metrics) - - loss_cls = self.loss_cls(cls_score, targets, label_weights, avg_factor=1.0) - - # FG cat_id: [0, num_classes -1], BG cat_id: num_classes - bg_class_ind = self.num_classes - pos_inds = ((labels >= 0) & (labels < bg_class_ind)).nonzero().squeeze(1) - - if len(pos_inds) > 0: - pos_bbox_targets = bbox_targets[pos_inds] - pos_bbox_pred = bbox_pred[pos_inds] - - pos_decode_bbox_pred = pos_bbox_pred - pos_decode_bbox_targets = pos_bbox_targets - - # regression loss - pos_bbox_weight = assign_metrics[pos_inds] - - loss_bbox = self.loss_bbox( - pos_decode_bbox_pred, - pos_decode_bbox_targets, - weight=pos_bbox_weight, - avg_factor=1.0, - ) - else: - loss_bbox = bbox_pred.sum() * 0 - pos_bbox_weight = bbox_targets.new_tensor(0.0) - - return loss_cls, loss_bbox, assign_metrics.sum(), pos_bbox_weight.sum() - def loss_by_feat( # type: ignore[override] self, cls_scores: list[Tensor], @@ -251,7 +184,7 @@ def loss_by_feat( # type: ignore[override] Defaults to None. Returns: - dict[str, Tensor]: A dictionary of loss components. + dict[str, Tensor]: A dictionary of raw outputs. """ num_imgs = len(batch_img_metas) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] @@ -294,23 +227,16 @@ def loss_by_feat( # type: ignore[override] batch_gt_instances_ignore=batch_gt_instances_ignore, ) - losses_cls, losses_bbox, cls_avg_factors, bbox_avg_factors = multi_apply( - self.loss_by_feat_single, - cls_scores, - decoded_bboxes, - labels_list, - label_weights_list, - bbox_targets_list, - assign_metrics_list, - self.prior_generator.strides, - ) - - cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item() - losses_cls = [x / cls_avg_factor for x in losses_cls] - - bbox_avg_factor = reduce_mean(sum(bbox_avg_factors)).clamp_(min=1).item() - losses_bbox = [x / bbox_avg_factor for x in losses_bbox] - return {"loss_cls": losses_cls, "loss_bbox": losses_bbox} + return { + "cls_score": cls_scores, + "bbox_pred": decoded_bboxes, + "labels": labels_list, + "label_weights": label_weights_list, + "bbox_targets": bbox_targets_list, + "assign_metrics": assign_metrics_list, + "stride": self.prior_generator.strides, + "sampling_results_list": sampling_results_list, + } def export_by_feat( # type: ignore[override] self, @@ -635,7 +561,7 @@ def get_anchors( return anchor_list, valid_flag_list -class RTMDetSepBNHead(RTMDetHead): +class RTMDetSepBNHeadModule(RTMDetHead): """RTMDetHead with separated BN layers and shared conv layers. Args: @@ -798,3 +724,41 @@ def forward(self, feats: tuple[Tensor, ...]) -> tuple: cls_scores.append(cls_score) bbox_preds.append(reg_dist) return tuple(cls_scores), tuple(bbox_preds) + + +class RTMDetSepBNHead: + """RTMDetSepBNHead factory for detection.""" + + RTMDETSEPBNHEAD_CFG: ClassVar[dict[str, Any]] = { + "rtmdet_tiny": { + "in_channels": 96, + "stacked_convs": 2, + "feat_channels": 96, + "with_objectness": False, + "normalization": nn.BatchNorm2d, + "activation": partial(nn.SiLU, inplace=True), + }, + } + + def __new__( + cls, + model_name: str, + num_classes: int, + anchor_generator: BasePriorGenerator, + bbox_coder: BaseBBoxCoder, + train_cfg: dict, + test_cfg: dict | None = None, + ) -> RTMDetSepBNHeadModule: + """Constructor for RTMDetSepBNHead.""" + if model_name not in cls.RTMDETSEPBNHEAD_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return RTMDetSepBNHeadModule( + **cls.RTMDETSEPBNHEAD_CFG[model_name], + num_classes=num_classes, + anchor_generator=anchor_generator, + bbox_coder=bbox_coder, + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove + ) diff --git a/src/otx/algo/detection/heads/ssd_head.py b/src/otx/algo/detection/heads/ssd_head.py index fb6aa316153..e412a95dfeb 100644 --- a/src/otx/algo/detection/heads/ssd_head.py +++ b/src/otx/algo/detection/heads/ssd_head.py @@ -8,21 +8,21 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, ClassVar import torch from torch import Tensor, nn -from otx.algo.common.losses import CrossEntropyLoss, smooth_l1_loss +from otx.algo.common.utils.coders import BaseBBoxCoder +from otx.algo.common.utils.prior_generators import BasePriorGenerator from otx.algo.common.utils.samplers import PseudoSampler -from otx.algo.common.utils.utils import multi_apply from otx.algo.detection.heads.anchor_head import AnchorHead if TYPE_CHECKING: from otx.algo.utils.mmengine_utils import InstanceData -class SSDHead(AnchorHead): +class SSDHeadModule(AnchorHead): """Implementation of `SSD head `_. Args: @@ -75,8 +75,6 @@ def __init__( # heads but a list of int in SSDHead self.num_base_priors = self.prior_generator.num_base_priors - self.loss_cls = CrossEntropyLoss(use_sigmoid=False, reduction="none", loss_weight=1.0) - self._init_layers() self.bbox_coder = bbox_coder @@ -113,66 +111,6 @@ def forward(self, x: tuple[Tensor]) -> tuple[list[Tensor], list[Tensor]]: bbox_preds.append(reg_conv(feat)) return cls_scores, bbox_preds - def loss_by_feat_single( - self, - cls_score: Tensor, - bbox_pred: Tensor, - anchor: Tensor, - labels: Tensor, - label_weights: Tensor, - bbox_targets: Tensor, - bbox_weights: Tensor, - avg_factor: int, - ) -> tuple[Tensor, Tensor]: - """Compute loss of a single image. - - Args: - cls_score (Tensor): Box scores for each image has shape (num_total_anchors, num_classes). - bbox_pred (Tensor): Box energies / deltas for each image level with shape (num_total_anchors, 4). - anchors (Tensor): Box reference for each scale level with shape (num_total_anchors, 4). - labels (Tensor): Labels of each anchors with shape (num_total_anchors,). - label_weights (Tensor): Label weights of each anchor with shape (num_total_anchors,) - bbox_targets (Tensor): BBox regression targets of each anchor with shape (num_total_anchors, 4). - bbox_weights (Tensor): BBox regression loss weights of each anchor with shape (num_total_anchors, 4). - avg_factor (int): Average factor that is used to average - the loss. When using sampling method, avg_factor is usually - the sum of positive and negative priors. When using - `PseudoSampler`, `avg_factor` is usually equal to the number - of positive priors. - - Returns: - tuple[Tensor, Tensor]: A tuple of cls loss and bbox loss of one - feature map. - """ - loss_cls_all = nn.functional.cross_entropy(cls_score, labels, reduction="none") * label_weights - # FG cat_id: [0, num_classes -1], BG cat_id: num_classes - pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero(as_tuple=False).reshape(-1) - neg_inds = (labels == self.num_classes).nonzero(as_tuple=False).view(-1) - - num_pos_samples = pos_inds.size(0) - num_neg_samples = self.train_cfg["neg_pos_ratio"] * num_pos_samples - if num_neg_samples > neg_inds.size(0): - num_neg_samples = neg_inds.size(0) - topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples) - loss_cls_pos = loss_cls_all[pos_inds].sum() - loss_cls_neg = topk_loss_cls_neg.sum() - loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor - - if self.reg_decoded_bbox: - # When the regression loss (e.g. `IouLoss`, `GIouLoss`) - # is applied directly on the decoded bounding boxes, it - # decodes the already encoded coordinates to absolute format. - bbox_pred = self.bbox_coder.decode(anchor, bbox_pred) - - loss_bbox = smooth_l1_loss( - bbox_pred, - bbox_targets, - bbox_weights, - beta=self.train_cfg["smoothl1_beta"], - avg_factor=avg_factor, - ) - return loss_cls[None], loss_bbox - def loss_by_feat( self, cls_scores: list[Tensor], @@ -180,7 +118,7 @@ def loss_by_feat( batch_gt_instances: list[InstanceData], batch_img_metas: list[dict], batch_gt_instances_ignore: list[InstanceData] | None = None, - ) -> dict[str, list[Tensor]]: + ) -> dict[str, Tensor]: """Compute losses of the head. Args: @@ -195,13 +133,7 @@ def loss_by_feat( Defaults to None. Returns: - dict[str, list[Tensor]]: A dictionary of loss components. the dict - has components below: - - - loss_cls (list[Tensor]): A list containing each feature map \ - classification loss. - - loss_bbox (list[Tensor]): A list containing each feature map \ - regression loss. + dict[str, Tensor]: A dictionary of raw outputs. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] @@ -232,18 +164,16 @@ def loss_by_feat( # concat all level anchors to a single tensor all_anchors = [torch.cat(anchor) for anchor in anchor_list] - losses_cls, losses_bbox = multi_apply( - self.loss_by_feat_single, - all_cls_scores, - all_bbox_preds, - all_anchors, - all_labels, - all_label_weights, - all_bbox_targets, - all_bbox_weights, - avg_factor=avg_factor, - ) - return {"loss_cls": losses_cls, "loss_bbox": losses_bbox} + return { + "cls_score": all_cls_scores, + "bbox_pred": all_bbox_preds, + "anchor": all_anchors, + "labels": all_labels, + "label_weights": all_label_weights, + "bbox_targets": all_bbox_targets, + "bbox_weights": all_bbox_weights, + "avg_factor": avg_factor, + } def _init_layers(self) -> None: """Initialize layers of the head. @@ -283,3 +213,39 @@ def _init_layers(self) -> None: self.cls_convs.append( nn.Conv2d(in_channel, num_base_priors * self.cls_out_channels, kernel_size=3, padding=1), ) + + +class SSDHead: + """SSDHead factory for detection.""" + + SSDHEAD_CFG: ClassVar[dict[str, Any]] = { + "ssd_mobilenetv2": { + "in_channels": (96, 320), + "use_depthwise": True, + }, + } + + def __new__( + cls, + model_name: str, + num_classes: int, + anchor_generator: BasePriorGenerator, + bbox_coder: BaseBBoxCoder, + init_cfg: dict, + train_cfg: dict, + test_cfg: dict | None = None, + ) -> SSDHeadModule: + """Constructor for SSDHead.""" + if model_name not in cls.SSDHEAD_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return SSDHeadModule( + **cls.SSDHEAD_CFG[model_name], + num_classes=num_classes, + anchor_generator=anchor_generator, + bbox_coder=bbox_coder, + init_cfg=init_cfg, # TODO (sungchul, kirill): remove + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove + ) diff --git a/src/otx/algo/detection/heads/yolox_head.py b/src/otx/algo/detection/heads/yolox_head.py index 6fe3ca8f320..10050e990cc 100644 --- a/src/otx/algo/detection/heads/yolox_head.py +++ b/src/otx/algo/detection/heads/yolox_head.py @@ -11,20 +11,18 @@ import logging import math from functools import partial -from typing import Callable, Sequence +from typing import Any, Callable, ClassVar, Sequence import torch import torch.nn.functional as F # noqa: N812 from torch import Tensor, nn from torchvision.ops import box_convert -from otx.algo.common.losses import CrossEntropyLoss, L1Loss from otx.algo.common.utils.nms import batched_nms, multiclass_nms from otx.algo.common.utils.prior_generators import MlvlPointGenerator from otx.algo.common.utils.samplers import PseudoSampler from otx.algo.common.utils.utils import multi_apply, reduce_mean from otx.algo.detection.heads.base_head import BaseDenseHead -from otx.algo.detection.losses import IoULoss from otx.algo.modules.activation import Swish, build_activation_layer from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule from otx.algo.modules.norm import build_norm_layer @@ -33,7 +31,7 @@ logger = logging.getLogger() -class YOLOXHead(BaseDenseHead): +class YOLOXHeadModule(BaseDenseHead): """YOLOXHead head used in `YOLOX `_. Args: @@ -56,10 +54,6 @@ class YOLOXHead(BaseDenseHead): Defaults to ``partial(nn.BatchNorm2d, momentum=0.03, eps=0.001)``. activation (Callable[..., nn.Module]): Activation layer module. Defaults to ``Swish``. - loss_cls (nn.Module, optional): Module of classification loss. - loss_bbox (nn.Module, optional): Module of localization loss. - loss_obj (nn.Module, optional): Module of objectness loss. - loss_l1 (nn.Module, optional): Module of L1 loss. train_cfg (dict, optional): Training config of anchor head. Defaults to None. test_cfg (dict, optional): Testing config of anchor head. @@ -80,10 +74,6 @@ def __init__( conv_bias: bool | str = "auto", normalization: Callable[..., nn.Module] = partial(nn.BatchNorm2d, momentum=0.03, eps=0.001), activation: Callable[..., nn.Module] = Swish, - loss_cls: nn.Module | None = None, - loss_bbox: nn.Module | None = None, - loss_obj: nn.Module | None = None, - loss_l1: nn.Module | None = None, train_cfg: dict | None = None, test_cfg: dict | None = None, init_cfg: dict | list[dict] | None = None, @@ -117,12 +107,7 @@ def __init__( self.normalization = normalization self.activation = activation - self.loss_cls = loss_cls or CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0) - self.loss_bbox = loss_bbox or IoULoss(mode="square", eps=1e-16, reduction="sum", loss_weight=5.0) - self.loss_obj = loss_obj or CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0) - self.use_l1 = False # This flag will be modified by hooks. - self.loss_l1 = loss_l1 or L1Loss(reduction="sum", loss_weight=1.0) self.prior_generator = MlvlPointGenerator(strides, offset=0) # type: ignore[arg-type] @@ -394,7 +379,7 @@ def _bbox_decode(self, priors: Tensor, bbox_preds: Tensor) -> Tensor: """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x, tl_y, br_x, br_y). Args: - priors (Tensor): Center proiors of an image, has shape (num_instances, 2). + priors (Tensor): Center priors of an image, has shape (num_instances, 2). bbox_preds (Tensor): Box energies / deltas for all instances, has shape (batch_size, num_instances, 4). Returns: @@ -425,7 +410,7 @@ def _bbox_post_process( # type: ignore[override] the nms operation. Usually `with_nms` is False is used for aug test. Args: - results (InstaceData): Detection instance results, + results (InstanceData): Detection instance results, each item has shape (num_bboxes, ). cfg (dict): Test / postprocessing configuration, if None, test_cfg would be used. @@ -490,7 +475,7 @@ def loss_by_feat( # type: ignore[override] Defaults to None. Returns: - dict[str, Tensor]: A dictionary of losses. + dict[str, Tensor]: A dictionary of raw outputs. """ num_imgs = len(batch_img_metas) if batch_gt_instances_ignore is None: @@ -542,34 +527,19 @@ def loss_by_feat( # type: ignore[override] if self.use_l1: l1_targets = torch.cat(l1_targets, 0) - loss_obj = self.loss_obj(flatten_objectness.view(-1, 1), obj_targets) / num_total_samples - if num_pos > 0: - loss_cls = ( - self.loss_cls(flatten_cls_preds.view(-1, self.num_classes)[pos_masks], cls_targets) / num_total_samples - ) - loss_bbox = self.loss_bbox(flatten_bboxes.view(-1, 4)[pos_masks], bbox_targets) / num_total_samples - else: - # Avoid cls and reg branch not participating in the gradient - # propagation when there is no ground-truth in the images. - # For more details, please refer to - # https://github.com/open-mmlab/mmdetection/issues/7298 - loss_cls = flatten_cls_preds.sum() * 0 - loss_bbox = flatten_bboxes.sum() * 0 - - loss_dict = {"loss_cls": loss_cls, "loss_bbox": loss_bbox, "loss_obj": loss_obj} - - if self.use_l1: - if num_pos > 0: - loss_l1 = self.loss_l1(flatten_bbox_preds.view(-1, 4)[pos_masks], l1_targets) / num_total_samples - else: - # Avoid cls and reg branch not participating in the gradient - # propagation when there is no ground-truth in the images. - # For more details, please refer to - # https://github.com/open-mmlab/mmdetection/issues/7298 - loss_l1 = flatten_bbox_preds.sum() * 0 - loss_dict.update(loss_l1=loss_l1) - - return loss_dict + return { + "flatten_objectness": flatten_objectness, + "flatten_cls_preds": flatten_cls_preds, + "flatten_bbox_preds": flatten_bbox_preds, + "flatten_bboxes": flatten_bboxes, + "obj_targets": obj_targets, + "cls_targets": cls_targets, + "bbox_targets": bbox_targets, + "l1_targets": l1_targets, + "num_total_samples": num_total_samples, + "num_pos": num_pos, + "pos_masks": pos_masks, + } @torch.no_grad() def _get_targets_single( @@ -659,3 +629,45 @@ def _get_l1_target(self, l1_target: Tensor, gt_bboxes: Tensor, priors: Tensor, e l1_target[:, :2] = (gt_cxcywh[:, :2] - priors[:, :2]) / priors[:, 2:] l1_target[:, 2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps) return l1_target + + +class YOLOXHead: + """YOLOXHead factory for detection.""" + + YOLOXHEAD_CFG: ClassVar[dict[str, Any]] = { + "yolox_tiny": { + "in_channels": 96, + "feat_channels": 96, + }, + "yolox_s": { + "in_channels": 128, + "feat_channels": 128, + }, + "yolox_l": { + "in_channels": 256, + "feat_channels": 256, + }, + "yolox_x": { + "in_channels": 320, + "feat_channels": 320, + }, + } + + def __new__( + cls, + model_name: str, + num_classes: int, + train_cfg: dict, + test_cfg: dict | None = None, + ) -> YOLOXHeadModule: + """Constructor for YOLOXHead.""" + if model_name not in cls.YOLOXHEAD_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return YOLOXHeadModule( + **cls.YOLOXHEAD_CFG[model_name], + num_classes=num_classes, + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove + ) diff --git a/src/otx/algo/detection/huggingface_model.py b/src/otx/algo/detection/huggingface_model.py index eb0cd9111ca..ce5035e005e 100644 --- a/src/otx/algo/detection/huggingface_model.py +++ b/src/otx/algo/detection/huggingface_model.py @@ -68,10 +68,10 @@ def __init__( metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, torch_compile: bool = False, ) -> None: - self.model_name = model_name_or_path self.load_from = None super().__init__( + model_name=model_name_or_path, label_info=label_info, input_size=input_size, optimizer=optimizer, diff --git a/src/otx/algo/detection/losses/__init__.py b/src/otx/algo/detection/losses/__init__.py index 768be6e9778..91b4ad733a4 100644 --- a/src/otx/algo/detection/losses/__init__.py +++ b/src/otx/algo/detection/losses/__init__.py @@ -3,7 +3,11 @@ # """Custom OTX Losses for Object Detection.""" +from .atss_loss import ATSSCriterion from .iou_loss import IoULoss from .rtdetr_loss import DetrCriterion +from .rtmdet_loss import RTMDetCriterion +from .ssd_loss import SSDCriterion +from .yolox_loss import YOLOXCriterion -__all__ = ["IoULoss", "DetrCriterion"] +__all__ = ["ATSSCriterion", "IoULoss", "DetrCriterion", "RTMDetCriterion", "SSDCriterion", "YOLOXCriterion"] diff --git a/src/otx/algo/detection/losses/atss_loss.py b/src/otx/algo/detection/losses/atss_loss.py new file mode 100644 index 00000000000..8b6390b508f --- /dev/null +++ b/src/otx/algo/detection/losses/atss_loss.py @@ -0,0 +1,280 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) OpenMMLab. All rights reserved. +# +"""ATSS criterion.""" + +from __future__ import annotations + +import torch +from torch import Tensor, nn + +from otx.algo.common.losses import CrossEntropyLoss, CrossSigmoidFocalLoss, QualityFocalLoss +from otx.algo.common.utils.bbox_overlaps import bbox_overlaps +from otx.algo.common.utils.utils import multi_apply, reduce_mean + + +class ATSSCriterion(nn.Module): + """ATSSCriterion is a loss criterion used in the Adaptive Training Sample Selection (ATSS) algorithm. + + Args: + num_classes (int): The number of object classes. + bbox_coder (nn.Module): The module used for encoding and decoding bounding box coordinates. + loss_cls (nn.Module): The module used for calculating the classification loss. + loss_bbox (nn.Module): The module used for calculating the bounding box regression loss. + loss_centerness (nn.Module | None, optional): The module used for calculating the centerness loss. + Defaults to None. + use_qfl (bool, optional): Whether to use the Quality Focal Loss (QFL). + Defaults to ``CrossEntropyLoss(use_sigmoid=True, loss_weight=1.0)``. + reg_decoded_bbox (bool, optional): Whether to use the decoded bounding box coordinates + for regression loss calculation. Defaults to True. + bg_loss_weight (float, optional): The weight for the background loss. + Defaults to -1.0. + """ + + def __init__( + self, + num_classes: int, + bbox_coder: nn.Module, + loss_cls: nn.Module, + loss_bbox: nn.Module, + loss_centerness: nn.Module | None = None, + use_qfl: bool = False, + qfl_cfg: dict | None = None, + reg_decoded_bbox: bool = True, + bg_loss_weight: float = -1.0, + ) -> None: + super().__init__() + self.num_classes = num_classes + self.bbox_coder = bbox_coder + self.use_qfl = use_qfl + self.reg_decoded_bbox = reg_decoded_bbox + self.bg_loss_weight = bg_loss_weight + + self.loss_bbox = loss_bbox + self.loss_centerness = loss_centerness or CrossEntropyLoss(use_sigmoid=True, loss_weight=1.0) + + if use_qfl: + loss_cls = qfl_cfg or QualityFocalLoss(use_sigmoid=True, beta=2.0, loss_weight=1.0) + + self.loss_cls = loss_cls + + self.use_sigmoid_cls = loss_cls.use_sigmoid + if self.use_sigmoid_cls: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + + if self.cls_out_channels <= 0: + msg = f"num_classes={num_classes} is too small" + raise ValueError(msg) + + def forward( + self, + anchors: Tensor, + cls_score: Tensor, + bbox_pred: Tensor, + centerness: Tensor, + labels: Tensor, + label_weights: Tensor, + bbox_targets: Tensor, + valid_label_mask: Tensor, + avg_factor: float, + ) -> dict[str, Tensor]: + """Compute loss of a single scale level. + + Args: + anchors (Tensor): Box reference for scale levels with shape (N, num_total_anchors, 4). + cls_score (Tensor): Box scores for scale levels have shape (N, num_anchors * num_classes, H, W). + bbox_pred (Tensor): Box energies / deltas for scale levels with shape (N, num_anchors * 4, H, W). + centerness(Tensor): Centerness scores for each scale level. + labels (Tensor): Labels of anchors with shape (N, num_total_anchors). + label_weights (Tensor): Label weights of anchors with shape (N, num_total_anchors) + bbox_targets (Tensor): BBox regression targets of anchors with shape (N, num_total_anchors, 4). + valid_label_mask (Tensor): Label mask for consideration of ignored label + with shape (N, num_total_anchors, 1). + avg_factor (float): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + losses_cls, losses_bbox, loss_centerness, bbox_avg_factor = multi_apply( + self._forward, + anchors, + cls_score, + bbox_pred, + centerness, + labels, + label_weights, + bbox_targets, + valid_label_mask, + avg_factor=avg_factor, + ) + + bbox_avg_factor = sum(bbox_avg_factor) + bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item() + losses_bbox = [loss_bbox / bbox_avg_factor for loss_bbox in losses_bbox] + return {"loss_cls": losses_cls, "loss_bbox": losses_bbox, "loss_centerness": loss_centerness} + + def _forward( + self, + anchors: Tensor, + cls_score: Tensor, + bbox_pred: Tensor, + centerness: Tensor, + labels: Tensor, + label_weights: Tensor, + bbox_targets: Tensor, + valid_label_mask: Tensor, + avg_factor: float, + ) -> tuple: + """Compute loss of a single scale level. + + Args: + anchors (Tensor): Box reference for each scale level with shape + (N, num_total_anchors, 4). + cls_score (Tensor): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W). + bbox_pred (Tensor): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W). + centerness(Tensor): Centerness scores for each scale level. + labels (Tensor): Labels of each anchors with shape + (N, num_total_anchors). + label_weights (Tensor): Label weights of each anchor with shape + (N, num_total_anchors) + bbox_targets (Tensor): BBox regression targets of each anchor with + shape (N, num_total_anchors, 4). + valid_label_mask (Tensor): Label mask for consideration of ignored + label with shape (N, num_total_anchors, 1). + avg_factor (float): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + + Returns: + tuple[Tensor]: A tuple of loss components. + """ + anchors = anchors.reshape(-1, 4) + cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels).contiguous() + bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + centerness = centerness.permute(0, 2, 3, 1).reshape(-1) + bbox_targets = bbox_targets.reshape(-1, 4) + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + valid_label_mask = valid_label_mask.reshape(-1, self.cls_out_channels) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + pos_inds = self._get_pos_inds(labels) + + if self.use_qfl: + quality = label_weights.new_zeros(labels.shape) + + if len(pos_inds) > 0: + pos_bbox_targets = bbox_targets[pos_inds] + pos_bbox_pred = bbox_pred[pos_inds] + pos_anchors = anchors[pos_inds] + pos_centerness = centerness[pos_inds] + + centerness_targets = self.centerness_target(pos_anchors, pos_bbox_targets) + if self.reg_decoded_bbox: + pos_bbox_pred = self.bbox_coder.decode(pos_anchors, pos_bbox_pred) + + if self.use_qfl: + quality[pos_inds] = bbox_overlaps(pos_bbox_pred.detach(), pos_bbox_targets, is_aligned=True).clamp( + min=1e-6, + ) + + # regression loss + loss_bbox = self._get_loss_bbox(pos_bbox_targets, pos_bbox_pred, centerness_targets) + + # centerness loss + loss_centerness = self._get_loss_centerness(avg_factor, pos_centerness, centerness_targets) + + else: + loss_bbox = bbox_pred.sum() * 0 + loss_centerness = centerness.sum() * 0 + centerness_targets = bbox_targets.new_tensor(0.0) + + # Re-weigting BG loss + if self.bg_loss_weight >= 0.0: + neg_indices = labels == self.num_classes + label_weights[neg_indices] = self.bg_loss_weight + + if self.use_qfl: + labels = (labels, quality) # For quality focal loss arg spec + + # classification loss + loss_cls = self._get_loss_cls(cls_score, labels, label_weights, valid_label_mask, avg_factor) + + return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum() + + def centerness_target(self, anchors: Tensor, gts: Tensor) -> Tensor: + """Calculate the centerness between anchors and gts. + + Only calculate pos centerness targets, otherwise there may be nan. + + Args: + anchors (Tensor): Anchors with shape (N, 4), "xyxy" format. + gts (Tensor): Ground truth bboxes with shape (N, 4), "xyxy" format. + + Returns: + Tensor: Centerness between anchors and gts. + """ + anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2 + anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2 + l_ = anchors_cx - gts[:, 0] + t_ = anchors_cy - gts[:, 1] + r_ = gts[:, 2] - anchors_cx + b_ = gts[:, 3] - anchors_cy + + left_right = torch.stack([l_, r_], dim=1) + top_bottom = torch.stack([t_, b_], dim=1) + return torch.sqrt( + (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) + * (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]), + ) + + def _get_pos_inds(self, labels: Tensor) -> Tensor: + bg_class_ind = self.num_classes + return ((labels >= 0) & (labels < bg_class_ind)).nonzero().squeeze(1) + + def _get_loss_bbox( + self, + pos_bbox_targets: Tensor, + pos_bbox_pred: Tensor, + centerness_targets: Tensor, + ) -> Tensor: + return self.loss_bbox(pos_bbox_pred, pos_bbox_targets, weight=centerness_targets, avg_factor=1.0) + + def _get_loss_centerness( + self, + avg_factor: Tensor, + pos_centerness: Tensor, + centerness_targets: Tensor, + ) -> Tensor: + return self.loss_centerness(pos_centerness, centerness_targets, avg_factor=avg_factor) + + def _get_loss_cls( + self, + cls_score: Tensor, + labels: Tensor, + label_weights: Tensor, + valid_label_mask: Tensor, + avg_factor: Tensor, + ) -> Tensor: + if isinstance(self.loss_cls, CrossSigmoidFocalLoss): + loss_cls = self.loss_cls( + cls_score, + labels, + label_weights, + avg_factor=avg_factor, + valid_label_mask=valid_label_mask, + ) + else: + loss_cls = self.loss_cls(cls_score, labels, label_weights, avg_factor=avg_factor) + return loss_cls diff --git a/src/otx/algo/detection/losses/rtmdet_loss.py b/src/otx/algo/detection/losses/rtmdet_loss.py new file mode 100644 index 00000000000..3b08daa4906 --- /dev/null +++ b/src/otx/algo/detection/losses/rtmdet_loss.py @@ -0,0 +1,129 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) OpenMMLab. All rights reserved. +# +"""RTMDet criterion.""" + +from __future__ import annotations + +from torch import Tensor, nn + +from otx.algo.common.utils.utils import multi_apply, reduce_mean + + +class RTMDetCriterion(nn.Module): + """RTMDetCriterion is a criterion module for RTM-based object detection. + + Args: + num_classes (int): Number of object classes. + loss_cls (nn.Module): Classification loss module. + loss_bbox (nn.Module): Bounding box regression loss module. + """ + + def __init__(self, num_classes: int, loss_cls: nn.Module, loss_bbox: nn.Module) -> None: + super().__init__() + self.num_classes = num_classes + self.loss_cls = loss_cls + self.loss_bbox = loss_bbox + self.use_sigmoid_cls = loss_cls.use_sigmoid + if self.use_sigmoid_cls: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + + if self.cls_out_channels <= 0: + msg = f"num_classes={num_classes} is too small" + raise ValueError(msg) + + def forward( + self, + cls_score: Tensor, + bbox_pred: Tensor, + labels: Tensor, + label_weights: Tensor, + bbox_targets: Tensor, + assign_metrics: Tensor, + stride: list[int], + **kwargs, + ) -> dict[str, Tensor]: + """Compute loss of a single scale level. + + Args: + cls_score (Tensor): Box scores for scale levels have shape (N, num_anchors * num_classes, H, W). + bbox_pred (Tensor): Decoded bboxes for scale levels with shape (N, num_anchors * 4, H, W). + labels (Tensor): Labels of anchors with shape (N, num_total_anchors). + label_weights (Tensor): Label weights of anchors with shape (N, num_total_anchors). + bbox_targets (Tensor): BBox regression targets of anchors with shape (N, num_total_anchors, 4). + assign_metrics (Tensor): Assign metrics with shape (N, num_total_anchors). + stride (list[int]): Downsample stride of the feature map. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + losses_cls, losses_bbox, cls_avg_factors, bbox_avg_factors = multi_apply( + self._forward, + cls_score, + bbox_pred, + labels, + label_weights, + bbox_targets, + assign_metrics, + stride, + ) + + cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item() + losses_cls = [x / cls_avg_factor for x in losses_cls] + + bbox_avg_factor = reduce_mean(sum(bbox_avg_factors)).clamp_(min=1).item() + losses_bbox = [x / bbox_avg_factor for x in losses_bbox] + return {"loss_cls": losses_cls, "loss_bbox": losses_bbox} + + def _forward( + self, + cls_score: Tensor, + bbox_pred: Tensor, + labels: Tensor, + label_weights: Tensor, + bbox_targets: Tensor, + assign_metrics: Tensor, + stride: list[int], + ) -> tuple[Tensor, ...]: + """Compute loss of a single scale level.""" + if stride[0] != stride[1]: + msg = "h stride is not equal to w stride!" + raise ValueError(msg) + cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels).contiguous() + bbox_pred = bbox_pred.reshape(-1, 4) + bbox_targets = bbox_targets.reshape(-1, 4) + labels = labels.reshape(-1) + assign_metrics = assign_metrics.reshape(-1) + label_weights = label_weights.reshape(-1) + targets = (labels, assign_metrics) + + loss_cls = self.loss_cls(cls_score, targets, label_weights, avg_factor=1.0) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) & (labels < bg_class_ind)).nonzero().squeeze(1) + + if len(pos_inds) > 0: + pos_bbox_targets = bbox_targets[pos_inds] + pos_bbox_pred = bbox_pred[pos_inds] + + pos_decode_bbox_pred = pos_bbox_pred + pos_decode_bbox_targets = pos_bbox_targets + + # regression loss + pos_bbox_weight = assign_metrics[pos_inds] + + loss_bbox = self.loss_bbox( + pos_decode_bbox_pred, + pos_decode_bbox_targets, + weight=pos_bbox_weight, + avg_factor=1.0, + ) + else: + loss_bbox = bbox_pred.sum() * 0 + pos_bbox_weight = bbox_targets.new_tensor(0.0) + + return loss_cls, loss_bbox, assign_metrics.sum(), pos_bbox_weight.sum() diff --git a/src/otx/algo/detection/losses/ssd_loss.py b/src/otx/algo/detection/losses/ssd_loss.py new file mode 100644 index 00000000000..cb553035c69 --- /dev/null +++ b/src/otx/algo/detection/losses/ssd_loss.py @@ -0,0 +1,132 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) OpenMMLab. All rights reserved. +# +"""SSD criterion.""" + +from __future__ import annotations + +from torch import Tensor, nn + +from otx.algo.common.losses import smooth_l1_loss +from otx.algo.common.utils.utils import multi_apply + + +class SSDCriterion(nn.Module): + """SSDCriterion is a loss criterion for Single Shot MultiBox Detector (SSD). + + Args: + num_classes (int): Number of classes including the background class. + bbox_coder (nn.Module): Bounding box coder module. Defaults to None. + neg_pos_ratio (int, optional): Ratio of negative to positive samples. Defaults to 3. + reg_decoded_bbox (bool): If true, the regression loss would be + applied directly on decoded bounding boxes, converting both + the predicted boxes and regression targets to absolute + coordinates format. Defaults to False. It should be `True` when + using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head. + smoothl1_beta (float, optional): Beta parameter for the smooth L1 loss. Defaults to 1.0. + """ + + def __init__( + self, + num_classes: int, + bbox_coder: nn.Module | None = None, + neg_pos_ratio: int = 3, + reg_decoded_bbox: bool = False, + smoothl1_beta: float = 1.0, + ) -> None: + super().__init__() + self.num_classes = num_classes + self.bbox_coder = bbox_coder + self.neg_pos_ratio = neg_pos_ratio + self.reg_decoded_bbox = reg_decoded_bbox + self.smoothl1_beta = smoothl1_beta + + def forward( + self, + cls_score: Tensor, + bbox_pred: Tensor, + anchor: Tensor, + labels: Tensor, + label_weights: Tensor, + bbox_targets: Tensor, + bbox_weights: Tensor, + avg_factor: int, + ) -> dict[str, Tensor]: + """Compute losses of images. + + Args: + cls_score (Tensor): Box scores for images have shape (N, num_total_anchors, num_classes). + bbox_pred (Tensor): Box energies / deltas for image levels with shape (N, num_total_anchors, 4). + anchors (Tensor): Box reference for for scale levels with shape (N, num_total_anchors, 4). + labels (Tensor): Labels of anchors with shape (N, num_total_anchors). + label_weights (Tensor): Label weights of anchors with shape (N, num_total_anchors) + bbox_targets (Tensor): BBox regression targets of anchors with shape (N, num_total_anchors, 4). + bbox_weights (Tensor): BBox regression loss weights of anchors with shape (N, num_total_anchors, 4). + avg_factor (int): Average factor that is used to average + the loss. When using sampling method, avg_factor is usually + the sum of positive and negative priors. When using + `PseudoSampler`, `avg_factor` is usually equal to the number + of positive priors. + + Returns: + dict[str, Tensor]: A dictionary of loss components. the dict + has components below: + + - loss_cls (list[Tensor]): A list containing each feature map \ + classification loss. + - loss_bbox (list[Tensor]): A list containing each feature map \ + regression loss. + """ + losses_cls, losses_bbox = multi_apply( + self._forward, + cls_score, + bbox_pred, + anchor, + labels, + label_weights, + bbox_targets, + bbox_weights, + avg_factor=avg_factor, + ) + return {"loss_cls": losses_cls, "loss_bbox": losses_bbox} + + def _forward( + self, + cls_score: Tensor, + bbox_pred: Tensor, + anchor: Tensor, + labels: Tensor, + label_weights: Tensor, + bbox_targets: Tensor, + bbox_weights: Tensor, + avg_factor: int, + ) -> tuple[Tensor, Tensor]: + """Compute loss of a single image.""" + loss_cls_all = nn.functional.cross_entropy(cls_score, labels, reduction="none") * label_weights + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero(as_tuple=False).reshape(-1) + neg_inds = (labels == self.num_classes).nonzero(as_tuple=False).view(-1) + + num_pos_samples = pos_inds.size(0) + num_neg_samples = self.neg_pos_ratio * num_pos_samples + num_neg_samples = min(num_neg_samples, neg_inds.size(0)) + topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples) + loss_cls_pos = loss_cls_all[pos_inds].sum() + loss_cls_neg = topk_loss_cls_neg.sum() + loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor + + if self.reg_decoded_bbox and self.bbox_coder: + # When the regression loss (e.g. `IouLoss`, `GIouLoss`) + # is applied directly on the decoded bounding boxes, it + # decodes the already encoded coordinates to absolute format. + bbox_pred = self.bbox_coder.decode(anchor, bbox_pred) + + loss_bbox = smooth_l1_loss( + bbox_pred, + bbox_targets, + bbox_weights, + beta=self.smoothl1_beta, + avg_factor=avg_factor, + ) + return loss_cls[None], loss_bbox diff --git a/src/otx/algo/detection/losses/yolox_loss.py b/src/otx/algo/detection/losses/yolox_loss.py new file mode 100644 index 00000000000..b26d713e45e --- /dev/null +++ b/src/otx/algo/detection/losses/yolox_loss.py @@ -0,0 +1,110 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) OpenMMLab. All rights reserved. +# +"""YOLOX criterion.""" + +from __future__ import annotations + +from torch import Tensor, nn + +from otx.algo.common.losses.cross_entropy_loss import CrossEntropyLoss +from otx.algo.common.losses.smooth_l1_loss import L1Loss +from otx.algo.detection.losses.iou_loss import IoULoss + + +class YOLOXCriterion(nn.Module): + """YOLOX criterion module. + + This module calculates the loss for YOLOX object detection model. + + Args: + num_classes (int): The number of classes. + loss_cls (nn.Module | None): The classification loss module. Defaults to None. + loss_bbox (nn.Module | None): The bounding box regression loss module. Defaults to None. + loss_obj (nn.Module | None): The objectness loss module. Defaults to None. + loss_l1 (nn.Module | None): The L1 loss module. Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary containing the calculated losses. + + """ + + def __init__( + self, + num_classes: int, + loss_cls: nn.Module | None = None, + loss_bbox: nn.Module | None = None, + loss_obj: nn.Module | None = None, + loss_l1: nn.Module | None = None, + use_l1: bool = False, + ) -> None: + super().__init__() + self.num_classes = num_classes + self.loss_cls = loss_cls or CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0) + self.loss_bbox = loss_bbox or IoULoss(mode="square", eps=1e-16, reduction="sum", loss_weight=5.0) + self.loss_obj = loss_obj or CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0) + self.loss_l1 = loss_l1 or L1Loss(reduction="sum", loss_weight=1.0) + self.use_l1 = use_l1 + + def forward( + self, + flatten_objectness: Tensor, + flatten_cls_preds: Tensor, + flatten_bbox_preds: Tensor, + flatten_bboxes: Tensor, + obj_targets: Tensor, + cls_targets: Tensor, + bbox_targets: Tensor, + l1_targets: Tensor, + num_total_samples: Tensor, + num_pos: Tensor, + pos_masks: Tensor, + ) -> dict[str, Tensor]: + """Forward pass of the YOLOX criterion module. + + Args: + flatten_objectness (Tensor): Flattened objectness predictions. + flatten_cls_preds (Tensor): Flattened class predictions. + flatten_bbox_preds (Tensor): Flattened bounding box predictions. + flatten_bboxes (Tensor): Flattened ground truth bounding boxes. + obj_targets (Tensor): Objectness targets. + cls_targets (Tensor): Class targets. + bbox_targets (Tensor): Bounding box targets. + l1_targets (Tensor): L1 targets. + num_total_samples (Tensor): Total number of samples. + num_pos (Tensor): Number of positive samples. + pos_masks (Tensor): Positive masks. + + Returns: + dict[str, Tensor]: A dictionary containing the calculated losses. + + """ + loss_obj = self.loss_obj(flatten_objectness.view(-1, 1), obj_targets) / num_total_samples + if num_pos > 0: + loss_cls = ( + self.loss_cls(flatten_cls_preds.view(-1, self.num_classes)[pos_masks], cls_targets) / num_total_samples + ) + loss_bbox = self.loss_bbox(flatten_bboxes.view(-1, 4)[pos_masks], bbox_targets) / num_total_samples + else: + # Avoid cls and reg branch not participating in the gradient + # propagation when there is no ground-truth in the images. + # For more details, please refer to + # https://github.com/open-mmlab/mmdetection/issues/7298 + loss_cls = flatten_cls_preds.sum() * 0 + loss_bbox = flatten_bboxes.sum() * 0 + + loss_dict = {"loss_cls": loss_cls, "loss_bbox": loss_bbox, "loss_obj": loss_obj} + + if self.use_l1: + if num_pos > 0: + loss_l1 = self.loss_l1(flatten_bbox_preds.view(-1, 4)[pos_masks], l1_targets) / num_total_samples + else: + # Avoid cls and reg branch not participating in the gradient + # propagation when there is no ground-truth in the images. + # For more details, please refer to + # https://github.com/open-mmlab/mmdetection/issues/7298 + loss_l1 = flatten_bbox_preds.sum() * 0 + loss_dict.update(loss_l1=loss_l1) + + return loss_dict diff --git a/src/otx/algo/detection/necks/cspnext_pafpn.py b/src/otx/algo/detection/necks/cspnext_pafpn.py index 4f9462d8db3..e1067d08bcd 100644 --- a/src/otx/algo/detection/necks/cspnext_pafpn.py +++ b/src/otx/algo/detection/necks/cspnext_pafpn.py @@ -12,7 +12,7 @@ import math from functools import partial -from typing import Callable +from typing import Any, Callable, ClassVar import torch from torch import Tensor, nn @@ -24,7 +24,7 @@ from otx.algo.modules.norm import build_norm_layer -class CSPNeXtPAFPN(BaseModule): +class CSPNeXtPAFPNModule(BaseModule): """Path Aggregation Network with CSPNeXt blocks. Args: @@ -179,3 +179,33 @@ def forward(self, inputs: tuple[Tensor, ...]) -> tuple[Tensor, ...]: outs[idx] = conv(outs[idx]) return tuple(outs) + + +class CSPNeXtPAFPN: + """CSPNeXtPAFPN factory for detection.""" + + CSPNEXTPAFPN_CFG: ClassVar[dict[str, Any]] = { + "rtmdet_tiny": { + "in_channels": (96, 192, 384), + "out_channels": 96, + "num_csp_blocks": 1, + "normalization": nn.BatchNorm2d, + "activation": partial(nn.SiLU, inplace=True), + }, + "rtmdet_inst_tiny": { + "in_channels": (96, 192, 384), + "out_channels": 96, + "num_csp_blocks": 1, + "expand_ratio": 0.5, + "normalization": nn.BatchNorm2d, + "activation": partial(nn.SiLU, inplace=True), + }, + } + + def __new__(cls, model_name: str) -> CSPNeXtPAFPNModule: + """Constructor for CSPNeXtPAFPN.""" + if model_name not in cls.CSPNEXTPAFPN_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return CSPNeXtPAFPNModule(**cls.CSPNEXTPAFPN_CFG[model_name]) diff --git a/src/otx/algo/detection/necks/fpn.py b/src/otx/algo/detection/necks/fpn.py index f9eb73f1e88..e5274598fd2 100644 --- a/src/otx/algo/detection/necks/fpn.py +++ b/src/otx/algo/detection/necks/fpn.py @@ -10,7 +10,7 @@ from __future__ import annotations -from typing import Callable +from typing import Any, Callable, ClassVar from torch import Tensor, nn @@ -20,7 +20,7 @@ from otx.algo.modules.norm import build_norm_layer -class FPN(BaseModule): +class FPNModule(BaseModule): r"""Feature Pyramid Network. This is an implementation of paper `Feature Pyramid Networks for Object Detection `_. @@ -200,3 +200,34 @@ def forward(self, inputs: tuple[Tensor]) -> tuple: else: outs.append(self.fpn_convs[i](outs[-1])) return tuple(outs) + + +class FPN: + """FPN factory for detection.""" + + FPN_CFG: ClassVar[dict[str, Any]] = { + "atss_mobilenetv2": { + "in_channels": [24, 32, 96, 320], + "out_channels": 64, + "num_outs": 5, + "start_level": 1, + "add_extra_convs": "on_output", + "relu_before_extra_convs": True, + }, + "atss_resnext101": { + "in_channels": [256, 512, 1024, 2048], + "out_channels": 256, + "num_outs": 5, + "start_level": 1, + "add_extra_convs": "on_output", + "relu_before_extra_convs": True, + }, + } + + def __new__(cls, model_name: str) -> FPNModule: + """Constructor for FPN.""" + if model_name not in cls.FPN_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return FPNModule(**cls.FPN_CFG[model_name]) diff --git a/src/otx/algo/detection/necks/hybrid_encoder.py b/src/otx/algo/detection/necks/hybrid_encoder.py index 9df5ca7ef74..64c3b56ed74 100644 --- a/src/otx/algo/detection/necks/hybrid_encoder.py +++ b/src/otx/algo/detection/necks/hybrid_encoder.py @@ -7,7 +7,7 @@ import copy from functools import partial -from typing import Callable +from typing import Any, Callable, ClassVar import torch from torch import nn @@ -100,7 +100,7 @@ def forward( return output -class HybridEncoder(BaseModule): +class HybridEncoderModule(BaseModule): """HybridEncoder for RTDetr. Args: @@ -319,3 +319,28 @@ def forward(self, feats: torch.Tensor) -> list[torch.Tensor]: outs.append(out) return outs + + +class HybridEncoder: + """HybridEncoder factory for detection.""" + + HYBRIDENCODER_CFG: ClassVar[dict[str, Any]] = { + "rtdetr_18": { + "in_channels": [128, 256, 512], + "expansion": 0.5, + }, + "rtdetr_50": {}, + "rtdetr_101": { + "hidden_dim": 384, + "dim_feedforward": 2048, + "in_channels": [512, 1024, 2048], + }, + } + + def __new__(cls, model_name: str, eval_spatial_size: tuple[int, int] | None = None) -> HybridEncoderModule: + """Constructor for HybridEncoder.""" + if model_name not in cls.HYBRIDENCODER_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return HybridEncoderModule(**cls.HYBRIDENCODER_CFG[model_name], eval_spatial_size=eval_spatial_size) diff --git a/src/otx/algo/detection/necks/yolox_pafpn.py b/src/otx/algo/detection/necks/yolox_pafpn.py index 34e7d878660..c8dee87e07f 100644 --- a/src/otx/algo/detection/necks/yolox_pafpn.py +++ b/src/otx/algo/detection/necks/yolox_pafpn.py @@ -10,7 +10,7 @@ import math from functools import partial -from typing import Any, Callable +from typing import Any, Callable, ClassVar import torch from torch import Tensor, nn @@ -22,7 +22,7 @@ from otx.algo.modules.norm import build_norm_layer -class YOLOXPAFPN(BaseModule): +class YOLOXPAFPNModule(BaseModule): """Path Aggregation Network used in YOLOX. Args: @@ -172,3 +172,37 @@ def forward(self, inputs: tuple[Tensor]) -> tuple[Any, ...]: outs[idx] = conv(outs[idx]) return tuple(outs) + + +class YOLOXPAFPN: + """YOLOXPAFPN factory for detection.""" + + YOLOXPAFPN_CFG: ClassVar[dict[str, Any]] = { + "yolox_tiny": { + "in_channels": [96, 192, 384], + "out_channels": 96, + "num_csp_blocks": 1, + }, + "yolox_s": { + "in_channels": [128, 256, 512], + "out_channels": 128, + "num_csp_blocks": 1, + }, + "yolox_l": { + "in_channels": [256, 512, 1024], + "out_channels": 256, + }, + "yolox_x": { + "in_channels": [320, 640, 1280], + "out_channels": 320, + "num_csp_blocks": 4, + }, + } + + def __new__(cls, model_name: str) -> YOLOXPAFPNModule: + """Constructor for YOLOXPAFPN.""" + if model_name not in cls.YOLOXPAFPN_CFG: + msg = f"model type '{model_name}' is not supported" + raise KeyError(msg) + + return YOLOXPAFPNModule(**cls.YOLOXPAFPN_CFG[model_name]) diff --git a/src/otx/algo/detection/rtdetr.py b/src/otx/algo/detection/rtdetr.py index 9f487c04be3..5c914531bbe 100644 --- a/src/otx/algo/detection/rtdetr.py +++ b/src/otx/algo/detection/rtdetr.py @@ -7,8 +7,7 @@ import copy import re -from functools import partial -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import torch from torch import Tensor, nn @@ -16,10 +15,9 @@ from torchvision.tv_tensors import BoundingBoxFormat from otx.algo.detection.backbones import PResNet -from otx.algo.detection.base_models.detection_transformer import DETR +from otx.algo.detection.detectors import DETR from otx.algo.detection.heads import RTDETRTransformer from otx.algo.detection.necks import HybridEncoder -from otx.algo.modules.norm import FrozenBatchNorm2d, build_norm_layer from otx.core.config.data import TileConfig from otx.core.data.entity.base import OTXBatchLossEntity from otx.core.data.entity.detection import DetBatchDataEntity, DetBatchPredEntity @@ -37,16 +35,29 @@ from otx.core.types.label import LabelInfoTypes +PRETRAINED_ROOT: str = "https://github.com/lyuwenyu/storage/releases/download/v0.1/" + +PRETRAINED_WEIGHTS: dict[str, str] = { + "rtdetr_18": PRETRAINED_ROOT + "rtdetr_r18vd_5x_coco_objects365_from_paddle.pth", + "rtdetr_50": PRETRAINED_ROOT + "rtdetr_r50vd_2x_coco_objects365_from_paddle.pth", + "rtdetr_101": PRETRAINED_ROOT + "rtdetr_r101vd_2x_coco_objects365_from_paddle.pth", +} + + class RTDETR(ExplainableOTXDetModel): - """RTDETR model.""" + """OTX Detection model class for RTDETR. + + Default input size per model: + - ssd_mobilenetv2 : (640, 640) + """ input_size_multiplier = 32 mean: tuple[float, float, float] = (0.0, 0.0, 0.0) std: tuple[float, float, float] = (255.0, 255.0, 255.0) - load_from: str | None = None def __init__( self, + model_name: Literal["rtdetr_18", "rtdetr_50", "rtdetr_101"], label_info: LabelInfoTypes, input_size: tuple[int, int] = (640, 640), optimizer: OptimizerCallable = DefaultOptimizerCallable, @@ -55,7 +66,9 @@ def __init__( torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), ) -> None: + self.load_from: str = PRETRAINED_WEIGHTS[model_name] super().__init__( + model_name=model_name, label_info=label_info, input_size=input_size, optimizer=optimizer, @@ -65,6 +78,36 @@ def __init__( tile_config=tile_config, ) + def _build_model(self, num_classes: int) -> DETR: + backbone = PResNet(model_name=self.model_name) + encoder = HybridEncoder( + model_name=self.model_name, + eval_spatial_size=self.input_size, + ) + decoder = RTDETRTransformer( + model_name=self.model_name, + num_classes=num_classes, + eval_spatial_size=self.input_size, + ) + + optimizer_configuration = [ + # no weight decay for norm layers in backbone + {"params": "^(?=.*backbone)(?=.*norm).*$", "weight_decay": 0.0, "lr": 0.00001}, + # lr for the backbone, but not norm layers is 0.00001 + {"params": "^(?=.*backbone)(?!.*norm).*$", "lr": 0.00001}, + # no weight decay for norm layers and biases in encoder and decoder layers + {"params": "^(?=.*(?:encoder|decoder))(?=.*(?:norm|bias)).*$", "weight_decay": 0.0}, + ] + + return DETR( + backbone=backbone, + encoder=encoder, + decoder=decoder, + num_classes=num_classes, + optimizer_configuration=optimizer_configuration, + input_size=self.input_size[0], + ) + def _customize_inputs( self, entity: DetBatchDataEntity, @@ -226,141 +269,3 @@ def _exporter(self) -> OTXModelExporter: def _optimization_config(self) -> dict[str, Any]: """PTQ config for RT-DETR.""" return {"model_type": "transformer"} - - -class RTDETR18(RTDETR): - """RT-DETR with ResNet-18 backbone.""" - - load_from = ( - "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r18vd_5x_coco_objects365_from_paddle.pth" - ) - - def _build_model(self, num_classes: int) -> nn.Module: - backbone = PResNet( - depth=18, - pretrained=True, - return_idx=[1, 2, 3], - ) - encoder = HybridEncoder( - in_channels=[128, 256, 512], - expansion=0.5, - eval_spatial_size=self.input_size, - ) - decoder = RTDETRTransformer( - num_classes=num_classes, - num_decoder_layers=3, - feat_channels=[256, 256, 256], - eval_spatial_size=self.input_size, - ) - - optimizer_configuration = [ - # no weight decay for norm layers in backbone - {"params": "^(?=.*backbone)(?=.*norm).*$", "weight_decay": 0.0, "lr": 0.00001}, - # lr for the backbone, but not norm layers is 0.00001 - {"params": "^(?=.*backbone)(?!.*norm).*$", "lr": 0.00001}, - # no weight decay for norm layers and biases in encoder and decoder layers - {"params": "^(?=.*(?:encoder|decoder))(?=.*(?:norm|bias)).*$", "weight_decay": 0.0}, - ] - - return DETR( - backbone=backbone, - encoder=encoder, - decoder=decoder, - num_classes=num_classes, - optimizer_configuration=optimizer_configuration, - input_size=self.input_size[0], - ) - - -class RTDETR50(RTDETR): - """RT-DETR with ResNet-50 backbone.""" - - load_from = ( - "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r50vd_2x_coco_objects365_from_paddle.pth" - ) - - def _build_model(self, num_classes: int) -> nn.Module: - backbone = PResNet( - depth=50, - return_idx=[1, 2, 3], - pretrained=True, - freeze_at=0, - normalization=partial(build_norm_layer, FrozenBatchNorm2d, layer_name="norm"), - ) - encoder = HybridEncoder( - eval_spatial_size=self.input_size, - ) - decoder = RTDETRTransformer( - num_classes=num_classes, - feat_channels=[256, 256, 256], - eval_spatial_size=self.input_size, - num_decoder_layers=6, - ) - - optimizer_configuration = [ - # lr for all layers in backbone is 0.00001 - {"params": "backbone", "lr": 0.00001}, - # no weight decay for norm layers and biases in decoder - {"params": "^(?=.*decoder(?=.*bias|.*norm.*weight)).*$", "weight_decay": 0.0}, - # no weight decay for norm layers and biases in encoder - {"params": "^(?=.*encoder(?=.*bias|.*norm.*weight)).*$", "weight_decay": 0.0}, - ] - - return DETR( - backbone=backbone, - encoder=encoder, - decoder=decoder, - num_classes=num_classes, - optimizer_configuration=optimizer_configuration, - input_size=self.input_size[0], - ) - - -class RTDETR101(RTDETR): - """RT-DETR with ResNet-101 backbone.""" - - load_from = ( - "https://github.com/lyuwenyu/storage/releases/download/v0.1/rtdetr_r101vd_2x_coco_objects365_from_paddle.pth" - ) - - def _build_model(self, num_classes: int) -> nn.Module: - backbone = PResNet( - depth=101, - return_idx=[1, 2, 3], - normalization=partial(build_norm_layer, FrozenBatchNorm2d, layer_name="norm"), - pretrained=True, - freeze_at=0, - ) - - encoder = HybridEncoder( - hidden_dim=384, - dim_feedforward=2048, - in_channels=[512, 1024, 2048], - eval_spatial_size=self.input_size, - ) - - decoder = RTDETRTransformer( - num_classes=num_classes, - feat_channels=[384, 384, 384], - eval_spatial_size=self.input_size, - ) - - # no bias decay and learning rate correction for the backbone. - # Without this correction gradients explosion will take place. - optimizer_configuration = [ - # lr for all layers in backbone is 0.000001 - {"params": "backbone", "lr": 0.000001}, - # no weight decay for norm layers and biases in encoder - {"params": "^(?=.*encoder(?=.*bias|.*norm.*weight)).*$", "weight_decay": 0.0}, - # no weight decay for norm layers and biases in decoder - {"params": "^(?=.*decoder(?=.*bias|.*norm.*weight)).*$", "weight_decay": 0.0}, - ] - - return DETR( - backbone=backbone, - encoder=encoder, - decoder=decoder, - num_classes=num_classes, - optimizer_configuration=optimizer_configuration, - input_size=self.input_size[0], - ) diff --git a/src/otx/algo/detection/rtmdet.py b/src/otx/algo/detection/rtmdet.py index ec236f07e5b..19c39db9ccf 100644 --- a/src/otx/algo/detection/rtmdet.py +++ b/src/otx/algo/detection/rtmdet.py @@ -5,20 +5,17 @@ from __future__ import annotations -from functools import partial -from typing import TYPE_CHECKING - -from torch import nn +from typing import TYPE_CHECKING, Literal from otx.algo.common.backbones import CSPNeXt from otx.algo.common.losses import GIoULoss, QualityFocalLoss -from otx.algo.common.losses.cross_entropy_loss import CrossEntropyLoss from otx.algo.common.utils.assigners import DynamicSoftLabelAssigner from otx.algo.common.utils.coders import DistancePointBBoxCoder from otx.algo.common.utils.prior_generators import MlvlPointGenerator from otx.algo.common.utils.samplers import PseudoSampler -from otx.algo.detection.base_models import SingleStageDetector +from otx.algo.detection.detectors import SingleStageDetector from otx.algo.detection.heads import RTMDetSepBNHead +from otx.algo.detection.losses import RTMDetCriterion from otx.algo.detection.necks import CSPNeXtPAFPN from otx.core.config.data import TileConfig from otx.core.exporter.base import OTXModelExporter @@ -36,13 +33,29 @@ from otx.core.types.label import LabelInfoTypes +PRETRAINED_ROOT: ( + str +) = "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions/models/object_detection/v2/" + +PRETRAINED_WEIGHTS: dict[str, str] = { + "rtmdet_tiny": PRETRAINED_ROOT + "rtmdet_tiny.pth", +} + + class RTMDet(ExplainableOTXDetModel): - """OTX Detection model class for RTMDet.""" + """OTX Detection model class for RTMDet. + + Default input size per model: + - rtmdet_tiny : (640, 640) + """ input_size_multiplier = 32 + mean: tuple[float, float, float] = (103.53, 116.28, 123.675) + std: tuple[float, float, float] = (57.375, 57.12, 58.395) def __init__( self, + model_name: Literal["rtmdet_tiny"], label_info: LabelInfoTypes, input_size: tuple[int, int] = (640, 640), optimizer: OptimizerCallable = DefaultOptimizerCallable, @@ -51,7 +64,9 @@ def __init__( torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), ) -> None: + self.load_from: str = PRETRAINED_WEIGHTS[model_name] super().__init__( + model_name=model_name, label_info=label_info, input_size=input_size, optimizer=optimizer, @@ -61,6 +76,48 @@ def __init__( tile_config=tile_config, ) + def _build_model(self, num_classes: int) -> SingleStageDetector: + train_cfg = { + "assigner": DynamicSoftLabelAssigner(topk=13), + "sampler": PseudoSampler(), + "allowed_border": -1, + "pos_weight": -1, + "debug": False, + } + + test_cfg = { + "nms": {"type": "nms", "iou_threshold": 0.65}, + "score_thr": 0.001, + "mask_thr_binary": 0.5, + "max_per_img": 300, + "min_bbox_size": 0, + "nms_pre": 30000, + } + + backbone = CSPNeXt(model_name=self.model_name) + neck = CSPNeXtPAFPN(model_name=self.model_name) + bbox_head = RTMDetSepBNHead( + model_name=self.model_name, + num_classes=num_classes, + anchor_generator=MlvlPointGenerator(offset=0, strides=[8, 16, 32]), + bbox_coder=DistancePointBBoxCoder(), + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove + ) + criterion = RTMDetCriterion( + num_classes=num_classes, + loss_cls=QualityFocalLoss(use_sigmoid=True, beta=2.0, loss_weight=1.0), + loss_bbox=GIoULoss(loss_weight=2.0), + ) + return SingleStageDetector( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + criterion=criterion, + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove + ) + @property def _exporter(self) -> OTXModelExporter: """Creates OTXModelExporter object that can export the model.""" @@ -94,69 +151,3 @@ def _exporter(self) -> OTXModelExporter: def _export_parameters(self) -> TaskLevelExportParameters: """Defines parameters required to export a particular model implementation.""" return super()._export_parameters.wrap(optimization_config={"preset": "mixed"}) - - -class RTMDetTiny(RTMDet): - """RTMDet Tiny Model.""" - - load_from = "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions/models/object_detection/v2/rtmdet_tiny.pth" - mean = (103.53, 116.28, 123.675) - std = (57.375, 57.12, 58.395) - - def _build_model(self, num_classes: int) -> RTMDet: - train_cfg = { - "assigner": DynamicSoftLabelAssigner(topk=13), - "sampler": PseudoSampler(), - "allowed_border": -1, - "pos_weight": -1, - "debug": False, - } - - test_cfg = { - "nms": {"type": "nms", "iou_threshold": 0.65}, - "score_thr": 0.001, - "mask_thr_binary": 0.5, - "max_per_img": 300, - "min_bbox_size": 0, - "nms_pre": 30000, - } - - backbone = CSPNeXt( - deepen_factor=0.167, - widen_factor=0.375, - normalization=nn.BatchNorm2d, - activation=partial(nn.SiLU, inplace=True), - ) - - neck = CSPNeXtPAFPN( - in_channels=(96, 192, 384), - out_channels=96, - num_csp_blocks=1, - normalization=nn.BatchNorm2d, - activation=partial(nn.SiLU, inplace=True), - ) - - bbox_head = RTMDetSepBNHead( - num_classes=num_classes, - in_channels=96, - stacked_convs=2, - feat_channels=96, - with_objectness=False, - anchor_generator=MlvlPointGenerator(offset=0, strides=[8, 16, 32]), - bbox_coder=DistancePointBBoxCoder(), - loss_cls=QualityFocalLoss(use_sigmoid=True, beta=2.0, loss_weight=1.0), - loss_bbox=GIoULoss(loss_weight=2.0), - loss_centerness=CrossEntropyLoss(use_sigmoid=True, loss_weight=1.0), - normalization=nn.BatchNorm2d, - activation=partial(nn.SiLU, inplace=True), - train_cfg=train_cfg, - test_cfg=test_cfg, - ) - - return SingleStageDetector( - backbone=backbone, - neck=neck, - bbox_head=bbox_head, - train_cfg=train_cfg, - test_cfg=test_cfg, - ) diff --git a/src/otx/algo/detection/ssd.py b/src/otx/algo/detection/ssd.py index f6aa62b6cea..2c3e8b01b65 100644 --- a/src/otx/algo/detection/ssd.py +++ b/src/otx/algo/detection/ssd.py @@ -10,16 +10,16 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import numpy as np from datumaro.components.annotation import Bbox -from otx.algo.common.backbones import build_model_including_pytorchcv from otx.algo.common.utils.assigners import MaxIoUAssigner from otx.algo.common.utils.coders import DeltaXYWHBBoxCoder -from otx.algo.detection.base_models import SingleStageDetector +from otx.algo.detection.detectors import SingleStageDetector from otx.algo.detection.heads import SSDHead +from otx.algo.detection.losses import SSDCriterion from otx.algo.detection.utils.prior_generators import SSDAnchorGeneratorClustered from otx.algo.utils.support_otx_v1 import OTXv1Helper from otx.core.config.data import TileConfig @@ -32,6 +32,7 @@ if TYPE_CHECKING: import torch from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + from torch import nn from otx.core.data.dataset.base import OTXDataset from otx.core.metrics import MetricCallable @@ -42,18 +43,30 @@ logger = logging.getLogger() +AVAILABLE_MODEL_VERSIONS: list[str] = ["ssd_mobilenetv2"] + +PRETRAINED_ROOT: ( + str +) = "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions/models/object_detection/v2/" + +PRETRAINED_WEIGHTS: dict[str, str] = { + "ssd_mobilenetv2": PRETRAINED_ROOT + "mobilenet_v2-2s_ssd-992x736.pth", +} + + class SSD(ExplainableOTXDetModel): - """Detecion model class for SSD.""" + """OTX Detection model class for SSD. + + Default input size per model: + - ssd_mobilenetv2 : (864, 864) + """ - load_from = ( - "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions" - "/models/object_detection/v2/mobilenet_v2-2s_ssd-992x736.pth" - ) - mean = (0.0, 0.0, 0.0) - std = (255.0, 255.0, 255.0) + mean: tuple[float, float, float] = (0.0, 0.0, 0.0) + std: tuple[float, float, float] = (255.0, 255.0, 255.0) def __init__( self, + model_name: Literal["ssd_mobilenetv2"], label_info: LabelInfoTypes, input_size: tuple[int, int] = (864, 864), optimizer: OptimizerCallable = DefaultOptimizerCallable, @@ -62,7 +75,9 @@ def __init__( torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), ) -> None: + self.load_from: str = PRETRAINED_WEIGHTS[model_name] super().__init__( + model_name=model_name, label_info=label_info, input_size=input_size, optimizer=optimizer, @@ -81,10 +96,8 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: pos_iou_thr=0.4, neg_iou_thr=0.4, ), - "smoothl1_beta": 1.0, "allowed_border": -1, "pos_weight": -1, - "neg_pos_ratio": 3, "debug": False, "use_giou": False, "use_focal": False, @@ -95,16 +108,10 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: "score_thr": 0.02, "max_per_img": 200, } - backbone = build_model_including_pytorchcv( - cfg={ - "type": "mobilenetv2_w1", - "out_indices": [4, 5], - "frozen_stages": -1, - "norm_eval": False, - "pretrained": True, - }, - ) + backbone = self._build_backbone(model_name=self.model_name) bbox_head = SSDHead( + model_name=self.model_name, + num_classes=num_classes, anchor_generator=SSDAnchorGeneratorClustered( strides=[16, 32], widths=[ @@ -120,14 +127,45 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: target_means=(0.0, 0.0, 0.0, 0.0), target_stds=(0.1, 0.1, 0.2, 0.2), ), + init_cfg={ + "type": "Xavier", + "layer": "Conv2d", + "distribution": "uniform", + }, # TODO (sungchul, kirill): remove + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove + ) + criterion = SSDCriterion( num_classes=num_classes, - in_channels=(96, 320), - use_depthwise=True, - init_cfg={"type": "Xavier", "layer": "Conv2d", "distribution": "uniform"}, - train_cfg=train_cfg, - test_cfg=test_cfg, + bbox_coder=DeltaXYWHBBoxCoder( + target_means=(0.0, 0.0, 0.0, 0.0), + target_stds=(0.1, 0.1, 0.2, 0.2), + ), + ) + return SingleStageDetector( + backbone=backbone, + bbox_head=bbox_head, + criterion=criterion, + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove ) - return SingleStageDetector(backbone, bbox_head, train_cfg=train_cfg, test_cfg=test_cfg) + + def _build_backbone(self, model_name: str) -> nn.Module: + if "mobilenetv2" in model_name: + from otx.algo.common.backbones import build_model_including_pytorchcv + + return build_model_including_pytorchcv( + cfg={ + "type": "mobilenetv2_w1", + "out_indices": [4, 5], + "frozen_stages": -1, + "norm_eval": False, + "pretrained": True, + }, + ) + + msg = f"Unknown backbone name: {model_name}" + raise ValueError(msg) def setup(self, stage: str) -> None: """Callback for setup OTX SSD Model. diff --git a/src/otx/algo/detection/yolox.py b/src/otx/algo/detection/yolox.py index fd1a8765cad..794d2b7b436 100644 --- a/src/otx/algo/detection/yolox.py +++ b/src/otx/algo/detection/yolox.py @@ -5,13 +5,13 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal from otx.algo.common.losses import CrossEntropyLoss, L1Loss from otx.algo.detection.backbones import CSPDarknet -from otx.algo.detection.base_models import SingleStageDetector +from otx.algo.detection.detectors import SingleStageDetector from otx.algo.detection.heads import YOLOXHead -from otx.algo.detection.losses import IoULoss +from otx.algo.detection.losses import IoULoss, YOLOXCriterion from otx.algo.detection.necks import YOLOXPAFPN from otx.algo.detection.utils.assigners import SimOTAAssigner from otx.algo.utils.support_otx_v1 import OTXv1Helper @@ -35,13 +35,36 @@ from otx.core.types.label import LabelInfoTypes +PRETRAINED_ROOT: dict[str, str] = { + "openvino": "https://storage.openvinotoolkit.org/repositories/openvino_training_extensions/models/object_detection/v2/", + "mmdet": "https://download.openmmlab.com/mmdetection/v2.0/yolox/", +} + +PRETRAINED_WEIGHTS: dict[str, str] = { + "yolox_tiny": PRETRAINED_ROOT["openvino"] + "yolox_tiny_8x8.pth", + "yolox_s": PRETRAINED_ROOT["mmdet"] + "yolox_s_8x8_300e_coco/yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth", + "yolox_l": PRETRAINED_ROOT["mmdet"] + "yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth", + "yolox_x": PRETRAINED_ROOT["mmdet"] + "yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth", +} + + class YOLOX(ExplainableOTXDetModel): - """OTX Detection model class for YOLOX.""" + """OTX Detection model class for YOLOX. + + Default input size per model: + - yolox_tiny : (416, 416) + - yolox_s : (640, 640) + - yolox_l : (640, 640) + - yolox_x : (640, 640) + """ input_size_multiplier = 32 + mean: tuple[float, float, float] + std: tuple[float, float, float] def __init__( self, + model_name: Literal["yolox_tiny", "yolox_s", "yolox_l", "yolox_x"], label_info: LabelInfoTypes, input_size: tuple[int, int] = (640, 640), optimizer: OptimizerCallable = DefaultOptimizerCallable, @@ -50,7 +73,9 @@ def __init__( torch_compile: bool = False, tile_config: TileConfig = TileConfig(enable_tiler=False), ) -> None: + self.load_from: str = PRETRAINED_WEIGHTS[model_name] super().__init__( + model_name=model_name, label_info=label_info, input_size=input_size, optimizer=optimizer, @@ -60,6 +85,44 @@ def __init__( tile_config=tile_config, ) + if model_name == "yolox_tiny": + self.mean = (123.675, 116.28, 103.53) + self.std = (58.395, 57.12, 57.375) + else: + self.mean = (0.0, 0.0, 0.0) + self.std = (1.0, 1.0, 1.0) + + def _build_model(self, num_classes: int) -> SingleStageDetector: + train_cfg: dict[str, Any] = {"assigner": SimOTAAssigner(center_radius=2.5)} + test_cfg = { + "nms": {"type": "nms", "iou_threshold": 0.65}, + "score_thr": 0.01, + "max_per_img": 100, + } + backbone = CSPDarknet(model_name=self.model_name) + neck = YOLOXPAFPN(model_name=self.model_name) + bbox_head = YOLOXHead( + model_name=self.model_name, + num_classes=num_classes, + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove + ) + criterion = YOLOXCriterion( + num_classes=num_classes, + loss_cls=CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0), + loss_bbox=IoULoss(mode="square", eps=1e-16, reduction="sum", loss_weight=5.0), + loss_obj=CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0), + loss_l1=L1Loss(reduction="sum", loss_weight=1.0), + ) + return SingleStageDetector( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + criterion=criterion, + train_cfg=train_cfg, # TODO (sungchul, kirill): remove + test_cfg=test_cfg, # TODO (sungchul, kirill): remove + ) + def _customize_inputs( self, entity: DetBatchDataEntity, @@ -75,7 +138,7 @@ def _exporter(self) -> OTXModelExporter: msg = f"Input size attribute is not set for {self.__class__}" raise ValueError(msg) - swap_rgb = not isinstance(self, YOLOXTINY) # only YOLOX-TINY uses RGB + swap_rgb = self.model_name != "yolox_tiny" # only YOLOX-TINY uses RGB return OTXNativeModelExporter( task_level_export_parameters=self._export_parameters, @@ -135,166 +198,3 @@ def export( def load_from_otx_v1_ckpt(self, state_dict: dict, add_prefix: str = "model.") -> dict: """Load the previous OTX ckpt according to OTX2.0.""" return OTXv1Helper.load_det_ckpt(state_dict, add_prefix) - - -class YOLOXTINY(YOLOX): - """YOLOX-TINY detector.""" - - load_from = ( - "https://storage.openvinotoolkit.org/repositories/" - "openvino_training_extensions/models/object_detection/v2/yolox_tiny_8x8.pth" - ) - mean = (123.675, 116.28, 103.53) - std = (58.395, 57.12, 57.375) - - def __init__( - self, - label_info: LabelInfoTypes, - input_size: tuple[int, int] = (416, 416), - optimizer: OptimizerCallable = DefaultOptimizerCallable, - scheduler: LRSchedulerCallable | LRSchedulerListCallable = DefaultSchedulerCallable, - metric: MetricCallable = MeanAveragePrecisionFMeasureCallable, - torch_compile: bool = False, - tile_config: TileConfig = TileConfig(enable_tiler=False), - ) -> None: - super().__init__( - label_info=label_info, - input_size=input_size, - optimizer=optimizer, - scheduler=scheduler, - metric=metric, - torch_compile=torch_compile, - tile_config=tile_config, - ) - - def _build_model(self, num_classes: int) -> SingleStageDetector: - train_cfg: dict[str, Any] = {"assigner": SimOTAAssigner(center_radius=2.5)} - test_cfg = { - "nms": {"type": "nms", "iou_threshold": 0.65}, - "score_thr": 0.01, - "max_per_img": 100, - } - backbone = CSPDarknet(deepen_factor=0.33, widen_factor=0.375) - neck = YOLOXPAFPN( - in_channels=[96, 192, 384], - out_channels=96, - num_csp_blocks=1, - ) - bbox_head = YOLOXHead( - num_classes=num_classes, - in_channels=96, - feat_channels=96, - loss_cls=CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0), - loss_bbox=IoULoss(mode="square", eps=1e-16, reduction="sum", loss_weight=5.0), - loss_obj=CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0), - loss_l1=L1Loss(reduction="sum", loss_weight=1.0), - train_cfg=train_cfg, - test_cfg=test_cfg, - ) - return SingleStageDetector(backbone, bbox_head, neck=neck, train_cfg=train_cfg, test_cfg=test_cfg) - - -class YOLOXS(YOLOX): - """YOLOX-S detector.""" - - load_from = ( - "https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_s_8x8_300e_coco/" - "yolox_s_8x8_300e_coco_20211121_095711-4592a793.pth" - ) - mean = (0.0, 0.0, 0.0) - std = (1.0, 1.0, 1.0) - - def _build_model(self, num_classes: int) -> SingleStageDetector: - train_cfg: dict[str, Any] = {"assigner": SimOTAAssigner(center_radius=2.5)} - test_cfg = { - "nms": {"type": "nms", "iou_threshold": 0.65}, - "score_thr": 0.01, - "max_per_img": 100, - } - backbone = CSPDarknet(deepen_factor=0.33, widen_factor=0.5) - neck = YOLOXPAFPN( - in_channels=[128, 256, 512], - out_channels=128, - num_csp_blocks=1, - ) - bbox_head = YOLOXHead( - num_classes=num_classes, - in_channels=128, - feat_channels=128, - loss_cls=CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0), - loss_bbox=IoULoss(mode="square", eps=1e-16, reduction="sum", loss_weight=5.0), - loss_obj=CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0), - loss_l1=L1Loss(reduction="sum", loss_weight=1.0), - train_cfg=train_cfg, - test_cfg=test_cfg, - ) - return SingleStageDetector(backbone, bbox_head, neck=neck, train_cfg=train_cfg, test_cfg=test_cfg) - - -class YOLOXL(YOLOX): - """YOLOX-L detector.""" - - load_from = ( - "https://download.openmmlab.com/mmdetection/v2.0/yolox/" - "yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth" - ) - mean = (0.0, 0.0, 0.0) - std = (1.0, 1.0, 1.0) - - def _build_model(self, num_classes: int) -> SingleStageDetector: - train_cfg: dict[str, Any] = {"assigner": SimOTAAssigner(center_radius=2.5)} - test_cfg = { - "nms": {"type": "nms", "iou_threshold": 0.65}, - "score_thr": 0.01, - "max_per_img": 100, - } - backbone = CSPDarknet() - neck = YOLOXPAFPN(in_channels=[256, 512, 1024], out_channels=256) - bbox_head = YOLOXHead( - num_classes=num_classes, - in_channels=256, - loss_cls=CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0), - loss_bbox=IoULoss(mode="square", eps=1e-16, reduction="sum", loss_weight=5.0), - loss_obj=CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0), - loss_l1=L1Loss(reduction="sum", loss_weight=1.0), - train_cfg=train_cfg, - test_cfg=test_cfg, - ) - return SingleStageDetector(backbone, bbox_head, neck=neck, train_cfg=train_cfg, test_cfg=test_cfg) - - -class YOLOXX(YOLOX): - """YOLOX-X detector.""" - - load_from = ( - "https://download.openmmlab.com/mmdetection/v2.0/yolox/" - "yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth" - ) - mean = (0.0, 0.0, 0.0) - std = (1.0, 1.0, 1.0) - - def _build_model(self, num_classes: int) -> SingleStageDetector: - train_cfg: dict[str, Any] = {"assigner": SimOTAAssigner(center_radius=2.5)} - test_cfg = { - "nms": {"type": "nms", "iou_threshold": 0.65}, - "score_thr": 0.01, - "max_per_img": 100, - } - backbone = CSPDarknet(deepen_factor=1.33, widen_factor=1.25) - neck = YOLOXPAFPN( - in_channels=[320, 640, 1280], - out_channels=320, - num_csp_blocks=4, - ) - bbox_head = YOLOXHead( - num_classes=num_classes, - in_channels=320, - feat_channels=320, - loss_cls=CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0), - loss_bbox=IoULoss(mode="square", eps=1e-16, reduction="sum", loss_weight=5.0), - loss_obj=CrossEntropyLoss(use_sigmoid=True, reduction="sum", loss_weight=1.0), - loss_l1=L1Loss(reduction="sum", loss_weight=1.0), - train_cfg=train_cfg, - test_cfg=test_cfg, - ) - return SingleStageDetector(backbone, bbox_head, neck=neck, train_cfg=train_cfg, test_cfg=test_cfg) diff --git a/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py b/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py index 77539bf22d3..cffa11abe47 100644 --- a/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py +++ b/src/otx/algo/instance_segmentation/heads/rtmdet_ins_head.py @@ -21,10 +21,8 @@ from otx.algo.common.utils.nms import batched_nms, multiclass_nms from otx.algo.common.utils.utils import ( - distance2bbox, filter_scores_and_topk, inverse_sigmoid, - multi_apply, reduce_mean, select_single_mlvl, ) @@ -50,7 +48,6 @@ class RTMDetInsHead(RTMDetHead): """Detection Head of RTMDet-Ins. Args: - loss_mask (nn.Module): A module for mask loss. num_prototypes (int): Number of mask prototype features extracted from the mask head. Defaults to 8. dyconv_channels (int): Channel of the dynamic conv layers. @@ -64,7 +61,6 @@ class RTMDetInsHead(RTMDetHead): def __init__( self, *args, - loss_mask: nn.Module, num_prototypes: int = 8, dyconv_channels: int = 8, num_dyconvs: int = 3, @@ -76,7 +72,6 @@ def __init__( self.dyconv_channels = dyconv_channels self.mask_loss_stride = mask_loss_stride super().__init__(*args, **kwargs) - self.loss_mask = loss_mask def _init_layers(self) -> None: """Initialize layers of the head.""" @@ -541,7 +536,7 @@ def loss_mask_by_feat( flatten_kernels: Tensor, sampling_results_list: list, batch_gt_instances: list[InstanceData], - ) -> Tensor: + ) -> dict[str, Tensor]: """Compute instance segmentation loss. Args: @@ -556,7 +551,7 @@ def loss_mask_by_feat( attributes. Returns: - Tensor: The mask loss tensor. + dict[str, Tensor]: A dictionary of raw outputs. """ batch_pos_mask_logits = [] pos_gt_masks = [] @@ -612,7 +607,11 @@ def loss_mask_by_feat( self.mask_loss_stride // 2 :: self.mask_loss_stride, ] - return self.loss_mask(batch_pos_mask_logits, pos_gt_masks, weight=None, avg_factor=num_pos) + return { + "batch_pos_mask_logits": batch_pos_mask_logits, + "pos_gt_masks": pos_gt_masks, + "num_pos": num_pos, + } def loss_by_feat( self, @@ -626,17 +625,7 @@ def loss_by_feat( ) -> dict[str, Tensor]: """Compute losses of the head.""" num_imgs = len(batch_img_metas) - featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] - if len(featmap_sizes) != self.prior_generator.num_levels: - msg = "The number of featmap sizes should be equal to the number of levels." - raise ValueError(msg) - device = cls_scores[0].device - anchor_list, valid_flag_list = self.get_anchors(featmap_sizes, batch_img_metas, device=device) - flatten_cls_scores = torch.cat( - [cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.cls_out_channels) for cls_score in cls_scores], - 1, - ) flatten_kernels = torch.cat( [ kernel_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.num_gen_params) @@ -644,14 +633,7 @@ def loss_by_feat( ], 1, ) - decoded_bboxes = [] - for anchor, bbox_pred in zip(anchor_list[0], bbox_preds): - anchor = anchor.reshape(-1, 4) # noqa: PLW2901 - bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) # noqa: PLW2901 - bbox_pred = distance2bbox(anchor, bbox_pred) # noqa: PLW2901 - decoded_bboxes.append(bbox_pred) - - flatten_bboxes = torch.cat(decoded_bboxes, 1) + # Convert polygon masks to bitmap masks if isinstance(batch_gt_instances[0].masks[0], Polygon): for gt_instances, img_meta in zip(batch_gt_instances, batch_img_metas): @@ -660,43 +642,23 @@ def loss_by_feat( ndarray_masks = np.empty((0, *img_meta["img_shape"]), dtype=np.uint8) gt_instances.masks = torch.tensor(ndarray_masks, dtype=torch.bool, device=device) - cls_reg_targets = self.get_targets( - flatten_cls_scores, - flatten_bboxes, - anchor_list, - valid_flag_list, - batch_gt_instances, - batch_img_metas, + raw_outputs = super().loss_by_feat( + cls_scores=cls_scores, + bbox_preds=bbox_preds, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, batch_gt_instances_ignore=batch_gt_instances_ignore, ) - ( - anchor_list, - labels_list, - label_weights_list, - bbox_targets_list, - assign_metrics_list, - sampling_results_list, - ) = cls_reg_targets - - losses_cls, losses_bbox, cls_avg_factors, bbox_avg_factors = multi_apply( - self.loss_by_feat_single, - cls_scores, - decoded_bboxes, - labels_list, - label_weights_list, - bbox_targets_list, - assign_metrics_list, - self.prior_generator.strides, - ) - cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item() - losses_cls = [x / cls_avg_factor for x in losses_cls] - - bbox_avg_factor = reduce_mean(sum(bbox_avg_factors)).clamp_(min=1).item() - losses_bbox = [x / bbox_avg_factor for x in losses_bbox] + raw_iseg_outputs = self.loss_mask_by_feat( + mask_feat, + flatten_kernels, + raw_outputs["sampling_results_list"], + batch_gt_instances, + ) + raw_outputs.update(raw_iseg_outputs) - loss_mask = self.loss_mask_by_feat(mask_feat, flatten_kernels, sampling_results_list, batch_gt_instances) - return {"loss_cls": losses_cls, "loss_bbox": losses_bbox, "loss_mask": loss_mask} + return raw_outputs class MaskFeatModule(BaseModule): diff --git a/src/otx/algo/instance_segmentation/losses/__init__.py b/src/otx/algo/instance_segmentation/losses/__init__.py index 903ea4c077e..bcd3bf03a51 100644 --- a/src/otx/algo/instance_segmentation/losses/__init__.py +++ b/src/otx/algo/instance_segmentation/losses/__init__.py @@ -5,5 +5,6 @@ from .accuracy import accuracy from .dice_loss import DiceLoss +from .rtmdet_inst_loss import RTMDetInstCriterion -__all__ = ["accuracy", "DiceLoss"] +__all__ = ["accuracy", "DiceLoss", "RTMDetInstCriterion"] diff --git a/src/otx/algo/instance_segmentation/losses/rtmdet_inst_loss.py b/src/otx/algo/instance_segmentation/losses/rtmdet_inst_loss.py new file mode 100644 index 00000000000..3935c0990db --- /dev/null +++ b/src/otx/algo/instance_segmentation/losses/rtmdet_inst_loss.py @@ -0,0 +1,89 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) OpenMMLab. All rights reserved. +# +"""RTMDet for instance segmentation criterion.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from otx.algo.detection.losses import RTMDetCriterion + +if TYPE_CHECKING: + from torch import Tensor, nn + + +class RTMDetInstCriterion(RTMDetCriterion): + """Criterion of RTMDet for instance segmentation. + + Args: + num_classes (int): Number of object classes. + loss_cls (nn.Module): Classification loss module. + loss_bbox (nn.Module): Bounding box regression loss module. + loss_mask (nn.Module): Mask loss module. + """ + + def __init__( + self, + num_classes: int, + loss_cls: nn.Module, + loss_bbox: nn.Module, + loss_mask: nn.Module, + ) -> None: + super().__init__(num_classes, loss_cls, loss_bbox) + self.loss_mask = loss_mask + + def forward( + self, + cls_score: Tensor, + bbox_pred: Tensor, + labels: Tensor, + label_weights: Tensor, + bbox_targets: Tensor, + assign_metrics: Tensor, + stride: list[int], + **kwargs, + ) -> dict[str, Tensor]: + """Compute loss of a single scale level. + + Args: + cls_score (Tensor): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W). + bbox_pred (Tensor): Decoded bboxes for each scale + level with shape (N, num_anchors * 4, H, W). + labels (Tensor): Labels of each anchors with shape + (N, num_total_anchors). + label_weights (Tensor): Label weights of each anchor with shape + (N, num_total_anchors). + bbox_targets (Tensor): BBox regression targets of each anchor with + shape (N, num_total_anchors, 4). + assign_metrics (Tensor): Assign metrics with shape + (N, num_total_anchors). + stride (list[int]): Downsample stride of the feature map. + batch_pos_mask_logits (Tensor): The prediction, has a shape (n, *). + pos_gt_masks (Tensor): The label of the prediction, + shape (n, *), same shape of pred. + num_pos (int, optional): Average factor that is used to average + the loss. Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + loss_dict = super().forward( + cls_score=cls_score, + bbox_pred=bbox_pred, + labels=labels, + label_weights=label_weights, + bbox_targets=bbox_targets, + assign_metrics=assign_metrics, + stride=stride, + ) + + batch_pos_mask_logits: Tensor = kwargs.pop("batch_pos_mask_logits") + pos_gt_masks: Tensor = kwargs.pop("pos_gt_masks") + num_pos: int = kwargs.pop("num_pos") + + loss_mask = self.loss_mask(batch_pos_mask_logits, pos_gt_masks, weight=None, avg_factor=num_pos) + loss_dict.update({"loss_mask": loss_mask}) + return loss_dict diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py index f6493fd4aca..97bf0f07d29 100644 --- a/src/otx/algo/instance_segmentation/rtmdet_inst.py +++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py @@ -11,15 +11,15 @@ from torch import nn from otx.algo.common.backbones import CSPNeXt -from otx.algo.common.losses import CrossEntropyLoss, GIoULoss, QualityFocalLoss +from otx.algo.common.losses import GIoULoss, QualityFocalLoss from otx.algo.common.utils.assigners import DynamicSoftLabelAssigner from otx.algo.common.utils.coders import DistancePointBBoxCoder from otx.algo.common.utils.prior_generators import MlvlPointGenerator from otx.algo.common.utils.samplers import PseudoSampler -from otx.algo.detection.base_models import SingleStageDetector +from otx.algo.detection.detectors import SingleStageDetector from otx.algo.detection.necks import CSPNeXtPAFPN from otx.algo.instance_segmentation.heads import RTMDetInsSepBNHead -from otx.algo.instance_segmentation.losses import DiceLoss +from otx.algo.instance_segmentation.losses import DiceLoss, RTMDetInstCriterion from otx.algo.modules.norm import build_norm_layer from otx.core.config.data import TileConfig from otx.core.exporter.base import OTXModelExporter @@ -135,25 +135,8 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: "nms_pre": 300, } - backbone = CSPNeXt( - arch="P5", - expand_ratio=0.5, - deepen_factor=0.167, - widen_factor=0.375, - channel_attention=True, - normalization=nn.BatchNorm2d, - activation=partial(nn.SiLU, inplace=True), - ) - - neck = CSPNeXtPAFPN( - in_channels=(96, 192, 384), - out_channels=96, - num_csp_blocks=1, - expand_ratio=0.5, - normalization=nn.BatchNorm2d, - activation=partial(nn.SiLU, inplace=True), - ) - + backbone = CSPNeXt(model_name="rtmdet_inst_tiny") + neck = CSPNeXtPAFPN(model_name="rtmdet_inst_tiny") bbox_head = RTMDetInsSepBNHead( num_classes=num_classes, in_channels=96, @@ -168,7 +151,11 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: strides=[8, 16, 32], ), bbox_coder=DistancePointBBoxCoder(), - loss_centerness=CrossEntropyLoss(use_sigmoid=True, loss_weight=1.0), + train_cfg=train_cfg, + test_cfg=test_cfg, + ) + criterion = RTMDetInstCriterion( + num_classes=num_classes, loss_cls=QualityFocalLoss( use_sigmoid=True, beta=2.0, @@ -180,14 +167,13 @@ def _build_model(self, num_classes: int) -> SingleStageDetector: eps=5.0e-06, reduction="mean", ), - train_cfg=train_cfg, - test_cfg=test_cfg, ) return SingleStageDetector( backbone=backbone, neck=neck, bbox_head=bbox_head, + criterion=criterion, train_cfg=train_cfg, test_cfg=test_cfg, ) diff --git a/src/otx/algo/keypoint_detection/rtmpose.py b/src/otx/algo/keypoint_detection/rtmpose.py index e0bd042d285..a31f7f8e775 100644 --- a/src/otx/algo/keypoint_detection/rtmpose.py +++ b/src/otx/algo/keypoint_detection/rtmpose.py @@ -5,7 +5,6 @@ from __future__ import annotations -from functools import partial from typing import TYPE_CHECKING from otx.algo.common.backbones import CSPNeXt @@ -16,7 +15,6 @@ from otx.core.metrics.pck import PCKMeasureCallable from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable from otx.core.model.keypoint_detection import OTXKeypointDetectionModel -from torch import nn if TYPE_CHECKING: from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable @@ -95,16 +93,7 @@ def _build_model(self, num_classes: int) -> RTMPose: simcc_split_ratio = 2.0 sigma = (4.9, 5.66) - backbone = CSPNeXt( - arch="P5", - expand_ratio=0.5, - deepen_factor=0.167, - widen_factor=0.375, - out_indices=(4,), - channel_attention=True, - normalization=nn.BatchNorm2d, - activation=partial(nn.SiLU, inplace=True), - ) + backbone = CSPNeXt(model_name="rtmpose_tiny") head = RTMCCHead( out_channels=num_classes, in_channels=384, diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py index 76b7c2d1538..437aa6b6e96 100644 --- a/src/otx/core/model/detection.py +++ b/src/otx/core/model/detection.py @@ -35,7 +35,7 @@ from model_api.models.utils import DetectionResult from torch import nn - from otx.algo.detection.base_models import SingleStageDetector + from otx.algo.detection.detectors import SingleStageDetector class OTXDetectionModel(OTXModel[DetBatchDataEntity, DetBatchPredEntity]): @@ -43,6 +43,10 @@ class OTXDetectionModel(OTXModel[DetBatchDataEntity, DetBatchPredEntity]): input_size: tuple[int, int] + def __init__(self, model_name: str, *args, **kwargs) -> None: + self.model_name = model_name + super().__init__(*args, **kwargs) + def test_step(self, batch: DetBatchDataEntity, batch_idx: int) -> None: """Perform a single test step on a batch of data from the test set. @@ -386,6 +390,7 @@ class ExplainableOTXDetModel(OTXDetectionModel): def __init__( self, + model_name: str, label_info: LabelInfoTypes, input_size: tuple[int, int], optimizer: OptimizerCallable = DefaultOptimizerCallable, @@ -397,6 +402,7 @@ def __init__( from otx.algo.explain.explain_algo import feature_vector_fn super().__init__( + model_name=model_name, label_info=label_info, input_size=input_size, optimizer=optimizer, @@ -461,11 +467,14 @@ def _forward_explain_detection( def get_explain_fn(self) -> Callable: """Returns explain function.""" - from otx.algo.detection.heads import SSDHead + from otx.algo.detection.heads.ssd_head import SSDHeadModule from otx.algo.explain.explain_algo import DetClassProbabilityMap # SSD-like heads also have background class - background_class = hasattr(self.model, "bbox_head") and isinstance(self.model.bbox_head, SSDHead) + background_class = hasattr(self.model, "bbox_head") and isinstance( + self.model.bbox_head, + SSDHeadModule, + ) # TODO (sungchul): revert module's name? tiling_mode = self.tile_config.enable_tiler if hasattr(self, "tile_config") else False explainer = DetClassProbabilityMap( num_classes=self.num_classes + background_class, diff --git a/src/otx/recipe/detection/atss_mobilenetv2.yaml b/src/otx/recipe/detection/atss_mobilenetv2.yaml index ae8bd846e7d..adabd373f1e 100644 --- a/src/otx/recipe/detection/atss_mobilenetv2.yaml +++ b/src/otx/recipe/detection/atss_mobilenetv2.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.atss.MobileNetV2ATSS + class_path: otx.algo.detection.atss.ATSS init_args: + model_name: atss_mobilenetv2 label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/atss_mobilenetv2_tile.yaml b/src/otx/recipe/detection/atss_mobilenetv2_tile.yaml index e1f6e9725f7..8202a650571 100644 --- a/src/otx/recipe/detection/atss_mobilenetv2_tile.yaml +++ b/src/otx/recipe/detection/atss_mobilenetv2_tile.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.atss.MobileNetV2ATSS + class_path: otx.algo.detection.atss.ATSS init_args: + model_name: atss_mobilenetv2 label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/atss_resnext101.yaml b/src/otx/recipe/detection/atss_resnext101.yaml index 9bfbb05caa5..0a6bb28bbde 100644 --- a/src/otx/recipe/detection/atss_resnext101.yaml +++ b/src/otx/recipe/detection/atss_resnext101.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.atss.ResNeXt101ATSS + class_path: otx.algo.detection.atss.ATSS init_args: + model_name: atss_resnext101 label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/rtdetr_101.yaml b/src/otx/recipe/detection/rtdetr_101.yaml index 1ae36dbc26b..bf17d8e9b7f 100644 --- a/src/otx/recipe/detection/rtdetr_101.yaml +++ b/src/otx/recipe/detection/rtdetr_101.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.rtdetr.RTDETR101 + class_path: otx.algo.detection.rtdetr.RTDETR init_args: + model_name: rtdetr_101 label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/rtdetr_18.yaml b/src/otx/recipe/detection/rtdetr_18.yaml index 4e11fa20499..a41714352e1 100644 --- a/src/otx/recipe/detection/rtdetr_18.yaml +++ b/src/otx/recipe/detection/rtdetr_18.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.rtdetr.RTDETR18 + class_path: otx.algo.detection.rtdetr.RTDETR init_args: + model_name: rtdetr_18 label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/rtdetr_50.yaml b/src/otx/recipe/detection/rtdetr_50.yaml index 9adb14819a7..33d555be0a1 100644 --- a/src/otx/recipe/detection/rtdetr_50.yaml +++ b/src/otx/recipe/detection/rtdetr_50.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.rtdetr.RTDETR50 + class_path: otx.algo.detection.rtdetr.RTDETR init_args: + model_name: rtdetr_50 label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/rtmdet_tiny.yaml b/src/otx/recipe/detection/rtmdet_tiny.yaml index 6a74d780ab7..561dc916204 100644 --- a/src/otx/recipe/detection/rtmdet_tiny.yaml +++ b/src/otx/recipe/detection/rtmdet_tiny.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.rtmdet.RTMDetTiny + class_path: otx.algo.detection.rtmdet.RTMDet init_args: + model_name: rtmdet_tiny label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/ssd_mobilenetv2.yaml b/src/otx/recipe/detection/ssd_mobilenetv2.yaml index 60f1cb02391..0f523859a02 100644 --- a/src/otx/recipe/detection/ssd_mobilenetv2.yaml +++ b/src/otx/recipe/detection/ssd_mobilenetv2.yaml @@ -1,6 +1,7 @@ model: class_path: otx.algo.detection.ssd.SSD init_args: + model_name: ssd_mobilenetv2 label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/ssd_mobilenetv2_tile.yaml b/src/otx/recipe/detection/ssd_mobilenetv2_tile.yaml index 33d6bf4c261..c27432b8f33 100644 --- a/src/otx/recipe/detection/ssd_mobilenetv2_tile.yaml +++ b/src/otx/recipe/detection/ssd_mobilenetv2_tile.yaml @@ -1,6 +1,7 @@ model: class_path: otx.algo.detection.ssd.SSD init_args: + model_name: ssd_mobilenetv2 label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/yolox_l.yaml b/src/otx/recipe/detection/yolox_l.yaml index 23a76f0e1d4..5b2bd067544 100644 --- a/src/otx/recipe/detection/yolox_l.yaml +++ b/src/otx/recipe/detection/yolox_l.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.yolox.YOLOXL + class_path: otx.algo.detection.yolox.YOLOX init_args: + model_name: yolox_l label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/yolox_l_tile.yaml b/src/otx/recipe/detection/yolox_l_tile.yaml index f69cd804357..64e3f087056 100644 --- a/src/otx/recipe/detection/yolox_l_tile.yaml +++ b/src/otx/recipe/detection/yolox_l_tile.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.yolox.YOLOXL + class_path: otx.algo.detection.yolox.YOLOX init_args: + model_name: yolox_l label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/yolox_s.yaml b/src/otx/recipe/detection/yolox_s.yaml index 12600d5e536..d99092992f4 100644 --- a/src/otx/recipe/detection/yolox_s.yaml +++ b/src/otx/recipe/detection/yolox_s.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.yolox.YOLOXS + class_path: otx.algo.detection.yolox.YOLOX init_args: + model_name: yolox_s label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/yolox_s_tile.yaml b/src/otx/recipe/detection/yolox_s_tile.yaml index a5758eca47c..414311666c6 100644 --- a/src/otx/recipe/detection/yolox_s_tile.yaml +++ b/src/otx/recipe/detection/yolox_s_tile.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.yolox.YOLOXS + class_path: otx.algo.detection.yolox.YOLOX init_args: + model_name: yolox_s label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/yolox_tiny.yaml b/src/otx/recipe/detection/yolox_tiny.yaml index 744dc3e72a7..f41105b2f43 100644 --- a/src/otx/recipe/detection/yolox_tiny.yaml +++ b/src/otx/recipe/detection/yolox_tiny.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.yolox.YOLOXTINY + class_path: otx.algo.detection.yolox.YOLOX init_args: + model_name: yolox_tiny label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/yolox_tiny_tile.yaml b/src/otx/recipe/detection/yolox_tiny_tile.yaml index 61d9d59f765..27c62435f8d 100644 --- a/src/otx/recipe/detection/yolox_tiny_tile.yaml +++ b/src/otx/recipe/detection/yolox_tiny_tile.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.yolox.YOLOXTINY + class_path: otx.algo.detection.yolox.YOLOX init_args: + model_name: yolox_tiny label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/yolox_x.yaml b/src/otx/recipe/detection/yolox_x.yaml index a99f0ce9122..f1478f0cb35 100644 --- a/src/otx/recipe/detection/yolox_x.yaml +++ b/src/otx/recipe/detection/yolox_x.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.yolox.YOLOXX + class_path: otx.algo.detection.yolox.YOLOX init_args: + model_name: yolox_x label_info: 80 optimizer: diff --git a/src/otx/recipe/detection/yolox_x_tile.yaml b/src/otx/recipe/detection/yolox_x_tile.yaml index 0431814cb6e..b1a75e47fcf 100644 --- a/src/otx/recipe/detection/yolox_x_tile.yaml +++ b/src/otx/recipe/detection/yolox_x_tile.yaml @@ -1,6 +1,7 @@ model: - class_path: otx.algo.detection.yolox.YOLOXX + class_path: otx.algo.detection.yolox.YOLOX init_args: + model_name: yolox_x label_info: 80 optimizer: diff --git a/tests/unit/algo/detection/backbones/test_csp_darknet.py b/tests/unit/algo/detection/backbones/test_csp_darknet.py index 29dbf707237..2e7f0522c23 100644 --- a/tests/unit/algo/detection/backbones/test_csp_darknet.py +++ b/tests/unit/algo/detection/backbones/test_csp_darknet.py @@ -8,7 +8,7 @@ import pytest import torch -from otx.algo.detection.backbones.csp_darknet import CSPDarknet, Focus +from otx.algo.detection.backbones.csp_darknet import CSPDarknetModule, Focus from torch import nn from torch.nn.modules import GroupNorm from torch.nn.modules.batchnorm import _BatchNorm @@ -45,22 +45,22 @@ def test_export(self) -> None: assert results.shape == (1, 32, 64, 64) -class TestCSPDarknet: +class TestCSPDarknetModule: def test_init_with_large_frozen_stages(self) -> None: """Test __init__ with large frozen_stages.""" with pytest.raises(ValueError): # noqa: PT011 # frozen_stages must in range(-1, len(arch_setting) + 1) - CSPDarknet(frozen_stages=6) + CSPDarknetModule(frozen_stages=6) def test_init_with_large_out_indices(self) -> None: """Test __init__ with large out_indices.""" with pytest.raises(AssertionError): - CSPDarknet(out_indices=[6]) + CSPDarknetModule(out_indices=[6]) def test_freeze_stages(self) -> None: """Test _freeze_stages.""" frozen_stages = 1 - model = CSPDarknet(frozen_stages=frozen_stages) + model = CSPDarknetModule(frozen_stages=frozen_stages) model.train() for mod in model.stem.modules(): @@ -76,14 +76,14 @@ def test_freeze_stages(self) -> None: def test_train_with_norm_eval(self) -> None: """Test train with norm_eval=True.""" - model = CSPDarknet(norm_eval=True) + model = CSPDarknetModule(norm_eval=True) model.train() assert check_norm_state(model.modules(), False) def test_forward(self) -> None: # Test CSPDarknet-P5 forward with widen_factor=0.5 - model = CSPDarknet(arch="P5", widen_factor=0.25, out_indices=range(5)) + model = CSPDarknetModule(arch="P5", widen_factor=0.25, out_indices=range(5)) model.train() imgs = torch.randn(1, 3, 64, 64) @@ -96,7 +96,7 @@ def test_forward(self) -> None: assert feat[4].shape == torch.Size((1, 256, 2, 2)) # Test CSPDarknet-P6 forward with widen_factor=0.5 - model = CSPDarknet(arch="P6", widen_factor=0.25, out_indices=range(6), spp_kernal_sizes=(3, 5, 7)) + model = CSPDarknetModule(arch="P6", widen_factor=0.25, out_indices=range(6), spp_kernal_sizes=(3, 5, 7)) model.train() imgs = torch.randn(1, 3, 128, 128) @@ -109,7 +109,7 @@ def test_forward(self) -> None: assert feat[5].shape == torch.Size((1, 256, 2, 2)) # Test CSPDarknet forward with dict(type='ReLU') - model = CSPDarknet(widen_factor=0.125, activation=nn.ReLU, out_indices=range(5)) + model = CSPDarknetModule(widen_factor=0.125, activation=nn.ReLU, out_indices=range(5)) model.train() imgs = torch.randn(1, 3, 64, 64) @@ -122,7 +122,7 @@ def test_forward(self) -> None: assert feat[4].shape == torch.Size((1, 128, 2, 2)) # Test CSPDarknet with BatchNorm forward - model = CSPDarknet(widen_factor=0.125, out_indices=range(5)) + model = CSPDarknetModule(widen_factor=0.125, out_indices=range(5)) for m in model.modules(): if is_norm(m): assert isinstance(m, _BatchNorm) @@ -139,7 +139,7 @@ def test_forward(self) -> None: # Test CSPDarknet with custom arch forward arch_ovewrite = [[32, 56, 3, True, False], [56, 224, 2, True, False], [224, 512, 1, True, False]] - model = CSPDarknet(arch_ovewrite=arch_ovewrite, widen_factor=0.25, out_indices=(0, 1, 2, 3)) + model = CSPDarknetModule(arch_ovewrite=arch_ovewrite, widen_factor=0.25, out_indices=(0, 1, 2, 3)) model.train() imgs = torch.randn(1, 3, 32, 32) diff --git a/tests/unit/algo/detection/backbones/test_presnet.py b/tests/unit/algo/detection/backbones/test_presnet.py index 5d684a1c14a..40e9d71a3f3 100644 --- a/tests/unit/algo/detection/backbones/test_presnet.py +++ b/tests/unit/algo/detection/backbones/test_presnet.py @@ -6,14 +6,14 @@ from functools import partial import torch -from otx.algo.detection.backbones.presnet import PResNet +from otx.algo.detection.backbones.presnet import PResNetModule from otx.algo.modules import FrozenBatchNorm2d from otx.algo.modules.norm import build_norm_layer class TestPresnet: def test_presnet_forward(self): - model = PResNet(depth=50) + model = PResNetModule(depth=50) inputs = torch.randn(1, 3, 224, 224) output = model(inputs) assert len(output) == 4 @@ -23,13 +23,13 @@ def test_presnet_forward(self): assert output[3].shape == torch.Size([1, 2048, 7, 7]) def test_presnet_freeze_parameters(self): - model = PResNet(depth=50, freeze_at=2) + model = PResNetModule(depth=50, freeze_at=2) for name, param in model.named_parameters(): if name.startswith(("conv1", "res_layers.0")): assert not param.requires_grad def test_presnet_freeze_norm(self): - model = PResNet( + model = PResNetModule( depth=50, normalization=partial(build_norm_layer, FrozenBatchNorm2d, layer_name="norm"), ) diff --git a/tests/unit/algo/detection/base_models/test_detr.py b/tests/unit/algo/detection/detectors/test_detr.py similarity index 93% rename from tests/unit/algo/detection/base_models/test_detr.py rename to tests/unit/algo/detection/detectors/test_detr.py index 71ce30cc1fb..2e4aba19a9b 100644 --- a/tests/unit/algo/detection/base_models/test_detr.py +++ b/tests/unit/algo/detection/detectors/test_detr.py @@ -19,20 +19,14 @@ class TestDETR: @pytest.fixture() def rt_detr_model(self): num_classes = 10 - backbone = PResNet( - depth=18, - pretrained=False, - return_idx=[1, 2, 3], - ) + backbone = PResNet(model_name="rtdetr_18") encoder = HybridEncoder( - in_channels=[128, 256, 512], - dim_feedforward=1024, + model_name="rtdetr_18", eval_spatial_size=(640, 640), ) decoder = RTDETRTransformer( + model_name="rtdetr_18", num_classes=num_classes, - num_decoder_layers=1, - feat_channels=[256, 256, 256], eval_spatial_size=(640, 640), ) criterion = DetrCriterion( diff --git a/tests/unit/algo/detection/base_models/test_single_stage_detector.py b/tests/unit/algo/detection/detectors/test_single_stage_detector.py similarity index 84% rename from tests/unit/algo/detection/base_models/test_single_stage_detector.py rename to tests/unit/algo/detection/detectors/test_single_stage_detector.py index eb6a25037bc..aa361e62601 100644 --- a/tests/unit/algo/detection/base_models/test_single_stage_detector.py +++ b/tests/unit/algo/detection/detectors/test_single_stage_detector.py @@ -5,7 +5,7 @@ import pytest import torch -from otx.algo.detection.base_models.single_stage_detector import SingleStageDetector +from otx.algo.detection.detectors.single_stage_detector import SingleStageDetector from otx.core.data.entity.detection import DetBatchDataEntity from otx.core.types import LabelInfo from torch import nn @@ -28,13 +28,15 @@ def __init__(self): self.linear = nn.Linear(16, 10) self.relu = nn.ReLU() self.linear2 = nn.Linear(10, 4) - self.loss = lambda x, _: {"loss": torch.sum(x)} def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.linear(x) x = self.relu(x) return self.linear2(x) + def loss(self, x: torch.Tensor, *args, **kwargs) -> dict: + return {"x": x} + def predict(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: return self.forward(x) @@ -55,8 +57,19 @@ def batch(self): ) @pytest.fixture() - def detector(self, backbone, bbox_head): - return SingleStageDetector(backbone=backbone, bbox_head=bbox_head) + def criterion(self): + class Criterion(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor, *args, **kwargs) -> dict: + return {"loss": torch.sum(x)} + + return Criterion() + + @pytest.fixture() + def detector(self, backbone, bbox_head, criterion): + return SingleStageDetector(backbone=backbone, bbox_head=bbox_head, criterion=criterion) def test_forward(self, detector, batch): output = detector.forward(batch.images) diff --git a/tests/unit/algo/detection/heads/test_class_incremental_mixin.py b/tests/unit/algo/detection/heads/test_class_incremental_mixin.py index 67760595c65..3ba7ca42330 100644 --- a/tests/unit/algo/detection/heads/test_class_incremental_mixin.py +++ b/tests/unit/algo/detection/heads/test_class_incremental_mixin.py @@ -3,7 +3,10 @@ """Test of ClassIncrementalMixin.""" import torch -from otx.algo.detection.atss import MobileNetV2ATSS +from otx.algo.common.losses import CrossEntropyLoss, CrossSigmoidFocalLoss, GIoULoss +from otx.algo.common.utils.coders.delta_xywh_bbox_coder import DeltaXYWHBBoxCoder +from otx.algo.detection.atss import ATSS +from otx.algo.detection.losses import ATSSCriterion class MockGTInstance: @@ -13,7 +16,22 @@ class MockGTInstance: class TestClassIncrementalMixin: def test_ignore_label(self) -> None: - atss = MobileNetV2ATSS(3) + atss = ATSS(model_name="atss_mobilenetv2", label_info=3, input_size=(800, 992)) + criterion = ATSSCriterion( + num_classes=3, + bbox_coder=DeltaXYWHBBoxCoder( + target_means=(0.0, 0.0, 0.0, 0.0), + target_stds=(0.1, 0.1, 0.2, 0.2), + ), + loss_cls=CrossSigmoidFocalLoss( + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0, + ), + loss_bbox=GIoULoss(loss_weight=2.0), + loss_centerness=CrossEntropyLoss(use_sigmoid=True, loss_weight=1.0), + ) atss_head = atss.model.bbox_head cls_scores = [ @@ -49,12 +67,14 @@ def test_ignore_label(self) -> None: }, ] - loss_with_ignored_labels = atss_head.loss_by_feat( - cls_scores, - bbox_preds, - centernesses, - batch_gt_instances, - batch_img_metas, + loss_with_ignored_labels = criterion( + **atss_head.loss_by_feat( + cls_scores, + bbox_preds, + centernesses, + batch_gt_instances, + batch_img_metas, + ), ) loss_cls_with_ignored_labels = torch.sum(torch.Tensor(loss_with_ignored_labels["loss_cls"])) @@ -67,12 +87,14 @@ def test_ignore_label(self) -> None: "pad_shape": (480, 480), }, ] - loss_without_ignored_labels = atss_head.loss_by_feat( - cls_scores, - bbox_preds, - centernesses, - batch_gt_instances, - batch_img_metas, + loss_without_ignored_labels = criterion( + **atss_head.loss_by_feat( + cls_scores, + bbox_preds, + centernesses, + batch_gt_instances, + batch_img_metas, + ), ) loss_cls_without_ignored_labels = torch.sum(torch.Tensor(loss_without_ignored_labels["loss_cls"])) diff --git a/tests/unit/algo/detection/heads/test_rtdetr_decoder.py b/tests/unit/algo/detection/heads/test_rtdetr_decoder.py index 0ee54c7a937..af67b065191 100644 --- a/tests/unit/algo/detection/heads/test_rtdetr_decoder.py +++ b/tests/unit/algo/detection/heads/test_rtdetr_decoder.py @@ -5,13 +5,13 @@ import pytest import torch -from otx.algo.detection.heads.rtdetr_decoder import RTDETRTransformer +from otx.algo.detection.heads.rtdetr_decoder import RTDETRTransformerModule class TestRTDETRTransformer: @pytest.fixture() def rt_detr_transformer(self): - return RTDETRTransformer(num_classes=10, feat_channels=[128, 128, 128], num_decoder_layers=1) + return RTDETRTransformerModule(num_classes=10, feat_channels=[128, 128, 128], num_decoder_layers=1) @pytest.fixture() def targets(self): @@ -20,7 +20,7 @@ def targets(self): ] def test_rt_detr_transformer_init(self, rt_detr_transformer): - assert isinstance(rt_detr_transformer, RTDETRTransformer) + assert isinstance(rt_detr_transformer, RTDETRTransformerModule) assert rt_detr_transformer.num_classes == 10 assert rt_detr_transformer.aux_loss diff --git a/tests/unit/algo/detection/heads/test_rtmdet_head.py b/tests/unit/algo/detection/heads/test_rtmdet_head.py index 79d46990962..8dd1f18362c 100644 --- a/tests/unit/algo/detection/heads/test_rtmdet_head.py +++ b/tests/unit/algo/detection/heads/test_rtmdet_head.py @@ -12,7 +12,7 @@ from otx.algo.common.utils.coders import DistancePointBBoxCoder from otx.algo.common.utils.prior_generators import MlvlPointGenerator from otx.algo.common.utils.samplers import PseudoSampler -from otx.algo.detection.heads.rtmdet_head import RTMDetHead, RTMDetSepBNHead +from otx.algo.detection.heads.rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule from torch import nn @@ -70,29 +70,6 @@ def test_forward(self, rtmdet_head, input_features) -> None: assert cls_score.shape[1] == rtmdet_head.num_base_priors * rtmdet_head.cls_out_channels assert bbox_pred.shape[1] == rtmdet_head.num_base_priors * 4 - def test_loss_by_feat_single(self, rtmdet_head) -> None: - # Create dummy data to simulate the inputs to the loss_by_feat_single method - cls_score = torch.rand(1, 2, 100, 80) - bbox_pred = torch.rand(1, 2, 100, 4) - labels = torch.randint(0, 80, (2, 100)) - label_weights = torch.rand(2, 100) - bbox_targets = torch.rand(2, 100, 4) - assign_metrics = torch.rand(2, 100) - stride = [8, 8] - - loss_cls, loss_bbox, _, _ = rtmdet_head.loss_by_feat_single( - cls_score, - bbox_pred, - labels, - label_weights, - bbox_targets, - assign_metrics, - stride, - ) - - assert loss_cls is not None - assert loss_bbox is not None - def test_export_by_feat(self, mocker, rtmdet_head) -> None: batch_size = 2 num_priors = 1 @@ -134,9 +111,9 @@ def test_get_anchors(self, rtmdet_head) -> None: assert valid_flag.dtype == torch.bool -class TestRTMDetSepBNHead: +class TestRTMDetSepBNHeadModule: @pytest.fixture() - def rtmdet_sep_bn_head(self) -> RTMDetSepBNHead: + def rtmdet_sep_bn_head(self) -> RTMDetSepBNHeadModule: train_cfg = { "assigner": DynamicSoftLabelAssigner(topk=13), "sampler": PseudoSampler(), @@ -155,7 +132,7 @@ def rtmdet_sep_bn_head(self) -> RTMDetSepBNHead: "nms_pre": 30000, }, ) - return RTMDetSepBNHead( + return RTMDetSepBNHeadModule( num_classes=80, in_channels=96, stacked_convs=2, diff --git a/tests/unit/algo/detection/heads/test_custom_ssd_head.py b/tests/unit/algo/detection/heads/test_ssd_head.py similarity index 59% rename from tests/unit/algo/detection/heads/test_custom_ssd_head.py rename to tests/unit/algo/detection/heads/test_ssd_head.py index f491301456a..a8bb1fd7c0c 100644 --- a/tests/unit/algo/detection/heads/test_custom_ssd_head.py +++ b/tests/unit/algo/detection/heads/test_ssd_head.py @@ -1,11 +1,10 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -"""Test of CustomSSDHead.""" +"""Test of SSDHead.""" from omegaconf import DictConfig -from otx.algo.common.losses import CrossEntropyLoss from otx.algo.common.utils.coders import DeltaXYWHBBoxCoder -from otx.algo.detection.heads import SSDHead +from otx.algo.detection.heads.ssd_head import SSDHeadModule from otx.algo.detection.utils.prior_generators import SSDAnchorGeneratorClustered @@ -37,7 +36,7 @@ def test_init(self, mocker) -> None: "max_per_img": 200, }, ) - self.head = SSDHead( + self.head = SSDHeadModule( anchor_generator=SSDAnchorGeneratorClustered( strides=[16, 32], widths=[ @@ -60,4 +59,29 @@ def test_init(self, mocker) -> None: train_cfg=train_cfg, test_cfg=test_cfg, ) - assert isinstance(self.head.loss_cls, CrossEntropyLoss) + + assert self.head.num_classes == 3 + assert self.head.in_channels == (96, 320) + assert self.head.stacked_convs == 0 + assert self.head.feat_channels == 256 + assert self.head.use_depthwise + assert self.head.cls_out_channels == 4 + assert self.head.prior_generator.strides == [(16, 16), (32, 32)] + assert self.head.prior_generator.widths == [ + [38.641007923271076, 92.49516032784699, 271.4234764938237, 141.53469410876247], + [206.04136086566515, 386.6542727907841, 716.9892752215089, 453.75609561761405, 788.4629155558277], + ] + assert self.head.prior_generator.heights == [ + [48.9243877087132, 147.73088476194903, 158.23569788707474, 324.14510379107367], + [587.6216059488938, 381.60024152086544, 323.5988913027747, 702.7486097568518, 741.4865860938451], + ] + assert self.head._init_layers() is None + assert self.head.bbox_coder.means == (0.0, 0.0, 0.0, 0.0) + assert self.head.bbox_coder.stds == (0.1, 0.1, 0.2, 0.2) + assert self.head.train_cfg == train_cfg + assert self.head.test_cfg == test_cfg + assert self.head.assigner == train_cfg["assigner"] + assert self.head.sampler is not None + assert self.head.cls_focal_loss is False + assert self.head.use_sigmoid_cls is False + assert self.head.reg_decoded_bbox is False diff --git a/tests/unit/algo/detection/heads/test_yolox_head.py b/tests/unit/algo/detection/heads/test_yolox_head.py index 36d190fb74c..65059ee15de 100644 --- a/tests/unit/algo/detection/heads/test_yolox_head.py +++ b/tests/unit/algo/detection/heads/test_yolox_head.py @@ -8,13 +8,13 @@ import torch from omegaconf import DictConfig -from otx.algo.detection.heads import YOLOXHead +from otx.algo.detection.heads.yolox_head import YOLOXHeadModule from otx.algo.detection.utils.assigners import SimOTAAssigner from otx.algo.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule from otx.algo.utils.mmengine_utils import InstanceData -class TestYOLOXHead: +class TestYOLOXHeadModule: def test_predict_by_feat(self): s = 256 img_metas = [ @@ -24,7 +24,7 @@ def test_predict_by_feat(self): }, ] test_cfg = DictConfig({"score_thr": 0.01, "nms": {"type": "nms", "iou_threshold": 0.65}}) - head = YOLOXHead(num_classes=4, in_channels=1, stacked_convs=1, use_depthwise=False, test_cfg=test_cfg) + head = YOLOXHeadModule(num_classes=4, in_channels=1, stacked_convs=1, use_depthwise=False, test_cfg=test_cfg) feat = [torch.rand(1, 1, s // feat_size, s // feat_size) for feat_size in [4, 8, 16]] cls_scores, bbox_preds, objectnesses = head.forward(feat) head.predict_by_feat(cls_scores, bbox_preds, objectnesses, img_metas, cfg=test_cfg, rescale=True, with_nms=True) @@ -49,7 +49,7 @@ def test_loss_by_feat(self): train_cfg = { "assigner": SimOTAAssigner(center_radius=2.5), } - head = YOLOXHead(num_classes=4, in_channels=1, stacked_convs=1, use_depthwise=False, train_cfg=train_cfg) + head = YOLOXHeadModule(num_classes=4, in_channels=1, stacked_convs=1, use_depthwise=False, train_cfg=train_cfg) assert not head.use_l1 assert isinstance(head.multi_level_cls_convs[0][0], Conv2dModule) @@ -60,47 +60,23 @@ def test_loss_by_feat(self): # background gt_instances = InstanceData(bboxes=torch.empty((0, 4)), labels=torch.LongTensor([])) - empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, [gt_instances], img_metas) - # When there is no truth, the cls loss should be nonzero but there - # should be no box loss. - empty_cls_loss = empty_gt_losses["loss_cls"].sum() - empty_box_loss = empty_gt_losses["loss_bbox"].sum() - empty_obj_loss = empty_gt_losses["loss_obj"].sum() - assert empty_cls_loss.item() == 0, "there should be no cls loss when there are no true boxes" - assert empty_box_loss.item() == 0, "there should be no box loss when there are no true boxes" - assert empty_obj_loss.item() > 0, "objectness loss should be non-zero" + raw_dict = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, [gt_instances], img_metas) + for key in [ + "flatten_objectness", + "flatten_cls_preds", + "flatten_bbox_preds", + "flatten_bboxes", + "obj_targets", + "cls_targets", + "bbox_targets", + "l1_targets", + "num_total_samples", + "num_pos", + "pos_masks", + ]: + assert key in raw_dict # When truth is non-empty then both cls and box loss should be nonzero # for random inputs - head = YOLOXHead(num_classes=4, in_channels=1, stacked_convs=1, use_depthwise=True, train_cfg=train_cfg) + head = YOLOXHeadModule(num_classes=4, in_channels=1, stacked_convs=1, use_depthwise=True, train_cfg=train_cfg) assert isinstance(head.multi_level_cls_convs[0][0], DepthwiseSeparableConvModule) - head.use_l1 = True - gt_instances = InstanceData( - bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), - labels=torch.LongTensor([2]), - ) - - one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, [gt_instances], img_metas) - onegt_cls_loss = one_gt_losses["loss_cls"].sum() - onegt_box_loss = one_gt_losses["loss_bbox"].sum() - onegt_obj_loss = one_gt_losses["loss_obj"].sum() - onegt_l1_loss = one_gt_losses["loss_l1"].sum() - assert onegt_cls_loss.item() > 0, "cls loss should be non-zero" - assert onegt_box_loss.item() > 0, "box loss should be non-zero" - assert onegt_obj_loss.item() > 0, "obj loss should be non-zero" - assert onegt_l1_loss.item() > 0, "l1 loss should be non-zero" - - # Test groud truth out of bound - gt_instances = InstanceData( - bboxes=torch.Tensor([[s * 4, s * 4, s * 4 + 10, s * 4 + 10]]), - labels=torch.LongTensor([2]), - ) - empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, [gt_instances], img_metas) - # When gt_bboxes out of bound, the assign results should be empty, - # so the cls and bbox loss should be zero. - empty_cls_loss = empty_gt_losses["loss_cls"].sum() - empty_box_loss = empty_gt_losses["loss_bbox"].sum() - empty_obj_loss = empty_gt_losses["loss_obj"].sum() - assert empty_cls_loss.item() == 0, "there should be no cls loss when gt_bboxes out of bound" - assert empty_box_loss.item() == 0, "there should be no box loss when gt_bboxes out of bound" - assert empty_obj_loss.item() > 0, "objectness loss should be non-zero" diff --git a/tests/unit/algo/detection/losses/test_yolox_loss.py b/tests/unit/algo/detection/losses/test_yolox_loss.py new file mode 100644 index 00000000000..8ebafa20d5f --- /dev/null +++ b/tests/unit/algo/detection/losses/test_yolox_loss.py @@ -0,0 +1,80 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) OpenMMLab. All rights reserved. + +import torch +from otx.algo.detection.heads.yolox_head import YOLOXHeadModule +from otx.algo.detection.losses import YOLOXCriterion +from otx.algo.detection.utils.assigners.sim_ota_assigner import SimOTAAssigner +from otx.algo.utils.mmengine_utils import InstanceData + + +class TestYOLOXCriterion: + def test_forward(self): + criterion = YOLOXCriterion(num_classes=4) + + s = 256 + img_metas = [ + { + "img_shape": (s, s, 3), + "scale_factor": 1, + }, + ] + train_cfg = { + "assigner": SimOTAAssigner(center_radius=2.5), + } + head = YOLOXHeadModule(num_classes=4, in_channels=1, stacked_convs=1, use_depthwise=False, train_cfg=train_cfg) + feat = [torch.rand(1, 1, s // feat_size, s // feat_size) for feat_size in [4, 8, 16]] + cls_scores, bbox_preds, objectnesses = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData(bboxes=torch.empty((0, 4)), labels=torch.LongTensor([])) + + raw_dict = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, [gt_instances], img_metas) + empty_gt_losses = criterion(**raw_dict) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses["loss_cls"].sum() + empty_box_loss = empty_gt_losses["loss_bbox"].sum() + empty_obj_loss = empty_gt_losses["loss_obj"].sum() + assert empty_cls_loss.item() == 0, "there should be no cls loss when there are no true boxes" + assert empty_box_loss.item() == 0, "there should be no box loss when there are no true boxes" + assert empty_obj_loss.item() > 0, "objectness loss should be non-zero" + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOXHeadModule(num_classes=4, in_channels=1, stacked_convs=1, use_depthwise=True, train_cfg=train_cfg) + head.use_l1 = True + criterion.use_l1 = True + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([2]), + ) + + raw_dict = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, [gt_instances], img_metas) + one_gt_losses = criterion(**raw_dict) + onegt_cls_loss = one_gt_losses["loss_cls"].sum() + onegt_box_loss = one_gt_losses["loss_bbox"].sum() + onegt_obj_loss = one_gt_losses["loss_obj"].sum() + onegt_l1_loss = one_gt_losses["loss_l1"].sum() + assert onegt_cls_loss.item() > 0, "cls loss should be non-zero" + assert onegt_box_loss.item() > 0, "box loss should be non-zero" + assert onegt_obj_loss.item() > 0, "obj loss should be non-zero" + assert onegt_l1_loss.item() > 0, "l1 loss should be non-zero" + + # Test groud truth out of bound + gt_instances = InstanceData( + bboxes=torch.Tensor([[s * 4, s * 4, s * 4 + 10, s * 4 + 10]]), + labels=torch.LongTensor([2]), + ) + raw_dict = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, [gt_instances], img_metas) + empty_gt_losses = criterion(**raw_dict) + # When gt_bboxes out of bound, the assign results should be empty, + # so the cls and bbox loss should be zero. + empty_cls_loss = empty_gt_losses["loss_cls"].sum() + empty_box_loss = empty_gt_losses["loss_bbox"].sum() + empty_obj_loss = empty_gt_losses["loss_obj"].sum() + assert empty_cls_loss.item() == 0, "there should be no cls loss when gt_bboxes out of bound" + assert empty_box_loss.item() == 0, "there should be no box loss when gt_bboxes out of bound" + assert empty_obj_loss.item() > 0, "objectness loss should be non-zero" diff --git a/tests/unit/algo/detection/necks/test_hybrid_encoder.py b/tests/unit/algo/detection/necks/test_hybrid_encoder.py index f67277b5a18..08d345286b8 100644 --- a/tests/unit/algo/detection/necks/test_hybrid_encoder.py +++ b/tests/unit/algo/detection/necks/test_hybrid_encoder.py @@ -4,14 +4,14 @@ """Test of HybridEncoder.""" import torch -from otx.algo.detection.necks.hybrid_encoder import HybridEncoder +from otx.algo.detection.necks.hybrid_encoder import HybridEncoderModule def test_hybrid_encoder_forward(): hidden_dim = 256 feat_strides = [8, 16, 32] in_channels = [128, 256, 512] - encoder = HybridEncoder(in_channels=in_channels, hidden_dim=hidden_dim, feat_strides=feat_strides) + encoder = HybridEncoderModule(in_channels=in_channels, hidden_dim=hidden_dim, feat_strides=feat_strides) # Create dummy input batch_size = 2 diff --git a/tests/unit/algo/detection/necks/test_yolox_pafpn.py b/tests/unit/algo/detection/necks/test_yolox_pafpn.py index fb4fd4886a9..1ee21068541 100644 --- a/tests/unit/algo/detection/necks/test_yolox_pafpn.py +++ b/tests/unit/algo/detection/necks/test_yolox_pafpn.py @@ -1,24 +1,24 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # Copyright (c) OpenMMLab. All rights reserved. -"""Test of YOLOXPAFPN. +"""Test of YOLOXPAFPNModule. Reference : https://github.com/open-mmlab/mmdetection/blob/v3.2.0/tests/test_models/test_necks/test_necks.py#L360-L387 """ import torch -from otx.algo.detection.necks.yolox_pafpn import YOLOXPAFPN +from otx.algo.detection.necks.yolox_pafpn import YOLOXPAFPNModule from otx.algo.modules.conv_module import DepthwiseSeparableConvModule -class TestYOLOXPAFPN: - def test_yolox_pafpn(self) -> None: +class TestYOLOXPAFPNModule: + def test_yolox_pafpn_module(self) -> None: s = 64 in_channels = [8, 16, 32, 64] feat_sizes = [s // 2**i for i in range(4)] # [64, 32, 16, 8] out_channels = 24 feats = [torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) for i in range(len(in_channels))] - neck = YOLOXPAFPN(in_channels=in_channels, out_channels=out_channels) + neck = YOLOXPAFPNModule(in_channels=in_channels, out_channels=out_channels) outs = neck(feats) assert len(outs) == len(feats) for i in range(len(feats)): @@ -26,7 +26,7 @@ def test_yolox_pafpn(self) -> None: assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) # test depth-wise - neck = YOLOXPAFPN(in_channels=in_channels, out_channels=out_channels, use_depthwise=True) + neck = YOLOXPAFPNModule(in_channels=in_channels, out_channels=out_channels, use_depthwise=True) assert isinstance(neck.downsamples[0], DepthwiseSeparableConvModule) diff --git a/tests/unit/algo/detection/test_atss.py b/tests/unit/algo/detection/test_atss.py index 4b61d22757a..74ec820e8b5 100644 --- a/tests/unit/algo/detection/test_atss.py +++ b/tests/unit/algo/detection/test_atss.py @@ -4,7 +4,7 @@ import pytest import torch -from otx.algo.detection.atss import MobileNetV2ATSS, ResNeXt101ATSS +from otx.algo.detection.atss import ATSS from otx.algo.utils.support_otx_v1 import OTXv1Helper from otx.core.data.entity.detection import DetBatchPredEntity from otx.core.exporter.native import OTXModelExporter @@ -13,7 +13,7 @@ class TestATSS: def test(self, mocker) -> None: - model = MobileNetV2ATSS(2) + model = ATSS(model_name="atss_mobilenetv2", label_info=2) mock_load_ckpt = mocker.patch.object(OTXv1Helper, "load_det_ckpt") model.load_from_otx_v1_ckpt({}) mock_load_ckpt.assert_called_once_with({}, "model.") @@ -21,7 +21,13 @@ def test(self, mocker) -> None: assert isinstance(model._export_parameters, TaskLevelExportParameters) assert isinstance(model._exporter, OTXModelExporter) - @pytest.mark.parametrize("model", [MobileNetV2ATSS(3), ResNeXt101ATSS(3)]) + @pytest.mark.parametrize( + "model", + [ + ATSS(model_name="atss_mobilenetv2", label_info=3), + ATSS(model_name="atss_resnext101", label_info=3), + ], + ) def test_loss(self, model, fxt_data_module): data = next(iter(fxt_data_module.train_dataloader())) data.images = [torch.randn(3, 32, 32), torch.randn(3, 48, 48)] @@ -30,7 +36,13 @@ def test_loss(self, model, fxt_data_module): assert "loss_bbox" in output assert "loss_centerness" in output - @pytest.mark.parametrize("model", [MobileNetV2ATSS(3), ResNeXt101ATSS(3)]) + @pytest.mark.parametrize( + "model", + [ + ATSS(model_name="atss_mobilenetv2", label_info=3), + ATSS(model_name="atss_resnext101", label_info=3), + ], + ) def test_predict(self, model, fxt_data_module): data = next(iter(fxt_data_module.train_dataloader())) data.images = [torch.randn(3, 32, 32), torch.randn(3, 48, 48)] @@ -38,7 +50,13 @@ def test_predict(self, model, fxt_data_module): output = model(data) assert isinstance(output, DetBatchPredEntity) - @pytest.mark.parametrize("model", [MobileNetV2ATSS(3), ResNeXt101ATSS(3)]) + @pytest.mark.parametrize( + "model", + [ + ATSS(model_name="atss_mobilenetv2", label_info=3), + ATSS(model_name="atss_resnext101", label_info=3), + ], + ) def test_export(self, model): model.eval() output = model.forward_for_tracing(torch.randn(1, 3, 32, 32)) diff --git a/tests/unit/algo/detection/test_rtdetr.py b/tests/unit/algo/detection/test_rtdetr.py index 22dc258029f..922f80b7468 100644 --- a/tests/unit/algo/detection/test_rtdetr.py +++ b/tests/unit/algo/detection/test_rtdetr.py @@ -15,7 +15,7 @@ class TestRTDETR: def test_customize_outputs(self, mocker): label_info = LabelInfo(["a", "b", "c"], [["a", "b", "c"]]) mocker.patch("otx.algo.detection.rtdetr.RTDETR._build_model", return_value=mocker.MagicMock()) - model = RTDETR(label_info) + model = RTDETR(model_name="rtdetr_18", label_info=label_info) model.model.load_from = None model.train() outputs = { diff --git a/tests/unit/algo/detection/test_rtmdet.py b/tests/unit/algo/detection/test_rtmdet.py index 17f4b7ecc35..b1513e6696d 100644 --- a/tests/unit/algo/detection/test_rtmdet.py +++ b/tests/unit/algo/detection/test_rtmdet.py @@ -4,29 +4,29 @@ import pytest import torch -from otx.algo.common.backbones import CSPNeXt -from otx.algo.detection.heads import RTMDetSepBNHead -from otx.algo.detection.necks import CSPNeXtPAFPN -from otx.algo.detection.rtmdet import RTMDetTiny +from otx.algo.common.backbones.cspnext import CSPNeXtModule +from otx.algo.detection.heads.rtmdet_head import RTMDetSepBNHeadModule +from otx.algo.detection.necks.cspnext_pafpn import CSPNeXtPAFPNModule +from otx.algo.detection.rtmdet import RTMDet from otx.core.data.entity.detection import DetBatchPredEntity from otx.core.exporter.native import OTXNativeModelExporter class TestRTMDet: def test_init(self) -> None: - otx_rtmdet_tiny = RTMDetTiny(label_info=3) - assert isinstance(otx_rtmdet_tiny.model.backbone, CSPNeXt) - assert isinstance(otx_rtmdet_tiny.model.neck, CSPNeXtPAFPN) - assert isinstance(otx_rtmdet_tiny.model.bbox_head, RTMDetSepBNHead) + otx_rtmdet_tiny = RTMDet(model_name="rtmdet_tiny", label_info=3) + assert isinstance(otx_rtmdet_tiny.model.backbone, CSPNeXtModule) + assert isinstance(otx_rtmdet_tiny.model.neck, CSPNeXtPAFPNModule) + assert isinstance(otx_rtmdet_tiny.model.bbox_head, RTMDetSepBNHeadModule) assert otx_rtmdet_tiny.input_size == (640, 640) def test_exporter(self) -> None: - otx_rtmdet_tiny = RTMDetTiny(label_info=3) + otx_rtmdet_tiny = RTMDet(model_name="rtmdet_tiny", label_info=3) otx_rtmdet_tiny_exporter = otx_rtmdet_tiny._exporter assert isinstance(otx_rtmdet_tiny_exporter, OTXNativeModelExporter) assert otx_rtmdet_tiny_exporter.swap_rgb is True - @pytest.mark.parametrize("model", [RTMDetTiny(3)]) + @pytest.mark.parametrize("model", [RTMDet(model_name="rtmdet_tiny", label_info=3)]) def test_loss(self, model, fxt_data_module): data = next(iter(fxt_data_module.train_dataloader())) data.images = [torch.randn(3, 32, 32), torch.randn(3, 48, 48)] @@ -34,7 +34,7 @@ def test_loss(self, model, fxt_data_module): assert "loss_cls" in output assert "loss_bbox" in output - @pytest.mark.parametrize("model", [RTMDetTiny(3)]) + @pytest.mark.parametrize("model", [RTMDet(model_name="rtmdet_tiny", label_info=3)]) def test_predict(self, model, fxt_data_module): data = next(iter(fxt_data_module.train_dataloader())) data.images = [torch.randn(3, 32, 32), torch.randn(3, 48, 48)] @@ -42,7 +42,7 @@ def test_predict(self, model, fxt_data_module): output = model(data) assert isinstance(output, DetBatchPredEntity) - @pytest.mark.parametrize("model", [RTMDetTiny(3)]) + @pytest.mark.parametrize("model", [RTMDet(model_name="rtmdet_tiny", label_info=3)]) def test_export(self, model): model.eval() output = model.forward_for_tracing(torch.randn(1, 3, 32, 32)) diff --git a/tests/unit/algo/detection/test_ssd.py b/tests/unit/algo/detection/test_ssd.py index e30daa163af..62b22ac5b91 100644 --- a/tests/unit/algo/detection/test_ssd.py +++ b/tests/unit/algo/detection/test_ssd.py @@ -16,7 +16,7 @@ class TestSSD: @pytest.fixture() def fxt_model(self) -> SSD: - return SSD(label_info=3) + return SSD(model_name="ssd_mobilenetv2", label_info=3) @pytest.fixture() def fxt_checkpoint(self, fxt_model, fxt_data_module, tmpdir, monkeypatch: pytest.MonkeyPatch): @@ -46,7 +46,7 @@ def test_save_and_load_anchors(self, fxt_checkpoint) -> None: assert loaded_model.model.bbox_head.anchor_generator.heights[0][0] == 50 def test_load_state_dict_pre_hook(self, fxt_model) -> None: - prev_model = SSD(2) + prev_model = SSD(model_name="ssd_mobilenetv2", label_info=2) state_dict = prev_model.state_dict() fxt_model.model_classes = [1, 2, 3] fxt_model.ckpt_classes = [1, 2] diff --git a/tests/unit/algo/detection/test_yolox.py b/tests/unit/algo/detection/test_yolox.py index fdb8e835ee7..82bd563fa3b 100644 --- a/tests/unit/algo/detection/test_yolox.py +++ b/tests/unit/algo/detection/test_yolox.py @@ -4,37 +4,48 @@ import pytest import torch -from otx.algo.detection.backbones import CSPDarknet -from otx.algo.detection.heads import YOLOXHead -from otx.algo.detection.necks import YOLOXPAFPN -from otx.algo.detection.yolox import YOLOXL, YOLOXS, YOLOXTINY, YOLOXX +from otx.algo.detection.backbones.csp_darknet import CSPDarknetModule +from otx.algo.detection.heads.yolox_head import YOLOXHeadModule +from otx.algo.detection.necks.yolox_pafpn import YOLOXPAFPNModule +from otx.algo.detection.yolox import YOLOX from otx.core.data.entity.detection import DetBatchPredEntity from otx.core.exporter.native import OTXNativeModelExporter class TestYOLOX: def test_init(self) -> None: - otx_yolox_l = YOLOXL(label_info=3) - assert isinstance(otx_yolox_l.model.backbone, CSPDarknet) - assert isinstance(otx_yolox_l.model.neck, YOLOXPAFPN) - assert isinstance(otx_yolox_l.model.bbox_head, YOLOXHead) + otx_yolox_l = YOLOX(model_name="yolox_l", label_info=3) + assert isinstance(otx_yolox_l.model.backbone, CSPDarknetModule) + assert isinstance(otx_yolox_l.model.neck, YOLOXPAFPNModule) + assert isinstance(otx_yolox_l.model.bbox_head, YOLOXHeadModule) assert otx_yolox_l.input_size == (640, 640) - otx_yolox_tiny = YOLOXTINY(label_info=3) + otx_yolox_tiny = YOLOX(model_name="yolox_tiny", label_info=3) + assert otx_yolox_tiny.input_size == (640, 640) + + otx_yolox_tiny = YOLOX(model_name="yolox_tiny", label_info=3, input_size=(416, 416)) assert otx_yolox_tiny.input_size == (416, 416) def test_exporter(self) -> None: - otx_yolox_l = YOLOXL(label_info=3) + otx_yolox_l = YOLOX(model_name="yolox_l", label_info=3) otx_yolox_l_exporter = otx_yolox_l._exporter assert isinstance(otx_yolox_l_exporter, OTXNativeModelExporter) assert otx_yolox_l_exporter.swap_rgb is True - otx_yolox_tiny = YOLOXTINY(label_info=3) + otx_yolox_tiny = YOLOX(model_name="yolox_tiny", label_info=3) otx_yolox_tiny_exporter = otx_yolox_tiny._exporter assert isinstance(otx_yolox_tiny_exporter, OTXNativeModelExporter) assert otx_yolox_tiny_exporter.swap_rgb is False - @pytest.mark.parametrize("model", [YOLOXTINY(3), YOLOXS(3), YOLOXL(3), YOLOXX(3)]) + @pytest.mark.parametrize( + "model", + [ + YOLOX(model_name="yolox_tiny", label_info=3), + YOLOX(model_name="yolox_s", label_info=3), + YOLOX(model_name="yolox_l", label_info=3), + YOLOX(model_name="yolox_x", label_info=3), + ], + ) def test_loss(self, model, fxt_data_module): data = next(iter(fxt_data_module.train_dataloader())) data.images = [torch.randn(3, 32, 32), torch.randn(3, 48, 48)] @@ -43,7 +54,15 @@ def test_loss(self, model, fxt_data_module): assert "loss_bbox" in output assert "loss_obj" in output - @pytest.mark.parametrize("model", [YOLOXTINY(3), YOLOXS(3), YOLOXL(3), YOLOXX(3)]) + @pytest.mark.parametrize( + "model", + [ + YOLOX(model_name="yolox_tiny", label_info=3), + YOLOX(model_name="yolox_s", label_info=3), + YOLOX(model_name="yolox_l", label_info=3), + YOLOX(model_name="yolox_x", label_info=3), + ], + ) def test_predict(self, model, fxt_data_module): data = next(iter(fxt_data_module.train_dataloader())) data.images = [torch.randn(3, 32, 32), torch.randn(3, 48, 48)] @@ -51,7 +70,15 @@ def test_predict(self, model, fxt_data_module): output = model(data) assert isinstance(output, DetBatchPredEntity) - @pytest.mark.parametrize("model", [YOLOXTINY(3), YOLOXS(3), YOLOXL(3), YOLOXX(3)]) + @pytest.mark.parametrize( + "model", + [ + YOLOX(model_name="yolox_tiny", label_info=3), + YOLOX(model_name="yolox_s", label_info=3), + YOLOX(model_name="yolox_l", label_info=3), + YOLOX(model_name="yolox_x", label_info=3), + ], + ) def test_export(self, model): model.eval() output = model.forward_for_tracing(torch.randn(1, 3, 32, 32)) diff --git a/tests/unit/core/data/test_tiling.py b/tests/unit/core/data/test_tiling.py index 0eff56af3a0..eaa9ba17834 100644 --- a/tests/unit/core/data/test_tiling.py +++ b/tests/unit/core/data/test_tiling.py @@ -14,7 +14,7 @@ from datumaro import Dataset as DmDataset from datumaro import Polygon from omegaconf import DictConfig, OmegaConf -from otx.algo.detection.atss import MobileNetV2ATSS +from otx.algo.detection.atss import ATSS from otx.algo.instance_segmentation.maskrcnn import MaskRCNNEfficientNet from otx.core.config.data import ( SubsetConfig, @@ -342,7 +342,8 @@ def test_val_dataloader(self, fxt_det_data_config) -> None: assert isinstance(batch, TileBatchDetDataEntity) def test_det_tile_merge(self, fxt_det_data_config): - model = MobileNetV2ATSS( + model = ATSS( + model_name="atss_mobilenetv2", label_info=3, ) # updated from OTXDetectionModel to avoid NotImplementedError in _build_model # Enable tile adapter @@ -360,7 +361,8 @@ def test_det_tile_merge(self, fxt_det_data_config): model.forward_tiles(batch) def test_explain_det_tile_merge(self, fxt_det_data_config): - model = MobileNetV2ATSS( + model = ATSS( + model_name="atss_mobilenetv2", label_info=3, ) # updated from OTXDetectionModel to avoid NotImplementedError in _build_model # Enable tile adapter diff --git a/tests/unit/core/model/test_detection.py b/tests/unit/core/model/test_detection.py index 3cc654e8750..4ce95dd3635 100644 --- a/tests/unit/core/model/test_detection.py +++ b/tests/unit/core/model/test_detection.py @@ -13,7 +13,7 @@ from importlib_resources import files from lightning.pytorch.cli import ReduceLROnPlateau from omegaconf import OmegaConf -from otx.algo.detection.atss import MobileNetV2ATSS +from otx.algo.detection.atss import ATSS from otx.algo.explain.explain_algo import feature_vector_fn from otx.core.metrics.fmeasure import FMeasureCallable from otx.core.types.export import TaskLevelExportParameters @@ -54,8 +54,8 @@ def config(self) -> DictConfig: return OmegaConf.load(cfg_path) @pytest.fixture() - def otx_model(self) -> MobileNetV2ATSS: - return MobileNetV2ATSS(label_info=1) + def otx_model(self) -> ATSS: + return ATSS(model_name="atss_mobilenetv2", label_info=1) def test_configure_metric_with_ckpt( self, @@ -63,8 +63,9 @@ def test_configure_metric_with_ckpt( mock_scheduler, mock_ckpt, ) -> None: - model = MobileNetV2ATSS( - label_info=1, + model = ATSS( + model_name="atss_mobilenetv2", + label_info=2, torch_compile=False, optimizer=mock_optimizer, scheduler=mock_scheduler, @@ -133,7 +134,7 @@ def test_export_parameters(self, otx_model): assert isinstance(parameters, TaskLevelExportParameters) assert parameters.task_type == "detection" - def test_dummy_input(self, otx_model: MobileNetV2ATSS): + def test_dummy_input(self, otx_model: ATSS): batch_size = 2 batch = otx_model.get_dummy_input(batch_size) assert batch.batch_size == batch_size