openvinotoolkit · sungmanc · Dec 11, 2023 · Dec 7, 2023 · Dec 7, 2023 · Dec 7, 2023
@@ -235,7 +235,7 @@ exclude = [
 
     # it will be cleaned up later
     "src/otx/core/engine/utils/*",
-    "src/otx/algo/classification/model/backbones/*",
+    "src/otx/algo/classification/backbones/*",
 ]
 
 # Same as Black.

@@ -1,8 +1,9 @@
 # Copyright (C) 2023 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
-"""Module for OTX classification."""
+"""Module for OTX classification models."""
 
-from . import model
+from . import backbones
+from .otx_dino_v2 import DINOv2, DINOv2RegisterClassifier
 
-__all__ = ["model"]
+__all__ = ["backbones", "DINOv2", "DINOv2RegisterClassifier"]
@@ -1,6 +1,6 @@
 # Copyright (C) 2023 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-#
+
 """EfficientNetV2 model.
 
 Original papers:

@@ -0,0 +1,132 @@
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""DINO-V2 model for the OTX classification."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import torch
+from torch import nn
+
+from otx.core.data.entity.base import OTXBatchLossEntity
+from otx.core.data.entity.classification import MulticlassClsBatchDataEntity, MulticlassClsBatchPredEntity
+from otx.core.model.entity.classification import OTXClassificationModel
+
+if TYPE_CHECKING:
+    from omegaconf import DictConfig
+
+class DINOv2(nn.Module):
+    """DINO-v2 Model."""
+    def __init__(
+        self,
+        backbone_name: str,
+        freeze_backbone: bool,
+        head_in_channels: int,
+        num_classes: int,
+        training: bool,
+    ):
+        super().__init__()
+        self.backbone = torch.hub.load(
+            repo_or_dir="facebookresearch/dinov2",
+            model=backbone_name,
+        )
+
+        if freeze_backbone:
+            self._freeze_backbone(self.backbone)
+
+        self.head = nn.Linear(
+            head_in_channels,
+            num_classes,
+        )
+
+        self.loss = nn.CrossEntropyLoss()
+        self.softmax = nn.Softmax()
+
+        self.training = training
+
+    def _freeze_backbone(self, backbone: nn.Module) -> None:
+        """Freeze the backbone."""
+        for _, v in backbone.named_parameters():
+            v.requires_grad = False
+
+    def forward(self, imgs: torch.Tensor, labels: torch.Tensor = None) -> torch.Tensor:
+        """Forward function."""
+        feats = self.backbone(imgs)
+        logits = self.head(feats)
+        if self.training:
+            return self.loss(logits, labels)
+        return self.softmax(logits)
+
+class DINOv2RegisterClassifier(OTXClassificationModel):
+    """DINO-v2 Classification Model with register."""
+    def __init__(self, config: DictConfig) -> None:
+        self.config = config
+        super().__init__() # create the model
+
+        # NOTE,
+        # We've decided to use MMpretrain's pipeline for this model
+        # It's hard to use ClsDataPreprocessor since the model is not related to MMpretrain
+        # That's the reason why I implemented the below preprocess things
+        self.data_preprocess_cfg = self.config.data_preprocess
+        self.register_buffer(
+            'mean', torch.tensor(self.data_preprocess_cfg.mean).view(-1, 1, 1), False,
+        )
+        self.register_buffer(
+            'std', torch.tensor(self.data_preprocess_cfg.std).view(-1, 1, 1), False,
+        )
+
+    def _create_model(self) -> nn.Module:
+        """Create the model."""
+        return DINOv2(
+            backbone_name=self.config.backbone.name,
+            freeze_backbone=self.config.backbone.frozen,
+            head_in_channels=self.config.head.in_channels,
+            num_classes=self.config.head.num_classes,
+            training=self.training,
+        )
+
+    def _preprocess_img(self, imgs: torch.Tensor) -> torch.Tensor:
+        """Control normalize and BGR/RGB conversion."""
+        # BGR -> RGB
+        if self.data_preprocess_cfg.to_rgb and imgs.size(1) == 3:
+            imgs = imgs.flip(1)
+        return (imgs - self.mean) / self.std
+
+
+    def _customize_inputs(self, entity: MulticlassClsBatchDataEntity) -> dict[str, Any]:
+        """Customize the inputs for the model."""
+        inputs: dict[str, Any] = {}
+        inputs["imgs"] = self._preprocess_img(torch.stack(entity.images))
+        inputs["labels"] = torch.cat(entity.labels)
+        return inputs
+
+    def _customize_outputs(
+        self,
+        outputs: Any, # noqa: ANN401
+        inputs: MulticlassClsBatchDataEntity,
+    ) -> MulticlassClsBatchPredEntity | OTXBatchLossEntity:
+        """Customize the outputs for the model."""
+        if self.training:
+            if not isinstance(outputs, torch.Tensor):
+                raise TypeError(outputs)
+
+            losses = OTXBatchLossEntity()
+            losses["loss"] = outputs
+            return losses
+
+        max_pred_elements, max_pred_idxs = torch.max(outputs, dim=1)
+        pred_scores = max_pred_elements
+        pred_labels = max_pred_idxs
+
+        scores = torch.unbind(pred_scores, dim=0)
+        labels = torch.unbind(pred_labels, dim=0)
+
+        return MulticlassClsBatchPredEntity(
+            batch_size=pred_labels.shape[0],
+            images=inputs.images,
+            imgs_info=inputs.imgs_info,
+            scores=scores,
+            labels=labels,
+        )
@@ -0,0 +1,12 @@
+optimizer:
+  _target_: torch.optim.Adam
+  _partial_: true
+  lr: 1e-3
+  weight_decay: 0.0
+
+scheduler:
+  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
+  _partial_: true
+  mode: min
+  factor: 0.1
+  patience: 10
@@ -1,17 +1,7 @@
-_target_: otx.core.model.module.detection.OTXDetectionLitModule
-
-optimizer:
-  _target_: torch.optim.Adam
-  _partial_: true
-  lr: 1e-3
-  weight_decay: 0.0
+defaults:
+  - default
 
-scheduler:
-  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
-  _partial_: true
-  mode: min
-  factor: 0.1
-  patience: 10
+_target_: otx.core.model.module.detection.OTXDetectionLitModule
 
 otx_model:
   _target_: otx.core.model.entity.detection.MMDetCompatibleModel

@@ -1,17 +1,7 @@
-_target_: otx.core.model.module.classification.OTXClassificationLitModule
-
-optimizer:
-  _target_: torch.optim.Adam
-  _partial_: true
-  lr: 1e-3
-  weight_decay: 0.0
+defaults:
+  - default
 
-scheduler:
-  _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
-  _partial_: true
-  mode: min
-  factor: 0.1
-  patience: 10
+_target_: otx.core.model.module.classification.OTXClassificationLitModule
 
 otx_model:
   _target_: otx.core.model.entity.classification.MMPretrainCompatibleModel

@@ -0,0 +1,11 @@
+defaults:
+  - default
+
+_target_: otx.core.model.module.classification.OTXClassificationLitModule
+
+otx_model:
+  _target_: otx.core.model.entity.classification.OTXClassificationModel
+  config: ???
+
+# compile model for faster training with pytorch 2.0
+torch_compile: false
@@ -0,0 +1,63 @@
+# @package _global_
+defaults:
+  - override /base: classification
+  - override /callbacks: classification
+  - override /data: mmpretrain
+  - override /model: torch_classification
+data:
+  train_subset:
+    batch_size: 64
+    transforms:
+      - type: LoadImageFromFile
+        to_float32: true
+      - backend: cv2
+        scale: 224
+        type: RandomResizedCrop
+      - type: PackInputs
+  val_subset:
+    batch_size: 64
+    transforms:
+      - type: LoadImageFromFile
+        to_float32: true
+      - backend: cv2
+        edge: short
+        scale: 256
+        type: ResizeEdge
+      - crop_size: 224
+        type: CenterCrop
+      - type: PackInputs
+  test_subset:
+    batch_size: 64
+    transforms:
+      - type: LoadImageFromFile
+        to_float32: true
+      - backend: cv2
+        edge: short
+        scale: 256
+        type: ResizeEdge
+      - crop_size: 224
+        type: CenterCrop
+      - type: PackInputs
+model:
+  otx_model:
+    _target_: otx.algo.classification.otx_dino_v2.DINOv2RegisterClassifier
+    config:
+      backbone:
+        name: dinov2_vits14_reg
+        frozen: true
+      head:
+        in_channels: 384
+        num_classes: 1000
+      data_preprocess:
+        mean:
+          - 123.675
+          - 116.28
+          - 103.53
+        std:
+          - 58.395
+          - 57.12
+          - 57.375
+        to_rgb: True
+  optimizer:
+    _target_: torch.optim.AdamW
+    lr: 1e-5
@@ -21,7 +21,6 @@
         "data_dir": "tests/assets/classification_dataset",
         "overrides": [
             "model.otx_model.config.head.num_classes=2",
-            "model.otx_model.config.data_preprocessor.num_classes=2",
         ],
     },
     "detection": {

@@ -0,0 +1,2 @@
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,51 @@
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+from __future__ import annotations
+
+import pytest
+import torch
+from omegaconf import DictConfig
+from torchvision import tv_tensors
+
+from src.otx.core.data.entity.base import ImageInfo
+from src.otx.core.data.entity.classification import MulticlassClsBatchDataEntity
+
+
+@pytest.fixture()
+def fxt_multiclass_cls_batch_data_entity() -> MulticlassClsBatchDataEntity:
+    batch_size = 2
+    random_tensor = torch.randn((batch_size, 3, 224, 224))
+    tv_tensor = tv_tensors.Image(data=random_tensor)
+    img_infos = [ImageInfo(
+        img_idx=i,
+        img_shape=(224, 224),
+        ori_shape=(224, 224),
+        pad_shape=(0, 0),
+        scale_factor=(1.0, 1.0),
+    ) for i in range(batch_size)]
+    return MulticlassClsBatchDataEntity(
+        batch_size=2,
+        images=tv_tensor,
+        imgs_info=img_infos,
+        labels=[torch.tensor([0]), torch.tensor([1])],
+    )
+
+@pytest.fixture()
+def fxt_config_mock() -> DictConfig:
+    pseudo_model_config = {
+        "backbone": {
+            "name": "dinov2_vits14_reg",
+            "frozen": False,
+        },
+        "head":{
+            "in_channels": 384,
+            "num_classes": 2,
+        },
+        "data_preprocess":{
+            "mean": [1, 1, 1],
+            "std": [1, 1, 1],
+            "to_rgb": True,
+        },
+    }
+    return DictConfig(pseudo_model_config)
@@ -0,0 +1,32 @@
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import torch
+from otx.algo.classification import DINOv2
+
+
+class TestDINOv2:
+    def setup(self) -> None:
+        self.model = DINOv2(
+            backbone_name="dinov2_vits14_reg",
+            freeze_backbone=True,
+            head_in_channels=384,
+            num_classes=2,
+            training=True,
+        )
+
+    def test_freeze_backbone(self) -> None:
+        for _, v in self.model.backbone.named_parameters():
+            assert v.requires_grad is False
+
+    def test_forward(self) -> None:
+        rand_img = torch.randn([1, 3, 224, 224], dtype=torch.float32)
+        rand_label = torch.ones([1], dtype=torch.int64)
+        outputs = self.model(rand_img, rand_label)
+        assert isinstance(outputs, torch.Tensor)
+
+        self.model.training = False
+        outputs = self.model(rand_img, rand_label)
+        assert torch.sum(outputs) == 1.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright (C) 2023 Intel Corporation
		# SPDX-License-Identifier: Apache-2.0