Skip to content

Commit

Permalink
Decoupling mmaction for action classification (MoviNet, X3D) (#3582)
Browse files Browse the repository at this point in the history
* first drraft

* first drraft

* temp

* temp2

* enable movinet

* decouple data pipeline

* temp

* enable x3d for action cls

* fix mypy error

* fix mypy error

* fix unit tests

* fix unit tests

* fix integ tests

* update _forward for recognizer

* correct mean/std for x3d

* remove action data sample

* update perf test action

* add more comments

* remove data preprocessor for action task

* remove mypy exception

* remove mmengine config

* remove mmengine config

* fix types

* add docstrings
  • Loading branch information
wonjuleee authored Jun 10, 2024
1 parent 1e1a9c5 commit af03ae0
Show file tree
Hide file tree
Showing 41 changed files with 2,413 additions and 232 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ dependencies = [
"importlib_resources==6.4.0",
"docstring_parser==0.16", # CLI help-formatter
"rich_argparse==1.4.0", # CLI help-formatter
"einops==0.7.0",
]

[project.optional-dependencies]
Expand Down
16 changes: 12 additions & 4 deletions src/otx/algo/action_classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,17 @@
#
"""Module for OTX action classification models."""

from .backbones import OTXMoViNet
from .heads import MoViNetHead
from .backbones import MoViNetBackbone, X3DBackbone
from .heads import MoViNetHead, X3DHead
from .openvino_model import OTXOVActionCls
from .recognizers import MoViNetRecognizer, OTXRecognizer3D
from .recognizers import BaseRecognizer, MoViNetRecognizer

__all__ = ["OTXOVActionCls", "OTXRecognizer3D", "OTXMoViNet", "MoViNetHead", "MoViNetRecognizer"]
__all__ = [
"OTXOVActionCls",
"BaseRecognizer",
"MoViNetBackbone",
"MoViNetHead",
"MoViNetRecognizer",
"X3DBackbone",
"X3DHead",
]
5 changes: 3 additions & 2 deletions src/otx/algo/action_classification/backbones/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
"""Custom backbones for action classification."""

from .movinet import OTXMoViNet
from .movinet import MoViNetBackbone
from .x3d import X3DBackbone

__all__ = ["OTXMoViNet"]
__all__ = ["MoViNetBackbone", "X3DBackbone"]
101 changes: 50 additions & 51 deletions src/otx/algo/action_classification/backbones/movinet.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
"""Code modified from: https://github.com/Atze00/MoViNet-pytorch/blob/main/movinets/models.py."""
# Copyright (c) OpenMMLab. All rights reserved.

"""Code modified from: https://github.com/Atze00/MoViNet-pytorch/blob/main/movinets/models.py."""
from __future__ import annotations

from collections import OrderedDict
Expand All @@ -10,8 +11,7 @@
import torch
import torch.nn.functional as F # noqa: N812
from einops import rearrange
from mmaction.models import MODELS
from mmengine.config import Config
from omegaconf.dictconfig import DictConfig
from torch import Tensor, nn
from torch.nn.modules.utils import _pair, _triple

Expand Down Expand Up @@ -438,7 +438,7 @@ class BasicBneck(nn.Module):
"""Basic bottleneck block of MoViNet network.
Args:
cfg (Config): Configuration object containing block's hyperparameters.
cfg (DictConfig): configuration object containing block's hyperparameters.
tf_like (bool): A boolean indicating whether to use TensorFlow like convolution
padding or not.
conv_type (str): A string indicating the type of convolutional layer to use.
Expand All @@ -460,7 +460,7 @@ class BasicBneck(nn.Module):

def __init__(
self,
cfg: Config,
cfg: DictConfig,
tf_like: bool,
conv_type: str,
norm_layer: Callable[..., nn.Module] | None = None,
Expand Down Expand Up @@ -543,11 +543,11 @@ def forward(self, x: Tensor) -> Tensor:
return residual + self.alpha * x


class MoViNet(nn.Module):
class MoViNetBackboneBase(nn.Module):
"""MoViNet class used for video classification.
Args:
cfg (Config): Configuration object containing network's hyperparameters.
cfg (DictConfig): configuration object containing network's hyperparameters.
conv_type (str, optional): A string indicating the type of convolutional layer
to use. Can be "2d" or "3d". Defaults to "3d".
tf_like (bool, optional): A boolean indicating whether to use TensorFlow like
Expand All @@ -569,7 +569,7 @@ class MoViNet(nn.Module):

def __init__(
self,
cfg: Config,
cfg: DictConfig,
conv_type: str = "3d",
tf_like: bool = False,
) -> None:
Expand Down Expand Up @@ -650,70 +650,69 @@ def init_weights(self) -> None:
self.apply(self._init_weights)


@MODELS.register_module()
class OTXMoViNet(MoViNet):
class MoViNetBackbone(MoViNetBackboneBase):
"""MoViNet wrapper class for OTX."""

def __init__(self, **kwargs):
cfg = Config()
def __init__(self, **kwargs) -> None:
cfg = DictConfig({})
cfg.name = "A0"
cfg.conv1 = Config()
OTXMoViNet.fill_conv(cfg.conv1, 3, 8, (1, 3, 3), (1, 2, 2), (0, 1, 1))
cfg.conv1 = DictConfig({})
MoViNetBackbone.fill_conv(cfg.conv1, 3, 8, (1, 3, 3), (1, 2, 2), (0, 1, 1))

cfg.blocks = [
[Config()],
[Config() for _ in range(3)],
[Config() for _ in range(3)],
[Config() for _ in range(4)],
[Config() for _ in range(4)],
[DictConfig({})],
[DictConfig({}) for _ in range(3)],
[DictConfig({}) for _ in range(3)],
[DictConfig({}) for _ in range(4)],
[DictConfig({}) for _ in range(4)],
]

# block 2
OTXMoViNet.fill_se_config(cfg.blocks[0][0], 8, 8, 24, (1, 5, 5), (1, 2, 2), (0, 2, 2), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[0][0], 8, 8, 24, (1, 5, 5), (1, 2, 2), (0, 2, 2), (0, 1, 1))

# block 3
OTXMoViNet.fill_se_config(cfg.blocks[1][0], 8, 32, 80, (3, 3, 3), (1, 2, 2), (1, 0, 0), (0, 0, 0))
OTXMoViNet.fill_se_config(cfg.blocks[1][1], 32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
OTXMoViNet.fill_se_config(cfg.blocks[1][2], 32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[1][0], 8, 32, 80, (3, 3, 3), (1, 2, 2), (1, 0, 0), (0, 0, 0))
MoViNetBackbone.fill_se_config(cfg.blocks[1][1], 32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[1][2], 32, 32, 80, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))

# block 4
OTXMoViNet.fill_se_config(cfg.blocks[2][0], 32, 56, 184, (5, 3, 3), (1, 2, 2), (2, 0, 0), (0, 0, 0))
OTXMoViNet.fill_se_config(cfg.blocks[2][1], 56, 56, 112, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
OTXMoViNet.fill_se_config(cfg.blocks[2][2], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[2][0], 32, 56, 184, (5, 3, 3), (1, 2, 2), (2, 0, 0), (0, 0, 0))
MoViNetBackbone.fill_se_config(cfg.blocks[2][1], 56, 56, 112, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[2][2], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))

# block 5
OTXMoViNet.fill_se_config(cfg.blocks[3][0], 56, 56, 184, (5, 3, 3), (1, 1, 1), (2, 1, 1), (0, 1, 1))
OTXMoViNet.fill_se_config(cfg.blocks[3][1], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
OTXMoViNet.fill_se_config(cfg.blocks[3][2], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
OTXMoViNet.fill_se_config(cfg.blocks[3][3], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[3][0], 56, 56, 184, (5, 3, 3), (1, 1, 1), (2, 1, 1), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[3][1], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[3][2], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[3][3], 56, 56, 184, (3, 3, 3), (1, 1, 1), (1, 1, 1), (0, 1, 1))

# block 6
OTXMoViNet.fill_se_config(cfg.blocks[4][0], 56, 104, 384, (5, 3, 3), (1, 2, 2), (2, 1, 1), (0, 1, 1))
OTXMoViNet.fill_se_config(cfg.blocks[4][1], 104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1))
OTXMoViNet.fill_se_config(cfg.blocks[4][2], 104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1))
OTXMoViNet.fill_se_config(cfg.blocks[4][3], 104, 104, 344, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[4][0], 56, 104, 384, (5, 3, 3), (1, 2, 2), (2, 1, 1), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[4][1], 104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[4][2], 104, 104, 280, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1))
MoViNetBackbone.fill_se_config(cfg.blocks[4][3], 104, 104, 344, (1, 5, 5), (1, 1, 1), (0, 2, 2), (0, 1, 1))

cfg.conv7 = Config()
OTXMoViNet.fill_conv(cfg.conv7, 104, 480, (1, 1, 1), (1, 1, 1), (0, 0, 0))
cfg.conv7 = DictConfig({})
MoViNetBackbone.fill_conv(cfg.conv7, 104, 480, (1, 1, 1), (1, 1, 1), (0, 0, 0))

cfg.dense9 = Config({"hidden_dim": 2048})
cfg.dense9 = DictConfig({"hidden_dim": 2048})
super().__init__(cfg)

@staticmethod
def fill_se_config(
conf: Config,
conf: DictConfig,
input_channels: int,
out_channels: int,
expanded_channels: int,
kernel_size: tuple[int, int],
stride: tuple[int, int],
padding: tuple[int, int],
padding_avg: tuple[int, int],
kernel_size: tuple[int, int, int],
stride: tuple[int, int, int],
padding: tuple[int, int, int],
padding_avg: tuple[int, int, int],
) -> None:
"""Set the values of a given Config object to SE module.
"""Set the values of a given DictConfig object to SE module.
Args:
conf (Config): The Config object to be updated.
conf (DictConfig): The DictConfig object to be updated.
input_channels (int): The number of input channels.
out_channels (int): The number of output channels.
expanded_channels (int): The number of channels after expansion in the basic block.
Expand All @@ -727,7 +726,7 @@ def fill_se_config(
"""
conf.expanded_channels = expanded_channels
conf.padding_avg = padding_avg
OTXMoViNet.fill_conv(
MoViNetBackbone.fill_conv(
conf,
input_channels,
out_channels,
Expand All @@ -738,17 +737,17 @@ def fill_se_config(

@staticmethod
def fill_conv(
conf: Config,
conf: DictConfig,
input_channels: int,
out_channels: int,
kernel_size: tuple[int, int],
stride: tuple[int, int],
padding: tuple[int, int],
kernel_size: tuple[int, int, int],
stride: tuple[int, int, int],
padding: tuple[int, int, int],
) -> None:
"""Set the values of a given Config object to conv layer.
"""Set the values of a given DictConfig object to conv layer.
Args:
conf (Config): The Config object to be updated.
conf (DictConfig): The DictConfig object to be updated.
input_channels (int): The number of input channels.
out_channels (int): The number of output channels.
kernel_size (tuple[int]): The size of the kernel.
Expand Down
Loading

0 comments on commit af03ae0

Please sign in to comment.