openvinotoolkit · harimkang · Mar 21, 2023 · Mar 20, 2023 · Mar 20, 2023 · Mar 20, 2023
@@ -10,7 +10,7 @@ learning_parameters:
       stable. A larger batch size has higher memory requirements.
     editable: true
     header: Batch size
-    max_value: 512
+    max_value: 2048
     min_value: 1
     type: INTEGER
     ui_rules:

@@ -65,7 +65,7 @@ class BaseLearningParameters(ParameterGroup):
         batch_size = configurable_integer(
             default_value=5,
             min_value=1,
-            max_value=512,
+            max_value=2048,
             header="Batch size",
             description="The number of training samples seen in each iteration of training. Increasing thisvalue "
             "improves training time and may make the training more stable. A larger batch size has higher "

@@ -11,7 +11,7 @@
       "options": {
         "arch": ["tiny", "small", "base"]
       },
-      "available": []
+      "available": ["CLASSIFICATION"]
     },
     "mmcls.ConvMixer": {
       "required": ["arch"],
@@ -287,7 +287,7 @@
     "mmcls.T2T_ViT": {
       "required": [],
       "options": {},
-      "available": []
+      "available": ["CLASSIFICATION"]
     },
     "mmcls.TIMMBackbone": {
       "required": ["model_name"],
@@ -299,7 +299,7 @@
       "options": {
         "arch": ["base", "small"]
       },
-      "available": []
+      "available": ["CLASSIFICATION"]
     },
     "mmcls.PCPVT": {
       "required": ["arch"],
@@ -341,7 +341,7 @@
           "deit-base"
         ]
       },
-      "available": []
+      "available": ["CLASSIFICATION"]
     }
   }
 }
@@ -15,6 +15,8 @@
 
 logger = get_logger()
 
+TRANSFORMER_BACKBONES = ["VisionTransformer", "T2T_ViT", "TNT", "Conformer"]
+
 
 class ClsStage(Stage):
     MODEL_BUILDER = build_classifier
@@ -89,6 +91,11 @@ def configure_in_channel(cfg):
         output = layer(torch.rand([1] + list(input_shape)))
         if isinstance(output, (tuple, list)):
             output = output[-1]
+
+        if layer.__class__.__name__ in TRANSFORMER_BACKBONES and isinstance(output, (tuple, list)):
+            # mmcls.VisionTransformer outputs Tuple[List[...]] and the last index of List is the final logit.
+            _, output = output
+
         in_channels = output.shape[1]
         if cfg.model.get("neck") is not None:
             if cfg.model.neck.get("in_channels") is not None: