From e750fd4c72bec27bbb98f7784bb2b50a25da7a5d Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Fri, 7 Jul 2023 16:06:20 +0900
Subject: [PATCH 01/12] Initial commit

---
 src/otx/algorithms/visual_prompting/tasks/openvino.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/otx/algorithms/visual_prompting/tasks/openvino.py b/src/otx/algorithms/visual_prompting/tasks/openvino.py
index e2d24c9d14a..363f426034c 100644
--- a/src/otx/algorithms/visual_prompting/tasks/openvino.py
+++ b/src/otx/algorithms/visual_prompting/tasks/openvino.py
@@ -335,4 +335,5 @@ def optimize(
         optimization_parameters: Optional[OptimizationParameters] = None,
     ):
         """Optimize function of OpenVINOVisualPromptingTask."""
+        logger.info("Start PTQ optimization")
         raise NotImplementedError

From bddcf67d14b4e0ffb469b4d1203e4736ba0b0ab8 Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Wed, 12 Jul 2023 14:32:23 +0900
Subject: [PATCH 02/12] Update block

---
 .../models/visual_prompters/segment_anything.py             | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/models/visual_prompters/segment_anything.py b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/models/visual_prompters/segment_anything.py
index 3dbe568091f..2460df0aec3 100644
--- a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/models/visual_prompters/segment_anything.py
+++ b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/models/visual_prompters/segment_anything.py
@@ -174,9 +174,9 @@ def replace_state_dict_keys(state_dict, revise_keys):
                 state_dict = replace_state_dict_keys(state_dict, revise_keys)
                 self.load_state_dict(state_dict)
 
-    #################################################
-    #     forward for inference (export/deploy)     #
-    #################################################
+    ##########################################################
+    #     forward for inference (export/deploy/optimize)     #
+    ##########################################################
     @torch.no_grad()
     def forward(
         self,

From c59398c81432d64ecad3c29d5c796680672bd0ca Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Wed, 12 Jul 2023 16:06:57 +0900
Subject: [PATCH 03/12] (WIP) otx optimize

---
 .../config/visual_prompting_config.py         |   4 +-
 .../datasets/pipelines/sam_transforms.py      |   9 +-
 .../configs/base/configuration.py             |  21 ++-
 .../configs/sam_vit_b/configuration.yaml      |  60 +-------
 .../visual_prompting/tasks/openvino.py        | 143 +++++++++++++++++-
 src/otx/cli/tools/optimize.py                 |   8 +-
 6 files changed, 171 insertions(+), 74 deletions(-)

diff --git a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/config/visual_prompting_config.py b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/config/visual_prompting_config.py
index ddd4d4dc070..e3382f25526 100644
--- a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/config/visual_prompting_config.py
+++ b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/config/visual_prompting_config.py
@@ -97,8 +97,8 @@ def update_visual_prompting_config(
     if groups:
         for group in groups:
             if group in ["learning_parameters", "nncf_optimization", "pot_parameters", "postprocessing"]:
-                if group in ["nncf_optimization", "pot_parameters"]:
-                    # TODO (sungchul): Consider pot_parameters, nncf_optimization, and postprocessing
+                if group in ["nncf_optimization"]:
+                    # TODO (sungchul): Consider nncf_optimization
                     logger.warning(f"{group} will be implemented.")
                     continue
                 update_visual_prompting_config(visual_prompting_config, getattr(otx_config, group))
diff --git a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
index 74e80f1b383..c3cae7d78a7 100644
--- a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
+++ b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
@@ -36,7 +36,8 @@ def __call__(self, item: Dict[str, Union[List, Tensor]]) -> Dict[str, Union[List
         Dict[str, Union[List, Tensor]]: Dictionary of batch data.
         """
         item["images"] = torch.as_tensor(
-            self.apply_image(item["images"]).transpose((2, 0, 1)), dtype=torch.get_default_dtype()
+            self.apply_image(item["images"], self.target_length).transpose((2, 0, 1)),
+            dtype=torch.get_default_dtype()
         )
         item["gt_masks"] = [torch.as_tensor(gt_mask) for gt_mask in item["gt_masks"]]
         item["bboxes"] = self.apply_boxes(item["bboxes"], item["original_size"])
@@ -44,16 +45,18 @@ def __call__(self, item: Dict[str, Union[List, Tensor]]) -> Dict[str, Union[List
             item["points"] = self.apply_coords(item["points"], item["original_size"])
         return item
 
-    def apply_image(self, image: np.ndarray) -> np.ndarray:
+    @staticmethod
+    def apply_image(image: np.ndarray, target_length: int) -> np.ndarray:
         """Expects a numpy array with shape HxWxC in uint8 format.
 
         Args:
             image (np.ndarray): Image array.
+            target_length (int): The length of the longest side of the image.
 
         Returns:
             np.ndarray: Resized image.
         """
-        target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
+        target_size = ResizeLongestSide.get_preprocess_shape(image.shape[0], image.shape[1], target_length)
         return np.array(resize(to_pil_image(image), target_size))
 
     def apply_coords(self, coords: np.ndarray, original_size: Union[List[Any], Tensor]) -> np.ndarray:
diff --git a/src/otx/algorithms/visual_prompting/configs/base/configuration.py b/src/otx/algorithms/visual_prompting/configs/base/configuration.py
index eeb174c4875..f89e7ac9896 100644
--- a/src/otx/algorithms/visual_prompting/configs/base/configuration.py
+++ b/src/otx/algorithms/visual_prompting/configs/base/configuration.py
@@ -15,15 +15,19 @@
 # and limitations under the License.
 
 
+from sys import maxsize
+
 from attr import attrs
 
-from otx.algorithms.common.configs import BaseConfig
+from otx.algorithms.common.configs import BaseConfig, POTQuantizationPreset
 from otx.api.configuration.elements import (
     ParameterGroup,
     add_parameter_group,
+    boolean_attribute,
     configurable_boolean,
     configurable_float,
     configurable_integer,
+    selectable,
     string_attribute,
 )
 from otx.api.configuration.model_lifecycle import ModelLifecycle
@@ -95,5 +99,20 @@ class __Postprocessing(ParameterGroup):
             affects_outcome_of=ModelLifecycle.INFERENCE,
         )
 
+    @attrs
+    class __POTParameter(BaseConfig.BasePOTParameter):
+        header = string_attribute("POT Parameters")
+        description = header
+        visible_in_ui = boolean_attribute(False)
+
+        preset = selectable(
+            default_value=POTQuantizationPreset.MIXED,
+            header="Preset",
+            description="Quantization preset that defines quantization scheme",
+            editable=True,
+            visible_in_ui=True,
+        )
+
     learning_parameters = add_parameter_group(__LearningParameters)
     postprocessing = add_parameter_group(__Postprocessing)
+    pot_parameters = add_parameter_group(__POTParameter)
diff --git a/src/otx/algorithms/visual_prompting/configs/sam_vit_b/configuration.yaml b/src/otx/algorithms/visual_prompting/configs/sam_vit_b/configuration.yaml
index e20429f60b2..8a867588912 100644
--- a/src/otx/algorithms/visual_prompting/configs/sam_vit_b/configuration.yaml
+++ b/src/otx/algorithms/visual_prompting/configs/sam_vit_b/configuration.yaml
@@ -85,62 +85,6 @@ learning_parameters:
       visible_in_ui: true
       warning: null
       auto_hpo_state: NOT_POSSIBLE
-nncf_optimization:
-  description: Optimization by NNCF
-  enable_pruning:
-    affects_outcome_of: NONE
-    auto_hpo_state: not_possible
-    auto_hpo_value: null
-    default_value: false
-    description: Enable filter pruning algorithm
-    editable: true
-    header: Enable filter pruning algorithm
-    type: BOOLEAN
-    ui_rules:
-      action: DISABLE_EDITING
-      operator: AND
-      rules: []
-      type: UI_RULES
-    value: false
-    visible_in_ui: true
-    warning: null
-  enable_quantization:
-    affects_outcome_of: NONE
-    auto_hpo_state: not_possible
-    auto_hpo_value: null
-    default_value: true
-    description: Enable quantization algorithm
-    editable: true
-    header: Enable quantization algorithm
-    type: BOOLEAN
-    ui_rules:
-      action: DISABLE_EDITING
-      operator: AND
-      rules: []
-      type: UI_RULES
-    value: true
-    visible_in_ui: true
-    warning: null
-  header: Optimization by NNCF
-  pruning_supported:
-    affects_outcome_of: TRAINING
-    auto_hpo_state: not_possible
-    auto_hpo_value: null
-    default_value: false
-    description: Whether filter pruning is supported
-    editable: false
-    header: Whether filter pruning is supported
-    type: BOOLEAN
-    ui_rules:
-      action: DISABLE_EDITING
-      operator: AND
-      rules: []
-      type: UI_RULES
-    value: false
-    visible_in_ui: false
-    warning: null
-  type: PARAMETER_GROUP
-  visible_in_ui: true
 pot_parameters:
   description: POT Parameters
   header: POT Parameters
@@ -148,7 +92,7 @@ pot_parameters:
     affects_outcome_of: NONE
     auto_hpo_state: not_possible
     auto_hpo_value: null
-    default_value: Performance
+    default_value: Mixed
     description: Quantization preset that defines quantization scheme
     editable: true
     enum_name: POTQuantizationPreset
@@ -162,7 +106,7 @@ pot_parameters:
       operator: AND
       rules: []
       type: UI_RULES
-    value: Performance
+    value: Mixed
     visible_in_ui: true
     warning: null
   stat_subset_size:
diff --git a/src/otx/algorithms/visual_prompting/tasks/openvino.py b/src/otx/algorithms/visual_prompting/tasks/openvino.py
index 363f426034c..f0a0d9a4747 100644
--- a/src/otx/algorithms/visual_prompting/tasks/openvino.py
+++ b/src/otx/algorithms/visual_prompting/tasks/openvino.py
@@ -17,16 +17,22 @@
 import io
 import json
 import os
+import tempfile
 import time
+import random
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
 from zipfile import ZipFile
 
 import attr
+import nncf
 import numpy as np
+import openvino.runtime as ov
+from nncf.common.quantization.structs import QuantizationPreset
 from openvino.model_api.adapters import create_core
 from openvino.model_api.models import Model
 
+from otx.algorithms.common.utils.ir import check_if_quantized
 from otx.algorithms.common.utils.logger import get_logger
 from otx.algorithms.common.utils.utils import get_default_async_reqs_num
 from otx.algorithms.visual_prompting.adapters.openvino import model_wrappers
@@ -37,6 +43,7 @@
     OTXVisualPromptingDataset,
     get_transform,
 )
+from otx.algorithms.visual_prompting.adapters.pytorch_lightning.datasets.pipelines import ResizeLongestSide
 from otx.algorithms.visual_prompting.configs.base import VisualPromptingBaseConfig
 from otx.api.entities.annotation import Annotation
 from otx.api.entities.dataset_item import DatasetItemEntity
@@ -46,12 +53,19 @@
     default_progress_callback,
 )
 from otx.api.entities.label_schema import LabelSchemaEntity
-from otx.api.entities.model import ModelEntity
+from otx.api.entities.model import (
+    ModelEntity,
+    ModelFormat,
+    ModelOptimizationType,
+    ModelPrecision,
+    OptimizationMethod,
+)
 from otx.api.entities.model_template import TaskType
 from otx.api.entities.optimization_parameters import OptimizationParameters
 from otx.api.entities.resultset import ResultSetEntity
+from otx.api.entities.subset import Subset
 from otx.api.entities.task_environment import TaskEnvironment
-from otx.api.serialization.label_mapper import LabelSchemaMapper
+from otx.api.serialization.label_mapper import LabelSchemaMapper, label_schema_to_bytes
 from otx.api.usecases.evaluation.metrics_helper import MetricsHelper
 from otx.api.usecases.exportable_code import demo
 from otx.api.usecases.exportable_code.inference import BaseInferencer
@@ -129,8 +143,7 @@ def pre_process(self, dataset_item: DatasetItemEntity) -> Dict[str, Any]:  # typ
         images, meta = self.model["image_encoder"].preprocess(dataset_item.numpy)
         prompts = OTXVisualPromptingDataset.get_prompts(dataset_item, self.labels)  # to be replaced
         prompts = self.model["decoder"].preprocess(prompts, meta)
-        items = {**images, **meta, "prompts": prompts}
-        return items
+        return images, meta, prompts
 
     def post_process(
         self, prediction: Dict[str, np.ndarray], metadata: Dict[str, Any]
@@ -143,13 +156,13 @@ def post_process(
     def predict(self, dataset_item: DatasetItemEntity) -> List[Annotation]:  # type: ignore
         """Perform a prediction for a given input image."""
         # forward image encoder
-        items = self.pre_process(dataset_item)
-        image_embeddings = self.forward({"images": items["images"]})
+        images, meta, prompts = self.pre_process(dataset_item)
+        image_embeddings = self.forward(images)
 
         annotations: List[Annotation] = []
         hard_predictions: List[np.ndarray] = []
         soft_predictions: List[np.ndarray] = []
-        for prompt in items["prompts"]:
+        for prompt in prompts:
             label = prompt.pop("label")
             prompt.update(image_embeddings)
 
@@ -178,6 +191,54 @@ def await_all(self) -> None:
         self.model["decoder"].await_all()
 
 
+class OTXOpenVinoDataLoader:
+    """DataLoader implementation for VisualPromptingOpenVINOTask."""
+
+    def __init__(self, dataset: Any, inferencer: BaseInferencer, shuffle: bool = True, is_encoder: bool = True, output_model: Optional[ModelEntity] = None):
+        self.dataset = dataset
+        self.inferencer = inferencer
+        self.shuffler = None
+        if shuffle:
+            self.shuffler = list(range(len(dataset)))
+            random.shuffle(self.shuffler)
+
+        self.is_encoder = is_encoder
+        self.target_length = self.inferencer.model["image_encoder"].orig_width
+        if not self.is_encoder:
+            core = ov.Core()
+            compressed_model = core.read_model(
+                output_model.get_data("visual_prompting_image_encoder.xml"),
+                output_model.get_data("visual_prompting_image_encoder.bin"))
+            self.compressed_model = core.compile_model(
+                model=compressed_model,
+                device_name=inferencer.model["image_encoder"].inference_adapter.device)
+
+    def __getitem__(self, index: int):
+        """Get item from dataset."""
+        if self.shuffler is not None:
+            index = self.shuffler[index]
+
+        items = self.dataset[index]
+        images, _, prompts = self.inferencer.pre_process(items)
+        processed_image = ResizeLongestSide.apply_image(images["images"][0], self.target_length).transpose(2, 0, 1)
+        _, h, w = processed_image.shape
+        pad_width = ((0, 0), (0, self.target_length - h), (0, self.target_length - w))
+        processed_image = np.pad(processed_image, pad_width, mode="constant", constant_values=0)
+        if self.is_encoder:
+            return {"images": processed_image[None]}
+        else:
+            image_embeddings = self.compressed_model(processed_image[None])
+            prompt = prompts[0]  # only use the first prompt
+            prompt.pop("label")
+            prompt.update({"image_embeddings": image_embeddings["image_embeddings"]})
+            return prompt
+            # TODO (sungchul): change has_mask_input
+
+    def __len__(self):
+        """Get length of dataset."""
+        return len(self.dataset)
+
+
 class OpenVINOVisualPromptingTask(IInferenceTask, IEvaluationTask, IOptimizationTask, IDeploymentTask):
     """Task implementation for Visual Prompting using OpenVINO backend."""
 
@@ -336,4 +397,70 @@ def optimize(
     ):
         """Optimize function of OpenVINOVisualPromptingTask."""
         logger.info("Start PTQ optimization")
-        raise NotImplementedError
+        if self.model is None:
+            raise RuntimeError("PTQ optimize failed, model is None")
+
+        if optimization_type is not OptimizationType.POT:
+            raise ValueError("PTQ is the only supported optimization type for OpenVino models")
+
+        dataset = dataset.get_subset(Subset.TRAINING)
+
+        for i, (name, is_encoder) in enumerate(
+            zip(["image_encoder", "decoder"], [True, False]), 1
+        ):
+            data_loader = OTXOpenVinoDataLoader(dataset, self.inferencer, is_encoder=is_encoder, output_model=output_model)
+            quantization_dataset = nncf.Dataset(data_loader, lambda data: data)
+
+            with tempfile.TemporaryDirectory() as tempdir:
+                xml_path = os.path.join(tempdir, f"visual_prompting_{name}.xml")
+                bin_path = os.path.join(tempdir, f"visual_prompting_{name}.bin")
+                with open(xml_path, "wb") as f:
+                    f.write(self.model.get_data(f"visual_prompting_{name}.xml"))
+                with open(bin_path, "wb") as f:
+                    f.write(self.model.get_data(f"visual_prompting_{name}.bin"))
+
+                ov_model = ov.Core().read_model(xml_path, bin_path)
+                if check_if_quantized(ov_model):
+                    raise RuntimeError("Model is already optimized by PTQ")
+
+            if optimization_parameters is not None:
+                optimization_parameters.update_progress(10 * i + 35 * (i - 1), None)
+
+            stat_subset_size = self.hparams.pot_parameters.stat_subset_size
+            preset = QuantizationPreset(self.hparams.pot_parameters.preset.name.lower())
+            from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters
+            advanced_parameters = AdvancedQuantizationParameters(backend_params={"use_pot": True})
+
+            compressed_model = nncf.quantize(
+                ov_model, quantization_dataset, subset_size=min(stat_subset_size, len(data_loader)), preset=preset, advanced_parameters=advanced_parameters
+            )
+
+            if optimization_parameters is not None:
+                optimization_parameters.update_progress(45 * i, None)
+
+            with tempfile.TemporaryDirectory() as tempdir:
+                xml_path = os.path.join(tempdir, f"visual_prompting_{name}.xml")
+                bin_path = os.path.join(tempdir, f"visual_prompting_{name}.bin")
+                ov.serialize(compressed_model, xml_path)
+                with open(xml_path, "rb") as f:
+                    output_model.set_data(f"visual_prompting_{name}.xml", f.read())
+                with open(bin_path, "rb") as f:
+                    output_model.set_data(f"visual_prompting_{name}.bin", f.read())
+
+        output_model.set_data(
+            "label_schema.json",
+            label_schema_to_bytes(self.task_environment.label_schema),
+        )
+
+        # set model attributes for quantized model
+        output_model.model_format = ModelFormat.OPENVINO
+        output_model.optimization_type = ModelOptimizationType.POT
+        output_model.optimization_methods = [OptimizationMethod.QUANTIZATION]
+        output_model.precision = [ModelPrecision.INT8]
+
+        self.model = output_model
+        self.inferencer = self.load_inferencer()
+
+        if optimization_parameters is not None:
+            optimization_parameters.update_progress(100, None)
+        logger.info("POT optimization completed")
diff --git a/src/otx/cli/tools/optimize.py b/src/otx/cli/tools/optimize.py
index eaa9cc2e7c3..df866c63009 100644
--- a/src/otx/cli/tools/optimize.py
+++ b/src/otx/cli/tools/optimize.py
@@ -19,6 +19,7 @@
 
 from otx.api.entities.inference_parameters import InferenceParameters
 from otx.api.entities.model import ModelEntity
+from otx.api.entities.model_template import TaskType
 from otx.api.entities.optimization_parameters import OptimizationParameters
 from otx.api.entities.resultset import ResultSetEntity
 from otx.api.entities.subset import Subset
@@ -140,8 +141,11 @@ def main():
 
     validation_dataset = dataset.get_subset(Subset.VALIDATION)
     predicted_validation_dataset = task.infer(
-        validation_dataset.with_empty_annotations(),
-        InferenceParameters(is_evaluation=True),
+        # temp (sungchul): remain annotation for visual prompting
+        validation_dataset
+        if getattr(task, "task_type", None) == TaskType.VISUAL_PROMPTING
+        else validation_dataset.with_empty_annotations(),
+        InferenceParameters(is_evaluation=False),
     )
 
     resultset = ResultSetEntity(

From dd8f2879afe1757508325efa3b12cae778bac101 Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Wed, 12 Jul 2023 17:37:53 +0900
Subject: [PATCH 04/12] Fix

---
 .../adapters/openvino/model_wrappers/openvino_models.py       | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/otx/algorithms/visual_prompting/adapters/openvino/model_wrappers/openvino_models.py b/src/otx/algorithms/visual_prompting/adapters/openvino/model_wrappers/openvino_models.py
index 83f327a7eca..5d4ba5e8917 100644
--- a/src/otx/algorithms/visual_prompting/adapters/openvino/model_wrappers/openvino_models.py
+++ b/src/otx/algorithms/visual_prompting/adapters/openvino/model_wrappers/openvino_models.py
@@ -63,7 +63,6 @@ def __init__(
         preload: bool = False,
     ):
         super().__init__(model_adapter, configuration, preload)
-        self.output_blob_name = "low_res_masks"
 
     @classmethod
     def parameters(cls):  # noqa: D102
@@ -71,6 +70,9 @@ def parameters(cls):  # noqa: D102
         parameters.update({"image_size": NumericalValue(value_type=int, default_value=1024, min=0, max=2048)})
         return parameters
 
+    def _get_outputs(self):
+        return "low_res_masks"
+
     def preprocess(self, inputs: Dict[str, Any], meta: Dict[str, Any]):
         """Preprocess prompts."""
         processed_prompts = []

From 2aa149ae6b92c9544529d3d1f99264f689e20d1f Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Wed, 12 Jul 2023 21:55:33 +0900
Subject: [PATCH 05/12] WIP

---
 .../visual_prompters/segment_anything.py      | 20 ++++++++++---------
 .../visual_prompting/tasks/openvino.py        | 15 ++++++++++----
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/models/visual_prompters/segment_anything.py b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/models/visual_prompters/segment_anything.py
index 2460df0aec3..efa3f792265 100644
--- a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/models/visual_prompters/segment_anything.py
+++ b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/models/visual_prompters/segment_anything.py
@@ -185,7 +185,7 @@ def forward(
         point_labels: Tensor,
         mask_input: Tensor,
         has_mask_input: Tensor,
-        orig_size: Tensor,
+        # orig_size: Tensor,
     ):
         """Forward method for SAM inference (export/deploy).
 
@@ -227,16 +227,18 @@ def forward(
         if self.config.model.return_single_mask:
             masks, scores = self.select_masks(masks, scores, point_coords.shape[1])
 
-        upscaled_masks = self.mask_postprocessing(masks, orig_size[0])
+        return scores, masks
+        # TODO (sungchul): apply inner postprocessing
+        # upscaled_masks = self.mask_postprocessing(masks, orig_size[0])
 
-        if self.config.model.return_extra_metrics:
-            stability_scores = self.calculate_stability_score(
-                upscaled_masks, self.config.model.mask_threshold, self.config.model.stability_score_offset
-            )
-            areas = (upscaled_masks > self.config.model.mask_threshold).sum(-1).sum(-1)
-            return upscaled_masks, scores, stability_scores, areas, masks
+        # if self.config.model.return_extra_metrics:
+        #     stability_scores = self.calculate_stability_score(
+        #         upscaled_masks, self.config.model.mask_threshold, self.config.model.stability_score_offset
+        #     )
+        #     areas = (upscaled_masks > self.config.model.mask_threshold).sum(-1).sum(-1)
+        #     return upscaled_masks, scores, stability_scores, areas, masks
 
-        return upscaled_masks, scores, masks
+        # return upscaled_masks, scores, masks
 
     def _embed_points(self, point_coords: Tensor, point_labels: Tensor) -> Tensor:
         """Embed sparse input prompts.
diff --git a/src/otx/algorithms/visual_prompting/tasks/openvino.py b/src/otx/algorithms/visual_prompting/tasks/openvino.py
index f0a0d9a4747..4ea8349cc77 100644
--- a/src/otx/algorithms/visual_prompting/tasks/openvino.py
+++ b/src/otx/algorithms/visual_prompting/tasks/openvino.py
@@ -164,11 +164,12 @@ def predict(self, dataset_item: DatasetItemEntity) -> List[Annotation]:  # type:
         soft_predictions: List[np.ndarray] = []
         for prompt in prompts:
             label = prompt.pop("label")
+            orig_size = prompt.pop("orig_size")
             prompt.update(image_embeddings)
 
             # forward decoder to get predicted mask
             prediction = self.forward_decoder(prompt)
-            metadata = {"label": label, "original_size": prompt["orig_size"]}
+            metadata = {"label": label, "original_size": orig_size}
 
             # set annotation for eval
             annotation, hard_prediction, soft_prediction = self.post_process(prediction, metadata)
@@ -230,6 +231,7 @@ def __getitem__(self, index: int):
             image_embeddings = self.compressed_model(processed_image[None])
             prompt = prompts[0]  # only use the first prompt
             prompt.pop("label")
+            prompt.pop("orig_size")
             prompt.update({"image_embeddings": image_embeddings["image_embeddings"]})
             return prompt
             # TODO (sungchul): change has_mask_input
@@ -408,6 +410,13 @@ def optimize(
         for i, (name, is_encoder) in enumerate(
             zip(["image_encoder", "decoder"], [True, False]), 1
         ):
+            if name == "decoder":
+                # TODO (sungchul): quantize decoder, too
+                logger.info(f"{name} won't do PTQ.")
+                output_model.set_data(f"visual_prompting_{name}.xml", self.model.get_data(f"visual_prompting_{name}.xml"))
+                output_model.set_data(f"visual_prompting_{name}.bin", self.model.get_data(f"visual_prompting_{name}.bin"))
+                continue
+
             data_loader = OTXOpenVinoDataLoader(dataset, self.inferencer, is_encoder=is_encoder, output_model=output_model)
             quantization_dataset = nncf.Dataset(data_loader, lambda data: data)
 
@@ -428,11 +437,9 @@ def optimize(
 
             stat_subset_size = self.hparams.pot_parameters.stat_subset_size
             preset = QuantizationPreset(self.hparams.pot_parameters.preset.name.lower())
-            from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters
-            advanced_parameters = AdvancedQuantizationParameters(backend_params={"use_pot": True})
 
             compressed_model = nncf.quantize(
-                ov_model, quantization_dataset, subset_size=min(stat_subset_size, len(data_loader)), preset=preset, advanced_parameters=advanced_parameters
+                ov_model, quantization_dataset, subset_size=min(stat_subset_size, len(data_loader)), preset=preset
             )
 
             if optimization_parameters is not None:

From d94f33b673e8322700d8c58bb93dcd7e85f7fbd0 Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Thu, 13 Jul 2023 10:58:55 +0900
Subject: [PATCH 06/12] Update configs & exported outputs

---
 .../algorithms/visual_prompting/configs/configuration.yaml    | 4 ++--
 .../algorithms/visual_prompting/configs/sam_vit_b/config.yaml | 4 ++--
 src/otx/algorithms/visual_prompting/tasks/inference.py        | 3 +--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/otx/algorithms/visual_prompting/configs/configuration.yaml b/src/otx/algorithms/visual_prompting/configs/configuration.yaml
index e20429f60b2..1949d14f2a3 100644
--- a/src/otx/algorithms/visual_prompting/configs/configuration.yaml
+++ b/src/otx/algorithms/visual_prompting/configs/configuration.yaml
@@ -148,7 +148,7 @@ pot_parameters:
     affects_outcome_of: NONE
     auto_hpo_state: not_possible
     auto_hpo_value: null
-    default_value: Performance
+    default_value: Mixed
     description: Quantization preset that defines quantization scheme
     editable: true
     enum_name: POTQuantizationPreset
@@ -162,7 +162,7 @@ pot_parameters:
       operator: AND
       rules: []
       type: UI_RULES
-    value: Performance
+    value: Mixed
     visible_in_ui: true
     warning: null
   stat_subset_size:
diff --git a/src/otx/algorithms/visual_prompting/configs/sam_vit_b/config.yaml b/src/otx/algorithms/visual_prompting/configs/sam_vit_b/config.yaml
index 393cfa468a2..3738303c911 100644
--- a/src/otx/algorithms/visual_prompting/configs/sam_vit_b/config.yaml
+++ b/src/otx/algorithms/visual_prompting/configs/sam_vit_b/config.yaml
@@ -1,6 +1,6 @@
 dataset:
   task: visual_prompting
-  train_batch_size: 2
+  train_batch_size: 4
   val_batch_size: 1
   test_batch_size: 1
   num_workers: 4
@@ -35,7 +35,7 @@ model:
 
 optimizer:
   name: Adam
-  lr: 0.0001
+  lr: 0.000001
 
 callback:
   checkpoint: # arguments for ModelCheckpoint
diff --git a/src/otx/algorithms/visual_prompting/tasks/inference.py b/src/otx/algorithms/visual_prompting/tasks/inference.py
index 6c93a05caa9..b84984e5fef 100644
--- a/src/otx/algorithms/visual_prompting/tasks/inference.py
+++ b/src/otx/algorithms/visual_prompting/tasks/inference.py
@@ -281,9 +281,8 @@ def _export_to_onnx(self, onnx_path: Dict[str, str]):
                     "point_labels": torch.randint(low=0, high=4, size=(1, 2), dtype=torch.float),
                     "mask_input": torch.randn(1, 1, *mask_input_size, dtype=torch.float),
                     "has_mask_input": torch.tensor([[1]], dtype=torch.float),
-                    "orig_size": torch.tensor([[height, width]], dtype=torch.float),
                 }
-                output_names = ["masks", "iou_predictions", "low_res_masks"]
+                output_names = ["iou_predictions", "low_res_masks"]
                 model_to_export = self.model
 
             with warnings.catch_warnings():

From 451f5d13f1cfab2f21019d2d2177b6027a887f93 Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Thu, 13 Jul 2023 11:41:46 +0900
Subject: [PATCH 07/12] Remove unused modules for torch

---
 .../datasets/pipelines/sam_transforms.py      | 50 -------------------
 1 file changed, 50 deletions(-)

diff --git a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
index c3cae7d78a7..1170c4efebb 100644
--- a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
+++ b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
@@ -91,56 +91,6 @@ def apply_boxes(self, boxes: np.ndarray, original_size: Union[List[Any], Tensor]
         boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
         return boxes.reshape(-1, 4)
 
-    def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
-        """Expects batched images with shape BxCxHxW and float format.
-
-        This transformation may not exactly match apply_image.
-        apply_image is the transformation expected by the model.
-
-        Args:
-            image (torch.Tensor): Image tensor.
-
-        Returns:
-            torch.Tensor: Resized image.
-        """
-        # Expects an image in BCHW format. May not exactly match apply_image.
-        target_size = self.get_preprocess_shape(image.shape[2], image.shape[3], self.target_length)
-        return F.interpolate(image, target_size, mode="bilinear", align_corners=False, antialias=True)
-
-    def apply_coords_torch(self, coords: torch.Tensor, original_size: Tuple[int, ...]) -> torch.Tensor:
-        """Expects a torch tensor with length 2 in the last dimension.
-
-        Requires the original image size in (H, W) format.
-
-        Args:
-            coords (torch.Tensor): Coordinates tensor.
-            original_size (Tuple[int, ...]): Original size of image.
-
-        Returns:
-            torch.Tensor: Resized coordinates.
-        """
-        old_h, old_w = original_size
-        new_h, new_w = self.get_preprocess_shape(original_size[0], original_size[1], self.target_length)
-        coords = deepcopy(coords).to(torch.float)
-        coords[..., 0] = coords[..., 0] * (new_w / old_w)
-        coords[..., 1] = coords[..., 1] * (new_h / old_h)
-        return coords
-
-    def apply_boxes_torch(self, boxes: torch.Tensor, original_size: Tuple[int, ...]) -> torch.Tensor:
-        """Expects a torch tensor with shape Bx4.
-
-        Requires the original image size in (H, W) format.
-
-        Args:
-            boxes (torch.Tensor): Boxes tensor.
-            original_size (Tuple[int, ...]): Original size of image.
-
-        Returns:
-            torch.Tensor: Resized boxes.
-        """
-        boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
-        return boxes.reshape(-1, 4)
-
     @staticmethod
     def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
         """Compute the output size given input size and target long side length.

From a06dd5c38c4f0ebfc8636b215298a33c975d24cf Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Thu, 13 Jul 2023 13:37:29 +0900
Subject: [PATCH 08/12] Add unit tests

---
 .../model_wrappers/test_openvino_models.py    |  7 ++
 .../config/test_visual_prompting_config.py    | 10 +-
 .../datasets/pipelines/test_sam_transforms.py | 95 ++++++++++---------
 .../visual_prompting/tasks/test_openvino.py   | 95 +++++++++++++++++--
 4 files changed, 151 insertions(+), 56 deletions(-)

diff --git a/tests/unit/algorithms/visual_prompting/adapters/openvino/model_wrappers/test_openvino_models.py b/tests/unit/algorithms/visual_prompting/adapters/openvino/model_wrappers/test_openvino_models.py
index efdf2c0b495..437e4f9d326 100644
--- a/tests/unit/algorithms/visual_prompting/adapters/openvino/model_wrappers/test_openvino_models.py
+++ b/tests/unit/algorithms/visual_prompting/adapters/openvino/model_wrappers/test_openvino_models.py
@@ -64,6 +64,13 @@ def test_parameters(self):
         assert isinstance(params.get("image_size"), NumericalValue)
         assert params.get("image_size").default_value == 1024
 
+    @e2e_pytest_unit
+    def test_get_outputs(self):
+        """Test _get_outputs."""
+        results = self.decoder._get_outputs()
+
+        assert "low_res_masks" == results
+
     @e2e_pytest_unit
     def test_preprocess(self):
         """Test preprocess"""
diff --git a/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/config/test_visual_prompting_config.py b/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/config/test_visual_prompting_config.py
index d5d57119ca2..e7ae231d157 100644
--- a/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/config/test_visual_prompting_config.py
+++ b/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/config/test_visual_prompting_config.py
@@ -1,3 +1,4 @@
+
 """Tests the methods in config."""
 
 # Copyright (C) 2023 Intel Corporation
@@ -61,13 +62,18 @@ def test_update_visual_prompting_config():
     """Test update_visual_prompting_config."""
     otx_config = OmegaConf.create(
         {
-            "groups": ["learning_parameters"],
+            "groups": ["learning_parameters", "pot_parameters", "postprocessing"],
             "learning_parameters": {"parameters": ["param1"], "param1": "updated_value1"},
+            "pot_parameters": {"parameters": ["param2"], "param2": "updated_value2"},
+            "postprocessing": {"parameters": ["param3"], "param3": "updated_value3"},
             "parameters": [],
         }
     )
-    visual_prompting_config = OmegaConf.create({"param1": "value1", "param2": "value2"})
+    visual_prompting_config = OmegaConf.create({"param1": "value1", "param2": "value2", "param3": "value3", "param4": "value4"})
 
     update_visual_prompting_config(visual_prompting_config, otx_config)
 
     assert visual_prompting_config["param1"] == "updated_value1"
+    assert visual_prompting_config["param2"] == "updated_value2"
+    assert visual_prompting_config["param3"] == "updated_value3"
+    assert visual_prompting_config["param4"] == "value4"
diff --git a/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/test_sam_transforms.py b/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/test_sam_transforms.py
index c79be668f22..82355933ed5 100644
--- a/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/test_sam_transforms.py
+++ b/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/test_sam_transforms.py
@@ -5,7 +5,8 @@
 #
 
 import numpy as np
-import torch
+from typing import Tuple
+import pytest
 from otx.algorithms.visual_prompting.adapters.pytorch_lightning.datasets.pipelines.sam_transforms import (
     ResizeLongestSide,
 )
@@ -14,60 +15,62 @@
 
 
 class TestResizeLongestSide:
-    @e2e_pytest_unit
-    def test_apply_boxes(self):
-        """Test apply_boxes."""
-        resize_longest_side = ResizeLongestSide(100)
-        boxes = np.array([[10, 20, 30, 40], [50, 60, 70, 80]])
-        original_size = (200, 200)
-        expected_result = np.array([[5, 10, 15, 20], [25, 30, 35, 40]])
-
-        result = resize_longest_side.apply_boxes(boxes, original_size)
-
-        assert np.array_equal(result, expected_result)
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        self.resize_longest_side = ResizeLongestSide(8)
 
     @e2e_pytest_unit
-    def test_apply_image_torch(self):
-        """Test apply_image_torch."""
-        resize_longest_side = ResizeLongestSide(100)
-        image = torch.zeros((1, 3, 200, 300), dtype=torch.float32)
-        expected_result_shape = (1, 3, 67, 100)
-
-        result = resize_longest_side.apply_image_torch(image)
-
-        assert result.shape == expected_result_shape
+    def test_call(self):
+        """Test __call__."""
 
     @e2e_pytest_unit
-    def test_apply_coords_torch(self):
-        """Test apply_coords_torch."""
-        resize_longest_side = ResizeLongestSide(100)
-        coords = torch.Tensor([[50, 50], [100, 100]])
-        original_size = (200, 200)
-        expected_result = torch.Tensor([[25, 25], [50, 50]])
-
-        result = resize_longest_side.apply_coords_torch(coords, original_size)
-
-        assert torch.allclose(result, expected_result)
+    @pytest.mark.parametrize("image,expected",
+        [
+            (np.zeros((2, 4, 3), dtype=np.uint8), (4, 8, 3)),
+            (np.zeros((12, 16, 3), dtype=np.uint8), (6, 8, 3)),
+        ]
+    )
+    def test_apply_image(self, image: np.ndarray, expected: Tuple[int, int, int]):
+        """Test apply_image."""
+        results = self.resize_longest_side.apply_image(image, self.resize_longest_side.target_length)
+
+        assert results.shape == expected
 
     @e2e_pytest_unit
-    def test_apply_boxes_torch(self):
-        """Test apply_boxes_torch."""
-        resize_longest_side = ResizeLongestSide(100)
-        boxes = torch.Tensor([[10, 20, 30, 40], [50, 60, 70, 80]])
-        original_size = (200, 200)
-        expected_result = torch.Tensor([[5, 10, 15, 20], [25, 30, 35, 40]])
+    @pytest.mark.parametrize("coords,original_size,expected",
+        [
+            (np.array([[1, 1], [2, 2]]), (4, 4), np.array([[2, 2], [4, 4]])),
+            (np.array([[4, 4], [8, 8]]), (16, 16), np.array([[2, 2], [4, 4]])),
+        ]
+    )
+    def test_apply_coords(self, coords: np.ndarray, original_size: Tuple[int, int], expected: np.ndarray):
+        """Test apply_coords."""
+        result = self.resize_longest_side.apply_coords(coords, original_size)
+
+        assert np.array_equal(result, expected)
 
-        result = resize_longest_side.apply_boxes_torch(boxes, original_size)
+    @e2e_pytest_unit
+    @pytest.mark.parametrize("boxes,original_size,expected",
+        [
+            (np.array([[1, 1, 2, 2], [2, 2, 3, 3]]), (4, 4), np.array([[2, 2, 4, 4], [4, 4, 6, 6]])),
+            (np.array([[4, 4, 8, 8], [8, 8, 12, 12]]), (16, 16), np.array([[2, 2, 4, 4], [4, 4, 6, 6]])),
+        ]
+    )
+    def test_apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, int], expected: np.ndarray):
+        """Test apply_boxes."""
+        result = self.resize_longest_side.apply_boxes(boxes, original_size)
 
-        assert torch.allclose(result, expected_result)
+        assert np.array_equal(result, expected)
 
     @e2e_pytest_unit
-    def test_get_preprocess_shape(self):
+    @pytest.mark.parametrize("oldh,oldw,expected",
+        [
+            (3, 4, (6, 8)),
+            (12, 16, (6, 8)),
+        ]
+    )
+    def test_get_preprocess_shape(self, oldh: int, oldw: int, expected: Tuple[int, int]):
         """Test get_preprocess_shape."""
-        resize_longest_side = ResizeLongestSide(100)
-        oldh, oldw = 200, 300
-        expected_result = (67, 100)
-
-        result = resize_longest_side.get_preprocess_shape(oldh, oldw, resize_longest_side.target_length)
+        result = self.resize_longest_side.get_preprocess_shape(oldh, oldw, self.resize_longest_side.target_length)
 
-        assert result == expected_result
+        assert result == expected
diff --git a/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py b/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py
index d7e97499649..17588177bd2 100644
--- a/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py
+++ b/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py
@@ -5,11 +5,15 @@
 #
 
 from copy import deepcopy
+from typing import Optional
 
 import numpy as np
+import pathlib
 import pytest
+from otx.api.usecases.tasks.interfaces.optimization_interface import OptimizationType
 import torch
 from openvino.model_api.models import Model
+from otx.api.entities.subset import Subset
 
 from otx.algorithms.visual_prompting.adapters.pytorch_lightning.datasets.dataset import (
     OTXVisualPromptingDataset,
@@ -18,6 +22,7 @@
 from otx.algorithms.visual_prompting.tasks.openvino import (
     OpenVINOVisualPromptingInferencer,
     OpenVINOVisualPromptingTask,
+    OTXOpenVinoDataLoader
 )
 from otx.api.configuration.configurable_parameters import ConfigurableParameters
 from otx.api.entities.annotation import Annotation
@@ -86,7 +91,7 @@ def test_pre_process(self, mocker):
 
         returned_value = self.visual_prompting_ov_inferencer.pre_process(fake_input)
 
-        assert isinstance(returned_value, dict)
+        assert isinstance(returned_value, tuple)
         mocker_get_prompts.assert_called_once()
 
     @e2e_pytest_unit
@@ -112,10 +117,10 @@ def test_predict(self, mocker):
         mocker_pre_process = mocker.patch.object(
             OpenVINOVisualPromptingInferencer,
             "pre_process",
-            return_value={
-                "index": 0,
-                "images": torch.rand((1, 3, 2, 2)),
-                "prompts": [
+            return_value=(
+                torch.zeros((1, 3, 2, 2)),
+                {},
+                [
                     {
                         "point_coords": [np.array([[[1, 1], [2, 2]]])],
                         "point_labels": [1, 2],
@@ -123,7 +128,7 @@ def test_predict(self, mocker):
                         "orig_size": (4, 4),
                     }
                 ],
-            },
+            ),
         )
         mocker_forward = mocker.patch.object(
             OpenVINOVisualPromptingInferencer, "forward", return_value={"image_embeddings": np.empty((4, 2, 2))}
@@ -165,6 +170,48 @@ def test_forward_decoder(self):
         assert returned_value == fake_output
 
 
+class TestOTXOpenVinoDataLoader:
+    @pytest.fixture
+    def load_dataloader(self, mocker):
+        def _load_dataloader(is_encoder: bool = True, output_model: Optional[ModelEntity] = None):
+            dataset = generate_visual_prompting_dataset()
+            dataset = dataset.get_subset(Subset.TRAINING)
+            return OTXOpenVinoDataLoader(dataset, self.mocker_inferencer, is_encoder=is_encoder, output_model=output_model)
+        return _load_dataloader
+
+    @pytest.fixture(autouse=True)
+    def setup(self, mocker):
+        self.mocker_read_model = mocker.patch("otx.algorithms.visual_prompting.tasks.openvino.ov.Core.read_model")
+        self.mocker_compile_model = mocker.patch("otx.algorithms.visual_prompting.tasks.openvino.ov.Core.compile_model")
+        self.mocker_inferencer = mocker.patch.object(OpenVINOVisualPromptingInferencer, "__init__")
+
+    @e2e_pytest_unit
+    @pytest.mark.parametrize("is_encoder", [True, False])
+    def test_getitem(self, mocker, load_dataloader, is_encoder: bool):
+        """Test __getitem__."""
+        mocker_output_model = mocker.patch("otx.api.entities.model.ModelEntity")
+        if not is_encoder:
+            mocker.patch.object(mocker_output_model, "get_data")
+            self.mocker_read_model.reset_mock()
+            self.mocker_compile_model.reset_mock()
+
+        dataloader = load_dataloader(is_encoder, mocker_output_model)
+
+        setattr(dataloader, "target_length", 8)
+        mocker.patch.object(dataloader.inferencer, "pre_process", return_value=({"images": np.zeros((1, 4, 3, 3), dtype=np.uint8)}, None, [{"label": 1, "orig_size": 1}]))
+        
+        results = dataloader.__getitem__(0)
+        
+        if is_encoder:
+            assert results['images'].shape == (1, 3, 8, 8)
+        else:
+            self.mocker_read_model.assert_called_once()
+            self.mocker_compile_model.assert_called_once()
+            assert "label" not in results
+            assert "orig_size" not in results
+            assert "image_embeddings" in results
+
+
 class TestOpenVINOVisualPromptingTask:
     @pytest.fixture
     def otx_model(self):
@@ -240,11 +287,43 @@ def test_evaluate(self, mocker):
 
     @e2e_pytest_unit
     def test_deploy(self):
+        """Test deploy."""
         output_model = deepcopy(self.task_environment.model)
-        self.visual_prompting_ov_task.model.set_data("visual_prompting_image_encoder.bin", b"image_encoder_bin")
         self.visual_prompting_ov_task.model.set_data("visual_prompting_image_encoder.xml", b"image_encoder_xml")
+        self.visual_prompting_ov_task.model.set_data("visual_prompting_image_encoder.bin", b"image_encoder_bin")
+        self.visual_prompting_ov_task.model.set_data("visual_prompting_decoder.xml", b"decoder_xml")
         self.visual_prompting_ov_task.model.set_data("visual_prompting_decoder.bin", b"decoder_bin")
-        self.visual_prompting_ov_task.model.set_data("visual_prompting_decoder.xml", b"deocder_xml")
+
         self.visual_prompting_ov_task.deploy(output_model)
 
         assert output_model.exportable_code is not None
+
+    @e2e_pytest_unit
+    def test_optimize(self, mocker):
+        """Test optimize."""
+        def patch_save_model(model, output_xml):
+            with open(output_xml, "wb") as f:
+                f.write(b"compressed_image_encoder_xml")
+            bin_path = pathlib.Path(output_xml).parent / pathlib.Path(str(pathlib.Path(output_xml).stem) + ".bin")
+            with open(bin_path, "wb") as f:
+                f.write(b"compressed_image_encoder_bin")
+
+        dataset = generate_visual_prompting_dataset()
+        output_model = deepcopy(self.task_environment.model)
+        self.visual_prompting_ov_task.model.set_data("visual_prompting_image_encoder.xml", b"image_encoder_xml")
+        self.visual_prompting_ov_task.model.set_data("visual_prompting_image_encoder.bin", b"image_encoder_bin")
+        self.visual_prompting_ov_task.model.set_data("visual_prompting_decoder.xml", b"decoder_xml")
+        self.visual_prompting_ov_task.model.set_data("visual_prompting_decoder.bin", b"decoder_bin")
+        mocker.patch("otx.algorithms.visual_prompting.tasks.openvino.ov.Core.read_model", autospec=True)
+        mocker.patch("otx.algorithms.visual_prompting.tasks.openvino.ov.serialize", new=patch_save_model)
+        fake_quantize = mocker.patch("otx.algorithms.visual_prompting.tasks.openvino.nncf.quantize", autospec=True)
+
+        self.visual_prompting_ov_task.optimize(OptimizationType.POT, dataset=dataset, output_model=output_model)
+
+        fake_quantize.assert_called_once()
+        # check if only image encoder was compressed
+        assert self.visual_prompting_ov_task.model.get_data("visual_prompting_image_encoder.xml") == b"compressed_image_encoder_xml"
+        assert self.visual_prompting_ov_task.model.get_data("visual_prompting_image_encoder.bin") == b"compressed_image_encoder_bin"
+        assert self.visual_prompting_ov_task.model.get_data("visual_prompting_decoder.xml") == b"decoder_xml"
+        assert self.visual_prompting_ov_task.model.get_data("visual_prompting_decoder.bin") == b"decoder_bin"
+

From ff80cd4b9634a40a9f962a5a2313d92bf557e51b Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Thu, 13 Jul 2023 13:53:36 +0900
Subject: [PATCH 09/12] pre-commit

---
 .../datasets/pipelines/sam_transforms.py      |  4 +-
 .../configs/base/configuration.py             |  2 -
 .../visual_prompting/tasks/openvino.py        | 40 +++++++++++++------
 .../config/test_visual_prompting_config.py    |  5 ++-
 .../datasets/pipelines/test_sam_transforms.py | 20 ++++++----
 .../visual_prompting/tasks/test_openvino.py   | 31 +++++++++-----
 6 files changed, 65 insertions(+), 37 deletions(-)

diff --git a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
index 1170c4efebb..f09275d4a21 100644
--- a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
+++ b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
@@ -10,7 +10,6 @@
 import numpy as np
 import torch
 from torch import Tensor
-from torch.nn import functional as F
 from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
 
 
@@ -36,8 +35,7 @@ def __call__(self, item: Dict[str, Union[List, Tensor]]) -> Dict[str, Union[List
         Dict[str, Union[List, Tensor]]: Dictionary of batch data.
         """
         item["images"] = torch.as_tensor(
-            self.apply_image(item["images"], self.target_length).transpose((2, 0, 1)),
-            dtype=torch.get_default_dtype()
+            self.apply_image(item["images"], self.target_length).transpose((2, 0, 1)), dtype=torch.get_default_dtype()
         )
         item["gt_masks"] = [torch.as_tensor(gt_mask) for gt_mask in item["gt_masks"]]
         item["bboxes"] = self.apply_boxes(item["bboxes"], item["original_size"])
diff --git a/src/otx/algorithms/visual_prompting/configs/base/configuration.py b/src/otx/algorithms/visual_prompting/configs/base/configuration.py
index f89e7ac9896..63dc1e726a2 100644
--- a/src/otx/algorithms/visual_prompting/configs/base/configuration.py
+++ b/src/otx/algorithms/visual_prompting/configs/base/configuration.py
@@ -15,8 +15,6 @@
 # and limitations under the License.
 
 
-from sys import maxsize
-
 from attr import attrs
 
 from otx.algorithms.common.configs import BaseConfig, POTQuantizationPreset
diff --git a/src/otx/algorithms/visual_prompting/tasks/openvino.py b/src/otx/algorithms/visual_prompting/tasks/openvino.py
index 4ea8349cc77..7d5ad6fcd2e 100644
--- a/src/otx/algorithms/visual_prompting/tasks/openvino.py
+++ b/src/otx/algorithms/visual_prompting/tasks/openvino.py
@@ -17,9 +17,9 @@
 import io
 import json
 import os
+import random
 import tempfile
 import time
-import random
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
 from zipfile import ZipFile
@@ -138,12 +138,14 @@ def __init__(
         self.labels = label_schema.get_labels(include_empty=False)
         self.transform = get_transform()  # TODO (sungchul): insert args
 
-    def pre_process(self, dataset_item: DatasetItemEntity) -> Dict[str, Any]:  # type: ignore
+    def pre_process(  # type: ignore
+        self, dataset_item: DatasetItemEntity
+    ) -> Tuple[Dict[str, Any], Dict[str, Any], List[Dict[str, Any]]]:
         """Pre-process function of OpenVINO Visual Prompting Inferencer for image encoder."""
         images, meta = self.model["image_encoder"].preprocess(dataset_item.numpy)
         prompts = OTXVisualPromptingDataset.get_prompts(dataset_item, self.labels)  # to be replaced
         prompts = self.model["decoder"].preprocess(prompts, meta)
-        return images, meta, prompts
+        return images, meta, prompts  # type: ignore
 
     def post_process(
         self, prediction: Dict[str, np.ndarray], metadata: Dict[str, Any]
@@ -195,7 +197,14 @@ def await_all(self) -> None:
 class OTXOpenVinoDataLoader:
     """DataLoader implementation for VisualPromptingOpenVINOTask."""
 
-    def __init__(self, dataset: Any, inferencer: BaseInferencer, shuffle: bool = True, is_encoder: bool = True, output_model: Optional[ModelEntity] = None):
+    def __init__(
+        self,
+        dataset: Any,
+        inferencer: OpenVINOVisualPromptingInferencer,
+        shuffle: bool = True,
+        is_encoder: bool = True,
+        output_model: Optional[ModelEntity] = None,
+    ):
         self.dataset = dataset
         self.inferencer = inferencer
         self.shuffler = None
@@ -209,10 +218,11 @@ def __init__(self, dataset: Any, inferencer: BaseInferencer, shuffle: bool = Tru
             core = ov.Core()
             compressed_model = core.read_model(
                 output_model.get_data("visual_prompting_image_encoder.xml"),
-                output_model.get_data("visual_prompting_image_encoder.bin"))
+                output_model.get_data("visual_prompting_image_encoder.bin"),
+            )
             self.compressed_model = core.compile_model(
-                model=compressed_model,
-                device_name=inferencer.model["image_encoder"].inference_adapter.device)
+                model=compressed_model, device_name=inferencer.model["image_encoder"].inference_adapter.device
+            )
 
     def __getitem__(self, index: int):
         """Get item from dataset."""
@@ -407,17 +417,21 @@ def optimize(
 
         dataset = dataset.get_subset(Subset.TRAINING)
 
-        for i, (name, is_encoder) in enumerate(
-            zip(["image_encoder", "decoder"], [True, False]), 1
-        ):
+        for i, (name, is_encoder) in enumerate(zip(["image_encoder", "decoder"], [True, False]), 1):
             if name == "decoder":
                 # TODO (sungchul): quantize decoder, too
                 logger.info(f"{name} won't do PTQ.")
-                output_model.set_data(f"visual_prompting_{name}.xml", self.model.get_data(f"visual_prompting_{name}.xml"))
-                output_model.set_data(f"visual_prompting_{name}.bin", self.model.get_data(f"visual_prompting_{name}.bin"))
+                output_model.set_data(
+                    f"visual_prompting_{name}.xml", self.model.get_data(f"visual_prompting_{name}.xml")
+                )
+                output_model.set_data(
+                    f"visual_prompting_{name}.bin", self.model.get_data(f"visual_prompting_{name}.bin")
+                )
                 continue
 
-            data_loader = OTXOpenVinoDataLoader(dataset, self.inferencer, is_encoder=is_encoder, output_model=output_model)
+            data_loader = OTXOpenVinoDataLoader(
+                dataset, self.inferencer, is_encoder=is_encoder, output_model=output_model
+            )
             quantization_dataset = nncf.Dataset(data_loader, lambda data: data)
 
             with tempfile.TemporaryDirectory() as tempdir:
diff --git a/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/config/test_visual_prompting_config.py b/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/config/test_visual_prompting_config.py
index e7ae231d157..c61e6b46589 100644
--- a/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/config/test_visual_prompting_config.py
+++ b/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/config/test_visual_prompting_config.py
@@ -1,4 +1,3 @@
-
 """Tests the methods in config."""
 
 # Copyright (C) 2023 Intel Corporation
@@ -69,7 +68,9 @@ def test_update_visual_prompting_config():
             "parameters": [],
         }
     )
-    visual_prompting_config = OmegaConf.create({"param1": "value1", "param2": "value2", "param3": "value3", "param4": "value4"})
+    visual_prompting_config = OmegaConf.create(
+        {"param1": "value1", "param2": "value2", "param3": "value3", "param4": "value4"}
+    )
 
     update_visual_prompting_config(visual_prompting_config, otx_config)
 
diff --git a/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/test_sam_transforms.py b/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/test_sam_transforms.py
index 82355933ed5..35c00c0198b 100644
--- a/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/test_sam_transforms.py
+++ b/tests/unit/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/test_sam_transforms.py
@@ -24,11 +24,12 @@ def test_call(self):
         """Test __call__."""
 
     @e2e_pytest_unit
-    @pytest.mark.parametrize("image,expected",
+    @pytest.mark.parametrize(
+        "image,expected",
         [
             (np.zeros((2, 4, 3), dtype=np.uint8), (4, 8, 3)),
             (np.zeros((12, 16, 3), dtype=np.uint8), (6, 8, 3)),
-        ]
+        ],
     )
     def test_apply_image(self, image: np.ndarray, expected: Tuple[int, int, int]):
         """Test apply_image."""
@@ -37,11 +38,12 @@ def test_apply_image(self, image: np.ndarray, expected: Tuple[int, int, int]):
         assert results.shape == expected
 
     @e2e_pytest_unit
-    @pytest.mark.parametrize("coords,original_size,expected",
+    @pytest.mark.parametrize(
+        "coords,original_size,expected",
         [
             (np.array([[1, 1], [2, 2]]), (4, 4), np.array([[2, 2], [4, 4]])),
             (np.array([[4, 4], [8, 8]]), (16, 16), np.array([[2, 2], [4, 4]])),
-        ]
+        ],
     )
     def test_apply_coords(self, coords: np.ndarray, original_size: Tuple[int, int], expected: np.ndarray):
         """Test apply_coords."""
@@ -50,11 +52,12 @@ def test_apply_coords(self, coords: np.ndarray, original_size: Tuple[int, int],
         assert np.array_equal(result, expected)
 
     @e2e_pytest_unit
-    @pytest.mark.parametrize("boxes,original_size,expected",
+    @pytest.mark.parametrize(
+        "boxes,original_size,expected",
         [
             (np.array([[1, 1, 2, 2], [2, 2, 3, 3]]), (4, 4), np.array([[2, 2, 4, 4], [4, 4, 6, 6]])),
             (np.array([[4, 4, 8, 8], [8, 8, 12, 12]]), (16, 16), np.array([[2, 2, 4, 4], [4, 4, 6, 6]])),
-        ]
+        ],
     )
     def test_apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, int], expected: np.ndarray):
         """Test apply_boxes."""
@@ -63,11 +66,12 @@ def test_apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, int], ex
         assert np.array_equal(result, expected)
 
     @e2e_pytest_unit
-    @pytest.mark.parametrize("oldh,oldw,expected",
+    @pytest.mark.parametrize(
+        "oldh,oldw,expected",
         [
             (3, 4, (6, 8)),
             (12, 16, (6, 8)),
-        ]
+        ],
     )
     def test_get_preprocess_shape(self, oldh: int, oldw: int, expected: Tuple[int, int]):
         """Test get_preprocess_shape."""
diff --git a/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py b/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py
index 17588177bd2..f44e25e1b1c 100644
--- a/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py
+++ b/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py
@@ -22,7 +22,7 @@
 from otx.algorithms.visual_prompting.tasks.openvino import (
     OpenVINOVisualPromptingInferencer,
     OpenVINOVisualPromptingTask,
-    OTXOpenVinoDataLoader
+    OTXOpenVinoDataLoader,
 )
 from otx.api.configuration.configurable_parameters import ConfigurableParameters
 from otx.api.entities.annotation import Annotation
@@ -176,7 +176,10 @@ def load_dataloader(self, mocker):
         def _load_dataloader(is_encoder: bool = True, output_model: Optional[ModelEntity] = None):
             dataset = generate_visual_prompting_dataset()
             dataset = dataset.get_subset(Subset.TRAINING)
-            return OTXOpenVinoDataLoader(dataset, self.mocker_inferencer, is_encoder=is_encoder, output_model=output_model)
+            return OTXOpenVinoDataLoader(
+                dataset, self.mocker_inferencer, is_encoder=is_encoder, output_model=output_model
+            )
+
         return _load_dataloader
 
     @pytest.fixture(autouse=True)
@@ -198,12 +201,16 @@ def test_getitem(self, mocker, load_dataloader, is_encoder: bool):
         dataloader = load_dataloader(is_encoder, mocker_output_model)
 
         setattr(dataloader, "target_length", 8)
-        mocker.patch.object(dataloader.inferencer, "pre_process", return_value=({"images": np.zeros((1, 4, 3, 3), dtype=np.uint8)}, None, [{"label": 1, "orig_size": 1}]))
-        
+        mocker.patch.object(
+            dataloader.inferencer,
+            "pre_process",
+            return_value=({"images": np.zeros((1, 4, 3, 3), dtype=np.uint8)}, None, [{"label": 1, "orig_size": 1}]),
+        )
+
         results = dataloader.__getitem__(0)
-        
+
         if is_encoder:
-            assert results['images'].shape == (1, 3, 8, 8)
+            assert results["images"].shape == (1, 3, 8, 8)
         else:
             self.mocker_read_model.assert_called_once()
             self.mocker_compile_model.assert_called_once()
@@ -301,6 +308,7 @@ def test_deploy(self):
     @e2e_pytest_unit
     def test_optimize(self, mocker):
         """Test optimize."""
+
         def patch_save_model(model, output_xml):
             with open(output_xml, "wb") as f:
                 f.write(b"compressed_image_encoder_xml")
@@ -322,8 +330,13 @@ def patch_save_model(model, output_xml):
 
         fake_quantize.assert_called_once()
         # check if only image encoder was compressed
-        assert self.visual_prompting_ov_task.model.get_data("visual_prompting_image_encoder.xml") == b"compressed_image_encoder_xml"
-        assert self.visual_prompting_ov_task.model.get_data("visual_prompting_image_encoder.bin") == b"compressed_image_encoder_bin"
+        assert (
+            self.visual_prompting_ov_task.model.get_data("visual_prompting_image_encoder.xml")
+            == b"compressed_image_encoder_xml"
+        )
+        assert (
+            self.visual_prompting_ov_task.model.get_data("visual_prompting_image_encoder.bin")
+            == b"compressed_image_encoder_bin"
+        )
         assert self.visual_prompting_ov_task.model.get_data("visual_prompting_decoder.xml") == b"decoder_xml"
         assert self.visual_prompting_ov_task.model.get_data("visual_prompting_decoder.bin") == b"decoder_bin"
-

From 40ce5976780091b61cb77732df771d865406bf18 Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Thu, 13 Jul 2023 14:16:34 +0900
Subject: [PATCH 10/12] Update CHANGELOG

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fcb7d47864d..7bbe22f40e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ All notable changes to this project will be documented in this file.
 - Add new visual prompting task: train/eval (https://github.com/openvinotoolkit/training_extensions/pull/2203)
 - Add new visual prompting task: export (https://github.com/openvinotoolkit/training_extensions/pull/2274)
 - Add new visual prompting task: deploy (https://github.com/openvinotoolkit/training_extensions/pull/2311)
+- Add new visual prompting task: optimize (PTQ) (https://github.com/openvinotoolkit/training_extensions/pull/2318)
 - Add new object detector ResNeXt101-ATSS (<https://github.com/openvinotoolkit/training_extensions/pull/2309>)
 
 ### Enhancements

From 60ae7ebe6044ba41d040d7692e02cf0e052ef46f Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Thu, 13 Jul 2023 15:45:02 +0900
Subject: [PATCH 11/12] Update from staticmethod to classmethod

---
 .../pytorch_lightning/datasets/pipelines/sam_transforms.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
index f09275d4a21..aeb0cc98baf 100644
--- a/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
+++ b/src/otx/algorithms/visual_prompting/adapters/pytorch_lightning/datasets/pipelines/sam_transforms.py
@@ -43,8 +43,8 @@ def __call__(self, item: Dict[str, Union[List, Tensor]]) -> Dict[str, Union[List
             item["points"] = self.apply_coords(item["points"], item["original_size"])
         return item
 
-    @staticmethod
-    def apply_image(image: np.ndarray, target_length: int) -> np.ndarray:
+    @classmethod
+    def apply_image(cls, image: np.ndarray, target_length: int) -> np.ndarray:
         """Expects a numpy array with shape HxWxC in uint8 format.
 
         Args:
@@ -54,7 +54,7 @@ def apply_image(image: np.ndarray, target_length: int) -> np.ndarray:
         Returns:
             np.ndarray: Resized image.
         """
-        target_size = ResizeLongestSide.get_preprocess_shape(image.shape[0], image.shape[1], target_length)
+        target_size = cls.get_preprocess_shape(image.shape[0], image.shape[1], target_length)
         return np.array(resize(to_pil_image(image), target_size))
 
     def apply_coords(self, coords: np.ndarray, original_size: Union[List[Any], Tensor]) -> np.ndarray:

From f8b307ea861caf93312dff1c1df5aa4db5225110 Mon Sep 17 00:00:00 2001
From: "Kim, Sungchul" <sungchul.kim@intel.com>
Date: Thu, 13 Jul 2023 18:29:09 +0900
Subject: [PATCH 12/12] Move `ResizeLongestSide` into preprocess

---
 .../model_wrappers/openvino_models.py         | 10 +++++++-
 .../visual_prompting/tasks/openvino.py        | 23 ++++++++++---------
 .../visual_prompting/tasks/test_openvino.py   |  2 +-
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/otx/algorithms/visual_prompting/adapters/openvino/model_wrappers/openvino_models.py b/src/otx/algorithms/visual_prompting/adapters/openvino/model_wrappers/openvino_models.py
index 5d4ba5e8917..ee18acd4bd6 100644
--- a/src/otx/algorithms/visual_prompting/adapters/openvino/model_wrappers/openvino_models.py
+++ b/src/otx/algorithms/visual_prompting/adapters/openvino/model_wrappers/openvino_models.py
@@ -23,6 +23,7 @@
 from openvino.model_api.models import ImageModel, SegmentationModel
 from openvino.model_api.models.types import NumericalValue, StringValue
 
+from otx.algorithms.visual_prompting.adapters.pytorch_lightning.datasets.pipelines import ResizeLongestSide
 from otx.api.utils.segmentation_utils import create_hard_prediction_from_soft_prediction
 
 
@@ -40,13 +41,20 @@ def parameters(cls) -> Dict[str, Any]:  # noqa: D102
         parameters.update(
             {
                 "resize_type": StringValue(default_value="fit_to_window"),
+                "image_size": NumericalValue(value_type=int, default_value=1024, min=0, max=2048),
             }
         )
         return parameters
 
-    def preprocess(self, inputs: np.ndarray) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
+    def preprocess(
+        self, inputs: np.ndarray, extra_processing: bool = False
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, Any]]:
         """Update meta for image encoder."""
         dict_inputs, meta = super().preprocess(inputs)
+        if extra_processing:
+            dict_inputs["images"] = ResizeLongestSide.apply_image(dict_inputs["images"][0], self.image_size).transpose(
+                2, 0, 1
+            )[None]
         meta["resize_type"] = self.resize_type
         return dict_inputs, meta
 
diff --git a/src/otx/algorithms/visual_prompting/tasks/openvino.py b/src/otx/algorithms/visual_prompting/tasks/openvino.py
index 7d5ad6fcd2e..f7d045f1e6c 100644
--- a/src/otx/algorithms/visual_prompting/tasks/openvino.py
+++ b/src/otx/algorithms/visual_prompting/tasks/openvino.py
@@ -43,7 +43,6 @@
     OTXVisualPromptingDataset,
     get_transform,
 )
-from otx.algorithms.visual_prompting.adapters.pytorch_lightning.datasets.pipelines import ResizeLongestSide
 from otx.algorithms.visual_prompting.configs.base import VisualPromptingBaseConfig
 from otx.api.entities.annotation import Annotation
 from otx.api.entities.dataset_item import DatasetItemEntity
@@ -115,13 +114,16 @@ def __init__(
         self.model = {}
         model_parameters = {"decoder": {"input_layouts": "image_embeddings:NCHW"}}
         self.configuration = {
+            "image_encoder": {
+                **attr.asdict(hparams.postprocessing, filter=lambda attr, value: attr.name in ["image_size"])
+            },
             "decoder": {
                 **attr.asdict(
                     hparams.postprocessing,
                     filter=lambda attr, value: attr.name
                     not in ["header", "description", "type", "visible_in_ui", "class_name"],
                 )
-            }
+            },
         }
         for name in ["image_encoder", "decoder"]:
             model_adapter = VisualPromptingOpenvinoAdapter(
@@ -139,10 +141,10 @@ def __init__(
         self.transform = get_transform()  # TODO (sungchul): insert args
 
     def pre_process(  # type: ignore
-        self, dataset_item: DatasetItemEntity
+        self, dataset_item: DatasetItemEntity, extra_processing: bool = False
     ) -> Tuple[Dict[str, Any], Dict[str, Any], List[Dict[str, Any]]]:
         """Pre-process function of OpenVINO Visual Prompting Inferencer for image encoder."""
-        images, meta = self.model["image_encoder"].preprocess(dataset_item.numpy)
+        images, meta = self.model["image_encoder"].preprocess(dataset_item.numpy, extra_processing)
         prompts = OTXVisualPromptingDataset.get_prompts(dataset_item, self.labels)  # to be replaced
         prompts = self.model["decoder"].preprocess(prompts, meta)
         return images, meta, prompts  # type: ignore
@@ -230,15 +232,14 @@ def __getitem__(self, index: int):
             index = self.shuffler[index]
 
         items = self.dataset[index]
-        images, _, prompts = self.inferencer.pre_process(items)
-        processed_image = ResizeLongestSide.apply_image(images["images"][0], self.target_length).transpose(2, 0, 1)
-        _, h, w = processed_image.shape
-        pad_width = ((0, 0), (0, self.target_length - h), (0, self.target_length - w))
-        processed_image = np.pad(processed_image, pad_width, mode="constant", constant_values=0)
+        images, _, prompts = self.inferencer.pre_process(items, extra_processing=True)
+        _, _, h, w = images["images"].shape
+        pad_width = ((0, 0), (0, 0), (0, self.target_length - h), (0, self.target_length - w))
+        images["images"] = np.pad(images["images"], pad_width, mode="constant", constant_values=0)
         if self.is_encoder:
-            return {"images": processed_image[None]}
+            return images
         else:
-            image_embeddings = self.compressed_model(processed_image[None])
+            image_embeddings = self.compressed_model(images["images"])
             prompt = prompts[0]  # only use the first prompt
             prompt.pop("label")
             prompt.pop("orig_size")
diff --git a/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py b/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py
index f44e25e1b1c..8a8229a9bf9 100644
--- a/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py
+++ b/tests/unit/algorithms/visual_prompting/tasks/test_openvino.py
@@ -204,7 +204,7 @@ def test_getitem(self, mocker, load_dataloader, is_encoder: bool):
         mocker.patch.object(
             dataloader.inferencer,
             "pre_process",
-            return_value=({"images": np.zeros((1, 4, 3, 3), dtype=np.uint8)}, None, [{"label": 1, "orig_size": 1}]),
+            return_value=({"images": np.zeros((1, 3, 4, 4), dtype=np.uint8)}, None, [{"label": 1, "orig_size": 1}]),
         )
 
         results = dataloader.__getitem__(0)