diff --git a/.github/workflows/perf-accuracy.yml b/.github/workflows/perf-accuracy.yml
deleted file mode 100644
index dc438f6ca89..00000000000
--- a/.github/workflows/perf-accuracy.yml
+++ /dev/null
@@ -1,104 +0,0 @@
-name: Performance-Accuracy Benchmark Test
-
-on:
-  workflow_dispatch: # run on request (no need for PR)
-    inputs:
-      model-type:
-        type: choice
-        description: Model type to run benchmark
-        options:
-          - default # speed, balance, accuracy models only
-          - all # default + other models
-        default: default
-      data-size:
-        type: choice
-        description: Dataset size to run benchmark
-        options:
-          - small
-          - medium
-          - large
-          - all
-        default: all
-      num-repeat:
-        description: Overrides default per-data-size number of repeat setting
-        default: 0
-      num-epoch:
-        description: Overrides default per-model number of epoch setting
-        default: 0
-      eval-upto:
-        type: choice
-        description: The last operation to evaluate. 'optimize' means all.
-        options:
-          - train
-          - export
-          - optimize
-        default: optimize
-      artifact-prefix:
-        type: string
-        default: perf-accuracy-benchmark
-  workflow_call:
-    inputs:
-      model-type:
-        type: string
-        description: Model type to run benchmark [default, all]
-        default: default
-      data-size:
-        type: string
-        description: Dataset size to run benchmark [small, medium, large, all]
-        default: all
-      num-repeat:
-        type: number
-        description: Overrides default per-data-size number of repeat setting
-        default: 0
-      num-epoch:
-        type: number
-        description: Overrides default per-model number of epoch setting
-        default: 0
-      eval-upto:
-        type: string
-        description: The last operation to evaluate. 'optimize' means all. [train,  export, optimize]
-        default: optimize
-      artifact-prefix:
-        type: string
-        default: perf-accuracy-benchmark
-
-# Declare default permissions as read only.
-permissions: read-all
-
-jobs:
-  Perf-Accuracy-Benchmark:
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - toxenv_task: "iseg"
-            task: "instance_segmentation"
-          - toxenv_task: "seg"
-            task: "semantic_segmentation"
-          - toxenv_task: "det"
-            task: "detection"
-          - toxenv_task: "ano"
-            task: "anomaly"
-          - toxenv_task: "cls"
-            task: "classification"
-    name: Perf-Accuracy-Benchmark-${{ matrix.toxenv_task }}-py310
-    uses: ./.github/workflows/run_tests_in_tox.yml
-    with:
-      python-version: "3.10"
-      toxenv-pyver: "py310"
-      toxenv-task: ${{ matrix.toxenv_task }}
-      tests-dir: >
-        tests/perf/test_${{ matrix.task }}.py
-        -k accuracy
-        --model-type ${{ inputs.model-type }}
-        --data-root /home/validation/data/new/
-        --data-size ${{ inputs.data-size }}
-        --num-repeat ${{ inputs.num-repeat }}
-        --num-epoch ${{ inputs.num-epoch }}
-        --eval-upto ${{ inputs.eval-upto }}
-        --summary-csv .tox/perf-accuracy-benchmark-${{ matrix.toxenv_task }}.csv
-      runs-on: "['self-hosted', 'Linux', 'X64', 'dmount']"
-      task: ${{ matrix.task }}
-      timeout-minutes: 8640
-      upload-artifact: true
-      artifact-prefix: ${{ inputs.artifact-prefix }}
diff --git a/.github/workflows/perf-speed.yml b/.github/workflows/perf-speed.yml
deleted file mode 100644
index 26995b0077c..00000000000
--- a/.github/workflows/perf-speed.yml
+++ /dev/null
@@ -1,90 +0,0 @@
-name: Performance-Speed Benchmark Test
-
-on:
-  workflow_dispatch: # run on request (no need for PR)
-    inputs:
-      model-type:
-        type: choice
-        description: Model type to run benchmark
-        options:
-          - default # speed, balance, accuracy models only
-          - all # default + other models
-        default: default
-      data-size:
-        type: choice
-        description: Dataset size to run benchmark
-        options:
-          - small
-          - medium
-          - large
-          - all
-        default: medium
-      num-repeat:
-        description: Overrides default per-data-size number of repeat setting
-        default: 1
-      num-epoch:
-        description: Overrides default per-model number of epoch setting
-        default: 3
-      eval-upto:
-        type: choice
-        description: The last operation to evaluate. 'optimize' means all.
-        options:
-          - train
-          - export
-          - optimize
-        default: optimize
-      artifact-prefix:
-        type: string
-        default: perf-speed-benchmark
-  workflow_call:
-    inputs:
-      model-type:
-        type: string
-        description: Model type to run benchmark [default, all]
-        default: default
-      data-size:
-        type: string
-        description: Dataset size to run benchmark [small, medium, large, all]
-        default: medium
-      num-repeat:
-        type: number
-        description: Overrides default per-data-size number of repeat setting
-        default: 1
-      num-epoch:
-        type: number
-        description: Overrides default per-model number of epoch setting
-        default: 3
-      eval-upto:
-        type: string
-        description: The last operation to evaluate. 'optimize' means all [train, export, optimize]
-        default: optimize
-      artifact-prefix:
-        type: string
-        default: perf-speed-benchmark
-
-# Declare default permissions as read only.
-permissions: read-all
-
-jobs:
-  Perf-Speed-Benchmark:
-    name: Perf-Speed-Benchmark-all-py310
-    uses: ./.github/workflows/run_tests_in_tox.yml
-    with:
-      python-version: "3.10"
-      toxenv-pyver: "py310"
-      toxenv-task: all
-      tests-dir: >
-        tests/perf/
-        -k speed
-        --model-type ${{ inputs.model-type }}
-        --data-root /home/validation/data/new/
-        --data-size ${{ inputs.data-size }}
-        --num-repeat ${{ inputs.num-repeat }}
-        --num-epoch ${{ inputs.num-epoch }}
-        --eval-upto ${{ inputs.eval-upto }}
-        --summary-csv .tox/perf-speed-benchmark-all.csv
-      runs-on: "['self-hosted', 'Linux', 'X64', 'dmount']"
-      task: all
-      timeout-minutes: 8640
-      upload-artifact: true
-      artifact-prefix: ${{ inputs.artifact-prefix }}
diff --git a/.github/workflows/perf_accuracy.yml b/.github/workflows/perf_accuracy.yml
new file mode 100644
index 00000000000..77b12e9aa81
--- /dev/null
+++ b/.github/workflows/perf_accuracy.yml
@@ -0,0 +1,146 @@
+name: Perf-Accuracy Benchmark
+
+on:
+  workflow_dispatch: # run on request (no need for PR)
+    inputs:
+      model-category:
+        type: choice
+        description: Model category to run benchmark
+        options:
+          - default # speed, balance, accuracy models only
+          - all # default + other models
+        default: default
+      data-size:
+        type: choice
+        description: Dataset size to run benchmark
+        options:
+          - small
+          - medium
+          - large
+          - all
+        default: all
+      num-repeat:
+        description: Overrides default per-data-size number of repeat setting
+        default: 0
+      num-epoch:
+        description: Overrides default per-model number of epoch setting
+        default: 0
+      eval-upto:
+        type: choice
+        description: The last operation to evaluate. 'optimize' means all.
+        options:
+          - train
+          - export
+          - optimize
+        default: optimize
+      pytest-args:
+        type: string
+        description: |
+          Additional perf-benchmark pytest arguments.
+          "-k detection" -> detection task only
+          "--dry-run" -> print command w/o execution.
+      dara-root:
+        type: string
+        description: Root directory containing validation data in CI server.
+        default: /home/validation/data/new/
+      artifact-prefix:
+        type: string
+        default: perf-accuracy-benchmark
+  workflow_call:
+    inputs:
+      model-category:
+        type: string
+        description: Model category to run benchmark [default, all]
+        default: default
+      data-size:
+        type: string
+        description: Dataset size to run benchmark [small, medium, large, all]
+        default: all
+      num-repeat:
+        type: number
+        description: Overrides default per-data-size number of repeat setting
+        default: 0
+      num-epoch:
+        type: number
+        description: Overrides default per-model number of epoch setting
+        default: 0
+      eval-upto:
+        type: string
+        description: The last operation to evaluate. 'optimize' means all. [train, export, optimize]
+        default: optimize
+      pytest-args:
+        type: string
+        description: |
+          Additional perf-benchmark pytest arguments.
+          "-k detection" -> detection task only
+          "--dry-run" -> print command w/o execution.
+      dara-root:
+        type: string
+        description: Root directory containing validation data in CI server.
+        default: /home/validation/data/new/
+      artifact-prefix:
+        type: string
+        default: perf-accuracy-benchmark
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  Perf-Accuracy-Benchmark:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - task-short: "ano"
+            task: "anomaly"
+          - task-short: "cls"
+            task: "classification"
+          - task-short: "det"
+            task: "detection"
+          - task-short: "isg"
+            task: "instance_segmentation"
+          - task-short: "ssg"
+            task: "semantic_segmentation"
+          - task-short: "vsp"
+            task: "visual_prompting"
+    name: Perf-Accuracy-Benchmark-${{ matrix.task-short }}
+    runs-on: "['self-hosted', 'Linux', 'X64', 'dmount']"
+    timeout-minutes: 8640
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - name: Set up Python
+        uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          pip install --require-hashes --no-deps -r requirements/gh-actions.txt
+          pip-compile --generate-hashes -o /tmp/otx-dev-requirements.txt requirements/dev.txt
+          pip install --require-hashes --no-deps -r /tmp/otx-dev-requirements.txt
+          rm /tmp/otx-dev-requirements.txt
+      - name: Run Tests
+        env:
+          MLFLOW_TRACKING_SERVER_URI: ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
+          BENCHMARK_RESULTS_CLEAR: ${{ vars.BENCHMARK_RESULTS_CLEAR }}
+          GH_CTX_REF_NAME: ${{ github.ref_name }}
+          GH_CTX_SHA: ${{ github.sha }}
+        run: >
+          tox -vv -e perf-benchmark -- tests/perf/test_${{ matrix.task }}.py ${{ inputs.pytest-args }}
+          --benchmark-type accuracy
+          --model-category ${{ inputs.model-category }}
+          --data-root ${{ inputs.data-root }}
+          --data-size ${{ inputs.data-size }}
+          --num-repeat ${{ inputs.num-repeat }}
+          --num-epoch ${{ inputs.num-epoch }}
+          --eval-upto ${{ inputs.eval-upto }}
+          --summary-csv .tox/perf-accuracy-benchmark-${{ matrix.task-short }}.csv
+          --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
+          --user-name ${{ vars.USER_NAME }}
+      - name: Upload test results
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
+        with:
+          name: ${{ inputs.artifact-prefix }}-${{ matrix.task-short }}
+          path: .tox/perf-*.csv
+        # Use always() to always run this step to publish test results when there are test failures
+        if: ${{ always() }}
diff --git a/.github/workflows/perf_efficiency.yml b/.github/workflows/perf_efficiency.yml
new file mode 100644
index 00000000000..7bf7069423f
--- /dev/null
+++ b/.github/workflows/perf_efficiency.yml
@@ -0,0 +1,130 @@
+name: Perf-Efficiency Benchmark
+
+on:
+  workflow_dispatch: # run on request (no need for PR)
+    inputs:
+      model-category:
+        type: choice
+        description: Model category to run benchmark
+        options:
+          - default # speed, balance, accuracy models only
+          - all # default + other models
+        default: default
+      data-size:
+        type: choice
+        description: Dataset size to run benchmark
+        options:
+          - small
+          - medium
+          - large
+          - all
+        default: medium
+      num-repeat:
+        description: Overrides default per-data-size number of repeat setting
+        default: 1
+      num-epoch:
+        description: Overrides default per-model number of epoch setting
+        default: 2
+      eval-upto:
+        type: choice
+        description: The last operation to evaluate. 'optimize' means all.
+        options:
+          - train
+          - export
+          - optimize
+        default: optimize
+      pytest-args:
+        type: string
+        description: |
+          Additional perf-benchmark pytest arguments.
+          "-k detection" -> detection task only
+          "--dry-run" -> print command w/o execution.
+      dara-root:
+        type: string
+        description: Root directory containing validation data in CI server.
+        default: /home/validation/data/new/
+      artifact-prefix:
+        type: string
+        default: perf-efficiency-benchmark
+  workflow_call:
+    inputs:
+      model-category:
+        type: string
+        description: Model category to run benchmark [default, all]
+        default: default
+      data-size:
+        type: string
+        description: Dataset size to run benchmark [small, medium, large, all]
+        default: medium
+      num-repeat:
+        type: number
+        description: Overrides default per-data-size number of repeat setting
+        default: 1
+      num-epoch:
+        type: number
+        description: Overrides default per-model number of epoch setting
+        default: 2
+      eval-upto:
+        type: string
+        description: The last operation to evaluate. 'optimize' means all. [train, export, optimize]
+        default: optimize
+      pytest-args:
+        type: string
+        description: |
+          Additional perf-benchmark pytest arguments.
+          "-k detection" -> detection task only
+          "--dry-run" -> print command w/o execution.
+      dara-root:
+        type: string
+        description: Root directory containing validation data in CI server.
+        default: /home/validation/data/new/
+      artifact-prefix:
+        type: string
+        default: perf-efficiency-benchmark
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  Perf-Efficiency-Benchmark:
+    name: Perf-Efficiency-Benchmark-all
+    runs-on: "['self-hosted', 'Linux', 'X64', 'dmount']"
+    timeout-minutes: 8640
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - name: Set up Python
+        uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          pip install --require-hashes --no-deps -r requirements/gh-actions.txt
+          pip-compile --generate-hashes -o /tmp/otx-dev-requirements.txt requirements/dev.txt
+          pip install --require-hashes --no-deps -r /tmp/otx-dev-requirements.txt
+          rm /tmp/otx-dev-requirements.txt
+      - name: Run Tests
+        env:
+          MLFLOW_TRACKING_SERVER_URI: ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
+          BENCHMARK_RESULTS_CLEAR: ${{ vars.BENCHMARK_RESULTS_CLEAR }}
+          GH_CTX_REF_NAME: ${{ github.ref_name }}
+          GH_CTX_SHA: ${{ github.sha }}
+        run: >
+          tox -vv -e perf-benchmark -- tests/perf ${{ inputs.pytest-args }}
+          --benchmark-type efficiency
+          --model-category ${{ inputs.model-category }}
+          --data-root ${{ inputs.data-root }}
+          --data-size ${{ inputs.data-size }}
+          --num-repeat ${{ inputs.num-repeat }}
+          --num-epoch ${{ inputs.num-epoch }}
+          --eval-upto ${{ inputs.eval-upto }}
+          --summary-csv .tox/perf-efficiency-benchmark-all.csv
+          --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
+          --user-name ${{ vars.USER_NAME }}
+      - name: Upload test results
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
+        with:
+          name: ${{ inputs.artifact-prefix }}-all
+          path: .tox/perf-*.csv
+        # Use always() to always run this step to publish test results when there are test failures
+        if: ${{ always() }}
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index ceb401b21f6..4ae23fbcced 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -10,19 +10,19 @@ on:
 permissions: read-all
 
 jobs:
-  Performance-Speed-Tests:
-    name: Performance-Speed-py310
-    uses: ./.github/workflows/perf-speed.yml
+  Weekly-Perf-Efficiency-Benchmark:
+    name: Weekly-Perf-Efficiency-Benchmark
+    uses: ./.github/workflows/perf_efficiency.yml
     with:
       model-type: default
       data-size: medium
       num-repeat: 1
-      num-epoch: 3
+      num-epoch: 2
       eval-upto: optimize
-      artifact-prefix: weekly-perf-speed-benchmark
-  Performance-Accuracy-Tests:
-    name: Performance-Accuracy-py310
-    uses: ./.github/workflows/perf-accuracy.yml
+      artifact-prefix: weekly-perf-accuracy-benchmark
+  Weekly-Perf-Accuracy-Benchmark:
+    name: Weekly-Perf-Accuracy-Benchmarky
+    uses: ./.github/workflows/perf_accuracy.yml
     with:
       model-type: default
       data-size: all
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index 21c1ab84d54..123686dcdf4 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -24,7 +24,14 @@
 def pytest_addoption(parser):
     """Add custom options for perf tests."""
     parser.addoption(
-        "--model-type",
+        "--benchmark-type",
+        action="store",
+        default="accuracy",
+        choices=("accuracy", "efficiency", "all"),
+        help="Choose accuracy|efficiency|all. Defaults to accuracy.",
+    )
+    parser.addoption(
+        "--model-category",
         action="store",
         default="all",
         choices=("default", "all"),
@@ -81,6 +88,17 @@ def pytest_addoption(parser):
         default=False,
         help="Print OTX commands without execution.",
     )
+    parser.addoption(
+        "--user-name",
+        type=str,
+        default="anonymous",
+        help='Sign-off the user name who launched the regression tests this time, e.g., `--user-name "John Doe"`.',
+    )
+    parser.addoption(
+        "--mlflow-tracking-uri",  # Currently set by MLFLOW_TRACKING_SERVER_URI env variable. To be fixed.
+        type=str,
+        help="URI for MLFlow Tracking server to store the regression test results.",
+    )
 
 
 @pytest.fixture(scope="session")
@@ -106,9 +124,9 @@ def fxt_working_branch() -> str:
 @pytest.fixture
 def fxt_model_id(request: pytest.FixtureRequest) -> str:
     """Skip by model category."""
-    model_type: str = request.config.getoption("--model-type")
+    model_category: str = request.config.getoption("--model-category")
     model_template: ModelTemplate = request.param
-    if model_type == "default":
+    if model_category == "default":
         if model_template.model_category == ModelCategory.OTHER:
             pytest.skip(f"{model_template.model_category} category model")
     return model_template.model_template_id
@@ -117,6 +135,11 @@ def fxt_model_id(request: pytest.FixtureRequest) -> str:
 @pytest.fixture
 def fxt_benchmark(request: pytest.FixtureRequest, fxt_output_root: Path) -> OTXBenchmark:
     """Configure benchmark."""
+    # Skip by benchmark type
+    benchmark_type: str = request.config.getoption("--benchmark-type")
+    if benchmark_type != "all" and benchmark_type not in request.node.name:
+        pytest.skip(f"non-{benchmark_type} benchmark")
+
     # Skip by dataset size
     data_size_option: str = request.config.getoption("--data-size")
     data_size: str = request.param[0]
@@ -129,6 +152,7 @@ def fxt_benchmark(request: pytest.FixtureRequest, fxt_output_root: Path) -> OTXB
 
     tags = cfg.get("tags", {})
     tags["data_size"] = data_size
+    tags["user_name"] = request.config.getoption("--user-name")
     cfg["tags"] = tags
 
     num_epoch_override: int = int(request.config.getoption("--num-epoch"))
@@ -278,6 +302,9 @@ def check_benchmark_result(result: pd.DataFrame, key: Tuple, checks: List[Dict])
             print("No benchmark references loaded. Skipping result checking.")
             return
 
+        if result is None:
+            return
+
         def get_entry(data: pd.DataFrame, key: Tuple) -> pd.Series:
             if key in data.index:
                 return data.loc[key]
diff --git a/tests/perf/test_anomaly.py b/tests/perf/test_anomaly.py
index ac7e62e37c6..74bad3e6d90 100644
--- a/tests/perf/test_anomaly.py
+++ b/tests/perf/test_anomaly.py
@@ -81,16 +81,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
+    def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
         """Benchmark train time per iter / infer time per image."""
         fxt_benchmark.track_resources = True
         result = fxt_benchmark.run(
             model_id=fxt_model_id,
-            tags={"benchmark": "speed"},
+            tags={"benchmark": "efficiency"},
         )
         fxt_check_benchmark_result(
             result,
-            key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
+            key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
             checks=[
                 {
                     "name": "train_e2e_time",
@@ -171,16 +171,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
+    def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
         """Benchmark train time per iter / infer time per image."""
         fxt_benchmark.track_resources = True
         result = fxt_benchmark.run(
             model_id=fxt_model_id,
-            tags={"benchmark": "speed"},
+            tags={"benchmark": "efficiency"},
         )
         fxt_check_benchmark_result(
             result,
-            key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
+            key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
             checks=[
                 {
                     "name": "train_e2e_time",
@@ -261,16 +261,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
+    def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
         """Benchmark train time per iter / infer time per image."""
         fxt_benchmark.track_resources = True
         result = fxt_benchmark.run(
             model_id=fxt_model_id,
-            tags={"benchmark": "speed"},
+            tags={"benchmark": "efficiency"},
         )
         fxt_check_benchmark_result(
             result,
-            key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
+            key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
             checks=[
                 {
                     "name": "train_e2e_time",
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
index 9397dc5413e..c1bb1819646 100644
--- a/tests/perf/test_classification.py
+++ b/tests/perf/test_classification.py
@@ -87,16 +87,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
+    def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
         """Benchmark train time per iter / infer time per image."""
         fxt_benchmark.track_resources = True
         result = fxt_benchmark.run(
             model_id=fxt_model_id,
-            tags={"benchmark": "speed"},
+            tags={"benchmark": "efficiency"},
         )
         fxt_check_benchmark_result(
             result,
-            key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
+            key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
             checks=[
                 {
                     "name": "avg_data_time",
@@ -194,16 +194,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
+    def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
         """Benchmark train time per iter / infer time per image."""
         fxt_benchmark.track_resources = True
         result = fxt_benchmark.run(
             model_id=fxt_model_id,
-            tags={"benchmark": "speed"},
+            tags={"benchmark": "efficiency"},
         )
         fxt_check_benchmark_result(
             result,
-            key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
+            key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
             checks=[
                 {
                     "name": "avg_data_time",
@@ -301,16 +301,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
+    def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
         """Benchmark train time per iter / infer time per image."""
         fxt_benchmark.track_resources = True
         result = fxt_benchmark.run(
             model_id=fxt_model_id,
-            tags={"benchmark": "speed"},
+            tags={"benchmark": "efficiency"},
         )
         fxt_check_benchmark_result(
             result,
-            key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
+            key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
             checks=[
                 {
                     "name": "avg_data_time",
diff --git a/tests/perf/test_detection.py b/tests/perf/test_detection.py
index c754549655a..c81001c1438 100644
--- a/tests/perf/test_detection.py
+++ b/tests/perf/test_detection.py
@@ -87,16 +87,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
+    def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
         """Benchmark train time per iter / infer time per image."""
         fxt_benchmark.track_resources = True
         result = fxt_benchmark.run(
             model_id=fxt_model_id,
-            tags={"benchmark": "speed"},
+            tags={"benchmark": "efficiency"},
         )
         fxt_check_benchmark_result(
             result,
-            key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
+            key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
             checks=[
                 {
                     "name": "avg_data_time",
diff --git a/tests/perf/test_instance_segmentation.py b/tests/perf/test_instance_segmentation.py
index fc869a29a1b..bb315e4f4d6 100644
--- a/tests/perf/test_instance_segmentation.py
+++ b/tests/perf/test_instance_segmentation.py
@@ -88,16 +88,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
+    def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
         """Benchmark train time per iter / infer time per image."""
         fxt_benchmark.track_resources = True
         result = fxt_benchmark.run(
             model_id=fxt_model_id,
-            tags={"benchmark": "speed"},
+            tags={"benchmark": "efficiency"},
         )
         fxt_check_benchmark_result(
             result,
-            key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
+            key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
             checks=[
                 {
                     "name": "avg_data_time",
@@ -202,16 +202,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
+    def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
         """Benchmark train time per iter / infer time per image."""
         fxt_benchmark.track_resources = True
         result = fxt_benchmark.run(
             model_id=fxt_model_id,
-            tags={"benchmark": "speed"},
+            tags={"benchmark": "efficiency"},
         )
         fxt_check_benchmark_result(
             result,
-            key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
+            key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
             checks=[
                 {
                     "name": "avg_data_time",
diff --git a/tests/perf/test_semantic_segmentation.py b/tests/perf/test_semantic_segmentation.py
index 62eaa01f6c0..5728ec4f057 100644
--- a/tests/perf/test_semantic_segmentation.py
+++ b/tests/perf/test_semantic_segmentation.py
@@ -90,16 +90,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
+    def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable):
         """Benchmark train time per iter / infer time per image."""
         fxt_benchmark.track_resources = True
         result = fxt_benchmark.run(
             model_id=fxt_model_id,
-            tags={"benchmark": "speed"},
+            tags={"benchmark": "efficiency"},
         )
         fxt_check_benchmark_result(
             result,
-            key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
+            key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id),
             checks=[
                 {
                     "name": "avg_data_time",
diff --git a/tests/perf/test_visual_prompting.py b/tests/perf/test_visual_prompting.py
new file mode 100644
index 00000000000..5d59f7ba09c
--- /dev/null
+++ b/tests/perf/test_visual_prompting.py
@@ -0,0 +1,4 @@
+"""OTX Visual Prompting perfomance tests."""
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tox.ini b/tox.ini
index 6c7f7da2581..fe780508865 100644
--- a/tox.ini
+++ b/tox.ini
@@ -65,10 +65,6 @@ deps =
     -r{toxinidir}/requirements/dev.txt
 passenv =
     {[testenv]passenv}
-    MLFLOW_TRACKING_SERVER_URI
-    BENCHMARK_RESULTS_CLEAR
-    GH_CTX_REF_NAME
-    GH_CTX_SHA
 commands =
     python -m pytest -ra --showlocals --csv={toxworkdir}/{envname}.csv {posargs:tests/integration/{[testenv]test_dir}}
 
@@ -86,6 +82,20 @@ commands =
     coverage xml -o {toxworkdir}/coverage.xml
 
 
+[testenv:perf-benchmark]
+deps =
+    {[testenv:tests-all-py310-pt1]deps}
+extras = full
+passenv =
+    {[testenv]passenv}
+    MLFLOW_TRACKING_SERVER_URI
+    BENCHMARK_RESULTS_CLEAR
+    GH_CTX_REF_NAME
+    GH_CTX_SHA
+commands =
+    python -m pytest -ra --showlocals --csv={toxworkdir}/{envname}.csv {posargs:tests/perf}
+
+
 [testenv:fuzzing]
 deps =
     {[testenv:tests-all-py310-pt1]deps}