diff --git a/.github/workflows/perf-accuracy.yml b/.github/workflows/perf-accuracy.yml deleted file mode 100644 index dc438f6ca89..00000000000 --- a/.github/workflows/perf-accuracy.yml +++ /dev/null @@ -1,104 +0,0 @@ -name: Performance-Accuracy Benchmark Test - -on: - workflow_dispatch: # run on request (no need for PR) - inputs: - model-type: - type: choice - description: Model type to run benchmark - options: - - default # speed, balance, accuracy models only - - all # default + other models - default: default - data-size: - type: choice - description: Dataset size to run benchmark - options: - - small - - medium - - large - - all - default: all - num-repeat: - description: Overrides default per-data-size number of repeat setting - default: 0 - num-epoch: - description: Overrides default per-model number of epoch setting - default: 0 - eval-upto: - type: choice - description: The last operation to evaluate. 'optimize' means all. - options: - - train - - export - - optimize - default: optimize - artifact-prefix: - type: string - default: perf-accuracy-benchmark - workflow_call: - inputs: - model-type: - type: string - description: Model type to run benchmark [default, all] - default: default - data-size: - type: string - description: Dataset size to run benchmark [small, medium, large, all] - default: all - num-repeat: - type: number - description: Overrides default per-data-size number of repeat setting - default: 0 - num-epoch: - type: number - description: Overrides default per-model number of epoch setting - default: 0 - eval-upto: - type: string - description: The last operation to evaluate. 'optimize' means all. [train, export, optimize] - default: optimize - artifact-prefix: - type: string - default: perf-accuracy-benchmark - -# Declare default permissions as read only. -permissions: read-all - -jobs: - Perf-Accuracy-Benchmark: - strategy: - fail-fast: false - matrix: - include: - - toxenv_task: "iseg" - task: "instance_segmentation" - - toxenv_task: "seg" - task: "semantic_segmentation" - - toxenv_task: "det" - task: "detection" - - toxenv_task: "ano" - task: "anomaly" - - toxenv_task: "cls" - task: "classification" - name: Perf-Accuracy-Benchmark-${{ matrix.toxenv_task }}-py310 - uses: ./.github/workflows/run_tests_in_tox.yml - with: - python-version: "3.10" - toxenv-pyver: "py310" - toxenv-task: ${{ matrix.toxenv_task }} - tests-dir: > - tests/perf/test_${{ matrix.task }}.py - -k accuracy - --model-type ${{ inputs.model-type }} - --data-root /home/validation/data/new/ - --data-size ${{ inputs.data-size }} - --num-repeat ${{ inputs.num-repeat }} - --num-epoch ${{ inputs.num-epoch }} - --eval-upto ${{ inputs.eval-upto }} - --summary-csv .tox/perf-accuracy-benchmark-${{ matrix.toxenv_task }}.csv - runs-on: "['self-hosted', 'Linux', 'X64', 'dmount']" - task: ${{ matrix.task }} - timeout-minutes: 8640 - upload-artifact: true - artifact-prefix: ${{ inputs.artifact-prefix }} diff --git a/.github/workflows/perf-speed.yml b/.github/workflows/perf-speed.yml deleted file mode 100644 index 26995b0077c..00000000000 --- a/.github/workflows/perf-speed.yml +++ /dev/null @@ -1,90 +0,0 @@ -name: Performance-Speed Benchmark Test - -on: - workflow_dispatch: # run on request (no need for PR) - inputs: - model-type: - type: choice - description: Model type to run benchmark - options: - - default # speed, balance, accuracy models only - - all # default + other models - default: default - data-size: - type: choice - description: Dataset size to run benchmark - options: - - small - - medium - - large - - all - default: medium - num-repeat: - description: Overrides default per-data-size number of repeat setting - default: 1 - num-epoch: - description: Overrides default per-model number of epoch setting - default: 3 - eval-upto: - type: choice - description: The last operation to evaluate. 'optimize' means all. - options: - - train - - export - - optimize - default: optimize - artifact-prefix: - type: string - default: perf-speed-benchmark - workflow_call: - inputs: - model-type: - type: string - description: Model type to run benchmark [default, all] - default: default - data-size: - type: string - description: Dataset size to run benchmark [small, medium, large, all] - default: medium - num-repeat: - type: number - description: Overrides default per-data-size number of repeat setting - default: 1 - num-epoch: - type: number - description: Overrides default per-model number of epoch setting - default: 3 - eval-upto: - type: string - description: The last operation to evaluate. 'optimize' means all [train, export, optimize] - default: optimize - artifact-prefix: - type: string - default: perf-speed-benchmark - -# Declare default permissions as read only. -permissions: read-all - -jobs: - Perf-Speed-Benchmark: - name: Perf-Speed-Benchmark-all-py310 - uses: ./.github/workflows/run_tests_in_tox.yml - with: - python-version: "3.10" - toxenv-pyver: "py310" - toxenv-task: all - tests-dir: > - tests/perf/ - -k speed - --model-type ${{ inputs.model-type }} - --data-root /home/validation/data/new/ - --data-size ${{ inputs.data-size }} - --num-repeat ${{ inputs.num-repeat }} - --num-epoch ${{ inputs.num-epoch }} - --eval-upto ${{ inputs.eval-upto }} - --summary-csv .tox/perf-speed-benchmark-all.csv - runs-on: "['self-hosted', 'Linux', 'X64', 'dmount']" - task: all - timeout-minutes: 8640 - upload-artifact: true - artifact-prefix: ${{ inputs.artifact-prefix }} diff --git a/.github/workflows/perf_accuracy.yml b/.github/workflows/perf_accuracy.yml new file mode 100644 index 00000000000..77b12e9aa81 --- /dev/null +++ b/.github/workflows/perf_accuracy.yml @@ -0,0 +1,146 @@ +name: Perf-Accuracy Benchmark + +on: + workflow_dispatch: # run on request (no need for PR) + inputs: + model-category: + type: choice + description: Model category to run benchmark + options: + - default # speed, balance, accuracy models only + - all # default + other models + default: default + data-size: + type: choice + description: Dataset size to run benchmark + options: + - small + - medium + - large + - all + default: all + num-repeat: + description: Overrides default per-data-size number of repeat setting + default: 0 + num-epoch: + description: Overrides default per-model number of epoch setting + default: 0 + eval-upto: + type: choice + description: The last operation to evaluate. 'optimize' means all. + options: + - train + - export + - optimize + default: optimize + pytest-args: + type: string + description: | + Additional perf-benchmark pytest arguments. + "-k detection" -> detection task only + "--dry-run" -> print command w/o execution. + dara-root: + type: string + description: Root directory containing validation data in CI server. + default: /home/validation/data/new/ + artifact-prefix: + type: string + default: perf-accuracy-benchmark + workflow_call: + inputs: + model-category: + type: string + description: Model category to run benchmark [default, all] + default: default + data-size: + type: string + description: Dataset size to run benchmark [small, medium, large, all] + default: all + num-repeat: + type: number + description: Overrides default per-data-size number of repeat setting + default: 0 + num-epoch: + type: number + description: Overrides default per-model number of epoch setting + default: 0 + eval-upto: + type: string + description: The last operation to evaluate. 'optimize' means all. [train, export, optimize] + default: optimize + pytest-args: + type: string + description: | + Additional perf-benchmark pytest arguments. + "-k detection" -> detection task only + "--dry-run" -> print command w/o execution. + dara-root: + type: string + description: Root directory containing validation data in CI server. + default: /home/validation/data/new/ + artifact-prefix: + type: string + default: perf-accuracy-benchmark + +# Declare default permissions as read only. +permissions: read-all + +jobs: + Perf-Accuracy-Benchmark: + strategy: + fail-fast: false + matrix: + include: + - task-short: "ano" + task: "anomaly" + - task-short: "cls" + task: "classification" + - task-short: "det" + task: "detection" + - task-short: "isg" + task: "instance_segmentation" + - task-short: "ssg" + task: "semantic_segmentation" + - task-short: "vsp" + task: "visual_prompting" + name: Perf-Accuracy-Benchmark-${{ matrix.task-short }} + runs-on: "['self-hosted', 'Linux', 'X64', 'dmount']" + timeout-minutes: 8640 + steps: + - name: Checkout repository + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: Set up Python + uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1 + with: + python-version: "3.10" + - name: Install dependencies + run: | + pip install --require-hashes --no-deps -r requirements/gh-actions.txt + pip-compile --generate-hashes -o /tmp/otx-dev-requirements.txt requirements/dev.txt + pip install --require-hashes --no-deps -r /tmp/otx-dev-requirements.txt + rm /tmp/otx-dev-requirements.txt + - name: Run Tests + env: + MLFLOW_TRACKING_SERVER_URI: ${{ vars.MLFLOW_TRACKING_SERVER_URI }} + BENCHMARK_RESULTS_CLEAR: ${{ vars.BENCHMARK_RESULTS_CLEAR }} + GH_CTX_REF_NAME: ${{ github.ref_name }} + GH_CTX_SHA: ${{ github.sha }} + run: > + tox -vv -e perf-benchmark -- tests/perf/test_${{ matrix.task }}.py ${{ inputs.pytest-args }} + --benchmark-type accuracy + --model-category ${{ inputs.model-category }} + --data-root ${{ inputs.data-root }} + --data-size ${{ inputs.data-size }} + --num-repeat ${{ inputs.num-repeat }} + --num-epoch ${{ inputs.num-epoch }} + --eval-upto ${{ inputs.eval-upto }} + --summary-csv .tox/perf-accuracy-benchmark-${{ matrix.task-short }}.csv + --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }} + --user-name ${{ vars.USER_NAME }} + - name: Upload test results + uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + with: + name: ${{ inputs.artifact-prefix }}-${{ matrix.task-short }} + path: .tox/perf-*.csv + # Use always() to always run this step to publish test results when there are test failures + if: ${{ always() }} diff --git a/.github/workflows/perf_efficiency.yml b/.github/workflows/perf_efficiency.yml new file mode 100644 index 00000000000..7bf7069423f --- /dev/null +++ b/.github/workflows/perf_efficiency.yml @@ -0,0 +1,130 @@ +name: Perf-Efficiency Benchmark + +on: + workflow_dispatch: # run on request (no need for PR) + inputs: + model-category: + type: choice + description: Model category to run benchmark + options: + - default # speed, balance, accuracy models only + - all # default + other models + default: default + data-size: + type: choice + description: Dataset size to run benchmark + options: + - small + - medium + - large + - all + default: medium + num-repeat: + description: Overrides default per-data-size number of repeat setting + default: 1 + num-epoch: + description: Overrides default per-model number of epoch setting + default: 2 + eval-upto: + type: choice + description: The last operation to evaluate. 'optimize' means all. + options: + - train + - export + - optimize + default: optimize + pytest-args: + type: string + description: | + Additional perf-benchmark pytest arguments. + "-k detection" -> detection task only + "--dry-run" -> print command w/o execution. + dara-root: + type: string + description: Root directory containing validation data in CI server. + default: /home/validation/data/new/ + artifact-prefix: + type: string + default: perf-efficiency-benchmark + workflow_call: + inputs: + model-category: + type: string + description: Model category to run benchmark [default, all] + default: default + data-size: + type: string + description: Dataset size to run benchmark [small, medium, large, all] + default: medium + num-repeat: + type: number + description: Overrides default per-data-size number of repeat setting + default: 1 + num-epoch: + type: number + description: Overrides default per-model number of epoch setting + default: 2 + eval-upto: + type: string + description: The last operation to evaluate. 'optimize' means all. [train, export, optimize] + default: optimize + pytest-args: + type: string + description: | + Additional perf-benchmark pytest arguments. + "-k detection" -> detection task only + "--dry-run" -> print command w/o execution. + dara-root: + type: string + description: Root directory containing validation data in CI server. + default: /home/validation/data/new/ + artifact-prefix: + type: string + default: perf-efficiency-benchmark + +# Declare default permissions as read only. +permissions: read-all + +jobs: + Perf-Efficiency-Benchmark: + name: Perf-Efficiency-Benchmark-all + runs-on: "['self-hosted', 'Linux', 'X64', 'dmount']" + timeout-minutes: 8640 + steps: + - name: Checkout repository + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: Set up Python + uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1 + with: + python-version: "3.10" + - name: Install dependencies + run: | + pip install --require-hashes --no-deps -r requirements/gh-actions.txt + pip-compile --generate-hashes -o /tmp/otx-dev-requirements.txt requirements/dev.txt + pip install --require-hashes --no-deps -r /tmp/otx-dev-requirements.txt + rm /tmp/otx-dev-requirements.txt + - name: Run Tests + env: + MLFLOW_TRACKING_SERVER_URI: ${{ vars.MLFLOW_TRACKING_SERVER_URI }} + BENCHMARK_RESULTS_CLEAR: ${{ vars.BENCHMARK_RESULTS_CLEAR }} + GH_CTX_REF_NAME: ${{ github.ref_name }} + GH_CTX_SHA: ${{ github.sha }} + run: > + tox -vv -e perf-benchmark -- tests/perf ${{ inputs.pytest-args }} + --benchmark-type efficiency + --model-category ${{ inputs.model-category }} + --data-root ${{ inputs.data-root }} + --data-size ${{ inputs.data-size }} + --num-repeat ${{ inputs.num-repeat }} + --num-epoch ${{ inputs.num-epoch }} + --eval-upto ${{ inputs.eval-upto }} + --summary-csv .tox/perf-efficiency-benchmark-all.csv + --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }} + --user-name ${{ vars.USER_NAME }} + - name: Upload test results + uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + with: + name: ${{ inputs.artifact-prefix }}-all + path: .tox/perf-*.csv + # Use always() to always run this step to publish test results when there are test failures + if: ${{ always() }} diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml index ceb401b21f6..4ae23fbcced 100644 --- a/.github/workflows/weekly.yml +++ b/.github/workflows/weekly.yml @@ -10,19 +10,19 @@ on: permissions: read-all jobs: - Performance-Speed-Tests: - name: Performance-Speed-py310 - uses: ./.github/workflows/perf-speed.yml + Weekly-Perf-Efficiency-Benchmark: + name: Weekly-Perf-Efficiency-Benchmark + uses: ./.github/workflows/perf_efficiency.yml with: model-type: default data-size: medium num-repeat: 1 - num-epoch: 3 + num-epoch: 2 eval-upto: optimize - artifact-prefix: weekly-perf-speed-benchmark - Performance-Accuracy-Tests: - name: Performance-Accuracy-py310 - uses: ./.github/workflows/perf-accuracy.yml + artifact-prefix: weekly-perf-accuracy-benchmark + Weekly-Perf-Accuracy-Benchmark: + name: Weekly-Perf-Accuracy-Benchmarky + uses: ./.github/workflows/perf_accuracy.yml with: model-type: default data-size: all diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py index 21c1ab84d54..123686dcdf4 100644 --- a/tests/perf/conftest.py +++ b/tests/perf/conftest.py @@ -24,7 +24,14 @@ def pytest_addoption(parser): """Add custom options for perf tests.""" parser.addoption( - "--model-type", + "--benchmark-type", + action="store", + default="accuracy", + choices=("accuracy", "efficiency", "all"), + help="Choose accuracy|efficiency|all. Defaults to accuracy.", + ) + parser.addoption( + "--model-category", action="store", default="all", choices=("default", "all"), @@ -81,6 +88,17 @@ def pytest_addoption(parser): default=False, help="Print OTX commands without execution.", ) + parser.addoption( + "--user-name", + type=str, + default="anonymous", + help='Sign-off the user name who launched the regression tests this time, e.g., `--user-name "John Doe"`.', + ) + parser.addoption( + "--mlflow-tracking-uri", # Currently set by MLFLOW_TRACKING_SERVER_URI env variable. To be fixed. + type=str, + help="URI for MLFlow Tracking server to store the regression test results.", + ) @pytest.fixture(scope="session") @@ -106,9 +124,9 @@ def fxt_working_branch() -> str: @pytest.fixture def fxt_model_id(request: pytest.FixtureRequest) -> str: """Skip by model category.""" - model_type: str = request.config.getoption("--model-type") + model_category: str = request.config.getoption("--model-category") model_template: ModelTemplate = request.param - if model_type == "default": + if model_category == "default": if model_template.model_category == ModelCategory.OTHER: pytest.skip(f"{model_template.model_category} category model") return model_template.model_template_id @@ -117,6 +135,11 @@ def fxt_model_id(request: pytest.FixtureRequest) -> str: @pytest.fixture def fxt_benchmark(request: pytest.FixtureRequest, fxt_output_root: Path) -> OTXBenchmark: """Configure benchmark.""" + # Skip by benchmark type + benchmark_type: str = request.config.getoption("--benchmark-type") + if benchmark_type != "all" and benchmark_type not in request.node.name: + pytest.skip(f"non-{benchmark_type} benchmark") + # Skip by dataset size data_size_option: str = request.config.getoption("--data-size") data_size: str = request.param[0] @@ -129,6 +152,7 @@ def fxt_benchmark(request: pytest.FixtureRequest, fxt_output_root: Path) -> OTXB tags = cfg.get("tags", {}) tags["data_size"] = data_size + tags["user_name"] = request.config.getoption("--user-name") cfg["tags"] = tags num_epoch_override: int = int(request.config.getoption("--num-epoch")) @@ -278,6 +302,9 @@ def check_benchmark_result(result: pd.DataFrame, key: Tuple, checks: List[Dict]) print("No benchmark references loaded. Skipping result checking.") return + if result is None: + return + def get_entry(data: pd.DataFrame, key: Tuple) -> pd.Series: if key in data.index: return data.loc[key] diff --git a/tests/perf/test_anomaly.py b/tests/perf/test_anomaly.py index ac7e62e37c6..74bad3e6d90 100644 --- a/tests/perf/test_anomaly.py +++ b/tests/perf/test_anomaly.py @@ -81,16 +81,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) - def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): + def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): """Benchmark train time per iter / infer time per image.""" fxt_benchmark.track_resources = True result = fxt_benchmark.run( model_id=fxt_model_id, - tags={"benchmark": "speed"}, + tags={"benchmark": "efficiency"}, ) fxt_check_benchmark_result( result, - key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), + key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), checks=[ { "name": "train_e2e_time", @@ -171,16 +171,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) - def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): + def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): """Benchmark train time per iter / infer time per image.""" fxt_benchmark.track_resources = True result = fxt_benchmark.run( model_id=fxt_model_id, - tags={"benchmark": "speed"}, + tags={"benchmark": "efficiency"}, ) fxt_check_benchmark_result( result, - key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), + key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), checks=[ { "name": "train_e2e_time", @@ -261,16 +261,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) - def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): + def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): """Benchmark train time per iter / infer time per image.""" fxt_benchmark.track_resources = True result = fxt_benchmark.run( model_id=fxt_model_id, - tags={"benchmark": "speed"}, + tags={"benchmark": "efficiency"}, ) fxt_check_benchmark_result( result, - key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), + key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), checks=[ { "name": "train_e2e_time", diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py index 9397dc5413e..c1bb1819646 100644 --- a/tests/perf/test_classification.py +++ b/tests/perf/test_classification.py @@ -87,16 +87,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) - def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): + def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): """Benchmark train time per iter / infer time per image.""" fxt_benchmark.track_resources = True result = fxt_benchmark.run( model_id=fxt_model_id, - tags={"benchmark": "speed"}, + tags={"benchmark": "efficiency"}, ) fxt_check_benchmark_result( result, - key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), + key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), checks=[ { "name": "avg_data_time", @@ -194,16 +194,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) - def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): + def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): """Benchmark train time per iter / infer time per image.""" fxt_benchmark.track_resources = True result = fxt_benchmark.run( model_id=fxt_model_id, - tags={"benchmark": "speed"}, + tags={"benchmark": "efficiency"}, ) fxt_check_benchmark_result( result, - key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), + key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), checks=[ { "name": "avg_data_time", @@ -301,16 +301,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) - def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): + def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): """Benchmark train time per iter / infer time per image.""" fxt_benchmark.track_resources = True result = fxt_benchmark.run( model_id=fxt_model_id, - tags={"benchmark": "speed"}, + tags={"benchmark": "efficiency"}, ) fxt_check_benchmark_result( result, - key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), + key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), checks=[ { "name": "avg_data_time", diff --git a/tests/perf/test_detection.py b/tests/perf/test_detection.py index c754549655a..c81001c1438 100644 --- a/tests/perf/test_detection.py +++ b/tests/perf/test_detection.py @@ -87,16 +87,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) - def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): + def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): """Benchmark train time per iter / infer time per image.""" fxt_benchmark.track_resources = True result = fxt_benchmark.run( model_id=fxt_model_id, - tags={"benchmark": "speed"}, + tags={"benchmark": "efficiency"}, ) fxt_check_benchmark_result( result, - key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), + key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), checks=[ { "name": "avg_data_time", diff --git a/tests/perf/test_instance_segmentation.py b/tests/perf/test_instance_segmentation.py index fc869a29a1b..bb315e4f4d6 100644 --- a/tests/perf/test_instance_segmentation.py +++ b/tests/perf/test_instance_segmentation.py @@ -88,16 +88,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) - def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): + def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): """Benchmark train time per iter / infer time per image.""" fxt_benchmark.track_resources = True result = fxt_benchmark.run( model_id=fxt_model_id, - tags={"benchmark": "speed"}, + tags={"benchmark": "efficiency"}, ) fxt_check_benchmark_result( result, - key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), + key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), checks=[ { "name": "avg_data_time", @@ -202,16 +202,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) - def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): + def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): """Benchmark train time per iter / infer time per image.""" fxt_benchmark.track_resources = True result = fxt_benchmark.run( model_id=fxt_model_id, - tags={"benchmark": "speed"}, + tags={"benchmark": "efficiency"}, ) fxt_check_benchmark_result( result, - key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), + key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), checks=[ { "name": "avg_data_time", diff --git a/tests/perf/test_semantic_segmentation.py b/tests/perf/test_semantic_segmentation.py index 62eaa01f6c0..5728ec4f057 100644 --- a/tests/perf/test_semantic_segmentation.py +++ b/tests/perf/test_semantic_segmentation.py @@ -90,16 +90,16 @@ def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_chec @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) - def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): + def test_efficiency(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark, fxt_check_benchmark_result: Callable): """Benchmark train time per iter / infer time per image.""" fxt_benchmark.track_resources = True result = fxt_benchmark.run( model_id=fxt_model_id, - tags={"benchmark": "speed"}, + tags={"benchmark": "efficiency"}, ) fxt_check_benchmark_result( result, - key=("speed", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), + key=("efficiency", fxt_benchmark.tags["task"], fxt_benchmark.tags["data_size"], fxt_model_id), checks=[ { "name": "avg_data_time", diff --git a/tests/perf/test_visual_prompting.py b/tests/perf/test_visual_prompting.py new file mode 100644 index 00000000000..5d59f7ba09c --- /dev/null +++ b/tests/perf/test_visual_prompting.py @@ -0,0 +1,4 @@ +"""OTX Visual Prompting perfomance tests.""" + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/tox.ini b/tox.ini index 6c7f7da2581..fe780508865 100644 --- a/tox.ini +++ b/tox.ini @@ -65,10 +65,6 @@ deps = -r{toxinidir}/requirements/dev.txt passenv = {[testenv]passenv} - MLFLOW_TRACKING_SERVER_URI - BENCHMARK_RESULTS_CLEAR - GH_CTX_REF_NAME - GH_CTX_SHA commands = python -m pytest -ra --showlocals --csv={toxworkdir}/{envname}.csv {posargs:tests/integration/{[testenv]test_dir}} @@ -86,6 +82,20 @@ commands = coverage xml -o {toxworkdir}/coverage.xml +[testenv:perf-benchmark] +deps = + {[testenv:tests-all-py310-pt1]deps} +extras = full +passenv = + {[testenv]passenv} + MLFLOW_TRACKING_SERVER_URI + BENCHMARK_RESULTS_CLEAR + GH_CTX_REF_NAME + GH_CTX_SHA +commands = + python -m pytest -ra --showlocals --csv={toxworkdir}/{envname}.csv {posargs:tests/perf} + + [testenv:fuzzing] deps = {[testenv:tests-all-py310-pt1]deps}