diff --git a/tests/regression/regression_command.py b/tests/regression/regression_command.py index 5ffd95d3053..121f2c33efb 100644 --- a/tests/regression/regression_command.py +++ b/tests/regression/regression_command.py @@ -130,6 +130,8 @@ def regression_openvino_testing( model_criteria = criteria[template.name] * (1.0 - reg_threshold) for k in trained_performance.keys(): + if k == "avg_time_per_image": + continue result_dict[k] = round(exported_performance[k], 3) if exported_performance[k] < model_criteria: regression_result["passed"] = False @@ -180,6 +182,8 @@ def regression_deployment_testing( modified_criteria = model_criteria - (model_criteria * reg_threshold) for k in exported_performance.keys(): + if k == "avg_time_per_image": + continue if isinstance(criteria, dict) and template.name in criteria.keys(): result_dict[k] = round(deployed_performance[k], 3) if deployed_performance[k] < modified_criteria: diff --git a/tests/test_suite/run_test_command.py b/tests/test_suite/run_test_command.py index 0ba7e21470f..c40d092bffd 100644 --- a/tests/test_suite/run_test_command.py +++ b/tests/test_suite/run_test_command.py @@ -10,7 +10,7 @@ import sys import torch from pathlib import Path -from typing import Dict +from typing import Dict, Union import onnx import onnxruntime @@ -349,11 +349,7 @@ def otx_eval_openvino_testing( with open(perf_path) as read_file: exported_performance = json.load(read_file) - for k in trained_performance.keys(): - assert ( - exported_performance[k] >= trained_performance[k] - or abs(trained_performance[k] - exported_performance[k]) / (trained_performance[k] + 1e-10) <= threshold - ), f"{trained_performance[k]=}, {exported_performance[k]=}" + compare_model_accuracy(exported_performance, trained_performance, threshold) def otx_demo_testing(template, root, otx_dir, args): @@ -494,11 +490,7 @@ def otx_eval_deployment_testing(template, root, otx_dir, args, threshold=0.0): with open(f"{template_work_dir}/deployed_{template.model_template_id}/performance.json") as read_file: deployed_performance = json.load(read_file) - for k in exported_performance.keys(): - assert ( - deployed_performance[k] >= exported_performance[k] - or abs(exported_performance[k] - deployed_performance[k]) / (exported_performance[k] + 1e-10) <= threshold - ), f"{exported_performance[k]=}, {deployed_performance[k]=}" + compare_model_accuracy(deployed_performance, deployed_performance, threshold) def otx_demo_deployment_testing(template, root, otx_dir, args): @@ -745,11 +737,7 @@ def nncf_eval_testing(template, root, otx_dir, args, threshold=0.01): with open(f"{template_work_dir}/nncf_{template.model_template_id}/performance.json") as read_file: evaluated_performance = json.load(read_file) - for k in trained_performance.keys(): - assert ( - evaluated_performance[k] >= trained_performance[k] - or abs(trained_performance[k] - evaluated_performance[k]) / (trained_performance[k] + 1e-10) <= threshold - ), f"{trained_performance[k]=}, {evaluated_performance[k]=}" + compare_model_accuracy(evaluated_performance, trained_performance, threshold) def nncf_eval_openvino_testing(template, root, otx_dir, args): @@ -1174,3 +1162,13 @@ def test_default_for_task(self): assert num_default_model == 1 return _TestModelTemplates + + +def compare_model_accuracy(performance_to_test: Dict, target_performance: Dict, threshold: Union[float, int]): + for k in target_performance.keys(): + if k == "avg_time_per_image": + continue + assert ( + performance_to_test[k] >= target_performance[k] + or abs(target_performance[k] - performance_to_test[k]) / (target_performance[k] + 1e-10) <= threshold + ), f"{target_performance[k]=}, {performance_to_test[k]=}" diff --git a/tools/experiment.py b/tools/experiment.py index 70e48bad9de..311b7641c2d 100644 --- a/tools/experiment.py +++ b/tools/experiment.py @@ -192,11 +192,15 @@ def get_exp_result(self): def _calculate_avg_std_per_iter(self): if self._iter_time_arr: self._exp_result.avg_iter_time = statistics.mean(self._iter_time_arr) - self._exp_result.std_iter_time = statistics.stdev(self._iter_time_arr) + self._exp_result.std_iter_time = ( + statistics.stdev(self._iter_time_arr) if len(self._iter_time_arr) > 1 else 0 + ) if self._data_time_arr: self._exp_result.avg_data_time = statistics.mean(self._data_time_arr) - self._exp_result.std_data_time = statistics.stdev(self._data_time_arr) + self._exp_result.std_data_time = ( + statistics.stdev(self._data_time_arr) if len(self._data_time_arr) > 1 else 0 + ) def _parse_eval_output(self, file_path: Path): # NOTE: It is assumed that performance.json has key named either score or avg_time_per_image