[evaluation] Add non-gpt output metrics keys (#37993)

* Remove gpt_ prefix from metric outputs * Remove GPT_ prefix from EvaluationMetrics * Ensure outputs are duplicated * Update tests * Add new logic in base_prompty_eval * Add test for legacy key * Add notice about new key * Add test for QA evaluator * Fix typos in changelog
Azure · Oct 23, 2024 · 605f72e · 605f72e
1 parent 91564ca
commit 605f72e
Show file tree

Hide file tree

Showing 14 changed files with 154 additions and 71 deletions.
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -24,6 +24,13 @@
 
 ### Other Changes
 - Improved error messages for the `evaluate` API by enhancing the validation of input parameters. This update provides more detailed and actionable error descriptions.
+- To align with our support of a diverse set of models, the following evaluators will now have a new key in their result output without the `gpt_` prefix. To maintain backwards compatibility, the old key with the `gpt_` prefix will still be present in the output; however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+  - `CoherenceEvaluator`
+  - `RelevanceEvaluator`
+  - `FluencyEvaluator`
+  - `GroundednessEvaluator`
+  - `SimilarityEvaluator`
+  - `RetrievalEvaluator`
 
 ## 1.0.0b4 (2024-10-16)
 

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py
@@ -7,13 +7,13 @@
 class EvaluationMetrics:
     """Metrics for model evaluation."""
 
-    GPT_GROUNDEDNESS = "gpt_groundedness"
-    GPT_RELEVANCE = "gpt_relevance"
-    GPT_COHERENCE = "gpt_coherence"
-    GPT_FLUENCY = "gpt_fluency"
-    GPT_SIMILARITY = "gpt_similarity"
+    GROUNDEDNESS = "groundedness"
+    RELEVANCE = "relevance"
+    COHERENCE = "coherence"
+    FLUENCY = "fluency"
+    SIMILARITY = "similarity"
     F1_SCORE = "f1_score"
-    GPT_RETRIEVAL_SCORE = "gpt_retrieval_score"
+    RETRIEVAL_SCORE = "retrieval_score"
     HATE_FAIRNESS = "hate_fairness"
     HATE_UNFAIRNESS = "hate_unfairness"
     VIOLENCE = "violence"

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/_coherence.py
@@ -31,12 +31,17 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
     .. code-block:: python
 
         {
-            "gpt_coherence": 1.0
+            "coherence": 1.0,
+            "gpt_coherence": 1.0,
         }
+
+    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
+    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
 
     _PROMPTY_FILE = "coherence.prompty"
-    _RESULT_KEY = "gpt_coherence"
+    _RESULT_KEY = "coherence"
 
     @override
     def __init__(self, model_config):

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -32,9 +32,9 @@ class DerivedEvalInput(TypedDict, total=False):
 
     foo: AggregateResult[float] = {
         "evaluation_per_turn": {
-            "gpt_coherence": [1.0, 2.0, 3.0]
+            "coherence": [1.0, 2.0, 3.0]
         },
-        "gpt_coherence": 2.0
+        "coherence": 2.0
     }
 """
 
@@ -44,7 +44,7 @@ class DerivedEvalInput(TypedDict, total=False):
     .. code-block:: python
 
     foo: DoEvalResult[float] = {
-        "gpt_coherence": 2.0
+        "coherence": 2.0
     }
 """
 

diff --git a/...luation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/...luation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
@@ -73,4 +73,4 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
             match = re.search(r"\d", llm_output)
             if match:
                 score = float(match.group())
-        return {self._result_key: float(score)}
+        return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_fluency/_fluency.py
@@ -32,12 +32,17 @@ class FluencyEvaluator(PromptyEvaluatorBase):
     .. code-block:: python
 
         {
-            "gpt_fluency": 4.0
+            "fluency": 4.0,
+            "gpt_fluency": 4.0,
         }
+
+    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
+    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
 
     _PROMPTY_FILE = "fluency.prompty"
-    _RESULT_KEY = "gpt_fluency"
+    _RESULT_KEY = "fluency"
 
     @override
     def __init__(self, model_config):

diff --git a/...uation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py b/...uation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py
@@ -32,12 +32,17 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
     .. code-block:: python
 
         {
-            "gpt_groundedness": 5
+            "groundedness": 5,
+            "gpt_groundedness": 5,
         }
+
+    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
+    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
 
     _PROMPTY_FILE = "groundedness.prompty"
-    _RESULT_KEY = "gpt_groundedness"
+    _RESULT_KEY = "groundedness"
 
     @override
     def __init__(self, model_config):

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_qa/_qa.py
@@ -41,6 +41,11 @@ class QAEvaluator:
     .. code-block:: python
 
         {
+            "groundedness": 3.5,
+            "relevance": 4.0,
+            "coherence": 1.5,
+            "fluency": 4.0,
+            "similarity": 3.0,
             "gpt_groundedness": 3.5,
             "gpt_relevance": 4.0,
             "gpt_coherence": 1.5,

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/_relevance.py
@@ -34,13 +34,18 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
     .. code-block:: python
 
         {
-            "gpt_relevance": 3.0
+            "relevance": 3.0,
+            "gpt_relevance": 3.0,
         }
+
+    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
+    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
 
     # Constants must be defined within eval's directory to be save/loadable
     _PROMPTY_FILE = "relevance.prompty"
-    _RESULT_KEY = "gpt_relevance"
+    _RESULT_KEY = "relevance"
 
     @override
     def __init__(self, model_config):

diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py
@@ -87,9 +87,11 @@ async def __call__(self, *, conversation, **kwargs):
                 per_turn_scores.append(math.nan)
 
         return {
+            "retrieval": list_mean_nan_safe(per_turn_scores),
             "gpt_retrieval": list_mean_nan_safe(per_turn_scores),
             "evaluation_per_turn": {
                 "gpt_retrieval": per_turn_scores,
+                "retrieval": per_turn_scores,
             },
         }
 
@@ -128,10 +130,16 @@ class RetrievalEvaluator:
 
         {
             "gpt_retrieval": 3.0,
+            "retrieval": 3.0,
             "evaluation_per_turn": {
                 "gpt_retrieval": [1.0, 2.0, 3.0],
+                "retrieval": [1.0, 2.0, 3.0]
             }
         }
+
+    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
+    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
 
     def __init__(self, model_config):

diff --git a/...evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py b/...evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/_similarity.py
@@ -75,7 +75,7 @@ async def __call__(self, *, query: str, response: str, ground_truth: str, **kwar
             if match:
                 score = float(match.group())
 
-        return {"gpt_similarity": float(score)}
+        return {"similarity": float(score), "gpt_similarity": float(score)}
 
 
 class SimilarityEvaluator:
@@ -101,8 +101,13 @@ class SimilarityEvaluator:
     .. code-block:: python
 
         {
-            "gpt_similarity": 3.0
+            "similarity": 3.0,
+            "gpt_similarity": 3.0,
         }
+
+    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
+    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
 
     def __init__(self, model_config):

diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py
@@ -88,13 +88,15 @@ def test_quality_evaluator_fluency(self, model_config, simple_conversation):
             response="The capital of Japan is Tokyo.",
         )
         assert score is not None
-        assert score["gpt_fluency"] > 1.0
+        print(f"score: {score}")
+        assert score["fluency"] > 1.0
 
         # Test conversation input
         score2 = eval_fn(conversation=simple_conversation)
-        assert score2["gpt_fluency"] > 0
-        assert score2["evaluation_per_turn"]["gpt_fluency"][0] > 0
-        assert score2["evaluation_per_turn"]["gpt_fluency"][1] > 0
+        print(f"score2: {score2}")
+        assert score2["fluency"] > 0
+        assert score2["evaluation_per_turn"]["fluency"][0] > 0
+        assert score2["evaluation_per_turn"]["fluency"][1] > 0
 
     def test_quality_evaluator_coherence(self, model_config, simple_conversation):
         eval_fn = CoherenceEvaluator(model_config)
@@ -103,13 +105,15 @@ def test_quality_evaluator_coherence(self, model_config, simple_conversation):
             response="The capital of Japan is Tokyo.",
         )
         assert score is not None
-        assert score["gpt_coherence"] > 1.0
+        print(f"score: {score}")
+        assert score["coherence"] > 1.0
 
         # Test conversation input
         score2 = eval_fn(conversation=simple_conversation)
-        assert score2["gpt_coherence"] > 0
-        assert score2["evaluation_per_turn"]["gpt_coherence"][0] > 0
-        assert score2["evaluation_per_turn"]["gpt_coherence"][1] > 0
+        print(f"score2: {score2}")
+        assert score2["coherence"] > 0
+        assert score2["evaluation_per_turn"]["coherence"][0] > 0
+        assert score2["evaluation_per_turn"]["coherence"][1] > 0
 
     def test_quality_evaluator_similarity(self, model_config):
         eval_fn = SimilarityEvaluator(model_config)
@@ -119,7 +123,8 @@ def test_quality_evaluator_similarity(self, model_config):
             ground_truth="Tokyo is Japan's capital.",
         )
         assert score is not None
-        assert score["gpt_similarity"] > 1.0
+        print(f"score: {score}")
+        assert score["similarity"] > 1.0
 
     def test_quality_evaluator_groundedness(self, model_config, simple_conversation):
         eval_fn = GroundednessEvaluator(model_config)
@@ -128,13 +133,15 @@ def test_quality_evaluator_groundedness(self, model_config, simple_conversation)
             context="Tokyo is Japan's capital.",
         )
         assert score is not None
-        assert score["gpt_groundedness"] > 1.0
+        print(f"score: {score}")
+        assert score["groundedness"] > 1.0
 
         # Test conversation input
         score2 = eval_fn(conversation=simple_conversation)
-        assert score2["gpt_groundedness"] > 0
-        assert score2["evaluation_per_turn"]["gpt_groundedness"][0] > 0
-        assert score2["evaluation_per_turn"]["gpt_groundedness"][1] > 0
+        print(f"score2: {score2}")
+        assert score2["groundedness"] > 0
+        assert score2["evaluation_per_turn"]["groundedness"][0] > 0
+        assert score2["evaluation_per_turn"]["groundedness"][1] > 0
 
     def test_quality_evaluator_relevance(self, model_config, simple_conversation):
         eval_fn = RelevanceEvaluator(model_config)
@@ -144,13 +151,15 @@ def test_quality_evaluator_relevance(self, model_config, simple_conversation):
             context="Tokyo is Japan's capital.",
         )
         assert score is not None
-        assert score["gpt_relevance"] > 1.0
+        print(f"score: {score}")
+        assert score["relevance"] > 1.0
 
         # Test conversation input
         score2 = eval_fn(conversation=simple_conversation)
-        assert score2["gpt_relevance"] > 0
-        assert score2["evaluation_per_turn"]["gpt_relevance"][0] > 0
-        assert score2["evaluation_per_turn"]["gpt_relevance"][1] > 0
+        print(f"score2: {score2}")
+        assert score2["relevance"] > 0
+        assert score2["evaluation_per_turn"]["relevance"][0] > 0
+        assert score2["evaluation_per_turn"]["relevance"][1] > 0
 
     def test_quality_evaluator_f1_score(self):
         eval_fn = F1ScoreEvaluator()
@@ -168,7 +177,7 @@ def test_quality_evaluator_prompt_based_with_dict_input(self, model_config):
             response={"bar": 2},
         )
         assert score is not None
-        assert score["gpt_fluency"] > 0.0
+        assert score["fluency"] > 0.0
 
     def test_content_safety_evaluator_violence(self, project_scope, azure_cred, simple_conversation):
         eval_fn = ViolenceEvaluator(azure_cred, project_scope)
@@ -298,11 +307,11 @@ def test_composite_evaluator_qa(self, model_config, parallel):
         )
 
         assert score is not None
-        assert score["gpt_groundedness"] > 0.0
-        assert score["gpt_relevance"] > 0.0
-        assert score["gpt_coherence"] > 0.0
-        assert score["gpt_fluency"] > 0.0
-        assert score["gpt_similarity"] > 0.0
+        assert score["groundedness"] > 0.0
+        assert score["relevance"] > 0.0
+        assert score["coherence"] > 0.0
+        assert score["fluency"] > 0.0
+        assert score["similarity"] > 0.0
         assert score["f1_score"] > 0.0
 
     @pytest.mark.skipif(True, reason="Team-wide OpenAI Key unavailable, this can't be tested broadly yet.")
@@ -318,23 +327,23 @@ def test_composite_evaluator_qa_with_openai_config(self, non_azure_openai_model_
         )
 
         assert score is not None
-        assert score["gpt_groundedness"] > 0.0
-        assert score["gpt_relevance"] > 0.0
-        assert score["gpt_coherence"] > 0.0
-        assert score["gpt_fluency"] > 0.0
-        assert score["gpt_similarity"] > 0.0
+        assert score["groundedness"] == score["gpt_groundedness"] > 0.0
+        assert score["relevance"] == score["gpt_relevance"] > 0.0
+        assert score["coherence"] == score["gpt_coherence"] > 0.0
+        assert score["fluency"] == score["gpt_fluency"] > 0.0
+        assert score["similarity"] == score["gpt_similarity"] > 0.0
         assert score["f1_score"] > 0.0
 
     def test_composite_evaluator_qa_for_nans(self, model_config):
         qa_eval = QAEvaluator(model_config)
         # Test Q/A below would cause NaNs in the evaluation metrics before the fix.
         score = qa_eval(query="This's the color?", response="Black", ground_truth="gray", context="gray")
 
-        assert not math.isnan(score["gpt_groundedness"])
-        assert not math.isnan(score["gpt_relevance"])
-        assert not math.isnan(score["gpt_coherence"])
-        assert not math.isnan(score["gpt_fluency"])
-        assert not math.isnan(score["gpt_similarity"])
+        assert not math.isnan(score["groundedness"])
+        assert not math.isnan(score["relevance"])
+        assert not math.isnan(score["coherence"])
+        assert not math.isnan(score["fluency"])
+        assert not math.isnan(score["similarity"])
 
     def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
         safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=False)