Skip to content

Commit

Permalink
[evaluation] Add non-gpt output metrics keys (#37993)
Browse files Browse the repository at this point in the history
* Remove gpt_ prefix from metric outputs

* Remove GPT_ prefix from EvaluationMetrics

* Ensure outputs are duplicated

* Update tests

* Add new logic in base_prompty_eval

* Add test for legacy key

* Add notice about new key

* Add test for QA evaluator

* Fix typos in changelog
  • Loading branch information
diondrapeck authored Oct 23, 2024
1 parent 91564ca commit 605f72e
Show file tree
Hide file tree
Showing 14 changed files with 154 additions and 71 deletions.
7 changes: 7 additions & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@

### Other Changes
- Improved error messages for the `evaluate` API by enhancing the validation of input parameters. This update provides more detailed and actionable error descriptions.
- To align with our support of a diverse set of models, the following evaluators will now have a new key in their result output without the `gpt_` prefix. To maintain backwards compatibility, the old key with the `gpt_` prefix will still be present in the output; however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
- `CoherenceEvaluator`
- `RelevanceEvaluator`
- `FluencyEvaluator`
- `GroundednessEvaluator`
- `SimilarityEvaluator`
- `RetrievalEvaluator`

## 1.0.0b4 (2024-10-16)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
class EvaluationMetrics:
"""Metrics for model evaluation."""

GPT_GROUNDEDNESS = "gpt_groundedness"
GPT_RELEVANCE = "gpt_relevance"
GPT_COHERENCE = "gpt_coherence"
GPT_FLUENCY = "gpt_fluency"
GPT_SIMILARITY = "gpt_similarity"
GROUNDEDNESS = "groundedness"
RELEVANCE = "relevance"
COHERENCE = "coherence"
FLUENCY = "fluency"
SIMILARITY = "similarity"
F1_SCORE = "f1_score"
GPT_RETRIEVAL_SCORE = "gpt_retrieval_score"
RETRIEVAL_SCORE = "retrieval_score"
HATE_FAIRNESS = "hate_fairness"
HATE_UNFAIRNESS = "hate_unfairness"
VIOLENCE = "violence"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,17 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
.. code-block:: python
{
"gpt_coherence": 1.0
"coherence": 1.0,
"gpt_coherence": 1.0,
}
Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
"""

_PROMPTY_FILE = "coherence.prompty"
_RESULT_KEY = "gpt_coherence"
_RESULT_KEY = "coherence"

@override
def __init__(self, model_config):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ class DerivedEvalInput(TypedDict, total=False):
foo: AggregateResult[float] = {
"evaluation_per_turn": {
"gpt_coherence": [1.0, 2.0, 3.0]
"coherence": [1.0, 2.0, 3.0]
},
"gpt_coherence": 2.0
"coherence": 2.0
}
"""

Expand All @@ -44,7 +44,7 @@ class DerivedEvalInput(TypedDict, total=False):
.. code-block:: python
foo: DoEvalResult[float] = {
"gpt_coherence": 2.0
"coherence": 2.0
}
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,4 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
match = re.search(r"\d", llm_output)
if match:
score = float(match.group())
return {self._result_key: float(score)}
return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,17 @@ class FluencyEvaluator(PromptyEvaluatorBase):
.. code-block:: python
{
"gpt_fluency": 4.0
"fluency": 4.0,
"gpt_fluency": 4.0,
}
Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
"""

_PROMPTY_FILE = "fluency.prompty"
_RESULT_KEY = "gpt_fluency"
_RESULT_KEY = "fluency"

@override
def __init__(self, model_config):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,17 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
.. code-block:: python
{
"gpt_groundedness": 5
"groundedness": 5,
"gpt_groundedness": 5,
}
Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
"""

_PROMPTY_FILE = "groundedness.prompty"
_RESULT_KEY = "gpt_groundedness"
_RESULT_KEY = "groundedness"

@override
def __init__(self, model_config):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ class QAEvaluator:
.. code-block:: python
{
"groundedness": 3.5,
"relevance": 4.0,
"coherence": 1.5,
"fluency": 4.0,
"similarity": 3.0,
"gpt_groundedness": 3.5,
"gpt_relevance": 4.0,
"gpt_coherence": 1.5,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,18 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
.. code-block:: python
{
"gpt_relevance": 3.0
"relevance": 3.0,
"gpt_relevance": 3.0,
}
Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
"""

# Constants must be defined within eval's directory to be save/loadable
_PROMPTY_FILE = "relevance.prompty"
_RESULT_KEY = "gpt_relevance"
_RESULT_KEY = "relevance"

@override
def __init__(self, model_config):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,11 @@ async def __call__(self, *, conversation, **kwargs):
per_turn_scores.append(math.nan)

return {
"retrieval": list_mean_nan_safe(per_turn_scores),
"gpt_retrieval": list_mean_nan_safe(per_turn_scores),
"evaluation_per_turn": {
"gpt_retrieval": per_turn_scores,
"retrieval": per_turn_scores,
},
}

Expand Down Expand Up @@ -128,10 +130,16 @@ class RetrievalEvaluator:
{
"gpt_retrieval": 3.0,
"retrieval": 3.0,
"evaluation_per_turn": {
"gpt_retrieval": [1.0, 2.0, 3.0],
"retrieval": [1.0, 2.0, 3.0]
}
}
Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
"""

def __init__(self, model_config):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ async def __call__(self, *, query: str, response: str, ground_truth: str, **kwar
if match:
score = float(match.group())

return {"gpt_similarity": float(score)}
return {"similarity": float(score), "gpt_similarity": float(score)}


class SimilarityEvaluator:
Expand All @@ -101,8 +101,13 @@ class SimilarityEvaluator:
.. code-block:: python
{
"gpt_similarity": 3.0
"similarity": 3.0,
"gpt_similarity": 3.0,
}
Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
"""

def __init__(self, model_config):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,15 @@ def test_quality_evaluator_fluency(self, model_config, simple_conversation):
response="The capital of Japan is Tokyo.",
)
assert score is not None
assert score["gpt_fluency"] > 1.0
print(f"score: {score}")
assert score["fluency"] > 1.0

# Test conversation input
score2 = eval_fn(conversation=simple_conversation)
assert score2["gpt_fluency"] > 0
assert score2["evaluation_per_turn"]["gpt_fluency"][0] > 0
assert score2["evaluation_per_turn"]["gpt_fluency"][1] > 0
print(f"score2: {score2}")
assert score2["fluency"] > 0
assert score2["evaluation_per_turn"]["fluency"][0] > 0
assert score2["evaluation_per_turn"]["fluency"][1] > 0

def test_quality_evaluator_coherence(self, model_config, simple_conversation):
eval_fn = CoherenceEvaluator(model_config)
Expand All @@ -103,13 +105,15 @@ def test_quality_evaluator_coherence(self, model_config, simple_conversation):
response="The capital of Japan is Tokyo.",
)
assert score is not None
assert score["gpt_coherence"] > 1.0
print(f"score: {score}")
assert score["coherence"] > 1.0

# Test conversation input
score2 = eval_fn(conversation=simple_conversation)
assert score2["gpt_coherence"] > 0
assert score2["evaluation_per_turn"]["gpt_coherence"][0] > 0
assert score2["evaluation_per_turn"]["gpt_coherence"][1] > 0
print(f"score2: {score2}")
assert score2["coherence"] > 0
assert score2["evaluation_per_turn"]["coherence"][0] > 0
assert score2["evaluation_per_turn"]["coherence"][1] > 0

def test_quality_evaluator_similarity(self, model_config):
eval_fn = SimilarityEvaluator(model_config)
Expand All @@ -119,7 +123,8 @@ def test_quality_evaluator_similarity(self, model_config):
ground_truth="Tokyo is Japan's capital.",
)
assert score is not None
assert score["gpt_similarity"] > 1.0
print(f"score: {score}")
assert score["similarity"] > 1.0

def test_quality_evaluator_groundedness(self, model_config, simple_conversation):
eval_fn = GroundednessEvaluator(model_config)
Expand All @@ -128,13 +133,15 @@ def test_quality_evaluator_groundedness(self, model_config, simple_conversation)
context="Tokyo is Japan's capital.",
)
assert score is not None
assert score["gpt_groundedness"] > 1.0
print(f"score: {score}")
assert score["groundedness"] > 1.0

# Test conversation input
score2 = eval_fn(conversation=simple_conversation)
assert score2["gpt_groundedness"] > 0
assert score2["evaluation_per_turn"]["gpt_groundedness"][0] > 0
assert score2["evaluation_per_turn"]["gpt_groundedness"][1] > 0
print(f"score2: {score2}")
assert score2["groundedness"] > 0
assert score2["evaluation_per_turn"]["groundedness"][0] > 0
assert score2["evaluation_per_turn"]["groundedness"][1] > 0

def test_quality_evaluator_relevance(self, model_config, simple_conversation):
eval_fn = RelevanceEvaluator(model_config)
Expand All @@ -144,13 +151,15 @@ def test_quality_evaluator_relevance(self, model_config, simple_conversation):
context="Tokyo is Japan's capital.",
)
assert score is not None
assert score["gpt_relevance"] > 1.0
print(f"score: {score}")
assert score["relevance"] > 1.0

# Test conversation input
score2 = eval_fn(conversation=simple_conversation)
assert score2["gpt_relevance"] > 0
assert score2["evaluation_per_turn"]["gpt_relevance"][0] > 0
assert score2["evaluation_per_turn"]["gpt_relevance"][1] > 0
print(f"score2: {score2}")
assert score2["relevance"] > 0
assert score2["evaluation_per_turn"]["relevance"][0] > 0
assert score2["evaluation_per_turn"]["relevance"][1] > 0

def test_quality_evaluator_f1_score(self):
eval_fn = F1ScoreEvaluator()
Expand All @@ -168,7 +177,7 @@ def test_quality_evaluator_prompt_based_with_dict_input(self, model_config):
response={"bar": 2},
)
assert score is not None
assert score["gpt_fluency"] > 0.0
assert score["fluency"] > 0.0

def test_content_safety_evaluator_violence(self, project_scope, azure_cred, simple_conversation):
eval_fn = ViolenceEvaluator(azure_cred, project_scope)
Expand Down Expand Up @@ -298,11 +307,11 @@ def test_composite_evaluator_qa(self, model_config, parallel):
)

assert score is not None
assert score["gpt_groundedness"] > 0.0
assert score["gpt_relevance"] > 0.0
assert score["gpt_coherence"] > 0.0
assert score["gpt_fluency"] > 0.0
assert score["gpt_similarity"] > 0.0
assert score["groundedness"] > 0.0
assert score["relevance"] > 0.0
assert score["coherence"] > 0.0
assert score["fluency"] > 0.0
assert score["similarity"] > 0.0
assert score["f1_score"] > 0.0

@pytest.mark.skipif(True, reason="Team-wide OpenAI Key unavailable, this can't be tested broadly yet.")
Expand All @@ -318,23 +327,23 @@ def test_composite_evaluator_qa_with_openai_config(self, non_azure_openai_model_
)

assert score is not None
assert score["gpt_groundedness"] > 0.0
assert score["gpt_relevance"] > 0.0
assert score["gpt_coherence"] > 0.0
assert score["gpt_fluency"] > 0.0
assert score["gpt_similarity"] > 0.0
assert score["groundedness"] == score["gpt_groundedness"] > 0.0
assert score["relevance"] == score["gpt_relevance"] > 0.0
assert score["coherence"] == score["gpt_coherence"] > 0.0
assert score["fluency"] == score["gpt_fluency"] > 0.0
assert score["similarity"] == score["gpt_similarity"] > 0.0
assert score["f1_score"] > 0.0

def test_composite_evaluator_qa_for_nans(self, model_config):
qa_eval = QAEvaluator(model_config)
# Test Q/A below would cause NaNs in the evaluation metrics before the fix.
score = qa_eval(query="This's the color?", response="Black", ground_truth="gray", context="gray")

assert not math.isnan(score["gpt_groundedness"])
assert not math.isnan(score["gpt_relevance"])
assert not math.isnan(score["gpt_coherence"])
assert not math.isnan(score["gpt_fluency"])
assert not math.isnan(score["gpt_similarity"])
assert not math.isnan(score["groundedness"])
assert not math.isnan(score["relevance"])
assert not math.isnan(score["coherence"])
assert not math.isnan(score["fluency"])
assert not math.isnan(score["similarity"])

def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
safety_eval = ContentSafetyEvaluator(azure_cred, project_scope, parallel=False)
Expand Down
Loading

0 comments on commit 605f72e

Please sign in to comment.