Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jacky/twi 332 add overall metric #23

Merged
merged 12 commits into from
Aug 21, 2023
Merged
65 changes: 65 additions & 0 deletions deepeval/metrics/alert_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Alert Score
"""
from .metric import Metric
from .entailment_metric import EntailmentScoreMetric

# from .answer_relevancy import AnswerRelevancy


class AlertScore(Metric):
def __init__(self, success_threshold: float = 0.5):
self.success_threshold = success_threshold
self.entailment_metric = EntailmentScoreMetric()
# self.answer_relevancy = AnswerRelevancy()

def __call__(self, generated_text: str, expected_output: str, context: str):
score = self.measure(generated_text, expected_output, context)
return score

def measure(
self, query: str, generated_text: str, expected_output: str, context: str
) -> float:

entailment_score = self.entailment_metric.measure(
context,
generated_text,
)

answer_expected_score = self.entailment_metric.measure(
generated_text,
expected_output,
)

# This metric is very very bad right now as it requires the answer
# to re-gurgitate the question.
# answer_relevancy_score = self.answer_relevancy.measure(
# query=query, answer=generated_text
# )
alert_score = min(entailment_score, answer_expected_score)
self.success = alert_score > self.success_threshold
return alert_score

def is_successful(self) -> bool:
return self.success

@property
def __name__(self):
return "Alert Score"


def assert_alert_score(
query: str,
generated_text: str,
expected_output: str,
context: str,
success_threshold: float = 0.5,
):
"""Create alert score."""
metric = AlertScore(success_threshold=success_threshold)
score = metric.measure(
query=query,
generated_text=generated_text,
expected_output=expected_output,
context=context,
)
assert metric.is_successful(), f"Found issue - Alert score: {score}"
6 changes: 6 additions & 0 deletions deepeval/metrics/answer_relevancy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import asyncio
from .metric import Metric
import numpy as np


def sigmoid(x):
return 1 / (1 + np.exp(-x))


class AnswerRelevancy(Metric):
Expand All @@ -23,6 +28,7 @@ def __call__(self, query: str, answer: str):

def measure(self, query, answer: str) -> float:
score = self.encoder.predict([query, answer])
score = sigmoid(score)
self.success = score > self.success_threshold
return score

Expand Down
10 changes: 1 addition & 9 deletions deepeval/metrics/bertscore_metric.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
from .metric import Metric
from typing import Optional
import numpy as np


def cosine_similarity(vector_a, vector_b):
dot_product = np.dot(vector_a, vector_b)
norm_a = np.linalg.norm(vector_a)
norm_b = np.linalg.norm(vector_b)
similarity = dot_product / (norm_a * norm_b)
return similarity
from ..utils import cosine_similarity


class BertScoreMetric(Metric):
Expand Down
54 changes: 54 additions & 0 deletions deepeval/metrics/bleu_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Sample Metric for BLEU
import nltk
from nltk.util import ngrams
import numpy as np
from .metric import Metric


class BLEUMetric(Metric):
def __init__(self, success_threshold: float = 0.5):
self.success_threshold = success_threshold

def compute_bleu(
self, candidate: str, reference: str, weights=(0.25, 0.25, 0.25, 0.25)
):
"""
Compute BLEU score for a candidate sentence given a reference sentence.

:param candidate: The candidate sentence as a string.
:param reference: The reference sentence as a string.
:param weights: Weights for the n-gram precisions, default is uniform (0.25 for each).
:return: BLEU score.
"""
candidate = (
candidate.split()
) # Convert the candidate string to a list of tokens
reference = (
reference.split()
) # Convert the reference string to a list of tokens

precisions = []

for i in range(1, 5): # Compute BLEU for 1 to 4-grams
candidate_ngrams = ngrams(candidate, i)
candidate_ngram_freq = nltk.FreqDist(candidate_ngrams)

reference_ngrams = ngrams(reference, i)
reference_ngram_freq = nltk.FreqDist(reference_ngrams)

clipped_counts = {
ngram: min(candidate_ngram_freq[ngram], reference_ngram_freq[ngram])
for ngram in candidate_ngram_freq
}
precision = sum(clipped_counts.values()) / sum(
candidate_ngram_freq.values()
)
precisions.append(precision)

brevity_penalty = min(1, len(candidate) / len(reference))

bleu = brevity_penalty * np.exp(
np.mean([w * np.log(p) for w, p in zip(weights, precisions)])
)

return bleu
38 changes: 38 additions & 0 deletions deepeval/metrics/conceptual_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Asserting conceptual similarity
"""
from typing import Optional
from .metric import Metric
from ..utils import cosine_similarity


class ConceptualSimilarityMetric(Metric):
"""basic implementation of BertScoreMetric"""

def __init__(
self,
model_name: Optional[str] = "sentence-transformers/all-mpnet-base-v2",
success_threshold: float = 0.7,
):
from sentence_transformers import SentenceTransformer

self.model_name = model_name
self.model = SentenceTransformer(self.model_name).eval()
self.success_threshold = success_threshold

def _vectorize(self, text_a: str, text_b: str):
vectors = self.model.encode([text_a, text_b])
return vectors

def measure(self, a: str, b: str):
vectors = self._vectorize(a, b)
self.score = cosine_similarity(vectors[0], vectors[1])
return self.score

def is_successful(self) -> bool:
return self.score >= self.success_threshold


def assert_conceptual_similarity(text_1: str, text_2: str, success_threshold=0.3):
metric = ConceptualSimilarityMetric(success_threshold=success_threshold)
score = metric.measure(text_1, text_2)
assert metric.is_successful(), f"Metric is not conceptually similar - got {score}"
4 changes: 4 additions & 0 deletions deepeval/metrics/entailment_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,7 @@ def measure(self, a: str, b: str):

def is_successful(self) -> bool:
return self.success

@property
def __name__(self):
return "Entailment"
46 changes: 46 additions & 0 deletions deepeval/metrics/overall_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Alert Score
"""
from .metric import Metric
from .entailment_metric import EntailmentScoreMetric
from .answer_relevancy import AnswerRelevancy


class OverallScore(Metric):
def __init__(self, success_threshold: float = 0.5):
self.success_threshold = success_threshold
self.entailment_metric = EntailmentScoreMetric()
self.answer_relevancy = AnswerRelevancy()

def __call__(self, generated_output: str, expected_output: str, context: str):
score = self.measure(generated_output, expected_output, context)
return score

def measure(
self, generated_output: str, expected_output: str, context: str
) -> float:
entailment_score = self.entailment_metric.measure(
generated_output,
context,
)
answer_relevancy_score = self.answer_relevancy.measure(
generated_output, expected_output
)
alert_score = 0.5 * entailment_score + 0.5 * answer_relevancy_score
self.success = alert_score > self.success_threshold
return alert_score

def is_successful(self) -> bool:
return self.success

def __name__(self):
return "Alert Score"


def assert_overall_score(
generated_output: str, expected_output: str, success_threshold: float = 0.5
):
metric = OverallScore(success_threshold=success_threshold)
score = metric.measure(
generated_output=generated_output, expected_output=expected_output
)
assert metric.is_successful(), f"Metric is not conceptually similar - got {score}"
8 changes: 8 additions & 0 deletions deepeval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,11 @@
def softmax(x):
e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
return e_x / e_x.sum(axis=1, keepdims=True)


def cosine_similarity(vector_a, vector_b):
dot_product = np.dot(vector_a, vector_b)
norm_a = np.linalg.norm(vector_a)
norm_b = np.linalg.norm(vector_b)
similarity = dot_product / (norm_a * norm_b)
return similarity
35 changes: 35 additions & 0 deletions docs/docs/measuring_llm_performance/conceptual_similarity.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Conceptual Similarity

Asserting for conceptual similarity allows developers to ensure that the expected answer and the generated answer are similar in terms of what is mentioned (even if the overall message can vary quite a bit.)

## What is it?

- Neural network embeddings are designed to represent the semantic meaning of words or concepts in a continuous vector space. These embeddings aim to capture the relationships and similarities between words or concepts based on their intrinsic properties.
- Techniques like word2vec, GloVe, and BERT embeddings are trained to learn the meaning and relationships between words or concepts from large text corpora. They excel at capturing the underlying semantics and conceptual associations between words.
- These embeddings are often used in various natural language processing (NLP) tasks like word similarity, text classification, and sentiment analysis, where understanding the meaning and similarity of words or concepts is crucial.

## Assert Conceptual Similarity

```python
from deepeval.metrics.conceptual_similarity import assert_conceptual_similarity

assert_conceptual_similarity(
output="python is a programming language",
expected_output="Python is a snake.",
success_threshold=0.3
)
```

## Conceptual Similarity As A Metric

```python
from deepeval.metrics.conceptual_similarity import ConceptualSimilarityMetric

metric = ConceptualSimilarityMetric(success_threshold=0.3)
score = metric.measure(text_1="Python is a programming language.", text_2="Python is a snake.")
metric.is_successful()
```

### Parameters

- `success_threshold` - the minimum score required a metric to be successful
1 change: 1 addition & 0 deletions docs/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ const sidebars = {
'measuring_llm_performance/factual_consistency',
'measuring_llm_performance/answer_relevancy',
"measuring_llm_performance/ranking_similarity",
"measuring_llm_performance/conceptual_similarity"
]
},
{
Expand Down
15 changes: 15 additions & 0 deletions tests/test_alert_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""Test alert score
"""

import pytest
from deepeval.metrics.alert_score import assert_alert_score


@pytest.mark.asyncio
async def test_alert_score():
assert_alert_score(
query="Who won the FIFA World Cup in 2018?",
generated_text="Winners of the FIFA world cup were the French national football team",
expected_output="French national football team",
context="The FIFA World Cup in 2018 was won by the French national football team. They defeated Croatia 4-2 in the final match to claim the championship.",
)