diff --git a/deepeval/metrics/alert_score.py b/deepeval/metrics/alert_score.py new file mode 100644 index 000000000..2177c93cf --- /dev/null +++ b/deepeval/metrics/alert_score.py @@ -0,0 +1,65 @@ +"""Alert Score +""" +from .metric import Metric +from .entailment_metric import EntailmentScoreMetric + +# from .answer_relevancy import AnswerRelevancy + + +class AlertScore(Metric): + def __init__(self, success_threshold: float = 0.5): + self.success_threshold = success_threshold + self.entailment_metric = EntailmentScoreMetric() + # self.answer_relevancy = AnswerRelevancy() + + def __call__(self, generated_text: str, expected_output: str, context: str): + score = self.measure(generated_text, expected_output, context) + return score + + def measure( + self, query: str, generated_text: str, expected_output: str, context: str + ) -> float: + + entailment_score = self.entailment_metric.measure( + context, + generated_text, + ) + + answer_expected_score = self.entailment_metric.measure( + generated_text, + expected_output, + ) + + # This metric is very very bad right now as it requires the answer + # to re-gurgitate the question. + # answer_relevancy_score = self.answer_relevancy.measure( + # query=query, answer=generated_text + # ) + alert_score = min(entailment_score, answer_expected_score) + self.success = alert_score > self.success_threshold + return alert_score + + def is_successful(self) -> bool: + return self.success + + @property + def __name__(self): + return "Alert Score" + + +def assert_alert_score( + query: str, + generated_text: str, + expected_output: str, + context: str, + success_threshold: float = 0.5, +): + """Create alert score.""" + metric = AlertScore(success_threshold=success_threshold) + score = metric.measure( + query=query, + generated_text=generated_text, + expected_output=expected_output, + context=context, + ) + assert metric.is_successful(), f"Found issue - Alert score: {score}" diff --git a/deepeval/metrics/answer_relevancy.py b/deepeval/metrics/answer_relevancy.py index dcad67ed4..389cc60cd 100644 --- a/deepeval/metrics/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy.py @@ -1,5 +1,10 @@ import asyncio from .metric import Metric +import numpy as np + + +def sigmoid(x): + return 1 / (1 + np.exp(-x)) class AnswerRelevancy(Metric): @@ -23,6 +28,7 @@ def __call__(self, query: str, answer: str): def measure(self, query, answer: str) -> float: score = self.encoder.predict([query, answer]) + score = sigmoid(score) self.success = score > self.success_threshold return score diff --git a/deepeval/metrics/bertscore_metric.py b/deepeval/metrics/bertscore_metric.py index 2c90f32a5..b48dd212c 100644 --- a/deepeval/metrics/bertscore_metric.py +++ b/deepeval/metrics/bertscore_metric.py @@ -1,14 +1,6 @@ from .metric import Metric from typing import Optional -import numpy as np - - -def cosine_similarity(vector_a, vector_b): - dot_product = np.dot(vector_a, vector_b) - norm_a = np.linalg.norm(vector_a) - norm_b = np.linalg.norm(vector_b) - similarity = dot_product / (norm_a * norm_b) - return similarity +from ..utils import cosine_similarity class BertScoreMetric(Metric): diff --git a/deepeval/metrics/bleu_metric.py b/deepeval/metrics/bleu_metric.py new file mode 100644 index 000000000..79c5a4c5f --- /dev/null +++ b/deepeval/metrics/bleu_metric.py @@ -0,0 +1,54 @@ +# Sample Metric for BLEU +import nltk +from nltk.util import ngrams +import numpy as np +from .metric import Metric + + +class BLEUMetric(Metric): + def __init__(self, success_threshold: float = 0.5): + self.success_threshold = success_threshold + + def compute_bleu( + self, candidate: str, reference: str, weights=(0.25, 0.25, 0.25, 0.25) + ): + """ + Compute BLEU score for a candidate sentence given a reference sentence. + + :param candidate: The candidate sentence as a string. + :param reference: The reference sentence as a string. + :param weights: Weights for the n-gram precisions, default is uniform (0.25 for each). + :return: BLEU score. + """ + candidate = ( + candidate.split() + ) # Convert the candidate string to a list of tokens + reference = ( + reference.split() + ) # Convert the reference string to a list of tokens + + precisions = [] + + for i in range(1, 5): # Compute BLEU for 1 to 4-grams + candidate_ngrams = ngrams(candidate, i) + candidate_ngram_freq = nltk.FreqDist(candidate_ngrams) + + reference_ngrams = ngrams(reference, i) + reference_ngram_freq = nltk.FreqDist(reference_ngrams) + + clipped_counts = { + ngram: min(candidate_ngram_freq[ngram], reference_ngram_freq[ngram]) + for ngram in candidate_ngram_freq + } + precision = sum(clipped_counts.values()) / sum( + candidate_ngram_freq.values() + ) + precisions.append(precision) + + brevity_penalty = min(1, len(candidate) / len(reference)) + + bleu = brevity_penalty * np.exp( + np.mean([w * np.log(p) for w, p in zip(weights, precisions)]) + ) + + return bleu diff --git a/deepeval/metrics/conceptual_similarity.py b/deepeval/metrics/conceptual_similarity.py new file mode 100644 index 000000000..b107c79eb --- /dev/null +++ b/deepeval/metrics/conceptual_similarity.py @@ -0,0 +1,38 @@ +"""Asserting conceptual similarity +""" +from typing import Optional +from .metric import Metric +from ..utils import cosine_similarity + + +class ConceptualSimilarityMetric(Metric): + """basic implementation of BertScoreMetric""" + + def __init__( + self, + model_name: Optional[str] = "sentence-transformers/all-mpnet-base-v2", + success_threshold: float = 0.7, + ): + from sentence_transformers import SentenceTransformer + + self.model_name = model_name + self.model = SentenceTransformer(self.model_name).eval() + self.success_threshold = success_threshold + + def _vectorize(self, text_a: str, text_b: str): + vectors = self.model.encode([text_a, text_b]) + return vectors + + def measure(self, a: str, b: str): + vectors = self._vectorize(a, b) + self.score = cosine_similarity(vectors[0], vectors[1]) + return self.score + + def is_successful(self) -> bool: + return self.score >= self.success_threshold + + +def assert_conceptual_similarity(text_1: str, text_2: str, success_threshold=0.3): + metric = ConceptualSimilarityMetric(success_threshold=success_threshold) + score = metric.measure(text_1, text_2) + assert metric.is_successful(), f"Metric is not conceptually similar - got {score}" diff --git a/deepeval/metrics/entailment_metric.py b/deepeval/metrics/entailment_metric.py index 72d83dc3a..875bdb676 100644 --- a/deepeval/metrics/entailment_metric.py +++ b/deepeval/metrics/entailment_metric.py @@ -24,3 +24,7 @@ def measure(self, a: str, b: str): def is_successful(self) -> bool: return self.success + + @property + def __name__(self): + return "Entailment" diff --git a/deepeval/metrics/overall_score.py b/deepeval/metrics/overall_score.py new file mode 100644 index 000000000..94580922a --- /dev/null +++ b/deepeval/metrics/overall_score.py @@ -0,0 +1,46 @@ +"""Alert Score +""" +from .metric import Metric +from .entailment_metric import EntailmentScoreMetric +from .answer_relevancy import AnswerRelevancy + + +class OverallScore(Metric): + def __init__(self, success_threshold: float = 0.5): + self.success_threshold = success_threshold + self.entailment_metric = EntailmentScoreMetric() + self.answer_relevancy = AnswerRelevancy() + + def __call__(self, generated_output: str, expected_output: str, context: str): + score = self.measure(generated_output, expected_output, context) + return score + + def measure( + self, generated_output: str, expected_output: str, context: str + ) -> float: + entailment_score = self.entailment_metric.measure( + generated_output, + context, + ) + answer_relevancy_score = self.answer_relevancy.measure( + generated_output, expected_output + ) + alert_score = 0.5 * entailment_score + 0.5 * answer_relevancy_score + self.success = alert_score > self.success_threshold + return alert_score + + def is_successful(self) -> bool: + return self.success + + def __name__(self): + return "Alert Score" + + +def assert_overall_score( + generated_output: str, expected_output: str, success_threshold: float = 0.5 +): + metric = OverallScore(success_threshold=success_threshold) + score = metric.measure( + generated_output=generated_output, expected_output=expected_output + ) + assert metric.is_successful(), f"Metric is not conceptually similar - got {score}" diff --git a/deepeval/utils.py b/deepeval/utils.py index d20b51f4b..e05f042f9 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -5,3 +5,11 @@ def softmax(x): e_x = np.exp(x - np.max(x, axis=1, keepdims=True)) return e_x / e_x.sum(axis=1, keepdims=True) + + +def cosine_similarity(vector_a, vector_b): + dot_product = np.dot(vector_a, vector_b) + norm_a = np.linalg.norm(vector_a) + norm_b = np.linalg.norm(vector_b) + similarity = dot_product / (norm_a * norm_b) + return similarity diff --git a/docs/docs/measuring_llm_performance/conceptual_similarity.md b/docs/docs/measuring_llm_performance/conceptual_similarity.md new file mode 100644 index 000000000..452078d59 --- /dev/null +++ b/docs/docs/measuring_llm_performance/conceptual_similarity.md @@ -0,0 +1,35 @@ +# Conceptual Similarity + +Asserting for conceptual similarity allows developers to ensure that the expected answer and the generated answer are similar in terms of what is mentioned (even if the overall message can vary quite a bit.) + +## What is it? + +- Neural network embeddings are designed to represent the semantic meaning of words or concepts in a continuous vector space. These embeddings aim to capture the relationships and similarities between words or concepts based on their intrinsic properties. +- Techniques like word2vec, GloVe, and BERT embeddings are trained to learn the meaning and relationships between words or concepts from large text corpora. They excel at capturing the underlying semantics and conceptual associations between words. +- These embeddings are often used in various natural language processing (NLP) tasks like word similarity, text classification, and sentiment analysis, where understanding the meaning and similarity of words or concepts is crucial. + +## Assert Conceptual Similarity + +```python +from deepeval.metrics.conceptual_similarity import assert_conceptual_similarity + +assert_conceptual_similarity( + output="python is a programming language", + expected_output="Python is a snake.", + success_threshold=0.3 +) +``` + +## Conceptual Similarity As A Metric + +```python +from deepeval.metrics.conceptual_similarity import ConceptualSimilarityMetric + +metric = ConceptualSimilarityMetric(success_threshold=0.3) +score = metric.measure(text_1="Python is a programming language.", text_2="Python is a snake.") +metric.is_successful() +``` + +### Parameters + +- `success_threshold` - the minimum score required a metric to be successful diff --git a/docs/sidebars.js b/docs/sidebars.js index 1c78b7b21..3c509c24c 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -38,6 +38,7 @@ const sidebars = { 'measuring_llm_performance/factual_consistency', 'measuring_llm_performance/answer_relevancy', "measuring_llm_performance/ranking_similarity", + "measuring_llm_performance/conceptual_similarity" ] }, { diff --git a/tests/test_alert_score.py b/tests/test_alert_score.py new file mode 100644 index 000000000..f060d3228 --- /dev/null +++ b/tests/test_alert_score.py @@ -0,0 +1,15 @@ +"""Test alert score +""" + +import pytest +from deepeval.metrics.alert_score import assert_alert_score + + +@pytest.mark.asyncio +async def test_alert_score(): + assert_alert_score( + query="Who won the FIFA World Cup in 2018?", + generated_text="Winners of the FIFA world cup were the French national football team", + expected_output="French national football team", + context="The FIFA World Cup in 2018 was won by the French national football team. They defeated Croatia 4-2 in the final match to claim the championship.", + )