confident-ai · jwongster2 · Aug 21, 2023 · Aug 21, 2023 · Aug 21, 2023 · Aug 21, 2023
diff --git a/deepeval/metrics/alert_score.py b/deepeval/metrics/alert_score.py
@@ -0,0 +1,65 @@
+"""Alert Score
+"""
+from .metric import Metric
+from .entailment_metric import EntailmentScoreMetric
+
+# from .answer_relevancy import AnswerRelevancy
+
+
+class AlertScore(Metric):
+    def __init__(self, success_threshold: float = 0.5):
+        self.success_threshold = success_threshold
+        self.entailment_metric = EntailmentScoreMetric()
+        # self.answer_relevancy = AnswerRelevancy()
+
+    def __call__(self, generated_text: str, expected_output: str, context: str):
+        score = self.measure(generated_text, expected_output, context)
+        return score
+
+    def measure(
+        self, query: str, generated_text: str, expected_output: str, context: str
+    ) -> float:
+
+        entailment_score = self.entailment_metric.measure(
+            context,
+            generated_text,
+        )
+
+        answer_expected_score = self.entailment_metric.measure(
+            generated_text,
+            expected_output,
+        )
+
+        # This metric is very very bad right now as it requires the answer
+        # to re-gurgitate the question.
+        # answer_relevancy_score = self.answer_relevancy.measure(
+        #     query=query, answer=generated_text
+        # )
+        alert_score = min(entailment_score, answer_expected_score)
+        self.success = alert_score > self.success_threshold
+        return alert_score
+
+    def is_successful(self) -> bool:
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Alert Score"
+
+
+def assert_alert_score(
+    query: str,
+    generated_text: str,
+    expected_output: str,
+    context: str,
+    success_threshold: float = 0.5,
+):
+    """Create alert score."""
+    metric = AlertScore(success_threshold=success_threshold)
+    score = metric.measure(
+        query=query,
+        generated_text=generated_text,
+        expected_output=expected_output,
+        context=context,
+    )
+    assert metric.is_successful(), f"Found issue - Alert score: {score}"
diff --git a/deepeval/metrics/answer_relevancy.py b/deepeval/metrics/answer_relevancy.py
@@ -1,5 +1,10 @@
 import asyncio
 from .metric import Metric
+import numpy as np
+
+
+def sigmoid(x):
+    return 1 / (1 + np.exp(-x))
 
 
 class AnswerRelevancy(Metric):
@@ -23,6 +28,7 @@ def __call__(self, query: str, answer: str):
 
     def measure(self, query, answer: str) -> float:
         score = self.encoder.predict([query, answer])
+        score = sigmoid(score)
         self.success = score > self.success_threshold
         return score
 

diff --git a/deepeval/metrics/bertscore_metric.py b/deepeval/metrics/bertscore_metric.py
@@ -1,14 +1,6 @@
 from .metric import Metric
 from typing import Optional
-import numpy as np
-
-
-def cosine_similarity(vector_a, vector_b):
-    dot_product = np.dot(vector_a, vector_b)
-    norm_a = np.linalg.norm(vector_a)
-    norm_b = np.linalg.norm(vector_b)
-    similarity = dot_product / (norm_a * norm_b)
-    return similarity
+from ..utils import cosine_similarity
 
 
 class BertScoreMetric(Metric):

diff --git a/deepeval/metrics/bleu_metric.py b/deepeval/metrics/bleu_metric.py
@@ -0,0 +1,54 @@
+# Sample Metric for BLEU
+import nltk
+from nltk.util import ngrams
+import numpy as np
+from .metric import Metric
+
+
+class BLEUMetric(Metric):
+    def __init__(self, success_threshold: float = 0.5):
+        self.success_threshold = success_threshold
+
+    def compute_bleu(
+        self, candidate: str, reference: str, weights=(0.25, 0.25, 0.25, 0.25)
+    ):
+        """
+        Compute BLEU score for a candidate sentence given a reference sentence.
+
+        :param candidate: The candidate sentence as a string.
+        :param reference: The reference sentence as a string.
+        :param weights: Weights for the n-gram precisions, default is uniform (0.25 for each).
+        :return: BLEU score.
+        """
+        candidate = (
+            candidate.split()
+        )  # Convert the candidate string to a list of tokens
+        reference = (
+            reference.split()
+        )  # Convert the reference string to a list of tokens
+
+        precisions = []
+
+        for i in range(1, 5):  # Compute BLEU for 1 to 4-grams
+            candidate_ngrams = ngrams(candidate, i)
+            candidate_ngram_freq = nltk.FreqDist(candidate_ngrams)
+
+            reference_ngrams = ngrams(reference, i)
+            reference_ngram_freq = nltk.FreqDist(reference_ngrams)
+
+            clipped_counts = {
+                ngram: min(candidate_ngram_freq[ngram], reference_ngram_freq[ngram])
+                for ngram in candidate_ngram_freq
+            }
+            precision = sum(clipped_counts.values()) / sum(
+                candidate_ngram_freq.values()
+            )
+            precisions.append(precision)
+
+        brevity_penalty = min(1, len(candidate) / len(reference))
+
+        bleu = brevity_penalty * np.exp(
+            np.mean([w * np.log(p) for w, p in zip(weights, precisions)])
+        )
+
+        return bleu
diff --git a/deepeval/metrics/conceptual_similarity.py b/deepeval/metrics/conceptual_similarity.py
@@ -0,0 +1,38 @@
+"""Asserting conceptual similarity
+"""
+from typing import Optional
+from .metric import Metric
+from ..utils import cosine_similarity
+
+
+class ConceptualSimilarityMetric(Metric):
+    """basic implementation of BertScoreMetric"""
+
+    def __init__(
+        self,
+        model_name: Optional[str] = "sentence-transformers/all-mpnet-base-v2",
+        success_threshold: float = 0.7,
+    ):
+        from sentence_transformers import SentenceTransformer
+
+        self.model_name = model_name
+        self.model = SentenceTransformer(self.model_name).eval()
+        self.success_threshold = success_threshold
+
+    def _vectorize(self, text_a: str, text_b: str):
+        vectors = self.model.encode([text_a, text_b])
+        return vectors
+
+    def measure(self, a: str, b: str):
+        vectors = self._vectorize(a, b)
+        self.score = cosine_similarity(vectors[0], vectors[1])
+        return self.score
+
+    def is_successful(self) -> bool:
+        return self.score >= self.success_threshold
+
+
+def assert_conceptual_similarity(text_1: str, text_2: str, success_threshold=0.3):
+    metric = ConceptualSimilarityMetric(success_threshold=success_threshold)
+    score = metric.measure(text_1, text_2)
+    assert metric.is_successful(), f"Metric is not conceptually similar - got {score}"
diff --git a/deepeval/metrics/entailment_metric.py b/deepeval/metrics/entailment_metric.py
@@ -24,3 +24,7 @@ def measure(self, a: str, b: str):
 
     def is_successful(self) -> bool:
         return self.success
+
+    @property
+    def __name__(self):
+        return "Entailment"
diff --git a/deepeval/metrics/overall_score.py b/deepeval/metrics/overall_score.py
@@ -0,0 +1,46 @@
+"""Alert Score
+"""
+from .metric import Metric
+from .entailment_metric import EntailmentScoreMetric
+from .answer_relevancy import AnswerRelevancy
+
+
+class OverallScore(Metric):
+    def __init__(self, success_threshold: float = 0.5):
+        self.success_threshold = success_threshold
+        self.entailment_metric = EntailmentScoreMetric()
+        self.answer_relevancy = AnswerRelevancy()
+
+    def __call__(self, generated_output: str, expected_output: str, context: str):
+        score = self.measure(generated_output, expected_output, context)
+        return score
+
+    def measure(
+        self, generated_output: str, expected_output: str, context: str
+    ) -> float:
+        entailment_score = self.entailment_metric.measure(
+            generated_output,
+            context,
+        )
+        answer_relevancy_score = self.answer_relevancy.measure(
+            generated_output, expected_output
+        )
+        alert_score = 0.5 * entailment_score + 0.5 * answer_relevancy_score
+        self.success = alert_score > self.success_threshold
+        return alert_score
+
+    def is_successful(self) -> bool:
+        return self.success
+
+    def __name__(self):
+        return "Alert Score"
+
+
+def assert_overall_score(
+    generated_output: str, expected_output: str, success_threshold: float = 0.5
+):
+    metric = OverallScore(success_threshold=success_threshold)
+    score = metric.measure(
+        generated_output=generated_output, expected_output=expected_output
+    )
+    assert metric.is_successful(), f"Metric is not conceptually similar - got {score}"
diff --git a/deepeval/utils.py b/deepeval/utils.py
@@ -5,3 +5,11 @@
 def softmax(x):
     e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
     return e_x / e_x.sum(axis=1, keepdims=True)
+
+
+def cosine_similarity(vector_a, vector_b):
+    dot_product = np.dot(vector_a, vector_b)
+    norm_a = np.linalg.norm(vector_a)
+    norm_b = np.linalg.norm(vector_b)
+    similarity = dot_product / (norm_a * norm_b)
+    return similarity
diff --git a/docs/docs/measuring_llm_performance/conceptual_similarity.md b/docs/docs/measuring_llm_performance/conceptual_similarity.md
@@ -0,0 +1,35 @@
+# Conceptual Similarity
+
+Asserting for conceptual similarity allows developers to ensure that the expected answer and the generated answer are similar in terms of what is mentioned (even if the overall message can vary quite a bit.)
+
+## What is it?
+
+- Neural network embeddings are designed to represent the semantic meaning of words or concepts in a continuous vector space. These embeddings aim to capture the relationships and similarities between words or concepts based on their intrinsic properties.
+- Techniques like word2vec, GloVe, and BERT embeddings are trained to learn the meaning and relationships between words or concepts from large text corpora. They excel at capturing the underlying semantics and conceptual associations between words.
+- These embeddings are often used in various natural language processing (NLP) tasks like word similarity, text classification, and sentiment analysis, where understanding the meaning and similarity of words or concepts is crucial.
+
+## Assert Conceptual Similarity
+
+```python
+from deepeval.metrics.conceptual_similarity import assert_conceptual_similarity
+
+assert_conceptual_similarity(
+    output="python is a programming language",
+    expected_output="Python is a snake.",
+    success_threshold=0.3
+)
+```
+
+## Conceptual Similarity As A Metric
+
+```python
+from deepeval.metrics.conceptual_similarity import ConceptualSimilarityMetric
+
+metric = ConceptualSimilarityMetric(success_threshold=0.3)
+score = metric.measure(text_1="Python is a programming language.", text_2="Python is a snake.")
+metric.is_successful()
+```
+
+### Parameters
+
+- `success_threshold` - the minimum score required a metric to be successful
diff --git a/docs/sidebars.js b/docs/sidebars.js
@@ -38,6 +38,7 @@ const sidebars = {
         'measuring_llm_performance/factual_consistency',
         'measuring_llm_performance/answer_relevancy',
         "measuring_llm_performance/ranking_similarity",
+        "measuring_llm_performance/conceptual_similarity"
       ]
     },
     {

diff --git a/tests/test_alert_score.py b/tests/test_alert_score.py
@@ -0,0 +1,15 @@
+"""Test alert score
+"""
+
+import pytest
+from deepeval.metrics.alert_score import assert_alert_score
+
+
+@pytest.mark.asyncio
+async def test_alert_score():
+    assert_alert_score(
+        query="Who won the FIFA World Cup in 2018?",
+        generated_text="Winners of the FIFA world cup were the French national football team",
+        expected_output="French national football team",
+        context="The FIFA World Cup in 2018 was won by the French national football team. They defeated Croatia 4-2 in the final match to claim the championship.",
+    )