confident-ai · jwongster2 · Aug 20, 2023 · Aug 20, 2023 · Aug 20, 2023 · Aug 20, 2023
diff --git a/deepeval/metrics/answer_relevancy.py b/deepeval/metrics/answer_relevancy.py
@@ -0,0 +1,33 @@
+import asyncio
+from .metric import Metric
+
+
+class AnswerRelevancy(Metric):
+    def __init__(self, success_threshold: bool = 0.5):
+        self.success_threshold = success_threshold
+        from sentence_transformers import CrossEncoder
+
+        self.encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2")
+
+    def __call__(self, query: str, answer: str):
+        score = self.measure(query, answer)
+        if self._is_send_okay():
+            asyncio.create_task(
+                self._send_to_server(
+                    entailment_score=score,
+                    query=query,
+                    output=answer,
+                )
+            )
+        return score
+
+    def measure(self, query, answer: str) -> float:
+        score = self.encoder.predict([query, answer])
+        self.success = score > self.success_threshold
+        return score
+
+    def is_successful(self) -> bool:
+        return self.success
+
+    def __name__(self):
+        return "Answer Relevancy"
diff --git a/deepeval/metrics/metric.py b/deepeval/metrics/metric.py
@@ -12,10 +12,6 @@
 
 
 class Metric:
-    def __call__(self, *args, **kwargs):
-        result = self.measure(*args, **kwargs)
-        return result
-
     @abstractmethod
     def measure(self, output, expected_output, query: Optional[str] = None):
         pass

diff --git a/deepeval/test_utils.py b/deepeval/test_utils.py
@@ -5,6 +5,7 @@
 from .metrics.metric import Metric
 from .metrics.bertscore_metric import BertScoreMetric
 from .metrics.entailment_metric import EntailmentScoreMetric
+from .metrics.answer_relevancy import AnswerRelevancy
 
 
 def assert_llm_output(
@@ -28,22 +29,32 @@ def assert_llm_output(
     assert metric.is_successful(), metric.__class__.__name__ + " was unsuccessful."
 
 
-def assert_factual_consistency(output: str, context: str):
+def assert_factual_consistency(
+    output: str, context: str, success_threshold: float = 0.3
+):
     """Assert that the output is factually consistent with the context."""
 
     class FactualConsistency(EntailmentScoreMetric):
         def __name__(self):
             return "Factual Consistency"
 
-    metric = FactualConsistency()
+    metric = FactualConsistency(minimum_score=success_threshold)
     score = metric(context, output)
     assert metric.is_successful(), metric.__class__.__name__ + " was unsuccessful."
 
 
-def assert_exact_match(text_input, text_output):
+def assert_exact_match(text_input: str, text_output: str):
     assert text_input == text_output, f"{text_output} != {text_input}"
 
 
+def assert_answer_relevancy(query: str, answer: str, success_threshold: float = 0.5):
+    metric = AnswerRelevancy(success_threshold=success_threshold)
+    score = metric(query=query, answer=answer)
+    assert metric.is_successful(), (
+        metric.__class__.__name__ + " was unsuccessful - " + str(score)
+    )
+
+
 class TestEvalCase:
     pass
 

diff --git a/docs/.gitignore b/docs/.gitignore
@@ -0,0 +1,45 @@
+.DS_Store
+.vscode/*
+!.vscode/extensions.json
+.idea
+*.iml
+*.code-workspace
+.changelog
+.history
+
+node_modules
+.yarn
+package-lock.json
+
+.eslintcache
+
+yarn-error.log
+build
+coverage
+.docusaurus
+.cache-loader
+types
+test-website
+test-website-in-workspace
+
+packages/create-docusaurus/lib/
+packages/lqip-loader/lib/
+packages/docusaurus/lib/
+packages/docusaurus-*/lib/*
+packages/eslint-plugin/lib/
+packages/stylelint-copyright/lib/
+
+website/netlifyDeployPreview/*
+website/changelog
+!website/netlifyDeployPreview/index.html
+!website/netlifyDeployPreview/_redirects
+
+website/_dogfooding/_swizzle_theme_tests
+
+CrowdinTranslations_*.zip
+
+website/i18n/**/*
+#!website/i18n/fr
+#!website/i18n/fr/**/*
+
+.netlify
diff --git a/docs/docs/measuring_llm_performance/answer_relevancy.md b/docs/docs/measuring_llm_performance/answer_relevancy.md
@@ -0,0 +1,35 @@
+# Answer Relevancy
+
+For question-answering applications, we provide a simple interface for ensuring question-answering relevancy.
+
+## Assert Answer Relevancy
+
+```python
+from deepeval.test_utils import assert_answer_relevancy
+query = "What is Python?"
+answer = "Python is a programming language?"
+assert_answer_relevancy(query, answer, success_threshold=0.5)
+```
+
+### Parameters
+
+- `success_threshold` refers to the minimum score for this to be considered relevant
+
+## Answer Relevancy As A Metric
+
+If you would instead like a score of how relevant an answer is to a query, simply call the metric class.
+
+```python
+from deepeval.metrics.answer_relevancy import AnswerRelevancy
+scorer = AnswerRelevancy(success_threshold=0.5)
+scorer.measure(query=query, answer=answer)
+# Returns a floating point number between 0 and 1
+```
+
+### Parameters
+
+- `success_threshold` refers to the minimum score for this to be considered relevant
+
+## How It is Measured
+
+Answer relevancy is measured using DL models that are trained off MS-Marco dataset (which is a search engine dataset).
diff --git a/docs/docs/measuring_llm_performance/factual_consistency.md b/docs/docs/measuring_llm_performance/factual_consistency.md
@@ -2,7 +2,7 @@
 
 Factual consistency refers to the accuracy and reliability of information presented in a piece of text, conversation, or any form of communication. It means that the information being conveyed is true, accurate, and aligns with established facts or reality. Factual consistency is crucial because it ensures that the information being shared is reliable and trustworthy. Inaccurate or inconsistent information can lead to misunderstandings, misinformation, and loss of credibility.
 
-## How to use
+## Assert Factual Consistency
 
 DeepEval offers an opinionated method for factual consistency based on entailment score.
 
@@ -14,11 +14,24 @@ assert_factual_consistency(
 )
 ```
 
+### Parameters
+
 Diving into the arguments for `assert_factual_consistency`:
 
 - `output` - the LLM generated text
 - `context` - the text from which the LLM is supposed to reason and derive conclusions from
 
+## Factual Consistency As A Metric
+
+If you would instead like a score of how relevant an answer is to a query, simply call the metric class.
+
+```python
+from deepeval.metrics.answer_relevancy import AnswerRelevancy
+scorer = AnswerRelevancy(success_threshold=0.5)
+scorer.measure(query=query, answer=answer)
+# Returns a floating point number between 0 and 1
+```
+
 ### How it is measured
 
 Factual consistency is measured using natural language inference models based on the output score of the entailment class that compare the ground truth and the context from which the ground truth is done.
diff --git a/docs/docs/measuring_llm_performance/ranking_similarity.md b/docs/docs/measuring_llm_performance/ranking_similarity.md
@@ -0,0 +1,12 @@
+# Ranking Similarity
+
+Top-K rankings are usually evaluated with several common criteria:
+
+- Top results are more important than bottom-ranked results.
+  A drop from 2nd to 3rd is more significant than a drop from 5th to 6th and so on.
+
+A specific result not appearing in another list is more indicative of difference than a result dropping in another list as it suggests the ranking itself is greater than K.
+
+For ensuring top-k ranking similarity, we recommend reading the following literature:
+
+https://medium.com/ai-in-plain-english/comparing-top-k-rankings-statistically-9adfc9cfc98b
diff --git a/docs/sidebars.js b/docs/sidebars.js
@@ -39,7 +39,8 @@ const sidebars = {
       type: 'category',
       label: 'Evaluating LLMs',
       items: [
-        'measuring_llm_performance/factual_consistency'
+        'measuring_llm_performance/factual_consistency',
+        'measuring_llm_performance/answer_relevancy'
       ]
     },
     {

diff --git a/tests/test_answer_relevancy.py b/tests/test_answer_relevancy.py
@@ -0,0 +1,21 @@
+"""Tests for answer relevancy
+"""
+import pytest
+
+query = "What is Python?"
+answer = "Python is a programming language?"
+
+
+@pytest.mark.asyncio
+async def test_answer_relevancy():
+    from deepeval.test_utils import assert_answer_relevancy
+
+    assert_answer_relevancy(query, answer, success_threshold=0.5)
+
+
+@pytest.mark.asyncio
+async def test_query_answer_relevancy():
+    from deepeval.metrics.answer_relevancy import AnswerRelevancy
+
+    scorer = AnswerRelevancy(success_threshold=0.5)
+    result = scorer.measure(query=query, answer=answer)