From 5ecaeaf7e1713b45fb752a02f1bf0213c220bd41 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sun, 20 Aug 2023 18:29:11 +1000 Subject: [PATCH 1/5] add answer relevancy --- deepeval/metrics/answer_relevancy.py | 18 ++++++++++++++++++ deepeval/test_utils.py | 7 ++++++- 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 deepeval/metrics/answer_relevancy.py diff --git a/deepeval/metrics/answer_relevancy.py b/deepeval/metrics/answer_relevancy.py new file mode 100644 index 000000000..000c2ae76 --- /dev/null +++ b/deepeval/metrics/answer_relevancy.py @@ -0,0 +1,18 @@ +from .metric import Metric + +class AnswerRelevancy(Metric): + def __init__(self, success_threshold: bool=0.5): + self.success_threshold = success_threshold + from sentence_transformers import CrossEncoder + self.encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2") + + def measure(self, query, answer: str) -> float: + score = self.encoder.predict([query, answer]) + self.success = score > self.success_threshold + return score + + def is_successful(self) -> bool: + return self.success + + def __name__(self): + return "Answer Relevancy" diff --git a/deepeval/test_utils.py b/deepeval/test_utils.py index 6a7cd0667..848edbab7 100644 --- a/deepeval/test_utils.py +++ b/deepeval/test_utils.py @@ -5,6 +5,7 @@ from .metrics.metric import Metric from .metrics.bertscore_metric import BertScoreMetric from .metrics.entailment_metric import EntailmentScoreMetric +from .metrics.answer_relevancy import AnswerRelevancy def assert_llm_output( @@ -40,9 +41,13 @@ def __name__(self): assert metric.is_successful(), metric.__class__.__name__ + " was unsuccessful." -def assert_exact_match(text_input, text_output): +def assert_exact_match(text_input: str, text_output: str): assert text_input == text_output, f"{text_output} != {text_input}" +def assert_answer_relevancy(query: str, answer: str, success_threshold: float=0.5): + metric = AnswerRelevancy(success_threshold=success_threshold) + score = metric(query=query, answer=answer) + assert metric.is_successful(), metric.__class__.__name__ + " was unsuccessful - " + str(score) class TestEvalCase: pass From 242df32135e04fdcabde7194e81941ff26db8a2f Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sun, 20 Aug 2023 18:34:14 +1000 Subject: [PATCH 2/5] answer search relevance --- docs/.gitignore | 45 +++++++++++++++++++ .../answer_relevancy.md | 25 +++++++++++ .../ranking_similarity.md | 12 +++++ docs/sidebars.js | 3 +- tests/test_answer_relevancy.py | 21 +++++++++ 5 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 docs/.gitignore create mode 100644 docs/docs/measuring_llm_performance/answer_relevancy.md create mode 100644 docs/docs/measuring_llm_performance/ranking_similarity.md create mode 100644 tests/test_answer_relevancy.py diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 000000000..e58ca023a --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,45 @@ +.DS_Store +.vscode/* +!.vscode/extensions.json +.idea +*.iml +*.code-workspace +.changelog +.history + +node_modules +.yarn +package-lock.json + +.eslintcache + +yarn-error.log +build +coverage +.docusaurus +.cache-loader +types +test-website +test-website-in-workspace + +packages/create-docusaurus/lib/ +packages/lqip-loader/lib/ +packages/docusaurus/lib/ +packages/docusaurus-*/lib/* +packages/eslint-plugin/lib/ +packages/stylelint-copyright/lib/ + +website/netlifyDeployPreview/* +website/changelog +!website/netlifyDeployPreview/index.html +!website/netlifyDeployPreview/_redirects + +website/_dogfooding/_swizzle_theme_tests + +CrowdinTranslations_*.zip + +website/i18n/**/* +#!website/i18n/fr +#!website/i18n/fr/**/* + +.netlify \ No newline at end of file diff --git a/docs/docs/measuring_llm_performance/answer_relevancy.md b/docs/docs/measuring_llm_performance/answer_relevancy.md new file mode 100644 index 000000000..625709448 --- /dev/null +++ b/docs/docs/measuring_llm_performance/answer_relevancy.md @@ -0,0 +1,25 @@ +# Answer Relevancy + +For question-answering applications, we provide a simple interface for ensuring question-answering relevancy. + +```python +from deepeval.test_utils import assert_answer_relevancy +query = "What is Python?" +answer = "Python is a programming language?" +assert_answer_relevancy(query, answer, success_threshold=0.5) +``` + +## Using answer relevancy as a metric + +If you would instead like a score of how relevant an answer is to a query, simply call the metric class. + +```python +from deepeval.metrics.answer_relevancy import AnswerRelevancy +scorer = AnswerRelevancy(success_threshold=0.5) +scorer.measure(query=query, answer=answer) +# Returns a floating point number between 0 and 1 +``` + +### Parameters + +- `success_threshold` refers to the minimum score for this to be considered relevant diff --git a/docs/docs/measuring_llm_performance/ranking_similarity.md b/docs/docs/measuring_llm_performance/ranking_similarity.md new file mode 100644 index 000000000..2c8da608e --- /dev/null +++ b/docs/docs/measuring_llm_performance/ranking_similarity.md @@ -0,0 +1,12 @@ +# Ranking Similarity + +Top-K rankings are usually evaluated with several common criteria: + +- Top results are more important than bottom-ranked results. + A drop from 2nd to 3rd is more significant than a drop from 5th to 6th and so on. + +A specific result not appearing in another list is more indicative of difference than a result dropping in another list as it suggests the ranking itself is greater than K. + +For ensuring top-k ranking similarity, we recommend reading the following literature: + +https://medium.com/ai-in-plain-english/comparing-top-k-rankings-statistically-9adfc9cfc98b diff --git a/docs/sidebars.js b/docs/sidebars.js index e599ecfe3..3e49c5550 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -39,7 +39,8 @@ const sidebars = { type: 'category', label: 'Evaluating LLMs', items: [ - 'measuring_llm_performance/factual_consistency' + 'measuring_llm_performance/factual_consistency', + 'measuring_llm_performance/answer_relevancy' ] }, { diff --git a/tests/test_answer_relevancy.py b/tests/test_answer_relevancy.py new file mode 100644 index 000000000..e4ce129a8 --- /dev/null +++ b/tests/test_answer_relevancy.py @@ -0,0 +1,21 @@ +"""Tests for answer relevancy +""" +import pytest + +query = "What is Python?" +answer = "Python is a programming language?" + + +@pytest.mark.asyncio +async def test_answer_relevancy(): + from deepeval.test_utils import assert_answer_relevancy + + assert_answer_relevancy(query, answer, success_threshold=0.5) + + +@pytest.mark.asyncio +async def test_query_answer_relevancy(): + from deepeval.metrics.answer_relevancy import AnswerRelevancy + + scorer = AnswerRelevancy(success_threshold=0.5) + result = scorer.measure(query=query, answer=answer) From f957f08b51906492799eb4d4323cc27be3ff470e Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sun, 20 Aug 2023 18:34:57 +1000 Subject: [PATCH 3/5] reformat --- deepeval/metrics/answer_relevancy.py | 6 ++++-- deepeval/test_utils.py | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/deepeval/metrics/answer_relevancy.py b/deepeval/metrics/answer_relevancy.py index 000c2ae76..129201cd3 100644 --- a/deepeval/metrics/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy.py @@ -1,9 +1,11 @@ from .metric import Metric + class AnswerRelevancy(Metric): - def __init__(self, success_threshold: bool=0.5): + def __init__(self, success_threshold: bool = 0.5): self.success_threshold = success_threshold from sentence_transformers import CrossEncoder + self.encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2") def measure(self, query, answer: str) -> float: @@ -13,6 +15,6 @@ def measure(self, query, answer: str) -> float: def is_successful(self) -> bool: return self.success - + def __name__(self): return "Answer Relevancy" diff --git a/deepeval/test_utils.py b/deepeval/test_utils.py index 848edbab7..4963d755c 100644 --- a/deepeval/test_utils.py +++ b/deepeval/test_utils.py @@ -44,10 +44,14 @@ def __name__(self): def assert_exact_match(text_input: str, text_output: str): assert text_input == text_output, f"{text_output} != {text_input}" -def assert_answer_relevancy(query: str, answer: str, success_threshold: float=0.5): + +def assert_answer_relevancy(query: str, answer: str, success_threshold: float = 0.5): metric = AnswerRelevancy(success_threshold=success_threshold) score = metric(query=query, answer=answer) - assert metric.is_successful(), metric.__class__.__name__ + " was unsuccessful - " + str(score) + assert metric.is_successful(), ( + metric.__class__.__name__ + " was unsuccessful - " + str(score) + ) + class TestEvalCase: pass From f57daa4dcbf4644335325744f8bb3ec371d7001e Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sun, 20 Aug 2023 18:50:14 +1000 Subject: [PATCH 4/5] add check for answer relevancy --- deepeval/metrics/answer_relevancy.py | 13 +++++++++++++ deepeval/metrics/metric.py | 4 ---- deepeval/test_utils.py | 4 ++-- .../measuring_llm_performance/answer_relevancy.md | 12 +++++++++++- .../factual_consistency.md | 15 ++++++++++++++- 5 files changed, 40 insertions(+), 8 deletions(-) diff --git a/deepeval/metrics/answer_relevancy.py b/deepeval/metrics/answer_relevancy.py index 129201cd3..dcad67ed4 100644 --- a/deepeval/metrics/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy.py @@ -1,3 +1,4 @@ +import asyncio from .metric import Metric @@ -8,6 +9,18 @@ def __init__(self, success_threshold: bool = 0.5): self.encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2") + def __call__(self, query: str, answer: str): + score = self.measure(query, answer) + if self._is_send_okay(): + asyncio.create_task( + self._send_to_server( + entailment_score=score, + query=query, + output=answer, + ) + ) + return score + def measure(self, query, answer: str) -> float: score = self.encoder.predict([query, answer]) self.success = score > self.success_threshold diff --git a/deepeval/metrics/metric.py b/deepeval/metrics/metric.py index c95804202..7f7a9a57b 100644 --- a/deepeval/metrics/metric.py +++ b/deepeval/metrics/metric.py @@ -12,10 +12,6 @@ class Metric: - def __call__(self, *args, **kwargs): - result = self.measure(*args, **kwargs) - return result - @abstractmethod def measure(self, output, expected_output, query: Optional[str] = None): pass diff --git a/deepeval/test_utils.py b/deepeval/test_utils.py index 4963d755c..5c2d560b3 100644 --- a/deepeval/test_utils.py +++ b/deepeval/test_utils.py @@ -29,14 +29,14 @@ def assert_llm_output( assert metric.is_successful(), metric.__class__.__name__ + " was unsuccessful." -def assert_factual_consistency(output: str, context: str): +def assert_factual_consistency(output: str, context: str, success_threshold: float=0.3): """Assert that the output is factually consistent with the context.""" class FactualConsistency(EntailmentScoreMetric): def __name__(self): return "Factual Consistency" - metric = FactualConsistency() + metric = FactualConsistency(minimum_score=success_threshold) score = metric(context, output) assert metric.is_successful(), metric.__class__.__name__ + " was unsuccessful." diff --git a/docs/docs/measuring_llm_performance/answer_relevancy.md b/docs/docs/measuring_llm_performance/answer_relevancy.md index 625709448..9f6c1adca 100644 --- a/docs/docs/measuring_llm_performance/answer_relevancy.md +++ b/docs/docs/measuring_llm_performance/answer_relevancy.md @@ -2,6 +2,8 @@ For question-answering applications, we provide a simple interface for ensuring question-answering relevancy. +## Assert Answer Relevancy + ```python from deepeval.test_utils import assert_answer_relevancy query = "What is Python?" @@ -9,7 +11,11 @@ answer = "Python is a programming language?" assert_answer_relevancy(query, answer, success_threshold=0.5) ``` -## Using answer relevancy as a metric +### Parameters + +- `success_threshold` refers to the minimum score for this to be considered relevant + +## Answer Relevancy As A Metric If you would instead like a score of how relevant an answer is to a query, simply call the metric class. @@ -23,3 +29,7 @@ scorer.measure(query=query, answer=answer) ### Parameters - `success_threshold` refers to the minimum score for this to be considered relevant + +## How It is Measured + +Answer relevancy is measured using DL models that are trained off MS-Marco dataset (which is a search engine dataset). diff --git a/docs/docs/measuring_llm_performance/factual_consistency.md b/docs/docs/measuring_llm_performance/factual_consistency.md index b9818941c..a68f7a5c1 100644 --- a/docs/docs/measuring_llm_performance/factual_consistency.md +++ b/docs/docs/measuring_llm_performance/factual_consistency.md @@ -2,7 +2,7 @@ Factual consistency refers to the accuracy and reliability of information presented in a piece of text, conversation, or any form of communication. It means that the information being conveyed is true, accurate, and aligns with established facts or reality. Factual consistency is crucial because it ensures that the information being shared is reliable and trustworthy. Inaccurate or inconsistent information can lead to misunderstandings, misinformation, and loss of credibility. -## How to use +## Assert Factual Consistency DeepEval offers an opinionated method for factual consistency based on entailment score. @@ -14,11 +14,24 @@ assert_factual_consistency( ) ``` +### Parameters + Diving into the arguments for `assert_factual_consistency`: - `output` - the LLM generated text - `context` - the text from which the LLM is supposed to reason and derive conclusions from +## Factual Consistency As A Metric + +If you would instead like a score of how relevant an answer is to a query, simply call the metric class. + +```python +from deepeval.metrics.answer_relevancy import AnswerRelevancy +scorer = AnswerRelevancy(success_threshold=0.5) +scorer.measure(query=query, answer=answer) +# Returns a floating point number between 0 and 1 +``` + ### How it is measured Factual consistency is measured using natural language inference models based on the output score of the entailment class that compare the ground truth and the context from which the ground truth is done. From dc54144f6d414fac7958bf894c18b0ae9dfd8012 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sun, 20 Aug 2023 18:51:56 +1000 Subject: [PATCH 5/5] reformat --- deepeval/test_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepeval/test_utils.py b/deepeval/test_utils.py index 5c2d560b3..877c1bca1 100644 --- a/deepeval/test_utils.py +++ b/deepeval/test_utils.py @@ -29,7 +29,9 @@ def assert_llm_output( assert metric.is_successful(), metric.__class__.__name__ + " was unsuccessful." -def assert_factual_consistency(output: str, context: str, success_threshold: float=0.3): +def assert_factual_consistency( + output: str, context: str, success_threshold: float = 0.3 +): """Assert that the output is factually consistent with the context.""" class FactualConsistency(EntailmentScoreMetric):