Merge pull request #1 from lesar64/faithfullness_limit

Faithfullness limit
confident-ai · Sep 25, 2024 · 84f146d · 84f146d
2 parents 0938e67 + 40c3f54
commit 84f146d
Show file tree

Hide file tree

Showing 11 changed files with 1,227 additions and 1,161 deletions.
diff --git a/README.md b/README.md
@@ -248,7 +248,7 @@ dataset.evaluate([answer_relevancy_metric])
 
 # Real-time Evaluations on Confident AI
 
-We offer a [free web platform](https://app.confident-ai.com) for you to:
+We offer a [web platform](https://app.confident-ai.com) for you to:
 
 1. Log and view all the test results / metrics data from DeepEval's test runs.
 2. Debug evaluation results via LLM traces.

diff --git a/deepeval/metrics/contextual_relevancy/template.py b/deepeval/metrics/contextual_relevancy/template.py
@@ -36,7 +36,7 @@ def generate_verdict(text, context):
 **
 IMPORTANT: Please make sure to only return in JSON format.
 Example Context: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1968. There was a cat."
-Example Input: "When what was some of Einstein's achievements?"
+Example Input: "What were some of Einstein's achievements?"
 
 Example:
 {{

diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py
@@ -17,7 +17,7 @@
 from deepeval.models import DeepEvalBaseLLM
 from deepeval.metrics.faithfulness.template import FaithfulnessTemplate
 from deepeval.metrics.indicator import metric_progress_indicator
-from deepeval.metrics.faithfulness.schema import *
+from deepeval.metrics.faithfulness.schema import FaithfulnessVerdict, Verdicts, Reason, Truths, Claims
 
 required_params: List[LLMTestCaseParams] = [
     LLMTestCaseParams.INPUT,
@@ -35,6 +35,7 @@ def __init__(
         async_mode: bool = True,
         strict_mode: bool = False,
         verbose_mode: bool = False,
+        limit_count: int = 0,
     ):
         self.threshold = 1 if strict_mode else threshold
         self.model, self.using_native_model = initialize_model(model)
@@ -43,6 +44,7 @@ def __init__(
         self.async_mode = async_mode
         self.strict_mode = strict_mode
         self.verbose_mode = verbose_mode
+        self.limit_count = limit_count
 
     def measure(
         self,
@@ -61,8 +63,8 @@ def measure(
                     self.a_measure(test_case, _show_indicator=False)
                 )
             else:
-                self.truths = self._generate_truths(test_case.retrieval_context)
-                self.claims = self._generate_claims(test_case.actual_output)
+                self.truths = self._generate_truths(test_case.retrieval_context, self.limit_count)
+                self.claims = self._generate_claims(test_case.actual_output, self.limit_count)
                 self.verdicts = self._generate_verdicts()
                 self.score = self._calculate_score()
                 self.reason = self._generate_reason()
@@ -228,9 +230,10 @@ def _generate_verdicts(self) -> List[FaithfulnessVerdict]:
                 ]
                 return verdicts
 
-    async def _a_generate_truths(self, retrieval_context: str) -> List[str]:
+    async def _a_generate_truths(self, retrieval_context: str, limit_count: int = 0) -> List[str]:
         prompt = FaithfulnessTemplate.generate_truths(
-            text="\n\n".join(retrieval_context)
+            text="\n\n".join(retrieval_context),
+            limit_count=limit_count
         )
         if self.using_native_model:
             res, cost = await self.model.a_generate(prompt)
@@ -246,9 +249,10 @@ async def _a_generate_truths(self, retrieval_context: str) -> List[str]:
                 data = trimAndLoadJson(res, self)
                 return data["truths"]
 
-    def _generate_truths(self, retrieval_context: str) -> List[str]:
+    def _generate_truths(self, retrieval_context: str, limit_count: int = 0) -> List[str]:
         prompt = FaithfulnessTemplate.generate_truths(
-            text="\n\n".join(retrieval_context)
+            text="\n\n".join(retrieval_context),
+            limit_count=limit_count
         )
         if self.using_native_model:
             res, cost = self.model.generate(prompt)
@@ -264,8 +268,8 @@ def _generate_truths(self, retrieval_context: str) -> List[str]:
                 data = trimAndLoadJson(res, self)
                 return data["truths"]
 
-    async def _a_generate_claims(self, actual_output: str) -> List[str]:
-        prompt = FaithfulnessTemplate.generate_claims(text=actual_output)
+    async def _a_generate_claims(self, actual_output: str, limit_count: int = 0) -> List[str]:
+        prompt = FaithfulnessTemplate.generate_claims(text=actual_output, limit_count=limit_count)
         if self.using_native_model:
             res, cost = await self.model.a_generate(prompt)
             self.evaluation_cost += cost
@@ -280,8 +284,8 @@ async def _a_generate_claims(self, actual_output: str) -> List[str]:
                 data = trimAndLoadJson(res, self)
                 return data["claims"]
 
-    def _generate_claims(self, actual_output: str) -> List[str]:
-        prompt = FaithfulnessTemplate.generate_claims(text=actual_output)
+    def _generate_claims(self, actual_output: str, limit_count: int = 0) -> List[str]:
+        prompt = FaithfulnessTemplate.generate_claims(text=actual_output, limit_count=limit_count)
         if self.using_native_model:
             res, cost = self.model.generate(prompt)
             self.evaluation_cost += cost

diff --git a/deepeval/metrics/faithfulness/template.py b/deepeval/metrics/faithfulness/template.py
@@ -1,7 +1,10 @@
 class FaithfulnessTemplate:
     @staticmethod
-    def generate_claims(text):
-        return f"""Based on the given text, please generate a comprehensive list of FACTUAL claims that can inferred from the provided text.
+    def generate_claims(text, limit_count):
+        if limit_count>0:
+            limit = f"the {limit_count} most important"
+        else: limit = "the most important"
+        return f"""Based on the given text, please generate a comprehensive list of {limit} FACTUAL claims that can inferred from the provided text.
 
 Example:
 Example Text: 
@@ -29,8 +32,14 @@ def generate_claims(text):
 """
 
     @staticmethod
-    def generate_truths(text):
-        return f"""Based on the given text, please generate a comprehensive list of FACTUAL, undisputed truths that can inferred from the provided text.
+    def generate_truths(text, limit_count):
+        if limit_count>0:
+            limit = f"the {limit_count} most important"
+            limit_addition = " per document"
+        else:
+            limit = "the most important"
+            limit_addition = ""
+        return f"""Based on the given text, please generate a comprehensive list of {limit} FACTUAL, undisputed truths{limit_addition}, that can inferred from the provided text.
 
 Example:
 Example Text: 

diff --git a/deepeval/progress_context.py b/deepeval/progress_context.py
@@ -1,10 +1,12 @@
-from rich.console import Console
 from rich.progress import Progress, SpinnerColumn, TextColumn
-from contextlib import contextmanager
+from tqdm.asyncio import tqdm as async_tqdm_bar
 from typing import Optional, Generator
-import sys
+from contextlib import contextmanager
 from tqdm import tqdm as tqdm_bar
+from rich.console import Console
 import tqdm
+import sys
+
 
 from deepeval.telemetry import capture_synthesizer_run
 
@@ -32,6 +34,7 @@ def synthesizer_progress_context(
     max_generations: str = None,
     use_case: str = "QA",
     progress_bar: Optional[tqdm.std.tqdm] = None,
+    async_mode: bool = False,
 ) -> Generator[Optional[tqdm.std.tqdm], None, None]:
     with capture_synthesizer_run(max_generations, method):
         if embedder is None:
@@ -40,9 +43,15 @@ def synthesizer_progress_context(
             description = f"✨ Generating up to {max_generations} goldens using DeepEval (using {evaluation_model} and {embedder}, use case={use_case}, method={method})"
         # Direct output to stderr, using TQDM progress bar for visual feedback
         if not progress_bar:
-            with tqdm_bar(
-                total=max_generations, desc=description, file=sys.stderr
-            ) as progress_bar:
-                yield progress_bar  # Pass progress bar to use in outer loop
+            if async_mode:
+                with async_tqdm_bar(
+                    total=max_generations, desc=description, file=sys.stderr
+                ) as progress_bar:
+                    yield progress_bar
+            else:
+                with tqdm_bar(
+                    total=max_generations, desc=description, file=sys.stderr
+                ) as progress_bar:
+                    yield progress_bar
         else:
             yield progress_bar