Skip to content

Commit

Permalink
Merge pull request #1 from lesar64/faithfullness_limit
Browse files Browse the repository at this point in the history
Faithfullness limit
  • Loading branch information
lesar64 authored Sep 25, 2024
2 parents 0938e67 + 40c3f54 commit 84f146d
Show file tree
Hide file tree
Showing 11 changed files with 1,227 additions and 1,161 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ dataset.evaluate([answer_relevancy_metric])

# Real-time Evaluations on Confident AI

We offer a [free web platform](https://app.confident-ai.com) for you to:
We offer a [web platform](https://app.confident-ai.com) for you to:

1. Log and view all the test results / metrics data from DeepEval's test runs.
2. Debug evaluation results via LLM traces.
Expand Down
2 changes: 1 addition & 1 deletion deepeval/metrics/contextual_relevancy/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def generate_verdict(text, context):
**
IMPORTANT: Please make sure to only return in JSON format.
Example Context: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1968. There was a cat."
Example Input: "When what was some of Einstein's achievements?"
Example Input: "What were some of Einstein's achievements?"
Example:
{{
Expand Down
26 changes: 15 additions & 11 deletions deepeval/metrics/faithfulness/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.faithfulness.template import FaithfulnessTemplate
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.metrics.faithfulness.schema import *
from deepeval.metrics.faithfulness.schema import FaithfulnessVerdict, Verdicts, Reason, Truths, Claims

required_params: List[LLMTestCaseParams] = [
LLMTestCaseParams.INPUT,
Expand All @@ -35,6 +35,7 @@ def __init__(
async_mode: bool = True,
strict_mode: bool = False,
verbose_mode: bool = False,
limit_count: int = 0,
):
self.threshold = 1 if strict_mode else threshold
self.model, self.using_native_model = initialize_model(model)
Expand All @@ -43,6 +44,7 @@ def __init__(
self.async_mode = async_mode
self.strict_mode = strict_mode
self.verbose_mode = verbose_mode
self.limit_count = limit_count

def measure(
self,
Expand All @@ -61,8 +63,8 @@ def measure(
self.a_measure(test_case, _show_indicator=False)
)
else:
self.truths = self._generate_truths(test_case.retrieval_context)
self.claims = self._generate_claims(test_case.actual_output)
self.truths = self._generate_truths(test_case.retrieval_context, self.limit_count)
self.claims = self._generate_claims(test_case.actual_output, self.limit_count)
self.verdicts = self._generate_verdicts()
self.score = self._calculate_score()
self.reason = self._generate_reason()
Expand Down Expand Up @@ -228,9 +230,10 @@ def _generate_verdicts(self) -> List[FaithfulnessVerdict]:
]
return verdicts

async def _a_generate_truths(self, retrieval_context: str) -> List[str]:
async def _a_generate_truths(self, retrieval_context: str, limit_count: int = 0) -> List[str]:
prompt = FaithfulnessTemplate.generate_truths(
text="\n\n".join(retrieval_context)
text="\n\n".join(retrieval_context),
limit_count=limit_count
)
if self.using_native_model:
res, cost = await self.model.a_generate(prompt)
Expand All @@ -246,9 +249,10 @@ async def _a_generate_truths(self, retrieval_context: str) -> List[str]:
data = trimAndLoadJson(res, self)
return data["truths"]

def _generate_truths(self, retrieval_context: str) -> List[str]:
def _generate_truths(self, retrieval_context: str, limit_count: int = 0) -> List[str]:
prompt = FaithfulnessTemplate.generate_truths(
text="\n\n".join(retrieval_context)
text="\n\n".join(retrieval_context),
limit_count=limit_count
)
if self.using_native_model:
res, cost = self.model.generate(prompt)
Expand All @@ -264,8 +268,8 @@ def _generate_truths(self, retrieval_context: str) -> List[str]:
data = trimAndLoadJson(res, self)
return data["truths"]

async def _a_generate_claims(self, actual_output: str) -> List[str]:
prompt = FaithfulnessTemplate.generate_claims(text=actual_output)
async def _a_generate_claims(self, actual_output: str, limit_count: int = 0) -> List[str]:
prompt = FaithfulnessTemplate.generate_claims(text=actual_output, limit_count=limit_count)
if self.using_native_model:
res, cost = await self.model.a_generate(prompt)
self.evaluation_cost += cost
Expand All @@ -280,8 +284,8 @@ async def _a_generate_claims(self, actual_output: str) -> List[str]:
data = trimAndLoadJson(res, self)
return data["claims"]

def _generate_claims(self, actual_output: str) -> List[str]:
prompt = FaithfulnessTemplate.generate_claims(text=actual_output)
def _generate_claims(self, actual_output: str, limit_count: int = 0) -> List[str]:
prompt = FaithfulnessTemplate.generate_claims(text=actual_output, limit_count=limit_count)
if self.using_native_model:
res, cost = self.model.generate(prompt)
self.evaluation_cost += cost
Expand Down
17 changes: 13 additions & 4 deletions deepeval/metrics/faithfulness/template.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
class FaithfulnessTemplate:
@staticmethod
def generate_claims(text):
return f"""Based on the given text, please generate a comprehensive list of FACTUAL claims that can inferred from the provided text.
def generate_claims(text, limit_count):
if limit_count>0:
limit = f"the {limit_count} most important"
else: limit = "the most important"
return f"""Based on the given text, please generate a comprehensive list of {limit} FACTUAL claims that can inferred from the provided text.
Example:
Example Text:
Expand Down Expand Up @@ -29,8 +32,14 @@ def generate_claims(text):
"""

@staticmethod
def generate_truths(text):
return f"""Based on the given text, please generate a comprehensive list of FACTUAL, undisputed truths that can inferred from the provided text.
def generate_truths(text, limit_count):
if limit_count>0:
limit = f"the {limit_count} most important"
limit_addition = " per document"
else:
limit = "the most important"
limit_addition = ""
return f"""Based on the given text, please generate a comprehensive list of {limit} FACTUAL, undisputed truths{limit_addition}, that can inferred from the provided text.
Example:
Example Text:
Expand Down
23 changes: 16 additions & 7 deletions deepeval/progress_context.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn
from contextlib import contextmanager
from tqdm.asyncio import tqdm as async_tqdm_bar
from typing import Optional, Generator
import sys
from contextlib import contextmanager
from tqdm import tqdm as tqdm_bar
from rich.console import Console
import tqdm
import sys


from deepeval.telemetry import capture_synthesizer_run

Expand Down Expand Up @@ -32,6 +34,7 @@ def synthesizer_progress_context(
max_generations: str = None,
use_case: str = "QA",
progress_bar: Optional[tqdm.std.tqdm] = None,
async_mode: bool = False,
) -> Generator[Optional[tqdm.std.tqdm], None, None]:
with capture_synthesizer_run(max_generations, method):
if embedder is None:
Expand All @@ -40,9 +43,15 @@ def synthesizer_progress_context(
description = f"✨ Generating up to {max_generations} goldens using DeepEval (using {evaluation_model} and {embedder}, use case={use_case}, method={method})"
# Direct output to stderr, using TQDM progress bar for visual feedback
if not progress_bar:
with tqdm_bar(
total=max_generations, desc=description, file=sys.stderr
) as progress_bar:
yield progress_bar # Pass progress bar to use in outer loop
if async_mode:
with async_tqdm_bar(
total=max_generations, desc=description, file=sys.stderr
) as progress_bar:
yield progress_bar
else:
with tqdm_bar(
total=max_generations, desc=description, file=sys.stderr
) as progress_bar:
yield progress_bar
else:
yield progress_bar
Loading

0 comments on commit 84f146d

Please sign in to comment.