Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hotfix/fix conceptual similarity threshold #133

Merged
merged 5 commits into from
Sep 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/test-no-login.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,6 @@ jobs:
env:
python-version: ${{ matrix.python-version }}
run: |
python tests/test_without_pytest.py
deepeval test run -x tests

5 changes: 3 additions & 2 deletions deepeval/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from deepeval.run_test import run_test
from deepeval.metrics.metric import Metric
from deepeval.test_case import LLMTestCase
from dataclasses import asdict


class EvaluationDataset(UserList):
Expand Down Expand Up @@ -230,7 +231,7 @@ def from_dict(
return cls(test_cases)

def to_dict(self):
return [x.dict() for x in self.data]
return [asdict(x) for x in self.data]

def to_csv(self, csv_filename: str):
import pandas as pd
Expand All @@ -255,7 +256,7 @@ def sample(self, n: int = 5):
if len(self.data) <= n:
n = len(self.data)
result = random.sample(self.data, n)
return [r.dict() for r in result]
return [asdict(r) for r in result]

def head(self, n: int = 5):
return self.data[:n]
Expand Down
2 changes: 1 addition & 1 deletion deepeval/metrics/conceptual_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __name__(self):


def assert_conceptual_similarity(
output: str, expected_output: str, minimum_score=0.3
output: str, expected_output: str, minimum_score=0.7
):
metric = ConceptualSimilarityMetric(minimum_score=minimum_score)
test_case = LLMTestCase(output=output, expected_output=expected_output)
Expand Down
137 changes: 76 additions & 61 deletions deepeval/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,42 @@ def __lt__(self, other: "TestResult") -> bool:
return self.score < other.score


def create_test_result(
test_case: Union[LLMTestCase, SearchTestCase],
success: bool,
score: float,
metric: float,
) -> TestResult:
if isinstance(test_case, LLMTestCase):
return TestResult(
success=success,
score=score,
metric_name=metric.__name__,
query=test_case.query if test_case.query else "-",
output=test_case.output if test_case.output else "-",
expected_output=test_case.expected_output
if test_case.expected_output
else "-",
metadata=None,
context=test_case.context,
)
elif isinstance(test_case, SearchTestCase):
return TestResult(
success=success,
score=score,
metric_name=metric.__name__,
query=test_case.query if test_case.query else "-",
output=test_case.output_list if test_case.output_list else "-",
expected_output=test_case.golden_list
if test_case.golden_list
else "-",
metadata=None,
context="-",
)
else:
raise ValueError("TestCase not supported yet.")


def run_test(
test_cases: Union[TestCase, LLMTestCase, SearchTestCase, List[LLMTestCase]],
metrics: List[Metric],
Expand Down Expand Up @@ -98,68 +134,47 @@ def run_test(
if isinstance(test_cases, TestCase):
test_cases = [test_cases]

test_results = []
for test_case in test_cases:
for metric in metrics:
test_start_time = time.perf_counter()

@retry(
max_retries=max_retries, delay=delay, min_success=min_success
)
def measure_metric():
score = metric.measure(test_case)
success = metric.is_successful()
if isinstance(test_case, LLMTestCase):
test_result = TestResult(
success=success,
score=score,
metric_name=metric.__name__,
query=test_case.query if test_case.query else "-",
output=test_case.output if test_case.output else "-",
expected_output=test_case.expected_output
if test_case.expected_output
else "-",
metadata=None,
context=test_case.context,
)
elif isinstance(test_case, SearchTestCase):
test_result = TestResult(
success=success,
score=score,
metric_name=metric.__name__,
query=test_case.query if test_case.query else "-",
output=test_case.output_list
if test_case.output_list
else "-",
expected_output=test_case.golden_list
if test_case.golden_list
else "-",
metadata=None,
context="-",
test_results = []
for test_case in test_cases:
failed_metrics = []
for metric in metrics:
test_start_time = time.perf_counter()

@retry(
max_retries=max_retries,
delay=delay,
min_success=min_success,
)
def measure_metric():
score = metric.measure(test_case)
success = metric.is_successful()
test_result = create_test_result(
test_case, success, score, metric
)
else:
raise ValueError("TestCase not supported yet.")
test_results.append(test_result)

# Load the test_run and add the test_case regardless of the success of the test
test_end_time = time.perf_counter()
run_duration = test_end_time - test_start_time
if os.getenv(PYTEST_RUN_ENV_VAR):
test_run = TestRun.load()
metric.score = score
test_run.add_llm_test_case(
test_case=test_case,
metrics=[metric],
run_duration=run_duration,
)
test_run.save()

if raise_error:
assert (
metric.is_successful()
), f"{metric.__name__} failed. Score: {score}."

measure_metric()
test_results.append(test_result)

# Load the test_run and add the test_case regardless of the success of the test
test_end_time = time.perf_counter()
run_duration = test_end_time - test_start_time
if os.getenv(PYTEST_RUN_ENV_VAR):
test_run = TestRun.load()
metric.score = score
test_run.add_llm_test_case(
test_case=test_case,
metrics=[metric],
run_duration=run_duration,
)
test_run.save()

if not success:
failed_metrics.append((metric.__name__, score))

measure_metric()

if raise_error and failed_metrics:
raise AssertionError(
f"Metrics {', '.join([f'{name} (Score: {score})' for name, score in failed_metrics])} failed."
)

return test_results

Expand Down
5 changes: 3 additions & 2 deletions docs/docs/quickstart/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,12 @@ Now we often don't want to write our own tests or at least be given a variety of
You can automatically create tests in DeepEval in just a few lines of code:

```python
from deepeval.dataset import create_evaluation_dataset
from deepeval.dataset import create_evaluation_query_answer_pairs

dataset = create_evaluation_query_answer_pairs(
openai_api_key="<YOUR_OPENAI_API_KEY>",
context="FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.",
n=3
n=3,
)

```
Expand Down
12 changes: 12 additions & 0 deletions examples/create_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import os
from deepeval.dataset import (
create_evaluation_query_answer_pairs,
EvaluationDataset,
)

dataset: EvaluationDataset = create_evaluation_query_answer_pairs(
openai_api_key=os.environ["OPENAI_API_KEY"],
context="FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.",
n=3,
)
dataset.review()
34 changes: 34 additions & 0 deletions examples/review_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Define your completion protocol
import openai
from deepeval.dataset import EvaluationDataset
from deepeval.metrics.factual_consistency import FactualConsistencyMetric

ds = EvaluationDataset.from_csv(
"review-test.csv",
query_column="query",
expected_output_column="expected_output",
)
print(ds.sample())


def generate_chatgpt_output(query: str):
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "assistant",
"content": "The customer success phone line is 1200-231-231 and the customer success state is in Austin.",
},
{"role": "user", "content": query},
],
)
expected_output = response.choices[0].message.content
return expected_output


factual_consistency_metric = FactualConsistencyMetric()

ds.run_evaluation(
completion_fn=generate_chatgpt_output, metrics=[factual_consistency_metric]
)
9 changes: 9 additions & 0 deletions tests/test_without_pytest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Test to make sure assert statements can work
"""
from deepeval.metrics.conceptual_similarity import assert_conceptual_similarity

assert_conceptual_similarity(
output="python is a programming language",
expected_output="Python is a snake.",
minimum_score=0.3,
)