diff --git a/.github/workflows/test-no-login.yml b/.github/workflows/test-no-login.yml index d283b5b7a..74de62b9b 100644 --- a/.github/workflows/test-no-login.yml +++ b/.github/workflows/test-no-login.yml @@ -37,5 +37,6 @@ jobs: env: python-version: ${{ matrix.python-version }} run: | + python tests/test_without_pytest.py deepeval test run -x tests diff --git a/deepeval/dataset.py b/deepeval/dataset.py index 994a6114f..bc635915a 100644 --- a/deepeval/dataset.py +++ b/deepeval/dataset.py @@ -12,6 +12,7 @@ from deepeval.run_test import run_test from deepeval.metrics.metric import Metric from deepeval.test_case import LLMTestCase +from dataclasses import asdict class EvaluationDataset(UserList): @@ -230,7 +231,7 @@ def from_dict( return cls(test_cases) def to_dict(self): - return [x.dict() for x in self.data] + return [asdict(x) for x in self.data] def to_csv(self, csv_filename: str): import pandas as pd @@ -255,7 +256,7 @@ def sample(self, n: int = 5): if len(self.data) <= n: n = len(self.data) result = random.sample(self.data, n) - return [r.dict() for r in result] + return [asdict(r) for r in result] def head(self, n: int = 5): return self.data[:n] diff --git a/deepeval/metrics/conceptual_similarity.py b/deepeval/metrics/conceptual_similarity.py index 477a7fd29..f7c86b2f2 100644 --- a/deepeval/metrics/conceptual_similarity.py +++ b/deepeval/metrics/conceptual_similarity.py @@ -44,7 +44,7 @@ def __name__(self): def assert_conceptual_similarity( - output: str, expected_output: str, minimum_score=0.3 + output: str, expected_output: str, minimum_score=0.7 ): metric = ConceptualSimilarityMetric(minimum_score=minimum_score) test_case = LLMTestCase(output=output, expected_output=expected_output) diff --git a/deepeval/run_test.py b/deepeval/run_test.py index 22329c6bc..f1ff2a3b9 100644 --- a/deepeval/run_test.py +++ b/deepeval/run_test.py @@ -65,6 +65,42 @@ def __lt__(self, other: "TestResult") -> bool: return self.score < other.score +def create_test_result( + test_case: Union[LLMTestCase, SearchTestCase], + success: bool, + score: float, + metric: float, +) -> TestResult: + if isinstance(test_case, LLMTestCase): + return TestResult( + success=success, + score=score, + metric_name=metric.__name__, + query=test_case.query if test_case.query else "-", + output=test_case.output if test_case.output else "-", + expected_output=test_case.expected_output + if test_case.expected_output + else "-", + metadata=None, + context=test_case.context, + ) + elif isinstance(test_case, SearchTestCase): + return TestResult( + success=success, + score=score, + metric_name=metric.__name__, + query=test_case.query if test_case.query else "-", + output=test_case.output_list if test_case.output_list else "-", + expected_output=test_case.golden_list + if test_case.golden_list + else "-", + metadata=None, + context="-", + ) + else: + raise ValueError("TestCase not supported yet.") + + def run_test( test_cases: Union[TestCase, LLMTestCase, SearchTestCase, List[LLMTestCase]], metrics: List[Metric], @@ -98,68 +134,47 @@ def run_test( if isinstance(test_cases, TestCase): test_cases = [test_cases] - test_results = [] - for test_case in test_cases: - for metric in metrics: - test_start_time = time.perf_counter() - - @retry( - max_retries=max_retries, delay=delay, min_success=min_success - ) - def measure_metric(): - score = metric.measure(test_case) - success = metric.is_successful() - if isinstance(test_case, LLMTestCase): - test_result = TestResult( - success=success, - score=score, - metric_name=metric.__name__, - query=test_case.query if test_case.query else "-", - output=test_case.output if test_case.output else "-", - expected_output=test_case.expected_output - if test_case.expected_output - else "-", - metadata=None, - context=test_case.context, - ) - elif isinstance(test_case, SearchTestCase): - test_result = TestResult( - success=success, - score=score, - metric_name=metric.__name__, - query=test_case.query if test_case.query else "-", - output=test_case.output_list - if test_case.output_list - else "-", - expected_output=test_case.golden_list - if test_case.golden_list - else "-", - metadata=None, - context="-", + test_results = [] + for test_case in test_cases: + failed_metrics = [] + for metric in metrics: + test_start_time = time.perf_counter() + + @retry( + max_retries=max_retries, + delay=delay, + min_success=min_success, + ) + def measure_metric(): + score = metric.measure(test_case) + success = metric.is_successful() + test_result = create_test_result( + test_case, success, score, metric ) - else: - raise ValueError("TestCase not supported yet.") - test_results.append(test_result) - - # Load the test_run and add the test_case regardless of the success of the test - test_end_time = time.perf_counter() - run_duration = test_end_time - test_start_time - if os.getenv(PYTEST_RUN_ENV_VAR): - test_run = TestRun.load() - metric.score = score - test_run.add_llm_test_case( - test_case=test_case, - metrics=[metric], - run_duration=run_duration, - ) - test_run.save() - - if raise_error: - assert ( - metric.is_successful() - ), f"{metric.__name__} failed. Score: {score}." - - measure_metric() + test_results.append(test_result) + + # Load the test_run and add the test_case regardless of the success of the test + test_end_time = time.perf_counter() + run_duration = test_end_time - test_start_time + if os.getenv(PYTEST_RUN_ENV_VAR): + test_run = TestRun.load() + metric.score = score + test_run.add_llm_test_case( + test_case=test_case, + metrics=[metric], + run_duration=run_duration, + ) + test_run.save() + + if not success: + failed_metrics.append((metric.__name__, score)) + + measure_metric() + + if raise_error and failed_metrics: + raise AssertionError( + f"Metrics {', '.join([f'{name} (Score: {score})' for name, score in failed_metrics])} failed." + ) return test_results diff --git a/docs/docs/quickstart/quickstart.md b/docs/docs/quickstart/quickstart.md index 52c760f65..5b97746db 100644 --- a/docs/docs/quickstart/quickstart.md +++ b/docs/docs/quickstart/quickstart.md @@ -87,11 +87,12 @@ Now we often don't want to write our own tests or at least be given a variety of You can automatically create tests in DeepEval in just a few lines of code: ```python -from deepeval.dataset import create_evaluation_dataset +from deepeval.dataset import create_evaluation_query_answer_pairs + dataset = create_evaluation_query_answer_pairs( openai_api_key="", context="FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.", - n=3 + n=3, ) ``` diff --git a/examples/create_tests.py b/examples/create_tests.py new file mode 100644 index 000000000..ace24694e --- /dev/null +++ b/examples/create_tests.py @@ -0,0 +1,12 @@ +import os +from deepeval.dataset import ( + create_evaluation_query_answer_pairs, + EvaluationDataset, +) + +dataset: EvaluationDataset = create_evaluation_query_answer_pairs( + openai_api_key=os.environ["OPENAI_API_KEY"], + context="FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.", + n=3, +) +dataset.review() diff --git a/examples/review_datasets.py b/examples/review_datasets.py new file mode 100644 index 000000000..3b6d382f6 --- /dev/null +++ b/examples/review_datasets.py @@ -0,0 +1,34 @@ +# Define your completion protocol +import openai +from deepeval.dataset import EvaluationDataset +from deepeval.metrics.factual_consistency import FactualConsistencyMetric + +ds = EvaluationDataset.from_csv( + "review-test.csv", + query_column="query", + expected_output_column="expected_output", +) +print(ds.sample()) + + +def generate_chatgpt_output(query: str): + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "assistant", + "content": "The customer success phone line is 1200-231-231 and the customer success state is in Austin.", + }, + {"role": "user", "content": query}, + ], + ) + expected_output = response.choices[0].message.content + return expected_output + + +factual_consistency_metric = FactualConsistencyMetric() + +ds.run_evaluation( + completion_fn=generate_chatgpt_output, metrics=[factual_consistency_metric] +) diff --git a/tests/test_without_pytest.py b/tests/test_without_pytest.py new file mode 100644 index 000000000..8538fbad8 --- /dev/null +++ b/tests/test_without_pytest.py @@ -0,0 +1,9 @@ +"""Test to make sure assert statements can work +""" +from deepeval.metrics.conceptual_similarity import assert_conceptual_similarity + +assert_conceptual_similarity( + output="python is a programming language", + expected_output="Python is a snake.", + minimum_score=0.3, +)