confident-ai · jwongster2 · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023
diff --git a/.github/workflows/test-no-login.yml b/.github/workflows/test-no-login.yml
@@ -37,5 +37,6 @@ jobs:
         env:
           python-version: ${{ matrix.python-version }}
         run: |
+          python tests/test_without_pytest.py
           deepeval test run -x tests
 
diff --git a/deepeval/dataset.py b/deepeval/dataset.py
@@ -12,6 +12,7 @@
 from deepeval.run_test import run_test
 from deepeval.metrics.metric import Metric
 from deepeval.test_case import LLMTestCase
+from dataclasses import asdict
 
 
 class EvaluationDataset(UserList):
@@ -230,7 +231,7 @@ def from_dict(
         return cls(test_cases)
 
     def to_dict(self):
-        return [x.dict() for x in self.data]
+        return [asdict(x) for x in self.data]
 
     def to_csv(self, csv_filename: str):
         import pandas as pd
@@ -255,7 +256,7 @@ def sample(self, n: int = 5):
         if len(self.data) <= n:
             n = len(self.data)
         result = random.sample(self.data, n)
-        return [r.dict() for r in result]
+        return [asdict(r) for r in result]
 
     def head(self, n: int = 5):
         return self.data[:n]

diff --git a/deepeval/metrics/conceptual_similarity.py b/deepeval/metrics/conceptual_similarity.py
@@ -44,7 +44,7 @@ def __name__(self):
 
 
 def assert_conceptual_similarity(
-    output: str, expected_output: str, minimum_score=0.3
+    output: str, expected_output: str, minimum_score=0.7
 ):
     metric = ConceptualSimilarityMetric(minimum_score=minimum_score)
     test_case = LLMTestCase(output=output, expected_output=expected_output)

diff --git a/deepeval/run_test.py b/deepeval/run_test.py
@@ -65,6 +65,42 @@ def __lt__(self, other: "TestResult") -> bool:
         return self.score < other.score
 
 
+def create_test_result(
+    test_case: Union[LLMTestCase, SearchTestCase],
+    success: bool,
+    score: float,
+    metric: float,
+) -> TestResult:
+    if isinstance(test_case, LLMTestCase):
+        return TestResult(
+            success=success,
+            score=score,
+            metric_name=metric.__name__,
+            query=test_case.query if test_case.query else "-",
+            output=test_case.output if test_case.output else "-",
+            expected_output=test_case.expected_output
+            if test_case.expected_output
+            else "-",
+            metadata=None,
+            context=test_case.context,
+        )
+    elif isinstance(test_case, SearchTestCase):
+        return TestResult(
+            success=success,
+            score=score,
+            metric_name=metric.__name__,
+            query=test_case.query if test_case.query else "-",
+            output=test_case.output_list if test_case.output_list else "-",
+            expected_output=test_case.golden_list
+            if test_case.golden_list
+            else "-",
+            metadata=None,
+            context="-",
+        )
+    else:
+        raise ValueError("TestCase not supported yet.")
+
+
 def run_test(
     test_cases: Union[TestCase, LLMTestCase, SearchTestCase, List[LLMTestCase]],
     metrics: List[Metric],
@@ -98,68 +134,47 @@ def run_test(
     if isinstance(test_cases, TestCase):
         test_cases = [test_cases]
 
-    test_results = []
-    for test_case in test_cases:
-        for metric in metrics:
-            test_start_time = time.perf_counter()
-
-            @retry(
-                max_retries=max_retries, delay=delay, min_success=min_success
-            )
-            def measure_metric():
-                score = metric.measure(test_case)
-                success = metric.is_successful()
-                if isinstance(test_case, LLMTestCase):
-                    test_result = TestResult(
-                        success=success,
-                        score=score,
-                        metric_name=metric.__name__,
-                        query=test_case.query if test_case.query else "-",
-                        output=test_case.output if test_case.output else "-",
-                        expected_output=test_case.expected_output
-                        if test_case.expected_output
-                        else "-",
-                        metadata=None,
-                        context=test_case.context,
-                    )
-                elif isinstance(test_case, SearchTestCase):
-                    test_result = TestResult(
-                        success=success,
-                        score=score,
-                        metric_name=metric.__name__,
-                        query=test_case.query if test_case.query else "-",
-                        output=test_case.output_list
-                        if test_case.output_list
-                        else "-",
-                        expected_output=test_case.golden_list
-                        if test_case.golden_list
-                        else "-",
-                        metadata=None,
-                        context="-",
+        test_results = []
+        for test_case in test_cases:
+            failed_metrics = []
+            for metric in metrics:
+                test_start_time = time.perf_counter()
+
+                @retry(
+                    max_retries=max_retries,
+                    delay=delay,
+                    min_success=min_success,
+                )
+                def measure_metric():
+                    score = metric.measure(test_case)
+                    success = metric.is_successful()
+                    test_result = create_test_result(
+                        test_case, success, score, metric
                     )
-                else:
-                    raise ValueError("TestCase not supported yet.")
-                test_results.append(test_result)
-
-                # Load the test_run and add the test_case regardless of the success of the test
-                test_end_time = time.perf_counter()
-                run_duration = test_end_time - test_start_time
-                if os.getenv(PYTEST_RUN_ENV_VAR):
-                    test_run = TestRun.load()
-                    metric.score = score
-                    test_run.add_llm_test_case(
-                        test_case=test_case,
-                        metrics=[metric],
-                        run_duration=run_duration,
-                    )
-                    test_run.save()
-
-                if raise_error:
-                    assert (
-                        metric.is_successful()
-                    ), f"{metric.__name__} failed. Score: {score}."
-
-            measure_metric()
+                    test_results.append(test_result)
+
+                    # Load the test_run and add the test_case regardless of the success of the test
+                    test_end_time = time.perf_counter()
+                    run_duration = test_end_time - test_start_time
+                    if os.getenv(PYTEST_RUN_ENV_VAR):
+                        test_run = TestRun.load()
+                        metric.score = score
+                        test_run.add_llm_test_case(
+                            test_case=test_case,
+                            metrics=[metric],
+                            run_duration=run_duration,
+                        )
+                        test_run.save()
+
+                    if not success:
+                        failed_metrics.append((metric.__name__, score))
+
+                measure_metric()
+
+        if raise_error and failed_metrics:
+            raise AssertionError(
+                f"Metrics {', '.join([f'{name} (Score: {score})' for name, score in failed_metrics])} failed."
+            )
 
     return test_results
 

diff --git a/docs/docs/quickstart/quickstart.md b/docs/docs/quickstart/quickstart.md
@@ -87,11 +87,12 @@ Now we often don't want to write our own tests or at least be given a variety of
 You can automatically create tests in DeepEval in just a few lines of code:
 
 ```python
-from deepeval.dataset import create_evaluation_dataset
+from deepeval.dataset import create_evaluation_query_answer_pairs
+
 dataset = create_evaluation_query_answer_pairs(
     openai_api_key="<YOUR_OPENAI_API_KEY>",
     context="FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.",
-    n=3    
+    n=3,
 )
 
 ```

diff --git a/examples/create_tests.py b/examples/create_tests.py
@@ -0,0 +1,12 @@
+import os
+from deepeval.dataset import (
+    create_evaluation_query_answer_pairs,
+    EvaluationDataset,
+)
+
+dataset: EvaluationDataset = create_evaluation_query_answer_pairs(
+    openai_api_key=os.environ["OPENAI_API_KEY"],
+    context="FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.",
+    n=3,
+)
+dataset.review()
diff --git a/examples/review_datasets.py b/examples/review_datasets.py
@@ -0,0 +1,34 @@
+# Define your completion protocol
+import openai
+from deepeval.dataset import EvaluationDataset
+from deepeval.metrics.factual_consistency import FactualConsistencyMetric
+
+ds = EvaluationDataset.from_csv(
+    "review-test.csv",
+    query_column="query",
+    expected_output_column="expected_output",
+)
+print(ds.sample())
+
+
+def generate_chatgpt_output(query: str):
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "assistant",
+                "content": "The customer success phone line is 1200-231-231 and the customer success state is in Austin.",
+            },
+            {"role": "user", "content": query},
+        ],
+    )
+    expected_output = response.choices[0].message.content
+    return expected_output
+
+
+factual_consistency_metric = FactualConsistencyMetric()
+
+ds.run_evaluation(
+    completion_fn=generate_chatgpt_output, metrics=[factual_consistency_metric]
+)
diff --git a/tests/test_without_pytest.py b/tests/test_without_pytest.py
@@ -0,0 +1,9 @@
+"""Test to make sure assert statements can work
+"""
+from deepeval.metrics.conceptual_similarity import assert_conceptual_similarity
+
+assert_conceptual_similarity(
+    output="python is a programming language",
+    expected_output="Python is a snake.",
+    minimum_score=0.3,
+)