confident-ai · jwongster2 · Sep 22, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 22, 2023
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -46,4 +46,4 @@ jobs:
           CONFIDENT_AI_API_KEY: ${{ secrets.API_KEY_ENV }}
           CONFIDENT_AI_IMP_ID: ${{ secrets.CONFIDENT_AI_IMP_ID }}
         run: |
-          python -m pytest -x tests
+          deepeval test run -x tests
diff --git a/deepeval/_version.py b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.15.2"
+__version__: str = "0.16.0"
diff --git a/deepeval/api.py b/deepeval/api.py
@@ -1,13 +1,19 @@
 import os
 import platform
 import urllib.parse
-from typing import Any, Optional
-
 import requests
+import json
+
+from datetime import datetime
+from typing import Any, Optional, Union
+from pydantic import BaseModel, Field
+from typing import List
 from requests.adapters import HTTPAdapter, Response, Retry
 
-from .constants import API_KEY_ENV
-from .key_handler import KEY_FILE_HANDLER
+from deepeval.constants import API_KEY_ENV, PYTEST_RUN_ENV_VAR
+from deepeval.key_handler import KEY_FILE_HANDLER
+from deepeval.metrics.metric import Metric
+from deepeval.test_case import LLMTestCase
 
 API_BASE_URL = "https://app.confident-ai.com/api"
 # API_BASE_URL = "http://localhost:3000/api"
@@ -19,6 +25,123 @@
 HTTP_RETRY_ALLOWED_METHODS = frozenset({"GET", "POST", "DELETE"})
 
 
+class MetricsMetadata(BaseModel):
+    metric: str
+    score: float
+    minimum_score: float = Field(None, alias="minimumScore")
+
+
+class APITestCase(BaseModel):
+    name: str
+    input: str
+    actual_output: str = Field(..., alias="actualOutput")
+    expected_output: str = Field(..., alias="expectedOutput")
+    success: bool
+    metrics_metadata: List[MetricsMetadata] = Field(
+        ..., alias="metricsMetadata"
+    )
+    threshold: float
+    run_duration: int = Field(..., alias="runDuration")
+
+
+class MetricScore(BaseModel):
+    metric: str
+    score: float
+
+    @classmethod
+    def from_metric(cls, metric: Metric):
+        return cls(metric=metric.__name__, score=metric.score)
+
+
+class TestRun(BaseModel):
+    test_file: Optional[str] = Field(
+        # TODO: Fix test_file
+        "test.py",
+        alias="testFile",
+    )
+    test_cases: List[APITestCase] = Field(
+        alias="testCases", default_factory=lambda: []
+    )
+    metric_scores: List[MetricScore] = Field(
+        default_factory=lambda: [], alias="metricScores"
+    )
+    configurations: dict
+
+    def add_llm_test_case(self, test_case: LLMTestCase, metrics: List[Metric]):
+        self.metric_scores.extend([MetricScore.from_metric(m) for m in metrics])
+        # Check if test case with the same ID already exists
+        existing_test_case: APITestCase = next(
+            (tc for tc in self.test_cases if tc.name == test_case.__name__),
+            None,
+        )
+        if existing_test_case:
+            # If it exists, append the metrics to the existing test case
+            existing_test_case.metricsMetadata.extend(
+                [
+                    MetricsMetadata(
+                        metric=metric.__name__,
+                        score=metric.score,
+                        minimumScore=metric.minimum_score,
+                    )
+                    for metric in metrics
+                ]
+            )
+            # Update the success status and threshold
+            existing_test_case.success = all(
+                [metric.is_successful() for metric in metrics]
+            )
+            existing_test_case.threshold = metrics[0].minimum_score
+        else:
+            # If it doesn't exist, create a new test case
+            self.test_cases.append(
+                APITestCase(
+                    name=test_case.__name__,
+                    input=test_case.query,
+                    actualOutput=test_case.output,
+                    expectedOutput=test_case.expected_output,
+                    success=all([metric.is_successful() for metric in metrics]),
+                    metricsMetadata=[
+                        MetricsMetadata(
+                            metric=metric.__name__,
+                            score=metric.score,
+                            minimumScore=metric.minimum_score,
+                        )
+                        for metric in metrics
+                    ],
+                    threshold=metrics[0].minimum_score,
+                    runDuration=0,  # TODO: add duration
+                )
+            )
+
+    def save(self, file_path: Optional[str] = None):
+        if file_path is None:
+            file_path = os.getenv(PYTEST_RUN_ENV_VAR)
+            # If file Path is None, remove it
+            if not file_path:
+                return
+            elif not file_path.endswith(".json"):
+                file_path = f"{file_path}.json"
+        print({"save_filepath", file_path})
+
+        with open(file_path, "w") as f:
+            json.dump(self.dict(by_alias=True, exclude_none=True), f)
+
+        return file_path
+
+    @classmethod
+    def load(cls, file_path: Optional[str] = None):
+        if file_path is None:
+            file_path = os.getenv(PYTEST_RUN_ENV_VAR)
+            # If file Path is None, remove it
+            if not file_path:
+                return
+            elif not file_path.endswith(".json"):
+                file_path = f"{file_path}.json"
+        print({"load_filepath", file_path})
+        with open(file_path, "r") as f:
+            return cls(**json.load(f))
+
+
 class Api:
     """Internal Api reference for handling http operations"""
 
@@ -330,3 +453,10 @@ def list_implementations(self):
         Returns a list of implementations
         """
         return self.get_request(endpoint="/v1/implementation")
+
+    def post_test_run(self, test_run: TestRun):
+        """Post a test run"""
+        return self.post_request(
+            endpoint="/v1/test-run",
+            body=test_run.model_dump(by_alias=True),
+        )
diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py
@@ -1,15 +1,19 @@
 import pytest
 import typer
-
+import os
+import datetime
+from typing_extensions import Annotated
 from ..metrics.overall_score import assert_overall_score
 from .cli_key_handler import set_env_vars
+from ..constants import PYTEST_RUN_ENV_VAR
 
 try:
     from rich import print
     from rich.progress import Progress, SpinnerColumn, TextColumn
 except Exception as e:
     pass
 
+
 app = typer.Typer(name="test")
 
 
@@ -85,26 +89,34 @@ def check_if_legit_file(test_file: str):
 @app.command()
 def run(
     test_file_or_directory: str,
-    exit_on_first_failure: bool = False,
     verbose: bool = False,
     color: str = "yes",
     durations: int = 10,
     pdb: bool = False,
+    exit_on_first_failure: Annotated[
+        bool, typer.Option("--exit-on-first-failure", "-x/-X")
+    ] = False,
 ):
     """Run a test"""
     pytest_args = ["-k", test_file_or_directory]
     if exit_on_first_failure:
         pytest_args.insert(0, "-x")
+
+    # Generate environment variable based on current date and time
+    env_var = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    os.environ[PYTEST_RUN_ENV_VAR] = env_var
+
     pytest_args.extend(
         [
             "--verbose" if verbose else "--quiet",
             f"--color={color}",
             f"--durations={durations}",
-            # f"--cov={cov}",
-            # f"--cov-report={cov_report}",
             "--pdb" if pdb else "",
         ]
     )
+    # Add the deepeval plugin file to pytest arguments
+    pytest_args.extend(["-p", "plugins"])
+
     with Progress(
         SpinnerColumn(),
         TextColumn("[progress.description]{task.description}"),

diff --git a/deepeval/constants.py b/deepeval/constants.py
@@ -2,4 +2,5 @@
 LOG_TO_SERVER_ENV: str = "DO_NOT_SEND_TO_CONFIDENT_AI"
 IMPLEMENTATION_ID_ENV: str = "CONFIDENT_AI_IMP_ID"
 IMPLEMENTATION_ID_NAME: str = "CONFIDENT_AI_IMP_NAME"
-KEY_FILE = ".deepeval"
+KEY_FILE: str = ".deepeval"
+PYTEST_RUN_ENV_VAR: str = "CONFIDENT_AI_RUN_TIMESTAMP"
diff --git a/deepeval/metrics/metric.py b/deepeval/metrics/metric.py
@@ -9,6 +9,7 @@
 class Metric(metaclass=Singleton):
     # set an arbitrary minimum score that will get over-ridden later
     minimum_score: float = 0
+    score: float = 0
 
     # Measure function signature is subject to be different - not sure
     # how applicable this is - might need a better abstraction
@@ -29,6 +30,10 @@ def _get_init_values(self):
     def is_successful(self) -> bool:
         raise NotImplementedError
 
+    @property
+    def __name__(self):
+        return "Metric"
+
 
 class EntailmentScoreMetric(Metric):
     def __init__(self, model_name: str = "cross-encoder/nli-deberta-base"):

diff --git a/deepeval/metrics/randomscore.py b/deepeval/metrics/randomscore.py
@@ -9,9 +9,9 @@ def __init__(self, minimum_score: float = 0.3):
         self.minimum_score = minimum_score
 
     def measure(self, test_case: LLMTestCase):
-        score = random.random()
-        self.success = score >= self.minimum_score
-        return score
+        self.score = random.random()
+        self.success = self.score >= self.minimum_score
+        return self.score
 
     def is_successful(self):
         return self.success

diff --git a/deepeval/plugins/__init__.py b/deepeval/plugins/__init__.py
diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py
@@ -0,0 +1,32 @@
+import pytest
+import os
+from deepeval.api import Api, TestRun
+
+from deepeval.constants import PYTEST_RUN_ENV_VAR
+
+
+def pytest_sessionstart(session):
+    global test_filename
+    test_run = TestRun(
+        testFile="-",
+        testCases=[],
+        metricScores=[],
+        configurations={},
+    )
+    test_filename = test_run.save()
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_sessionfinish(session, exitstatus):
+    # Code before yield will run before the test teardown
+    api: Api = Api()
+
+    # yield control back to pytest for the actual teardown
+    yield
+
+    # Code after yield will run after the test teardown
+    if os.getenv(PYTEST_RUN_ENV_VAR):
+        test_run = TestRun.load(test_filename)
+        api.post_test_run(test_run)
+    # if os.path.exists(test_filename):
+    #     os.remove(test_filename)
diff --git a/deepeval/run_test.py b/deepeval/run_test.py
@@ -1,14 +1,20 @@
 """Function for running test
 """
 import os
+import copy
 from typing import List, Optional, Union
 from dataclasses import dataclass
 from .retry import retry
 from .client import Client
-from .constants import IMPLEMENTATION_ID_ENV, LOG_TO_SERVER_ENV
+from .constants import (
+    IMPLEMENTATION_ID_ENV,
+    LOG_TO_SERVER_ENV,
+    PYTEST_RUN_ENV_VAR,
+)
 from .get_api_key import _get_api_key, _get_implementation_name
 from .metrics import Metric
 from .test_case import LLMTestCase, TestCase, SearchTestCase
+from .api import TestRun
 
 
 def _is_api_key_set():
@@ -231,6 +237,15 @@ def measure_metric():
                     raise ValueError("TestCase not supported yet.")
                 test_results.append(test_result)
 
+                # Load the test_run and add the test_case regardless of the success of the test
+                if os.getenv(PYTEST_RUN_ENV_VAR):
+                    test_run = TestRun.load()
+                    test_run.add_llm_test_case(
+                        test_case=test_case,
+                        metrics=[metric],
+                    )
+                    test_run.save()
+
                 if raise_error:
                     assert (
                         metric.is_successful()

diff --git a/deepeval/test_case.py b/deepeval/test_case.py
@@ -17,10 +17,14 @@ def __post_init__(self):
 
 @dataclass
 class LLMTestCase(TestCase):
-    query: Optional[str] = None
-    expected_output: Optional[str] = None
-    context: Optional[str] = None
-    output: Optional[str] = None
+    query: str = "-"
+    expected_output: str = "-"
+    context: str = "-"
+    output: str = "-"
+
+    def __post_init__(self):
+        super().__post_init__()
+        self.__name__ = f"LLMTestCase_{self.id}"
 
     # def dict(self):
     #     data = {

diff --git a/docs/docs/quickstart/quickstart.md b/docs/docs/quickstart/quickstart.md
@@ -37,11 +37,7 @@ deepeval test run -x test_sample.py
 deepeval test run --pdb test_sample.py
 ```
 
-Under the hood, it triggers pytest and offers support for a number of pytest command line functionalities. Similarly, you may also trigger `pytest` natively for these tests such as 
-
-```bash
-pytest test_sample.py
-```
+Under the hood, it triggers pytest and offers support for a number of pytest command line functionalities.
 
 Once you run the tests, you should be able to see a dashboard similar to the one below.
 

diff --git a/setup.py b/setup.py
@@ -27,6 +27,7 @@
         "rich",
         "protobuf<=3.20.5",
         "pandas",
+        "pydantic>=2.0.0",
     ],
     extras_require={
         "bias": [
@@ -41,6 +42,9 @@
     entry_points={
         "console_scripts": [
             "deepeval = deepeval.cli.main:app",
-        ]
+        ],
+        "pytest11": [
+            "plugins = deepeval.plugins.plugin",
+        ],
     },
 )
diff --git a/tests/test_overall_score.py b/tests/test_overall_score.py
@@ -28,7 +28,7 @@
 metric = OverallScoreMetric()
 
 
-class TestOverallScore(LLMTestCase):
+class TestOverallScore:
     metric = OverallScoreMetric()
 
     def test_overall_score(self):
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__: str = "0.15.2"
		__version__: str = "0.16.0"