Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/add test run #115

Merged
merged 19 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ jobs:
CONFIDENT_AI_API_KEY: ${{ secrets.API_KEY_ENV }}
CONFIDENT_AI_IMP_ID: ${{ secrets.CONFIDENT_AI_IMP_ID }}
run: |
python -m pytest -x tests
deepeval test run -x tests
2 changes: 1 addition & 1 deletion deepeval/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__: str = "0.15.2"
__version__: str = "0.16.0"
138 changes: 134 additions & 4 deletions deepeval/api.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import os
import platform
import urllib.parse
from typing import Any, Optional

import requests
import json

from datetime import datetime
from typing import Any, Optional, Union
from pydantic import BaseModel, Field
from typing import List
from requests.adapters import HTTPAdapter, Response, Retry

from .constants import API_KEY_ENV
from .key_handler import KEY_FILE_HANDLER
from deepeval.constants import API_KEY_ENV, PYTEST_RUN_ENV_VAR
from deepeval.key_handler import KEY_FILE_HANDLER
from deepeval.metrics.metric import Metric
from deepeval.test_case import LLMTestCase

API_BASE_URL = "https://app.confident-ai.com/api"
# API_BASE_URL = "http://localhost:3000/api"
Expand All @@ -19,6 +25,123 @@
HTTP_RETRY_ALLOWED_METHODS = frozenset({"GET", "POST", "DELETE"})


class MetricsMetadata(BaseModel):
metric: str
score: float
minimum_score: float = Field(None, alias="minimumScore")


class APITestCase(BaseModel):
name: str
input: str
actual_output: str = Field(..., alias="actualOutput")
expected_output: str = Field(..., alias="expectedOutput")
success: bool
metrics_metadata: List[MetricsMetadata] = Field(
..., alias="metricsMetadata"
)
threshold: float
run_duration: int = Field(..., alias="runDuration")


class MetricScore(BaseModel):
metric: str
score: float

@classmethod
def from_metric(cls, metric: Metric):
return cls(metric=metric.__name__, score=metric.score)


class TestRun(BaseModel):
test_file: Optional[str] = Field(
# TODO: Fix test_file
"test.py",
alias="testFile",
)
test_cases: List[APITestCase] = Field(
alias="testCases", default_factory=lambda: []
)
metric_scores: List[MetricScore] = Field(
default_factory=lambda: [], alias="metricScores"
)
configurations: dict

def add_llm_test_case(self, test_case: LLMTestCase, metrics: List[Metric]):
self.metric_scores.extend([MetricScore.from_metric(m) for m in metrics])
# Check if test case with the same ID already exists
existing_test_case: APITestCase = next(
(tc for tc in self.test_cases if tc.name == test_case.__name__),
None,
)
if existing_test_case:
# If it exists, append the metrics to the existing test case
existing_test_case.metricsMetadata.extend(
[
MetricsMetadata(
metric=metric.__name__,
score=metric.score,
minimumScore=metric.minimum_score,
)
for metric in metrics
]
)
# Update the success status and threshold
existing_test_case.success = all(
[metric.is_successful() for metric in metrics]
)
existing_test_case.threshold = metrics[0].minimum_score
else:
# If it doesn't exist, create a new test case
self.test_cases.append(
APITestCase(
name=test_case.__name__,
input=test_case.query,
actualOutput=test_case.output,
expectedOutput=test_case.expected_output,
success=all([metric.is_successful() for metric in metrics]),
metricsMetadata=[
MetricsMetadata(
metric=metric.__name__,
score=metric.score,
minimumScore=metric.minimum_score,
)
for metric in metrics
],
threshold=metrics[0].minimum_score,
runDuration=0, # TODO: add duration
)
)

def save(self, file_path: Optional[str] = None):
if file_path is None:
file_path = os.getenv(PYTEST_RUN_ENV_VAR)
# If file Path is None, remove it
if not file_path:
return
elif not file_path.endswith(".json"):
file_path = f"{file_path}.json"
print({"save_filepath", file_path})

with open(file_path, "w") as f:
json.dump(self.dict(by_alias=True, exclude_none=True), f)

return file_path

@classmethod
def load(cls, file_path: Optional[str] = None):
if file_path is None:
file_path = os.getenv(PYTEST_RUN_ENV_VAR)
# If file Path is None, remove it
if not file_path:
return
elif not file_path.endswith(".json"):
file_path = f"{file_path}.json"
print({"load_filepath", file_path})
with open(file_path, "r") as f:
return cls(**json.load(f))


class Api:
"""Internal Api reference for handling http operations"""

Expand Down Expand Up @@ -330,3 +453,10 @@ def list_implementations(self):
Returns a list of implementations
"""
return self.get_request(endpoint="/v1/implementation")

def post_test_run(self, test_run: TestRun):
"""Post a test run"""
return self.post_request(
endpoint="/v1/test-run",
body=test_run.model_dump(by_alias=True),
)
20 changes: 16 additions & 4 deletions deepeval/cli/test.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
import pytest
import typer

import os
import datetime
from typing_extensions import Annotated
from ..metrics.overall_score import assert_overall_score
from .cli_key_handler import set_env_vars
from ..constants import PYTEST_RUN_ENV_VAR

try:
from rich import print
from rich.progress import Progress, SpinnerColumn, TextColumn
except Exception as e:
pass


app = typer.Typer(name="test")


Expand Down Expand Up @@ -85,26 +89,34 @@ def check_if_legit_file(test_file: str):
@app.command()
def run(
test_file_or_directory: str,
exit_on_first_failure: bool = False,
verbose: bool = False,
color: str = "yes",
durations: int = 10,
pdb: bool = False,
exit_on_first_failure: Annotated[
bool, typer.Option("--exit-on-first-failure", "-x/-X")
] = False,
):
"""Run a test"""
pytest_args = ["-k", test_file_or_directory]
if exit_on_first_failure:
pytest_args.insert(0, "-x")

# Generate environment variable based on current date and time
env_var = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
os.environ[PYTEST_RUN_ENV_VAR] = env_var

pytest_args.extend(
[
"--verbose" if verbose else "--quiet",
f"--color={color}",
f"--durations={durations}",
# f"--cov={cov}",
# f"--cov-report={cov_report}",
"--pdb" if pdb else "",
]
)
# Add the deepeval plugin file to pytest arguments
pytest_args.extend(["-p", "plugins"])

with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
Expand Down
3 changes: 2 additions & 1 deletion deepeval/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
LOG_TO_SERVER_ENV: str = "DO_NOT_SEND_TO_CONFIDENT_AI"
IMPLEMENTATION_ID_ENV: str = "CONFIDENT_AI_IMP_ID"
IMPLEMENTATION_ID_NAME: str = "CONFIDENT_AI_IMP_NAME"
KEY_FILE = ".deepeval"
KEY_FILE: str = ".deepeval"
PYTEST_RUN_ENV_VAR: str = "CONFIDENT_AI_RUN_TIMESTAMP"
5 changes: 5 additions & 0 deletions deepeval/metrics/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
class Metric(metaclass=Singleton):
# set an arbitrary minimum score that will get over-ridden later
minimum_score: float = 0
score: float = 0

# Measure function signature is subject to be different - not sure
# how applicable this is - might need a better abstraction
Expand All @@ -29,6 +30,10 @@ def _get_init_values(self):
def is_successful(self) -> bool:
raise NotImplementedError

@property
def __name__(self):
return "Metric"


class EntailmentScoreMetric(Metric):
def __init__(self, model_name: str = "cross-encoder/nli-deberta-base"):
Expand Down
6 changes: 3 additions & 3 deletions deepeval/metrics/randomscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ def __init__(self, minimum_score: float = 0.3):
self.minimum_score = minimum_score

def measure(self, test_case: LLMTestCase):
score = random.random()
self.success = score >= self.minimum_score
return score
self.score = random.random()
self.success = self.score >= self.minimum_score
return self.score

def is_successful(self):
return self.success
Expand Down
Empty file added deepeval/plugins/__init__.py
Empty file.
32 changes: 32 additions & 0 deletions deepeval/plugins/plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pytest
import os
from deepeval.api import Api, TestRun

from deepeval.constants import PYTEST_RUN_ENV_VAR


def pytest_sessionstart(session):
global test_filename
test_run = TestRun(
testFile="-",
testCases=[],
metricScores=[],
configurations={},
)
test_filename = test_run.save()


@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_sessionfinish(session, exitstatus):
# Code before yield will run before the test teardown
api: Api = Api()

# yield control back to pytest for the actual teardown
yield

# Code after yield will run after the test teardown
if os.getenv(PYTEST_RUN_ENV_VAR):
test_run = TestRun.load(test_filename)
api.post_test_run(test_run)
# if os.path.exists(test_filename):
# os.remove(test_filename)
17 changes: 16 additions & 1 deletion deepeval/run_test.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
"""Function for running test
"""
import os
import copy
from typing import List, Optional, Union
from dataclasses import dataclass
from .retry import retry
from .client import Client
from .constants import IMPLEMENTATION_ID_ENV, LOG_TO_SERVER_ENV
from .constants import (
IMPLEMENTATION_ID_ENV,
LOG_TO_SERVER_ENV,
PYTEST_RUN_ENV_VAR,
)
from .get_api_key import _get_api_key, _get_implementation_name
from .metrics import Metric
from .test_case import LLMTestCase, TestCase, SearchTestCase
from .api import TestRun


def _is_api_key_set():
Expand Down Expand Up @@ -231,6 +237,15 @@ def measure_metric():
raise ValueError("TestCase not supported yet.")
test_results.append(test_result)

# Load the test_run and add the test_case regardless of the success of the test
if os.getenv(PYTEST_RUN_ENV_VAR):
test_run = TestRun.load()
test_run.add_llm_test_case(
test_case=test_case,
metrics=[metric],
)
test_run.save()

if raise_error:
assert (
metric.is_successful()
Expand Down
12 changes: 8 additions & 4 deletions deepeval/test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@ def __post_init__(self):

@dataclass
class LLMTestCase(TestCase):
query: Optional[str] = None
expected_output: Optional[str] = None
context: Optional[str] = None
output: Optional[str] = None
query: str = "-"
expected_output: str = "-"
context: str = "-"
output: str = "-"

def __post_init__(self):
super().__post_init__()
self.__name__ = f"LLMTestCase_{self.id}"

# def dict(self):
# data = {
Expand Down
6 changes: 1 addition & 5 deletions docs/docs/quickstart/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,7 @@ deepeval test run -x test_sample.py
deepeval test run --pdb test_sample.py
```

Under the hood, it triggers pytest and offers support for a number of pytest command line functionalities. Similarly, you may also trigger `pytest` natively for these tests such as

```bash
pytest test_sample.py
```
Under the hood, it triggers pytest and offers support for a number of pytest command line functionalities.

Once you run the tests, you should be able to see a dashboard similar to the one below.

Expand Down
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"rich",
"protobuf<=3.20.5",
"pandas",
"pydantic>=2.0.0",
],
extras_require={
"bias": [
Expand All @@ -41,6 +42,9 @@
entry_points={
"console_scripts": [
"deepeval = deepeval.cli.main:app",
]
],
"pytest11": [
"plugins = deepeval.plugins.plugin",
],
},
)
2 changes: 1 addition & 1 deletion tests/test_overall_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
metric = OverallScoreMetric()


class TestOverallScore(LLMTestCase):
class TestOverallScore:
metric = OverallScoreMetric()

def test_overall_score(self):
Expand Down