From 88f101687ff8fbf96ba4377ba4d52ab0595915e8 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Thu, 7 Sep 2023 12:27:28 +1000 Subject: [PATCH 1/6] make quickstart easy --- deepeval/api.py | 2 +- docs/docs/quickstart/quickstart.md | 63 +++++++------------------ docs/docs/quickstart/write_test_case.md | 60 +++++++++++++++++++++++ docs/sidebars.js | 1 + 4 files changed, 78 insertions(+), 48 deletions(-) create mode 100644 docs/docs/quickstart/write_test_case.md diff --git a/deepeval/api.py b/deepeval/api.py index 65b1da6d4..c33fe7ba0 100644 --- a/deepeval/api.py +++ b/deepeval/api.py @@ -2,9 +2,9 @@ import urllib.parse import os import requests -from .constants import API_KEY_ENV from typing import Any, Optional from requests.adapters import HTTPAdapter, Response, Retry +from .constants import API_KEY_ENV from .key_handler import KEY_FILE_HANDLER API_BASE_URL = "https://app.confident-ai.com/api" diff --git a/docs/docs/quickstart/quickstart.md b/docs/docs/quickstart/quickstart.md index 25d263fd9..be0814243 100644 --- a/docs/docs/quickstart/quickstart.md +++ b/docs/docs/quickstart/quickstart.md @@ -1,60 +1,29 @@ -# Write a simple test case +# QuickStart -If you are interested in running a quick Colab example, you can [click here](https://colab.research.google.com/drive/1Lfq5geYsvfVoquDqv84UkWS57SdAHm30?usp=sharing). +Once you have installed, run the login command. During this step, you will be asked to visit https://app.confident-ai.com to grab your API key. -You can write a simple test case as simply as: +Note: this step is entirely optional if you do not wish to track your results but we highly recommend it so you can view how results differ over time. ```bash -deepeval test generate test_sample.py -``` +deepeval login -```python -import os -import openai -from deepeval.metrics.factual_consistency import assert_factual_consistency - -openai.api_key = "sk-XXX" - -# Write a sample ChatGPT function -def generate_chatgpt_output(query: str): - response = openai.ChatCompletion.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "assistant", "content": "The customer success phone line is 1200-231-231 and the customer success state is in Austin."}, - {"role": "user", "content": query} - ] - ) - llm_output = response.choices[0].message.content - return llm_output - -def test_factual_consistency(): - query = "What is the customer success phone line?" - context = "Our customer success phone line is 1200-231-231." - output = generate_chatgpt_output(query) - assert_factual_consistency(output, context) - -# Just run the following code in Python if required -test_factual_consistency() +# If you already have an API key +deepeval login --api-key $API_KEY ``` -### Running it in Pytest - -To run this in Pytest, just run: +Once you have logged in, you can generate a sample test file as shown below. This test file allows you to quickly get started modifying it with various tests. (More on this later) -```python -# sample.py - -def test_factual_consistency(): - query = "What is the customer success phone line?" - context = "Our customer success phone line is 1200-231-231." - output = generate_chatgpt_output(query) - assert_factual_consistency(output, context) +```bash +deepeval test generate --output-file test_sample.py ``` -You can then run it in CLI using: +Once you have generated the test file, you can then run tests as shown. ```bash -deepeval test run sample.py -# If you want to stay with pytest instead +deepeval test run test_sample.py ``` + +## About the sample test file + +The sample test file that you have generated is highly important. + diff --git a/docs/docs/quickstart/write_test_case.md b/docs/docs/quickstart/write_test_case.md new file mode 100644 index 000000000..25d263fd9 --- /dev/null +++ b/docs/docs/quickstart/write_test_case.md @@ -0,0 +1,60 @@ +# Write a simple test case + +If you are interested in running a quick Colab example, you can [click here](https://colab.research.google.com/drive/1Lfq5geYsvfVoquDqv84UkWS57SdAHm30?usp=sharing). + +You can write a simple test case as simply as: + +```bash +deepeval test generate test_sample.py +``` + +```python +import os +import openai +from deepeval.metrics.factual_consistency import assert_factual_consistency + +openai.api_key = "sk-XXX" + +# Write a sample ChatGPT function +def generate_chatgpt_output(query: str): + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "assistant", "content": "The customer success phone line is 1200-231-231 and the customer success state is in Austin."}, + {"role": "user", "content": query} + ] + ) + llm_output = response.choices[0].message.content + return llm_output + +def test_factual_consistency(): + query = "What is the customer success phone line?" + context = "Our customer success phone line is 1200-231-231." + output = generate_chatgpt_output(query) + assert_factual_consistency(output, context) + +# Just run the following code in Python if required +test_factual_consistency() +``` + +### Running it in Pytest + +To run this in Pytest, just run: + +```python +# sample.py + +def test_factual_consistency(): + query = "What is the customer success phone line?" + context = "Our customer success phone line is 1200-231-231." + output = generate_chatgpt_output(query) + assert_factual_consistency(output, context) +``` + +You can then run it in CLI using: + +```bash +deepeval test run sample.py +# If you want to stay with pytest instead +``` diff --git a/docs/sidebars.js b/docs/sidebars.js index d214c4746..6ae654e91 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -26,6 +26,7 @@ const sidebars = { label: 'QuickStart', items: [ 'quickstart/quickstart', + 'quickstart/write_test_case', 'quickstart/dataset', 'quickstart/synthetic-data-creation', 'quickstart/dashboard-app' From 51dea385317aa974b679ae5d269e74e51a0abf55 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Thu, 7 Sep 2023 12:50:46 +1000 Subject: [PATCH 2/6] update quickstart --- deepeval/_version.py | 2 +- deepeval/cli/test.py | 6 +++--- docs/docs/quickstart/quickstart.md | 19 ++++++++++++++++--- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/deepeval/_version.py b/deepeval/_version.py index 0048970f0..dcc942c42 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.12.0" +__version__: str = "0.12.1" diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py index 8c25fc01e..dad245a5e 100644 --- a/deepeval/cli/test.py +++ b/deepeval/cli/test.py @@ -75,7 +75,7 @@ def check_if_legit_file(test_file: str): @app.command() -def run(test_file_or_directory: str, exit_on_first_failure: bool = False): +def run(test_file_or_directory: str, exit_on_first_failure: bool = False, **kwargs): """Run a test""" if test_file_or_directory == "sample": sample() @@ -84,9 +84,9 @@ def run(test_file_or_directory: str, exit_on_first_failure: bool = False): ) retcode = 0 if exit_on_first_failure: - retcode = pytest.main(["-x", "-k", test_file_or_directory]) + retcode = pytest.main(["-x", "-k", test_file_or_directory], **kwargs) else: - retcode = pytest.main(["-k", test_file_or_directory]) + retcode = pytest.main(["-k", test_file_or_directory], **kwargs) print("✅ Tests finished! View results on https://app.confident-ai.com/") return retcode diff --git a/docs/docs/quickstart/quickstart.md b/docs/docs/quickstart/quickstart.md index be0814243..9447078fc 100644 --- a/docs/docs/quickstart/quickstart.md +++ b/docs/docs/quickstart/quickstart.md @@ -7,7 +7,7 @@ Note: this step is entirely optional if you do not wish to track your results bu ```bash deepeval login -# If you already have an API key +# If you already have an API key and want to feed it in through CLI deepeval login --api-key $API_KEY ``` @@ -23,7 +23,20 @@ Once you have generated the test file, you can then run tests as shown. deepeval test run test_sample.py ``` -## About the sample test file +## Diving Into The Example -The sample test file that you have generated is highly important. +Diving into the example, it shows what a sample test looks like. It uses `assert_overall_score` to ensure that the overall score exceeds a certain threshold. We recommend experimenting with different tests to ensure that the LLMs work as intended across domains such as Bias, Answer Relevancy and Factual Consistency. + +```python +from deepeval.metrics.overall_score import assert_overall_score + + +def test_0(): + query = "How does photosynthesis work?" + output = "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods with the help of chlorophyll pigment." + expected_output = "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize food with the help of chlorophyll pigment." + context = "Biology" + + assert_overall_score(query, output, expected_output, context) +``` From 56701dbc43e953102df9d7adcec01f7a2284b830 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Thu, 7 Sep 2023 13:03:18 +1000 Subject: [PATCH 3/6] update quickstart --- deepeval/cli/test.py | 31 ++++++++++++++++++++---------- docs/docs/quickstart/quickstart.md | 6 ++++++ 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py index dad245a5e..8fab19f9b 100644 --- a/deepeval/cli/test.py +++ b/deepeval/cli/test.py @@ -75,18 +75,29 @@ def check_if_legit_file(test_file: str): @app.command() -def run(test_file_or_directory: str, exit_on_first_failure: bool = False, **kwargs): +def run( + test_file_or_directory: str, + exit_on_first_failure: bool = False, + verbose: bool = False, + color: str = "yes", + durations: int = 10, + pdb: bool = False, +): """Run a test""" - if test_file_or_directory == "sample": - sample() - print( - "You can generate a sample test using [bold]deepeval test generate[/bold]." - ) - retcode = 0 + pytest_args = ["-k", test_file_or_directory] if exit_on_first_failure: - retcode = pytest.main(["-x", "-k", test_file_or_directory], **kwargs) - else: - retcode = pytest.main(["-k", test_file_or_directory], **kwargs) + pytest_args.insert(0, "-x") + pytest_args.extend( + [ + "--verbose" if verbose else "--quiet", + f"--color={color}", + f"--durations={durations}", + # f"--cov={cov}", + # f"--cov-report={cov_report}", + "--pdb" if pdb else "", + ] + ) + retcode = pytest.main(pytest_args) print("✅ Tests finished! View results on https://app.confident-ai.com/") return retcode diff --git a/docs/docs/quickstart/quickstart.md b/docs/docs/quickstart/quickstart.md index 9447078fc..53701508d 100644 --- a/docs/docs/quickstart/quickstart.md +++ b/docs/docs/quickstart/quickstart.md @@ -21,12 +21,18 @@ Once you have generated the test file, you can then run tests as shown. ```bash deepeval test run test_sample.py +# if you wish to fail first +deepeval test run -x test_sample.py +# If you want to run an interactive debugger when a test fails +deepeval test run --pdb test_sample.py ``` ## Diving Into The Example Diving into the example, it shows what a sample test looks like. It uses `assert_overall_score` to ensure that the overall score exceeds a certain threshold. We recommend experimenting with different tests to ensure that the LLMs work as intended across domains such as Bias, Answer Relevancy and Factual Consistency. +With overall score, if you leave out `query` or `expected_output`, DeepEval will automatically run the relevant tests. + ```python from deepeval.metrics.overall_score import assert_overall_score From a9888aae66d80fe94802020a5e22879fb5b9cd02 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Thu, 7 Sep 2023 13:07:49 +1000 Subject: [PATCH 4/6] update quickstart --- docs/docs/quickstart/quickstart.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/docs/quickstart/quickstart.md b/docs/docs/quickstart/quickstart.md index 53701508d..396d32e55 100644 --- a/docs/docs/quickstart/quickstart.md +++ b/docs/docs/quickstart/quickstart.md @@ -27,12 +27,24 @@ deepeval test run -x test_sample.py deepeval test run --pdb test_sample.py ``` +Under the hood, it triggers pytest and offers support for a number of pytest command line functionalities. Similarly, you may also trigger `pytest` natively for these tests such as + +```bash +pytest test_sample.py +``` + +Once you run the tests, you should be able to see a dashboard similar to the one below. + +![Dashboard Example](../../assets/dashboard-screenshot.png) + ## Diving Into The Example Diving into the example, it shows what a sample test looks like. It uses `assert_overall_score` to ensure that the overall score exceeds a certain threshold. We recommend experimenting with different tests to ensure that the LLMs work as intended across domains such as Bias, Answer Relevancy and Factual Consistency. With overall score, if you leave out `query` or `expected_output`, DeepEval will automatically run the relevant tests. +For these tests, you will need a `test_` prefix for this to be ran in Python. + ```python from deepeval.metrics.overall_score import assert_overall_score @@ -46,3 +58,7 @@ def test_0(): assert_overall_score(query, output, expected_output, context) ``` +## What next? + +We recommend diving into ![creating a dataset](../quickstart/dataset.md) or ![defining custom metrics](../quickstart/custom-metrics.md) next. + From ed24b0476cb032439f4885071f2a41a254eeef27 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Thu, 7 Sep 2023 13:10:45 +1000 Subject: [PATCH 5/6] add quickstart --- docs/docs/quickstart/quickstart.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/quickstart/quickstart.md b/docs/docs/quickstart/quickstart.md index 396d32e55..6e7aa432c 100644 --- a/docs/docs/quickstart/quickstart.md +++ b/docs/docs/quickstart/quickstart.md @@ -60,5 +60,5 @@ def test_0(): ## What next? -We recommend diving into ![creating a dataset](../quickstart/dataset.md) or ![defining custom metrics](../quickstart/custom-metrics.md) next. +We recommend diving into [creating a dataset](dataset) or [defining custom metrics](../quickstart/custom-metrics) next. From 7994eeef31f267bda92e2b27fe0b7165d4650e2f Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Thu, 7 Sep 2023 13:12:32 +1000 Subject: [PATCH 6/6] update quickstart --- docs/docs/quickstart/quickstart.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/quickstart/quickstart.md b/docs/docs/quickstart/quickstart.md index 6e7aa432c..a8c5ff8cc 100644 --- a/docs/docs/quickstart/quickstart.md +++ b/docs/docs/quickstart/quickstart.md @@ -60,5 +60,5 @@ def test_0(): ## What next? -We recommend diving into [creating a dataset](dataset) or [defining custom metrics](../quickstart/custom-metrics) next. +We recommend diving into [creating a dataset](dataset) to learn how to run tests in bulk or [defining custom metrics](../quickstart/custom-metrics) so you can support writing custom tests and metrics for your own use cases.