From dbf92421ce2bac9eae695ba1589d4c387a534841 Mon Sep 17 00:00:00 2001 From: penguine-up Date: Fri, 1 Sep 2023 02:04:17 +1000 Subject: [PATCH 1/9] gpt synthetic data --- deepeval/dataset.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/deepeval/dataset.py b/deepeval/dataset.py index eb2774ae0..bd4ee1e1c 100644 --- a/deepeval/dataset.py +++ b/deepeval/dataset.py @@ -10,6 +10,7 @@ from .metrics.metric import Metric from .query_generator import BEIRQueryGenerator from .retry import retry +import openai class EvaluationDataset(UserList): @@ -237,7 +238,6 @@ def create_evaluation_dataset_from_raw_text(text: str, output_fn: str = "output. # NOTE: loading this may take a while as the model used is quite big gen = BEIRQueryGenerator() - text = "Synthetic queries are useful for scenraios where there is no data." queries = gen.generate_queries(texts=[text], num_queries=2) test_cases = [] with open(output_fn, "w") as f: @@ -249,3 +249,29 @@ def create_evaluation_dataset_from_raw_text(text: str, output_fn: str = "output. dataset = EvaluationDataset(test_cases=test_cases) return dataset + +def create_evaluation_query_output_pairs(text: str, n:int = 3): + """Utility function to create an evaluation dataset using GPT.""" + + prompt = f""" + Please generate {n} queries that may lead to the following text as an output. Ensure diversity in the generated queries. + + {text} + """ + + queries = openai.Completion.create( + engine="gpt-3.5-turbo", + prompt=prompt, + temperature=0, + ) + + test_cases = [] + for query_choice in queries.choices: + generated_query = query_choice.message['content'] + test_case = TestCase(query=generated_query, expected_output=text) + test_cases.append(test_case) + + dataset = EvaluationDataset(test_cases=test_cases) + return dataset + + From 7502bcdd24f5379a51b94c56c627244b7861b5ce Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Fri, 1 Sep 2023 16:59:54 +1000 Subject: [PATCH 2/9] fix synthetic data --- deepeval/dataset.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/deepeval/dataset.py b/deepeval/dataset.py index bd4ee1e1c..70f05d365 100644 --- a/deepeval/dataset.py +++ b/deepeval/dataset.py @@ -10,7 +10,6 @@ from .metrics.metric import Metric from .query_generator import BEIRQueryGenerator from .retry import retry -import openai class EvaluationDataset(UserList): @@ -250,28 +249,34 @@ def create_evaluation_dataset_from_raw_text(text: str, output_fn: str = "output. dataset = EvaluationDataset(test_cases=test_cases) return dataset -def create_evaluation_query_output_pairs(text: str, n:int = 3): + +def create_evaluation_query_output_pairs( + openai_api_key: str, + text: str, + n: int = 3, + temperature: float = 0.2, +): """Utility function to create an evaluation dataset using GPT.""" + import openai - prompt = f""" - Please generate {n} queries that may lead to the following text as an output. Ensure diversity in the generated queries. + openai.api_key = openai_api_key + prompt = f"""Please generate {n} queries that may lead to the following text as an output. +Add diversity in the generated queries. - {text} - """ +{text}""" queries = openai.Completion.create( engine="gpt-3.5-turbo", prompt=prompt, - temperature=0, + # Minimize creativity to avoid hallucination + temperature=temperature, ) test_cases = [] for query_choice in queries.choices: - generated_query = query_choice.message['content'] + generated_query = query_choice.message["content"] test_case = TestCase(query=generated_query, expected_output=text) test_cases.append(test_case) - + dataset = EvaluationDataset(test_cases=test_cases) return dataset - - From 748b6c53110fe5d3f3386eee7aacf4233ae3ebdc Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Fri, 1 Sep 2023 18:46:13 +1000 Subject: [PATCH 3/9] add synthetic data pair functionality --- deepeval/dataset.py | 70 +++++++++++++++++++++++++++++++------------ tests/test_dataset.py | 19 ++++++++++++ 2 files changed, 70 insertions(+), 19 deletions(-) diff --git a/deepeval/dataset.py b/deepeval/dataset.py index 70f05d365..edcee0056 100644 --- a/deepeval/dataset.py +++ b/deepeval/dataset.py @@ -2,6 +2,7 @@ """ import json import random +import time from tabulate import tabulate from datetime import datetime from typing import List, Callable @@ -250,32 +251,63 @@ def create_evaluation_dataset_from_raw_text(text: str, output_fn: str = "output. return dataset -def create_evaluation_query_output_pairs( - openai_api_key: str, - text: str, - n: int = 3, - temperature: float = 0.2, -): - """Utility function to create an evaluation dataset using GPT.""" +def make_chat_completion_request(prompt: str, openai_api_key: str): import openai openai.api_key = openai_api_key - prompt = f"""Please generate {n} queries that may lead to the following text as an output. -Add diversity in the generated queries. + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": prompt}, + ], + ) + return response.choices[0].message.content -{text}""" - queries = openai.Completion.create( - engine="gpt-3.5-turbo", - prompt=prompt, - # Minimize creativity to avoid hallucination - temperature=temperature, - ) +def generate_chatgpt_output(prompt: str, openai_api_key: str) -> str: + max_retries = 3 + retry_delay = 1 + for attempt in range(max_retries): + try: + expected_output = make_chat_completion_request( + prompt=prompt, openai_api_key=openai_api_key + ) + break + except Exception as e: + print(f"Error occurred: {e}") + if attempt < max_retries - 1: + print(f"Retrying in {retry_delay} seconds...") + time.sleep(retry_delay) + retry_delay *= 2 + else: + raise + + return expected_output + + +def create_evaluation_query_output_pairs( + openai_api_key: str, context: str, n: int = 3, model: str = "openai/gpt-3.5-turbo" +) -> EvaluationDataset: + """Utility function to create an evaluation dataset using GPT.""" + prompt = f"""You are generating {n} sets of of query-answer pairs to create an evaluation dataset based on the below context. +Context: {context} + +Respond in JSON format in 1 single line without white spaces an array of JSON with the keys `query` and `answer`. +""" + for _ in range(3): + try: + responses = generate_chatgpt_output(prompt, openai_api_key=openai_api_key) + responses = json.loads(responses) + break + except Exception as e: + return EvaluationDataset(test_cases=[]) test_cases = [] - for query_choice in queries.choices: - generated_query = query_choice.message["content"] - test_case = TestCase(query=generated_query, expected_output=text) + for response in responses: + test_case = TestCase( + query=response["query"], expected_output=response["answer"] + ) test_cases.append(test_case) dataset = EvaluationDataset(test_cases=test_cases) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index c4afd923b..fd3e54359 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,3 +1,7 @@ +import pytest +import os + + def test_evaluation_dataset(): from deepeval.dataset import EvaluationDataset @@ -20,3 +24,18 @@ def test_evaluation_dataset(): expected_output_column="expected_output", id_column="id", ) + + +@pytest.mark.skip(reason="OpenAI costs") +def test_create_synthetic_dataset(): + """ + test for creating a synthetic dataset + """ + from deepeval.dataset import create_evaluation_query_output_pairs + + dataset = create_evaluation_query_output_pairs( + openai_api_key=os.environ["OPENAI_API_KEY"], + context="FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.", + n=1, + ) + assert len(dataset) == 1 From 45c4a6db9766c0f236c94dbbfd2b37fb4be668ea Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Fri, 1 Sep 2023 19:03:29 +1000 Subject: [PATCH 4/9] add synthetic data creation --- docs/docs/quickstart/synthetic-data-creation.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/docs/quickstart/synthetic-data-creation.md b/docs/docs/quickstart/synthetic-data-creation.md index 5d1cd572f..daa6c3e76 100644 --- a/docs/docs/quickstart/synthetic-data-creation.md +++ b/docs/docs/quickstart/synthetic-data-creation.md @@ -5,17 +5,15 @@ - When there isn't much data or any data to start with for evaluating langchain pipelines - When getting an eyeball check of current performance is done very quickly -![Synthetic Queries](../../assets/synthetic-query-generation.png) - Generating synthetic queries allows you to quickly evaluate the queries related to your prompts. -We help developers get up and running with example queries from just raw text. + +We help developers get up and running with example queries from just raw text based on OpenAI's model. In this model, we generate query-answer pairs based on teh text. ```python # Loads the synthetic query model to generate them based on data you get. # These automatically create synthetic queries and adds them to our online database -from deepeval.dataset import create_evaluation_dataset_from_raw_text - -dataset = create_evaluation_dataset_from_raw_text("Python is a great language for mathematical expression and machine learning.") +from deepeval.dataset import create_evaluation_query_output_pairs +dataset = create_evaluation_query_output_pairs("Python is a great language for mathematical expression and machine learning.") ``` ## Running test cases. @@ -24,6 +22,10 @@ Once you have defined a number of test cases, you can easily run it in bulk if r ```python # test_bulk_runner.py + +def generate_llm_output(query: str) -> str: + return "sample output" + # Run an evaluation as you would any normal evaluation. dataset.run_evaluation(completion_fn=generate_llm_output) ``` From 9db27b23dc5c63557db3fa843c74d0c5a8e73963 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Fri, 1 Sep 2023 19:04:54 +1000 Subject: [PATCH 5/9] update synthetic data creation --- deepeval/_version.py | 2 +- docs/docs/quickstart/synthetic-data-creation.md | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/deepeval/_version.py b/deepeval/_version.py index d1872cd2c..acdeaf6f4 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.10.11" +__version__: str = "0.11.0" diff --git a/docs/docs/quickstart/synthetic-data-creation.md b/docs/docs/quickstart/synthetic-data-creation.md index daa6c3e76..e01d05be9 100644 --- a/docs/docs/quickstart/synthetic-data-creation.md +++ b/docs/docs/quickstart/synthetic-data-creation.md @@ -16,6 +16,12 @@ from deepeval.dataset import create_evaluation_query_output_pairs dataset = create_evaluation_query_output_pairs("Python is a great language for mathematical expression and machine learning.") ``` +Once you have created your evaluation dataset, we recommend saving it. + +```python +dataset.to_csv("sample.csv") +``` + ## Running test cases. Once you have defined a number of test cases, you can easily run it in bulk if required. From 1a6f1ccbe0f4c1b9bc099a85261a3e47f5c880c5 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Fri, 1 Sep 2023 19:06:49 +1000 Subject: [PATCH 6/9] add a way to update dataset --- tests/test_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index fd3e54359..4a1777fdf 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -24,6 +24,7 @@ def test_evaluation_dataset(): expected_output_column="expected_output", id_column="id", ) + assert len(dataset) == 5 @pytest.mark.skip(reason="OpenAI costs") From 0a9ecd5179d7c900b9c1c923ee2a36eb41c9b560 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Fri, 1 Sep 2023 19:23:16 +1000 Subject: [PATCH 7/9] fix data creation --- docs/docs/quickstart/synthetic-data-creation.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/docs/quickstart/synthetic-data-creation.md b/docs/docs/quickstart/synthetic-data-creation.md index e01d05be9..b8be3bc4d 100644 --- a/docs/docs/quickstart/synthetic-data-creation.md +++ b/docs/docs/quickstart/synthetic-data-creation.md @@ -1,6 +1,6 @@ -# Create Synthetic Data +# Auto-Evaluation -## Problem synthetic data creation solves +Auto-evaluation is useful: - When there isn't much data or any data to start with for evaluating langchain pipelines - When getting an eyeball check of current performance is done very quickly From 988eb7f52597e7e06cfc2346e6fcb3960bdfbb30 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Fri, 1 Sep 2023 19:24:27 +1000 Subject: [PATCH 8/9] update synthetic data notes --- docs/docs/quickstart/synthetic-data-creation.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/docs/quickstart/synthetic-data-creation.md b/docs/docs/quickstart/synthetic-data-creation.md index b8be3bc4d..551aec317 100644 --- a/docs/docs/quickstart/synthetic-data-creation.md +++ b/docs/docs/quickstart/synthetic-data-creation.md @@ -22,9 +22,11 @@ Once you have created your evaluation dataset, we recommend saving it. dataset.to_csv("sample.csv") ``` -## Running test cases. +## Running tests/evaluation -Once you have defined a number of test cases, you can easily run it in bulk if required. +Once you have defined a number of test cases, you can easily run it in bulk if required. + +If there are errors - this function will error. ```python # test_bulk_runner.py From 729c7697ecb9251609b8a140febb343a0603a973 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Fri, 1 Sep 2023 19:25:21 +1000 Subject: [PATCH 9/9] add synthetic data creation --- docs/docs/quickstart/synthetic-data-creation.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/docs/quickstart/synthetic-data-creation.md b/docs/docs/quickstart/synthetic-data-creation.md index 551aec317..7f2f67788 100644 --- a/docs/docs/quickstart/synthetic-data-creation.md +++ b/docs/docs/quickstart/synthetic-data-creation.md @@ -1,13 +1,15 @@ # Auto-Evaluation +## Introduction + Auto-evaluation is useful: - When there isn't much data or any data to start with for evaluating langchain pipelines -- When getting an eyeball check of current performance is done very quickly +- When getting an eyeball check of current performance is required to be done very quickly Generating synthetic queries allows you to quickly evaluate the queries related to your prompts. -We help developers get up and running with example queries from just raw text based on OpenAI's model. In this model, we generate query-answer pairs based on teh text. +We help developers get up and running with example queries from just raw text based on OpenAI's model. In this model, we generate query-answer pairs based on the text. ```python # Loads the synthetic query model to generate them based on data you get.