From dbf92421ce2bac9eae695ba1589d4c387a534841 Mon Sep 17 00:00:00 2001
From: penguine-up <jeffreyip@confident-ai.com>
Date: Fri, 1 Sep 2023 02:04:17 +1000
Subject: [PATCH 1/9] gpt synthetic data

---
 deepeval/dataset.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/deepeval/dataset.py b/deepeval/dataset.py
index eb2774ae0..bd4ee1e1c 100644
--- a/deepeval/dataset.py
+++ b/deepeval/dataset.py
@@ -10,6 +10,7 @@
 from .metrics.metric import Metric
 from .query_generator import BEIRQueryGenerator
 from .retry import retry
+import openai
 
 
 class EvaluationDataset(UserList):
@@ -237,7 +238,6 @@ def create_evaluation_dataset_from_raw_text(text: str, output_fn: str = "output.
 
     # NOTE: loading this may take a while as the model used is quite big
     gen = BEIRQueryGenerator()
-    text = "Synthetic queries are useful for scenraios where there is no data."
     queries = gen.generate_queries(texts=[text], num_queries=2)
     test_cases = []
     with open(output_fn, "w") as f:
@@ -249,3 +249,29 @@ def create_evaluation_dataset_from_raw_text(text: str, output_fn: str = "output.
 
     dataset = EvaluationDataset(test_cases=test_cases)
     return dataset
+
+def create_evaluation_query_output_pairs(text: str, n:int = 3):
+    """Utility function to create an evaluation dataset using GPT."""
+
+    prompt = f"""
+        Please generate {n} queries that may lead to the following text as an output. Ensure diversity in the generated queries.
+
+        {text}        
+    """
+
+    queries = openai.Completion.create(
+        engine="gpt-3.5-turbo",
+        prompt=prompt,
+        temperature=0,
+    )
+
+    test_cases = []
+    for query_choice in queries.choices:
+        generated_query = query_choice.message['content']
+        test_case = TestCase(query=generated_query, expected_output=text)
+        test_cases.append(test_case)
+    
+    dataset = EvaluationDataset(test_cases=test_cases)
+    return dataset
+
+        

From 7502bcdd24f5379a51b94c56c627244b7861b5ce Mon Sep 17 00:00:00 2001
From: Jacky Wong <colabdog@gmail.com>
Date: Fri, 1 Sep 2023 16:59:54 +1000
Subject: [PATCH 2/9] fix synthetic data

---
 deepeval/dataset.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/deepeval/dataset.py b/deepeval/dataset.py
index bd4ee1e1c..70f05d365 100644
--- a/deepeval/dataset.py
+++ b/deepeval/dataset.py
@@ -10,7 +10,6 @@
 from .metrics.metric import Metric
 from .query_generator import BEIRQueryGenerator
 from .retry import retry
-import openai
 
 
 class EvaluationDataset(UserList):
@@ -250,28 +249,34 @@ def create_evaluation_dataset_from_raw_text(text: str, output_fn: str = "output.
     dataset = EvaluationDataset(test_cases=test_cases)
     return dataset
 
-def create_evaluation_query_output_pairs(text: str, n:int = 3):
+
+def create_evaluation_query_output_pairs(
+    openai_api_key: str,
+    text: str,
+    n: int = 3,
+    temperature: float = 0.2,
+):
     """Utility function to create an evaluation dataset using GPT."""
+    import openai
 
-    prompt = f"""
-        Please generate {n} queries that may lead to the following text as an output. Ensure diversity in the generated queries.
+    openai.api_key = openai_api_key
+    prompt = f"""Please generate {n} queries that may lead to the following text as an output. 
+Add diversity in the generated queries.
 
-        {text}        
-    """
+{text}"""
 
     queries = openai.Completion.create(
         engine="gpt-3.5-turbo",
         prompt=prompt,
-        temperature=0,
+        # Minimize creativity to avoid hallucination
+        temperature=temperature,
     )
 
     test_cases = []
     for query_choice in queries.choices:
-        generated_query = query_choice.message['content']
+        generated_query = query_choice.message["content"]
         test_case = TestCase(query=generated_query, expected_output=text)
         test_cases.append(test_case)
-    
+
     dataset = EvaluationDataset(test_cases=test_cases)
     return dataset
-
-        

From 748b6c53110fe5d3f3386eee7aacf4233ae3ebdc Mon Sep 17 00:00:00 2001
From: Jacky Wong <colabdog@gmail.com>
Date: Fri, 1 Sep 2023 18:46:13 +1000
Subject: [PATCH 3/9] add synthetic data pair functionality

---
 deepeval/dataset.py   | 70 +++++++++++++++++++++++++++++++------------
 tests/test_dataset.py | 19 ++++++++++++
 2 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/deepeval/dataset.py b/deepeval/dataset.py
index 70f05d365..edcee0056 100644
--- a/deepeval/dataset.py
+++ b/deepeval/dataset.py
@@ -2,6 +2,7 @@
 """
 import json
 import random
+import time
 from tabulate import tabulate
 from datetime import datetime
 from typing import List, Callable
@@ -250,32 +251,63 @@ def create_evaluation_dataset_from_raw_text(text: str, output_fn: str = "output.
     return dataset
 
 
-def create_evaluation_query_output_pairs(
-    openai_api_key: str,
-    text: str,
-    n: int = 3,
-    temperature: float = 0.2,
-):
-    """Utility function to create an evaluation dataset using GPT."""
+def make_chat_completion_request(prompt: str, openai_api_key: str):
     import openai
 
     openai.api_key = openai_api_key
-    prompt = f"""Please generate {n} queries that may lead to the following text as an output. 
-Add diversity in the generated queries.
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt},
+        ],
+    )
+    return response.choices[0].message.content
 
-{text}"""
 
-    queries = openai.Completion.create(
-        engine="gpt-3.5-turbo",
-        prompt=prompt,
-        # Minimize creativity to avoid hallucination
-        temperature=temperature,
-    )
+def generate_chatgpt_output(prompt: str, openai_api_key: str) -> str:
+    max_retries = 3
+    retry_delay = 1
+    for attempt in range(max_retries):
+        try:
+            expected_output = make_chat_completion_request(
+                prompt=prompt, openai_api_key=openai_api_key
+            )
+            break
+        except Exception as e:
+            print(f"Error occurred: {e}")
+            if attempt < max_retries - 1:
+                print(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+                retry_delay *= 2
+            else:
+                raise
+
+    return expected_output
+
+
+def create_evaluation_query_output_pairs(
+    openai_api_key: str, context: str, n: int = 3, model: str = "openai/gpt-3.5-turbo"
+) -> EvaluationDataset:
+    """Utility function to create an evaluation dataset using GPT."""
+    prompt = f"""You are generating {n} sets of of query-answer pairs to create an evaluation dataset based on the below context.
+Context: {context}
+
+Respond in JSON format in 1 single line without white spaces an array of JSON with the keys `query` and `answer`.
+"""
+    for _ in range(3):
+        try:
+            responses = generate_chatgpt_output(prompt, openai_api_key=openai_api_key)
+            responses = json.loads(responses)
+            break
+        except Exception as e:
+            return EvaluationDataset(test_cases=[])
 
     test_cases = []
-    for query_choice in queries.choices:
-        generated_query = query_choice.message["content"]
-        test_case = TestCase(query=generated_query, expected_output=text)
+    for response in responses:
+        test_case = TestCase(
+            query=response["query"], expected_output=response["answer"]
+        )
         test_cases.append(test_case)
 
     dataset = EvaluationDataset(test_cases=test_cases)
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index c4afd923b..fd3e54359 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -1,3 +1,7 @@
+import pytest
+import os
+
+
 def test_evaluation_dataset():
     from deepeval.dataset import EvaluationDataset
 
@@ -20,3 +24,18 @@ def test_evaluation_dataset():
         expected_output_column="expected_output",
         id_column="id",
     )
+
+
+@pytest.mark.skip(reason="OpenAI costs")
+def test_create_synthetic_dataset():
+    """
+    test for creating a synthetic dataset
+    """
+    from deepeval.dataset import create_evaluation_query_output_pairs
+
+    dataset = create_evaluation_query_output_pairs(
+        openai_api_key=os.environ["OPENAI_API_KEY"],
+        context="FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.",
+        n=1,
+    )
+    assert len(dataset) == 1

From 45c4a6db9766c0f236c94dbbfd2b37fb4be668ea Mon Sep 17 00:00:00 2001
From: Jacky Wong <colabdog@gmail.com>
Date: Fri, 1 Sep 2023 19:03:29 +1000
Subject: [PATCH 4/9] add synthetic data creation

---
 docs/docs/quickstart/synthetic-data-creation.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/docs/quickstart/synthetic-data-creation.md b/docs/docs/quickstart/synthetic-data-creation.md
index 5d1cd572f..daa6c3e76 100644
--- a/docs/docs/quickstart/synthetic-data-creation.md
+++ b/docs/docs/quickstart/synthetic-data-creation.md
@@ -5,17 +5,15 @@
 - When there isn't much data or any data to start with for evaluating langchain pipelines
 - When getting an eyeball check of current performance is done very quickly
 
-![Synthetic Queries](../../assets/synthetic-query-generation.png)
-
 Generating synthetic queries allows you to quickly evaluate the queries related to your prompts.
-We help developers get up and running with example queries from just raw text.
+
+We help developers get up and running with example queries from just raw text based on OpenAI's model. In this model, we generate query-answer pairs based on teh text.
 
 ```python
 # Loads the synthetic query model to generate them based on data you get.
 # These automatically create synthetic queries and adds them to our online database
-from deepeval.dataset import create_evaluation_dataset_from_raw_text
-
-dataset = create_evaluation_dataset_from_raw_text("Python is a great language for mathematical expression and machine learning.")
+from deepeval.dataset import create_evaluation_query_output_pairs
+dataset = create_evaluation_query_output_pairs("Python is a great language for mathematical expression and machine learning.")
 ```
 
 ## Running test cases.
@@ -24,6 +22,10 @@ Once you have defined a number of test cases, you can easily run it in bulk if r
 
 ```python
 # test_bulk_runner.py
+
+def generate_llm_output(query: str) -> str:
+    return "sample output"
+
 # Run an evaluation as you would any normal evaluation.
 dataset.run_evaluation(completion_fn=generate_llm_output)
 ```

From 9db27b23dc5c63557db3fa843c74d0c5a8e73963 Mon Sep 17 00:00:00 2001
From: Jacky Wong <colabdog@gmail.com>
Date: Fri, 1 Sep 2023 19:04:54 +1000
Subject: [PATCH 5/9] update synthetic data creation

---
 deepeval/_version.py                            | 2 +-
 docs/docs/quickstart/synthetic-data-creation.md | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/deepeval/_version.py b/deepeval/_version.py
index d1872cd2c..acdeaf6f4 100644
--- a/deepeval/_version.py
+++ b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.10.11"
+__version__: str = "0.11.0"
diff --git a/docs/docs/quickstart/synthetic-data-creation.md b/docs/docs/quickstart/synthetic-data-creation.md
index daa6c3e76..e01d05be9 100644
--- a/docs/docs/quickstart/synthetic-data-creation.md
+++ b/docs/docs/quickstart/synthetic-data-creation.md
@@ -16,6 +16,12 @@ from deepeval.dataset import create_evaluation_query_output_pairs
 dataset = create_evaluation_query_output_pairs("Python is a great language for mathematical expression and machine learning.")
 ```
 
+Once you have created your evaluation dataset, we recommend saving it.
+
+```python
+dataset.to_csv("sample.csv")
+```
+
 ## Running test cases.
 
 Once you have defined a number of test cases, you can easily run it in bulk if required.

From 1a6f1ccbe0f4c1b9bc099a85261a3e47f5c880c5 Mon Sep 17 00:00:00 2001
From: Jacky Wong <colabdog@gmail.com>
Date: Fri, 1 Sep 2023 19:06:49 +1000
Subject: [PATCH 6/9] add a way to update dataset

---
 tests/test_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index fd3e54359..4a1777fdf 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -24,6 +24,7 @@ def test_evaluation_dataset():
         expected_output_column="expected_output",
         id_column="id",
     )
+    assert len(dataset) == 5
 
 
 @pytest.mark.skip(reason="OpenAI costs")

From 0a9ecd5179d7c900b9c1c923ee2a36eb41c9b560 Mon Sep 17 00:00:00 2001
From: Jacky Wong <colabdog@gmail.com>
Date: Fri, 1 Sep 2023 19:23:16 +1000
Subject: [PATCH 7/9] fix data creation

---
 docs/docs/quickstart/synthetic-data-creation.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/docs/quickstart/synthetic-data-creation.md b/docs/docs/quickstart/synthetic-data-creation.md
index e01d05be9..b8be3bc4d 100644
--- a/docs/docs/quickstart/synthetic-data-creation.md
+++ b/docs/docs/quickstart/synthetic-data-creation.md
@@ -1,6 +1,6 @@
-# Create Synthetic Data
+# Auto-Evaluation
 
-## Problem synthetic data creation solves
+Auto-evaluation is useful:
 
 - When there isn't much data or any data to start with for evaluating langchain pipelines
 - When getting an eyeball check of current performance is done very quickly

From 988eb7f52597e7e06cfc2346e6fcb3960bdfbb30 Mon Sep 17 00:00:00 2001
From: Jacky Wong <colabdog@gmail.com>
Date: Fri, 1 Sep 2023 19:24:27 +1000
Subject: [PATCH 8/9] update synthetic data notes

---
 docs/docs/quickstart/synthetic-data-creation.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/docs/quickstart/synthetic-data-creation.md b/docs/docs/quickstart/synthetic-data-creation.md
index b8be3bc4d..551aec317 100644
--- a/docs/docs/quickstart/synthetic-data-creation.md
+++ b/docs/docs/quickstart/synthetic-data-creation.md
@@ -22,9 +22,11 @@ Once you have created your evaluation dataset, we recommend saving it.
 dataset.to_csv("sample.csv")
 ```
 
-## Running test cases.
+## Running tests/evaluation
 
-Once you have defined a number of test cases, you can easily run it in bulk if required.
+Once you have defined a number of test cases, you can easily run it in bulk if required. 
+
+If there are errors - this function will error.
 
 ```python
 # test_bulk_runner.py

From 729c7697ecb9251609b8a140febb343a0603a973 Mon Sep 17 00:00:00 2001
From: Jacky Wong <colabdog@gmail.com>
Date: Fri, 1 Sep 2023 19:25:21 +1000
Subject: [PATCH 9/9] add synthetic data creation

---
 docs/docs/quickstart/synthetic-data-creation.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/docs/quickstart/synthetic-data-creation.md b/docs/docs/quickstart/synthetic-data-creation.md
index 551aec317..7f2f67788 100644
--- a/docs/docs/quickstart/synthetic-data-creation.md
+++ b/docs/docs/quickstart/synthetic-data-creation.md
@@ -1,13 +1,15 @@
 # Auto-Evaluation
 
+## Introduction
+
 Auto-evaluation is useful:
 
 - When there isn't much data or any data to start with for evaluating langchain pipelines
-- When getting an eyeball check of current performance is done very quickly
+- When getting an eyeball check of current performance is required to be done very quickly
 
 Generating synthetic queries allows you to quickly evaluate the queries related to your prompts.
 
-We help developers get up and running with example queries from just raw text based on OpenAI's model. In this model, we generate query-answer pairs based on teh text.
+We help developers get up and running with example queries from just raw text based on OpenAI's model. In this model, we generate query-answer pairs based on the text.
 
 ```python
 # Loads the synthetic query model to generate them based on data you get.