confident-ai · jwongster2 · Sep 1, 2023 · Aug 31, 2023 · Sep 1, 2023 · Sep 1, 2023
diff --git a/deepeval/_version.py b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.10.13"
+__version__: str = "0.11.0"
diff --git a/deepeval/dataset.py b/deepeval/dataset.py
@@ -2,6 +2,7 @@
 """
 import json
 import random
+import time
 from tabulate import tabulate
 from datetime import datetime
 from typing import List, Callable
@@ -237,7 +238,6 @@ def create_evaluation_dataset_from_raw_text(text: str, output_fn: str = "output.
 
     # NOTE: loading this may take a while as the model used is quite big
     gen = BEIRQueryGenerator()
-    text = "Synthetic queries are useful for scenraios where there is no data."
     queries = gen.generate_queries(texts=[text], num_queries=2)
     test_cases = []
     with open(output_fn, "w") as f:
@@ -249,3 +249,66 @@ def create_evaluation_dataset_from_raw_text(text: str, output_fn: str = "output.
 
     dataset = EvaluationDataset(test_cases=test_cases)
     return dataset
+
+
+def make_chat_completion_request(prompt: str, openai_api_key: str):
+    import openai
+
+    openai.api_key = openai_api_key
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt},
+        ],
+    )
+    return response.choices[0].message.content
+
+
+def generate_chatgpt_output(prompt: str, openai_api_key: str) -> str:
+    max_retries = 3
+    retry_delay = 1
+    for attempt in range(max_retries):
+        try:
+            expected_output = make_chat_completion_request(
+                prompt=prompt, openai_api_key=openai_api_key
+            )
+            break
+        except Exception as e:
+            print(f"Error occurred: {e}")
+            if attempt < max_retries - 1:
+                print(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+                retry_delay *= 2
+            else:
+                raise
+
+    return expected_output
+
+
+def create_evaluation_query_output_pairs(
+    openai_api_key: str, context: str, n: int = 3, model: str = "openai/gpt-3.5-turbo"
+) -> EvaluationDataset:
+    """Utility function to create an evaluation dataset using GPT."""
+    prompt = f"""You are generating {n} sets of of query-answer pairs to create an evaluation dataset based on the below context.
+Context: {context}
+
+Respond in JSON format in 1 single line without white spaces an array of JSON with the keys `query` and `answer`.
+"""
+    for _ in range(3):
+        try:
+            responses = generate_chatgpt_output(prompt, openai_api_key=openai_api_key)
+            responses = json.loads(responses)
+            break
+        except Exception as e:
+            return EvaluationDataset(test_cases=[])
+
+    test_cases = []
+    for response in responses:
+        test_case = TestCase(
+            query=response["query"], expected_output=response["answer"]
+        )
+        test_cases.append(test_case)
+
+    dataset = EvaluationDataset(test_cases=test_cases)
+    return dataset
diff --git a/docs/docs/quickstart/synthetic-data-creation.md b/docs/docs/quickstart/synthetic-data-creation.md
@@ -1,29 +1,41 @@
-# Create Synthetic Data
+# Auto-Evaluation
 
-## Problem synthetic data creation solves
+## Introduction
 
-- When there isn't much data or any data to start with for evaluating langchain pipelines
-- When getting an eyeball check of current performance is done very quickly
+Auto-evaluation is useful:
 
-![Synthetic Queries](../../assets/synthetic-query-generation.png)
+- When there isn't much data or any data to start with for evaluating langchain pipelines
+- When getting an eyeball check of current performance is required to be done very quickly
 
 Generating synthetic queries allows you to quickly evaluate the queries related to your prompts.
-We help developers get up and running with example queries from just raw text.
+
+We help developers get up and running with example queries from just raw text based on OpenAI's model. In this model, we generate query-answer pairs based on the text.
 
 ```python
 # Loads the synthetic query model to generate them based on data you get.
 # These automatically create synthetic queries and adds them to our online database
-from deepeval.dataset import create_evaluation_dataset_from_raw_text
+from deepeval.dataset import create_evaluation_query_output_pairs
+dataset = create_evaluation_query_output_pairs("Python is a great language for mathematical expression and machine learning.")
+```
+
+Once you have created your evaluation dataset, we recommend saving it.
 
-dataset = create_evaluation_dataset_from_raw_text("Python is a great language for mathematical expression and machine learning.")
+```python
+dataset.to_csv("sample.csv")
 ```
 
-## Running test cases.
+## Running tests/evaluation
+
+Once you have defined a number of test cases, you can easily run it in bulk if required. 
 
-Once you have defined a number of test cases, you can easily run it in bulk if required.
+If there are errors - this function will error.
 
 ```python
 # test_bulk_runner.py
+
+def generate_llm_output(query: str) -> str:
+    return "sample output"
+
 # Run an evaluation as you would any normal evaluation.
 dataset.run_evaluation(completion_fn=generate_llm_output)
 ```
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1,3 +1,7 @@
+import pytest
+import os
+
+
 def test_evaluation_dataset():
     from deepeval.dataset import EvaluationDataset
 
@@ -20,3 +24,19 @@ def test_evaluation_dataset():
         expected_output_column="expected_output",
         id_column="id",
     )
+    assert len(dataset) == 5
+
+
+@pytest.mark.skip(reason="OpenAI costs")
+def test_create_synthetic_dataset():
+    """
+    test for creating a synthetic dataset
+    """
+    from deepeval.dataset import create_evaluation_query_output_pairs
+
+    dataset = create_evaluation_query_output_pairs(
+        openai_api_key=os.environ["OPENAI_API_KEY"],
+        context="FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.",
+        n=1,
+    )
+    assert len(dataset) == 1
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__: str = "0.10.13"
		__version__: str = "0.11.0"