From 94448333e48da348bfb1490a864571abf3782be4 Mon Sep 17 00:00:00 2001
From: Adatepe <adatepe.alperen.lmu@gmail.com>
Date: Wed, 10 Jul 2024 17:20:53 +0200
Subject: [PATCH] feat: added 2nd function, visualization generating

---
 .gitignore                                    |   4 +-
 NAMESPACE                                     |   3 +-
 R/create_analysis.R                           |   4 +-
 R/generate_visualization.R                    | 150 ++++++++++++++++++
 README.md                                     |  46 ++++--
 ...Scientist_generate_scientific_analysis.Rd} |   8 +-
 ...nAIScientist_generate_visualization_rmd.Rd |  40 +++++
 7 files changed, 238 insertions(+), 17 deletions(-)
 create mode 100644 R/generate_visualization.R
 rename man/{generate_scientific_analysis.Rd => openAIScientist_generate_scientific_analysis.Rd} (78%)
 create mode 100644 man/openAIScientist_generate_visualization_rmd.Rd

diff --git a/.gitignore b/.gitignore
index 5179dbb..6167016 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,4 +18,6 @@ session-*
 
 # R build ignore file
 .Rbuildignore
-.Rhistory
\ No newline at end of file
+.Rhistory
+.RData
+TEST-CMD.R
\ No newline at end of file
diff --git a/NAMESPACE b/NAMESPACE
index 8c9616b..aab3bdc 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
-export(generate_scientific_analysis)
+export(openAIScientist_generate_scientific_analysis)
+export(openAIScientist_generate_visualization_rmd)
 importFrom(httr,POST)
 importFrom(httr,add_headers)
 importFrom(httr,content)
diff --git a/R/create_analysis.R b/R/create_analysis.R
index 81e5b5d..fa25e9d 100644
--- a/R/create_analysis.R
+++ b/R/create_analysis.R
@@ -126,12 +126,12 @@ clean_up_table <- function(md_content) {
 #'   outcome = sample(c(0, 1), 100, replace = TRUE)
 #' )
 #' api_key <- "your_openai_api_key"
-#' generate_scientific_analysis(data, api_key, "Analysis")
+#' openAIScientist_generate_scientific_analysis(data, api_key, "Analysis")
 #' }
 #' @importFrom httr POST add_headers content
 #' @importFrom utils capture.output
 #' @export
-generate_scientific_analysis <- function(data, api_key, output_name = "Analysis", additional_prompt = "") {
+openAIScientist_generate_scientific_analysis <- function(data, api_key, output_name = "Analysis", additional_prompt = "") {
   
   cat("Generating data summary...\n")
   
diff --git a/R/generate_visualization.R b/R/generate_visualization.R
new file mode 100644
index 0000000..8477a61
--- /dev/null
+++ b/R/generate_visualization.R
@@ -0,0 +1,150 @@
+#' Generate a Visualization Recommendation and ggplot Code in RMarkdown
+#'
+#' This function generates a visualization recommendation and the corresponding ggplot code
+#' for the given dataset using OpenAI's API. The generated R code with the ggplot is saved in a folder as an .Rmd file.
+#'
+#' @param data A data frame containing the dataset to analyze.
+#' @param api_key Your OpenAI API key as a string.
+#' @param output_name The name of the output file for the R code with ggplot.
+#' @param additional_prompt Additional instructions for the OpenAI API.
+#' @return The generated R code as a string.
+#' @examples
+#' \dontrun{
+#' data <- data.frame(
+#'   var1 = rnorm(100),
+#'   var2 = rnorm(100),
+#'   outcome = sample(c(0, 1), 100, replace = TRUE)
+#' )
+#' api_key <- "your_openai_api_key"
+#' openAIScientist_generate_visualization_rmd(data, api_key, "Visualization")
+#' }
+#' @importFrom httr POST add_headers content
+#' @importFrom utils capture.output
+#' @export
+openAIScientist_generate_visualization_rmd <- function(data, api_key, output_name = "Visualization", additional_prompt = "") {
+  
+  cat("Generating data summary...\n")
+  
+  # Create a data summary
+  data_summary <- summary(data)
+  data_description <- paste(capture.output(data_summary), collapse = "\n")
+  
+  # Check if API key is provided
+  if (api_key == "") {
+    stop("API key not found. Please provide a valid OpenAI API key.")
+  }
+  
+  # Clean up any problematic characters in the data description
+  data_description <- gsub("[`*]", "", data_description)
+  
+  # Construct the prompt
+  prompt <- paste(
+    "You are provided with the following dataset summary: We are working in R \n\n", 
+    data_description,
+    "\n\nYour tasks are enlisted below finish all of them one after another \n",
+    "- Explain what you are doing ",
+    "- Write your R code in Codeblocks with ```r ",
+    "- The data is created in the variable `dataset`. Reference it in your code. THIS IS VERY important. DO NOT OVERWRITE THE `dataset` variable (1:1).",
+    "- The data is created in the variable `dataset`. Reference it in your code. THIS IS VERY important. DO NOT OVERWRITE THE `dataset` variable (1:1).",
+    "- Analyze the data you and write a explenation about it ",
+    "- Now create plots with ggplot2 fitting to the analysiation you did before. Describe the plot and why you chose it.\n",
+    "- Structure your Response well with # Headers \n",
+    "- Write an if statement at the beginning to check if all the needed library is already installed and if not, install it.\n",
+    "- Always use the variable names, never use abbreviations.",
+    "- Try to ALWAYS cover all variables and correlations in a plot",
+    "- For every Plot write a Description and explenation on why this plot fits to the data",
+    "- Write an analysis of the data at the beginning",
+    "- Do not use grid.arrange",
+    "- Follow best practices while using colors for data visualization:\n",
+    "  * Use Qualitative palettes for categorical data.\n",
+    "  * Use Sequential palettes for numerical data with order.\n",
+    "  * Use Diverging palettes for numerical data with a meaningful midpoint.\n",
+    "  * Leverage the meaningfulness of color.\n",
+    "  * Avoid unnecessary usage of color.\n",
+    "  * Be consistent with color across charts.\n",
+    "  * Try to not use bright neon colors\n",
+    "Think about using: scatter plots, line charts, box plots, heatmaps, bar charts, pie charts, histograms, area charts or barplots depending on the best usecase",
+    "Every attribute that can be plotted should be plotted",
+    
+    additional_prompt,
+    "The data is created in the variable `dataset`. Reference it in your code. THIS IS VERY important. DO NOT OVERWRITE THE `dataset` variable (1:1).",
+    "Reference the dataset like this:\n",
+    "data <- dataset"
+  )
+  
+  cat("Sending request to OpenAI API (this might take a while)...\n")
+  
+  response <- POST(
+    url = "https://api.openai.com/v1/chat/completions",
+    add_headers(Authorization = paste("Bearer", api_key), 'Content-Type' = 'application/json'),
+    body = list(
+      model = "gpt-4o",
+      messages = list(list(role = "user", content = prompt))
+    ),
+    encode = "json"
+  )
+  
+  content <- content(response, "parsed")
+  
+  if (!is.null(content$choices)) {
+    r_code <- content$choices[[1]]$message$content
+    
+    # Replace ```r and ```R with ```{r}
+    rmd_code <- gsub("```[rR]", "```{r, message=FALSE}", r_code)
+    
+    # Ensure there is a blank space after each code block
+    rmd_code <- gsub("```\\{r, message=FALSE\\}\\n(.+?)\\n```", "```{r, message=FALSE}\\n\\1\\n```\n", rmd_code, perl = TRUE)
+    
+    # Create a folder for the output RMarkdown file
+    time_stamp <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S")
+    folder_name <- paste0(output_name, "_", time_stamp)
+    dir.create(folder_name)
+    
+    file_path <- file.path(folder_name, paste0(output_name, ".Rmd"))
+    
+    # Write the header and dataset to the RMarkdown file
+    rmd_header <- c(
+      "---",
+      "output:",
+      "  html_document:",
+      "    code_folding: hide",
+      "---",
+      "",
+      "# Dataset",
+      "```{r, message=FALSE}",
+      "dataset <- ", deparse(data),
+      "```",
+      ""
+    )
+    
+    # Save the RMarkdown code
+    writeLines(c(rmd_header, rmd_code), file_path)
+    cat(paste("RMarkdown file for visualization saved in:", file_path, "\n"))
+    
+    # Append additional text at the end of the file
+    additional_text <- "\n\n\n---\n\n\n\n\nThis analysis was created with [openAIScientist](https://github.com/noluyorAbi/openaAIScientist).\n\n Made with \u2665 by [noluyorAbi](https://github.com/noluyorAbi) for FortStaSoft @ LMU Munich"
+    cat(additional_text, file = file_path, append = TRUE)
+    
+    # Token usage and cost calculation
+    usage <- content$usage
+    if (!is.null(usage)) {
+      total_tokens <- usage$total_tokens
+      total_input_tokens <- usage$prompt_tokens
+      total_output_tokens <- usage$completion_tokens
+      
+      price_per_input_token <- 5.00 / 1e6  # $5 per 1M input tokens
+      price_per_output_token <- 15.00 / 1e6  # $15 per 1M output tokens
+      
+      total_cost <- (total_input_tokens * price_per_input_token) + (total_output_tokens * price_per_output_token)
+      cat("Initial call - Total tokens used:", total_tokens, "\n")
+      cat("Initial call - Total cost (USD):", total_cost, "\n")
+    } else {
+      cat("Initial call - Token usage information not available.\n")
+    }
+    
+    return(rmd_code)
+  } else {
+    cat("Failed to generate visualization. No content returned from OpenAI API.\n")
+    return(NULL)
+  }
+}
diff --git a/README.md b/README.md
index 807d8e2..1dcddb1 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,20 @@
   <img src="https://github.com/noluyorAbi/openAIScientist/actions/workflows/R-CMD-check.yml/badge.svg" style="margin-bottom: 10px;" />
 </div>
 
+## Table of Contents
+
+- [Installation](#installation-)
+- [Other Dependencies](#other-dependencies-)
+- [Usage](#usage-)
+  - [Function Arguments](#function-arguments-)
+  - [Example](#example-)
+- [Setting Up Your API Key](#setting-up-your-api-key-)
+  - [Step-by-Step Instructions](#step-by-step-instructions-)
+  - [Note on API Key Usage](#note-on-api-key-usage-)
+- [Documentation](#documentation-)
+- [Disclaimer](#disclaimer-)
+- [Contributing](#contributing-)
+- [License](#license-)
 
 ## Installation 💻
 
@@ -36,15 +50,24 @@ To use the `openAIScientist` package, follow these steps:
 
 1. Load the package.
 2. Load your API key from `.Renviron`.
-3. Use the `generate_scientific_analysis` function to generate the analysis.
+3. Use the `openAIScientist_generate_scientific_analysis` or `openAIScientist_generate_visualization_rmd` function to generate the analysis.
 
 ### Function Arguments 📊
 
+#### `openAIScientist_generate_scientific_analysis`
+
 - `data` (mandatory): A data frame containing the dataset to analyze.
 - `api_key` (mandatory): Your OpenAI API key as a string.
 - `output_name` (optional): The name of the output markdown file (default is "Analysis").
 - `additional_prompt` (optional): Additional instructions for the OpenAI API.
 
+#### `openAIScientist_generate_visualization_rmd`
+
+- `data` (mandatory): A data frame containing the dataset to analyze.
+- `api_key` (mandatory): Your OpenAI API key as a string.
+- `output_name` (optional): The name of the output RMarkdown file (default is "Visualization").
+- `additional_prompt` (optional): Additional instructions for the OpenAI API.
+
 ### Example 📝
 
 ```r
@@ -65,13 +88,17 @@ data <- data.frame(
 api_key <- Sys.getenv("OPENAI_API_KEY")
 
 # Generate scientific analysis
-analysis <- generate_scientific_analysis(data, api_key, "Analysis")
+analysis <- openAIScientist_generate_scientific_analysis(data, api_key, "Analysis")
+
+# Generate scientific analysis with additional prompt
+analysis <- openAIScientist_generate_scientific_analysis(data, api_key, "Analysis-ADDITIONAL-PROMPT","Write the analysis in German")
+
+# Generate visualization RMarkdown
+visualization <- openAIScientist_generate_visualization_rmd(data, api_key, "Visualization")
 
-# Additional Prompt for custom uses
-# analysis <- generate_scientific_analysis(data, api_key, "Analysis", "Write the Analysis in German")
+# Generate visualization RMarkdown with additional prompt
+visualization <- openAIScientist_generate_visualization_rmd(data, api_key, "Visualization-ADDITIONAL-PROMPT", "make the visualizations for red-green colorblind")
 
-# Print the analysis in the console if wanted
-# cat(analysis)
 ```
 
 ## Setting Up Your API Key 🔑
@@ -114,7 +141,7 @@ While you can directly paste your API key as an argument in the `generate_scient
 
 ```r
 # Directly pasting the API key as an argument (not recommended)
-analysis <- generate_scientific_analysis(data, "your_openai_api_key", "Analysis")
+analysis <- openAIScientist_generate_scientific_analysis(data, "your_openai_api_key", "Analysis")
 ```
 
 Using environment variables as demonstrated in the previous examples is the recommended approach.
@@ -124,12 +151,13 @@ Using environment variables as demonstrated in the previous examples is the reco
 For detailed documentation, please refer to the function documentation generated by Roxygen2. You can access the documentation within R:
 
 ```r
-?generate_scientific_analysis
+?openAIScientist_generate_scientific_analysis
+?openAIScientist_generate_visualization_rmd
 ```
 
 ## Disclaimer ⚠️
 
-The analysis is created with GPT-4o, a very powerful and fast AI. However, there can still be inaccuracies and formatting issues as AIs can be unpredictable sometimes. For formatting issues, try reanalyzing the dataset.
+The analysis is created with GPT-4, a very powerful and fast AI. However, there can still be inaccuracies and formatting issues as AIs can be unpredictable sometimes. For formatting issues, try reanalyzing the dataset.
 
 ## Contributing 🤝
 
diff --git a/man/generate_scientific_analysis.Rd b/man/openAIScientist_generate_scientific_analysis.Rd
similarity index 78%
rename from man/generate_scientific_analysis.Rd
rename to man/openAIScientist_generate_scientific_analysis.Rd
index 71474e5..bf3eee9 100644
--- a/man/generate_scientific_analysis.Rd
+++ b/man/openAIScientist_generate_scientific_analysis.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/create_analysis.R
-\name{generate_scientific_analysis}
-\alias{generate_scientific_analysis}
+\name{openAIScientist_generate_scientific_analysis}
+\alias{openAIScientist_generate_scientific_analysis}
 \title{Generate a Comprehensive Scientific Analysis}
 \usage{
-generate_scientific_analysis(
+openAIScientist_generate_scientific_analysis(
   data,
   api_key,
   output_name = "Analysis",
@@ -34,6 +34,6 @@ data <- data.frame(
   outcome = sample(c(0, 1), 100, replace = TRUE)
 )
 api_key <- "your_openai_api_key"
-generate_scientific_analysis(data, api_key, "Analysis")
+openAIScientist_generate_scientific_analysis(data, api_key, "Analysis")
 }
 }
diff --git a/man/openAIScientist_generate_visualization_rmd.Rd b/man/openAIScientist_generate_visualization_rmd.Rd
new file mode 100644
index 0000000..fa78348
--- /dev/null
+++ b/man/openAIScientist_generate_visualization_rmd.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/generate_visualization.R
+\name{openAIScientist_generate_visualization_rmd}
+\alias{openAIScientist_generate_visualization_rmd}
+\title{Generate a Visualization Recommendation and ggplot Code in RMarkdown}
+\usage{
+openAIScientist_generate_visualization_rmd(
+  data,
+  api_key,
+  output_name = "Visualization",
+  additional_prompt = ""
+)
+}
+\arguments{
+\item{data}{A data frame containing the dataset to analyze.}
+
+\item{api_key}{Your OpenAI API key as a string.}
+
+\item{output_name}{The name of the output file for the R code with ggplot.}
+
+\item{additional_prompt}{Additional instructions for the OpenAI API.}
+}
+\value{
+The generated R code as a string.
+}
+\description{
+This function generates a visualization recommendation and the corresponding ggplot code
+for the given dataset using OpenAI's API. The generated R code with the ggplot is saved in a folder as an .Rmd file.
+}
+\examples{
+\dontrun{
+data <- data.frame(
+  var1 = rnorm(100),
+  var2 = rnorm(100),
+  outcome = sample(c(0, 1), 100, replace = TRUE)
+)
+api_key <- "your_openai_api_key"
+openAIScientist_generate_visualization_rmd(data, api_key, "Visualization")
+}
+}