From 94448333e48da348bfb1490a864571abf3782be4 Mon Sep 17 00:00:00 2001 From: Adatepe Date: Wed, 10 Jul 2024 17:20:53 +0200 Subject: [PATCH] feat: added 2nd function, visualization generating --- .gitignore | 4 +- NAMESPACE | 3 +- R/create_analysis.R | 4 +- R/generate_visualization.R | 150 ++++++++++++++++++ README.md | 46 ++++-- ...Scientist_generate_scientific_analysis.Rd} | 8 +- ...nAIScientist_generate_visualization_rmd.Rd | 40 +++++ 7 files changed, 238 insertions(+), 17 deletions(-) create mode 100644 R/generate_visualization.R rename man/{generate_scientific_analysis.Rd => openAIScientist_generate_scientific_analysis.Rd} (78%) create mode 100644 man/openAIScientist_generate_visualization_rmd.Rd diff --git a/.gitignore b/.gitignore index 5179dbb..6167016 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,6 @@ session-* # R build ignore file .Rbuildignore -.Rhistory \ No newline at end of file +.Rhistory +.RData +TEST-CMD.R \ No newline at end of file diff --git a/NAMESPACE b/NAMESPACE index 8c9616b..aab3bdc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand -export(generate_scientific_analysis) +export(openAIScientist_generate_scientific_analysis) +export(openAIScientist_generate_visualization_rmd) importFrom(httr,POST) importFrom(httr,add_headers) importFrom(httr,content) diff --git a/R/create_analysis.R b/R/create_analysis.R index 81e5b5d..fa25e9d 100644 --- a/R/create_analysis.R +++ b/R/create_analysis.R @@ -126,12 +126,12 @@ clean_up_table <- function(md_content) { #' outcome = sample(c(0, 1), 100, replace = TRUE) #' ) #' api_key <- "your_openai_api_key" -#' generate_scientific_analysis(data, api_key, "Analysis") +#' openAIScientist_generate_scientific_analysis(data, api_key, "Analysis") #' } #' @importFrom httr POST add_headers content #' @importFrom utils capture.output #' @export -generate_scientific_analysis <- function(data, api_key, output_name = "Analysis", additional_prompt = "") { +openAIScientist_generate_scientific_analysis <- function(data, api_key, output_name = "Analysis", additional_prompt = "") { cat("Generating data summary...\n") diff --git a/R/generate_visualization.R b/R/generate_visualization.R new file mode 100644 index 0000000..8477a61 --- /dev/null +++ b/R/generate_visualization.R @@ -0,0 +1,150 @@ +#' Generate a Visualization Recommendation and ggplot Code in RMarkdown +#' +#' This function generates a visualization recommendation and the corresponding ggplot code +#' for the given dataset using OpenAI's API. The generated R code with the ggplot is saved in a folder as an .Rmd file. +#' +#' @param data A data frame containing the dataset to analyze. +#' @param api_key Your OpenAI API key as a string. +#' @param output_name The name of the output file for the R code with ggplot. +#' @param additional_prompt Additional instructions for the OpenAI API. +#' @return The generated R code as a string. +#' @examples +#' \dontrun{ +#' data <- data.frame( +#' var1 = rnorm(100), +#' var2 = rnorm(100), +#' outcome = sample(c(0, 1), 100, replace = TRUE) +#' ) +#' api_key <- "your_openai_api_key" +#' openAIScientist_generate_visualization_rmd(data, api_key, "Visualization") +#' } +#' @importFrom httr POST add_headers content +#' @importFrom utils capture.output +#' @export +openAIScientist_generate_visualization_rmd <- function(data, api_key, output_name = "Visualization", additional_prompt = "") { + + cat("Generating data summary...\n") + + # Create a data summary + data_summary <- summary(data) + data_description <- paste(capture.output(data_summary), collapse = "\n") + + # Check if API key is provided + if (api_key == "") { + stop("API key not found. Please provide a valid OpenAI API key.") + } + + # Clean up any problematic characters in the data description + data_description <- gsub("[`*]", "", data_description) + + # Construct the prompt + prompt <- paste( + "You are provided with the following dataset summary: We are working in R \n\n", + data_description, + "\n\nYour tasks are enlisted below finish all of them one after another \n", + "- Explain what you are doing ", + "- Write your R code in Codeblocks with ```r ", + "- The data is created in the variable `dataset`. Reference it in your code. THIS IS VERY important. DO NOT OVERWRITE THE `dataset` variable (1:1).", + "- The data is created in the variable `dataset`. Reference it in your code. THIS IS VERY important. DO NOT OVERWRITE THE `dataset` variable (1:1).", + "- Analyze the data you and write a explenation about it ", + "- Now create plots with ggplot2 fitting to the analysiation you did before. Describe the plot and why you chose it.\n", + "- Structure your Response well with # Headers \n", + "- Write an if statement at the beginning to check if all the needed library is already installed and if not, install it.\n", + "- Always use the variable names, never use abbreviations.", + "- Try to ALWAYS cover all variables and correlations in a plot", + "- For every Plot write a Description and explenation on why this plot fits to the data", + "- Write an analysis of the data at the beginning", + "- Do not use grid.arrange", + "- Follow best practices while using colors for data visualization:\n", + " * Use Qualitative palettes for categorical data.\n", + " * Use Sequential palettes for numerical data with order.\n", + " * Use Diverging palettes for numerical data with a meaningful midpoint.\n", + " * Leverage the meaningfulness of color.\n", + " * Avoid unnecessary usage of color.\n", + " * Be consistent with color across charts.\n", + " * Try to not use bright neon colors\n", + "Think about using: scatter plots, line charts, box plots, heatmaps, bar charts, pie charts, histograms, area charts or barplots depending on the best usecase", + "Every attribute that can be plotted should be plotted", + + additional_prompt, + "The data is created in the variable `dataset`. Reference it in your code. THIS IS VERY important. DO NOT OVERWRITE THE `dataset` variable (1:1).", + "Reference the dataset like this:\n", + "data <- dataset" + ) + + cat("Sending request to OpenAI API (this might take a while)...\n") + + response <- POST( + url = "https://api.openai.com/v1/chat/completions", + add_headers(Authorization = paste("Bearer", api_key), 'Content-Type' = 'application/json'), + body = list( + model = "gpt-4o", + messages = list(list(role = "user", content = prompt)) + ), + encode = "json" + ) + + content <- content(response, "parsed") + + if (!is.null(content$choices)) { + r_code <- content$choices[[1]]$message$content + + # Replace ```r and ```R with ```{r} + rmd_code <- gsub("```[rR]", "```{r, message=FALSE}", r_code) + + # Ensure there is a blank space after each code block + rmd_code <- gsub("```\\{r, message=FALSE\\}\\n(.+?)\\n```", "```{r, message=FALSE}\\n\\1\\n```\n", rmd_code, perl = TRUE) + + # Create a folder for the output RMarkdown file + time_stamp <- format(Sys.time(), "%Y-%m-%d_%H-%M-%S") + folder_name <- paste0(output_name, "_", time_stamp) + dir.create(folder_name) + + file_path <- file.path(folder_name, paste0(output_name, ".Rmd")) + + # Write the header and dataset to the RMarkdown file + rmd_header <- c( + "---", + "output:", + " html_document:", + " code_folding: hide", + "---", + "", + "# Dataset", + "```{r, message=FALSE}", + "dataset <- ", deparse(data), + "```", + "" + ) + + # Save the RMarkdown code + writeLines(c(rmd_header, rmd_code), file_path) + cat(paste("RMarkdown file for visualization saved in:", file_path, "\n")) + + # Append additional text at the end of the file + additional_text <- "\n\n\n---\n\n\n\n\nThis analysis was created with [openAIScientist](https://github.com/noluyorAbi/openaAIScientist).\n\n Made with \u2665 by [noluyorAbi](https://github.com/noluyorAbi) for FortStaSoft @ LMU Munich" + cat(additional_text, file = file_path, append = TRUE) + + # Token usage and cost calculation + usage <- content$usage + if (!is.null(usage)) { + total_tokens <- usage$total_tokens + total_input_tokens <- usage$prompt_tokens + total_output_tokens <- usage$completion_tokens + + price_per_input_token <- 5.00 / 1e6 # $5 per 1M input tokens + price_per_output_token <- 15.00 / 1e6 # $15 per 1M output tokens + + total_cost <- (total_input_tokens * price_per_input_token) + (total_output_tokens * price_per_output_token) + cat("Initial call - Total tokens used:", total_tokens, "\n") + cat("Initial call - Total cost (USD):", total_cost, "\n") + } else { + cat("Initial call - Token usage information not available.\n") + } + + return(rmd_code) + } else { + cat("Failed to generate visualization. No content returned from OpenAI API.\n") + return(NULL) + } +} diff --git a/README.md b/README.md index 807d8e2..1dcddb1 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,20 @@ +## Table of Contents + +- [Installation](#installation-) +- [Other Dependencies](#other-dependencies-) +- [Usage](#usage-) + - [Function Arguments](#function-arguments-) + - [Example](#example-) +- [Setting Up Your API Key](#setting-up-your-api-key-) + - [Step-by-Step Instructions](#step-by-step-instructions-) + - [Note on API Key Usage](#note-on-api-key-usage-) +- [Documentation](#documentation-) +- [Disclaimer](#disclaimer-) +- [Contributing](#contributing-) +- [License](#license-) ## Installation 💻 @@ -36,15 +50,24 @@ To use the `openAIScientist` package, follow these steps: 1. Load the package. 2. Load your API key from `.Renviron`. -3. Use the `generate_scientific_analysis` function to generate the analysis. +3. Use the `openAIScientist_generate_scientific_analysis` or `openAIScientist_generate_visualization_rmd` function to generate the analysis. ### Function Arguments 📊 +#### `openAIScientist_generate_scientific_analysis` + - `data` (mandatory): A data frame containing the dataset to analyze. - `api_key` (mandatory): Your OpenAI API key as a string. - `output_name` (optional): The name of the output markdown file (default is "Analysis"). - `additional_prompt` (optional): Additional instructions for the OpenAI API. +#### `openAIScientist_generate_visualization_rmd` + +- `data` (mandatory): A data frame containing the dataset to analyze. +- `api_key` (mandatory): Your OpenAI API key as a string. +- `output_name` (optional): The name of the output RMarkdown file (default is "Visualization"). +- `additional_prompt` (optional): Additional instructions for the OpenAI API. + ### Example 📝 ```r @@ -65,13 +88,17 @@ data <- data.frame( api_key <- Sys.getenv("OPENAI_API_KEY") # Generate scientific analysis -analysis <- generate_scientific_analysis(data, api_key, "Analysis") +analysis <- openAIScientist_generate_scientific_analysis(data, api_key, "Analysis") + +# Generate scientific analysis with additional prompt +analysis <- openAIScientist_generate_scientific_analysis(data, api_key, "Analysis-ADDITIONAL-PROMPT","Write the analysis in German") + +# Generate visualization RMarkdown +visualization <- openAIScientist_generate_visualization_rmd(data, api_key, "Visualization") -# Additional Prompt for custom uses -# analysis <- generate_scientific_analysis(data, api_key, "Analysis", "Write the Analysis in German") +# Generate visualization RMarkdown with additional prompt +visualization <- openAIScientist_generate_visualization_rmd(data, api_key, "Visualization-ADDITIONAL-PROMPT", "make the visualizations for red-green colorblind") -# Print the analysis in the console if wanted -# cat(analysis) ``` ## Setting Up Your API Key 🔑 @@ -114,7 +141,7 @@ While you can directly paste your API key as an argument in the `generate_scient ```r # Directly pasting the API key as an argument (not recommended) -analysis <- generate_scientific_analysis(data, "your_openai_api_key", "Analysis") +analysis <- openAIScientist_generate_scientific_analysis(data, "your_openai_api_key", "Analysis") ``` Using environment variables as demonstrated in the previous examples is the recommended approach. @@ -124,12 +151,13 @@ Using environment variables as demonstrated in the previous examples is the reco For detailed documentation, please refer to the function documentation generated by Roxygen2. You can access the documentation within R: ```r -?generate_scientific_analysis +?openAIScientist_generate_scientific_analysis +?openAIScientist_generate_visualization_rmd ``` ## Disclaimer ⚠️ -The analysis is created with GPT-4o, a very powerful and fast AI. However, there can still be inaccuracies and formatting issues as AIs can be unpredictable sometimes. For formatting issues, try reanalyzing the dataset. +The analysis is created with GPT-4, a very powerful and fast AI. However, there can still be inaccuracies and formatting issues as AIs can be unpredictable sometimes. For formatting issues, try reanalyzing the dataset. ## Contributing 🤝 diff --git a/man/generate_scientific_analysis.Rd b/man/openAIScientist_generate_scientific_analysis.Rd similarity index 78% rename from man/generate_scientific_analysis.Rd rename to man/openAIScientist_generate_scientific_analysis.Rd index 71474e5..bf3eee9 100644 --- a/man/generate_scientific_analysis.Rd +++ b/man/openAIScientist_generate_scientific_analysis.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/create_analysis.R -\name{generate_scientific_analysis} -\alias{generate_scientific_analysis} +\name{openAIScientist_generate_scientific_analysis} +\alias{openAIScientist_generate_scientific_analysis} \title{Generate a Comprehensive Scientific Analysis} \usage{ -generate_scientific_analysis( +openAIScientist_generate_scientific_analysis( data, api_key, output_name = "Analysis", @@ -34,6 +34,6 @@ data <- data.frame( outcome = sample(c(0, 1), 100, replace = TRUE) ) api_key <- "your_openai_api_key" -generate_scientific_analysis(data, api_key, "Analysis") +openAIScientist_generate_scientific_analysis(data, api_key, "Analysis") } } diff --git a/man/openAIScientist_generate_visualization_rmd.Rd b/man/openAIScientist_generate_visualization_rmd.Rd new file mode 100644 index 0000000..fa78348 --- /dev/null +++ b/man/openAIScientist_generate_visualization_rmd.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/generate_visualization.R +\name{openAIScientist_generate_visualization_rmd} +\alias{openAIScientist_generate_visualization_rmd} +\title{Generate a Visualization Recommendation and ggplot Code in RMarkdown} +\usage{ +openAIScientist_generate_visualization_rmd( + data, + api_key, + output_name = "Visualization", + additional_prompt = "" +) +} +\arguments{ +\item{data}{A data frame containing the dataset to analyze.} + +\item{api_key}{Your OpenAI API key as a string.} + +\item{output_name}{The name of the output file for the R code with ggplot.} + +\item{additional_prompt}{Additional instructions for the OpenAI API.} +} +\value{ +The generated R code as a string. +} +\description{ +This function generates a visualization recommendation and the corresponding ggplot code +for the given dataset using OpenAI's API. The generated R code with the ggplot is saved in a folder as an .Rmd file. +} +\examples{ +\dontrun{ +data <- data.frame( + var1 = rnorm(100), + var2 = rnorm(100), + outcome = sample(c(0, 1), 100, replace = TRUE) +) +api_key <- "your_openai_api_key" +openAIScientist_generate_visualization_rmd(data, api_key, "Visualization") +} +}