diff --git a/examples/open_deep_research/analysis.ipynb b/examples/open_deep_research/analysis.ipynb
new file mode 100644
index 000000000..73b63dc2a
--- /dev/null
+++ b/examples/open_deep_research/analysis.ipynb
@@ -0,0 +1,10552 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# !pip install plotly kaleido datasets nbformat -U -q"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/aymeric/venv/gaia/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n",
+ "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "\n",
+ "import datasets\n",
+ "import pandas as pd\n",
+ "from dotenv import load_dotenv\n",
+ "from huggingface_hub import login\n",
+ "\n",
+ "\n",
+ "load_dotenv(override=True)\n",
+ "login(os.getenv(\"HF_TOKEN\"))\n",
+ "\n",
+ "pd.set_option(\"max_colwidth\", None)\n",
+ "\n",
+ "OUTPUT_DIR = \"output\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "eval_ds = datasets.load_dataset(\"gaia-benchmark/GAIA\", \"2023_all\")[\"validation\"]\n",
+ "eval_ds = eval_ds.rename_columns({\"Question\": \"question\", \"Final answer\": \"true_answer\", \"Level\": \"task\"})\n",
+ "eval_df = pd.DataFrame(eval_ds)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2 86\n",
+ "1 53\n",
+ "3 26\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.Series(eval_ds[\"task\"]).value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 1. Load all results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import glob\n",
+ "\n",
+ "\n",
+ "results = []\n",
+ "for f in glob.glob(f\"{OUTPUT_DIR}/validation/*.jsonl\"):\n",
+ " df = pd.read_json(f, lines=True)\n",
+ " df[\"agent_name\"] = f.split(\"/\")[-1].split(\".\")[0]\n",
+ " results.append(df)\n",
+ "\n",
+ "result_df = pd.concat(results)\n",
+ "result_df = result_df.drop(columns=[\"start_time\", \"end_time\"])\n",
+ "result_df[\"prediction\"] = result_df[\"prediction\"].fillna(\"No prediction\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/aymeric/Documents/Code/smolagents/examples/open_deep_research/scripts/gaia_scorer.py:52: UserWarning: Answer lists have different lengths, returning False.\n",
+ " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String 2 High fantasy A Song of Ice and Fire cannot be normalized to number str.\n",
+ "String cannot be normalized to number str.\n",
+ "String 94 CFM for Cheater cannot be normalized to number str.\n",
+ "String 93 CFM for Cheater beater cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 3 or 4 cannot be normalized to number str.\n",
+ "String No year cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 250 for Cheater cannot be normalized to number str.\n",
+ "String 220 for Cheater beater cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String CFM number for Cheater: not listed cannot be normalized to number str.\n",
+ "String CFM number for Cheater beater: 665 ft/min cannot be normalized to number str.\n",
+ "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String cannot be normalized to number str.\n",
+ "String August 1: 0 August 2: 0 August 3: 0 August 4: 0 August 5: 0 August 6: 0 August 7: 0 August 8: 0 August 9: 0 August 10: 0 August 11: 0 August 12: 0 August 13: 0 August 14: 0 August 15: 0 August 16: 0 August 17: 0 August 18: 0 August 19: 0 August 20: 0 August 21: 0 August 22: 0 August 23: 0 August 24: 0 August 25: 0 August 26: 0 August 27: 0 August 28: 0 August 29: 0 August 30: 0 August 31: 0 cannot be normalized to number str.\n",
+ "String cannot be normalized to number str.\n",
+ "String cannot be normalized to number str.\n",
+ "String cannot be normalized to number str.\n",
+ "String cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 120.28 for Cheater cannot be normalized to number str.\n",
+ "String 119.04 for Cheater beater cannot be normalized to number str.\n",
+ "String 3 or 4 cannot be normalized to number str.\n",
+ "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String cannot be normalized to number str.\n",
+ "String 2017 Komo Mai Drive sold for 900000 cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 2730-2740 cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 89706.00 USD cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String No prediction cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 6 The Lord of the Rings (book) J. R. R. Tolkien Author American literature Fantasy literature Publishers A Song of Ice and Fire cannot be normalized to number str.\n",
+ "String cannot be normalized to number str.\n",
+ "String cannot be normalized to number str.\n",
+ "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 1.46 Å cannot be normalized to number str.\n",
+ "String cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 94.5 for Cheater cannot be normalized to number str.\n",
+ "String 93.5 for Cheater beater cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 776 for Cheater cannot be normalized to number str.\n",
+ "String Not specified for Cheater Beater cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 5.75 for Cheater cannot be normalized to number str.\n",
+ "String 5.22 for Cheater Beater cannot be normalized to number str.\n",
+ "String 2017 Komo Mai Drive sold for 900000 cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String 33101 28557 cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "String Unable to determine cannot be normalized to number str.\n",
+ "Close call: Egalitarianism vs egalitarian\n",
+ "Close call: INT. THE CASTLE vs THE CASTLE\n",
+ "Close call: to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n",
+ "Close call: Alfonso Cardinal Visconti vs Alfonso Visconti\n",
+ "Close call: Wes Craven's A Nightmare on Elm Street vs A Nightmare on Elm Street\n",
+ "Close call: God said let there be dragons vs Here be dragons\n",
+ "Close call: rockhopper penguins vs Rockhopper penguin\n",
+ "Close call: Harbinger, This Fire, Tidal vs Harbinger, Tidal\n",
+ "Close call: EC 3.1.3.1;EC 1.11.1.7 vs 3.1.3.1; 1.11.1.7\n",
+ "Close call: to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n",
+ "Close call: Alfonso Cardinal Visconti vs Alfonso Visconti\n",
+ "Close call: to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n",
+ "Close call: Out of the Silent Planet by C.S. Lewis vs Out of the Silent Planet\n",
+ "Close call: broccoli, celery, fresh basil, green beans, lettuce, sweet potatoes vs broccoli, celery, fresh basil, lettuce, sweet potatoes\n",
+ "Close call: To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n"
+ ]
+ }
+ ],
+ "source": [
+ "import re\n",
+ "from collections import Counter\n",
+ "\n",
+ "from scripts.gaia_scorer import check_close_call, question_scorer\n",
+ "\n",
+ "\n",
+ "result_df[\"is_correct\"] = result_df.apply(lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1)\n",
+ "result_df[\"is_near_correct\"] = result_df.apply(\n",
+ " lambda x: check_close_call(x[\"prediction\"], x[\"true_answer\"], x[\"is_correct\"]),\n",
+ " axis=1,\n",
+ ")\n",
+ "\n",
+ "result_df[\"count_steps\"] = result_df[\"intermediate_steps\"].apply(len)\n",
+ "\n",
+ "\n",
+ "def find_attachment(question):\n",
+ " matches = eval_df.loc[eval_df[\"question\"].apply(lambda x: x in question), \"file_name\"]\n",
+ "\n",
+ " if len(matches) == 0:\n",
+ " return \"Not found\"\n",
+ " file_path = matches.values[0]\n",
+ "\n",
+ " if isinstance(file_path, str) and len(file_path) > 0:\n",
+ " return file_path.split(\".\")[-1]\n",
+ " else:\n",
+ " return \"None\"\n",
+ "\n",
+ "\n",
+ "result_df[\"attachment_type\"] = result_df[\"question\"].apply(find_attachment)\n",
+ "\n",
+ "\n",
+ "def extract_tool_calls(code):\n",
+ " regex = r\"\\b(\\w+)\\(\"\n",
+ " function_calls = [el for el in re.findall(regex, code) if el.islower()]\n",
+ "\n",
+ " function_call_counter = Counter(function_calls)\n",
+ " return function_call_counter\n",
+ "\n",
+ "\n",
+ "def sum_tool_calls(steps):\n",
+ " total_count = Counter()\n",
+ " for step in steps:\n",
+ " if \"llm_output\" in step:\n",
+ " total_count += extract_tool_calls(step[\"llm_output\"])\n",
+ "\n",
+ " return total_count\n",
+ "\n",
+ "\n",
+ "# result_df[\"tool_calls\"] = result_df[\"intermediate_steps\"].apply(sum_tool_calls)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_thoughts(x):\n",
+ " try:\n",
+ " output = x[0][\"task\"]\n",
+ " for y in x[1:]:\n",
+ " try:\n",
+ " if \"observation\" in y:\n",
+ " output += y[\"llm_output\"] + \"\\nObservation:\" + y[\"observation\"]\n",
+ " else:\n",
+ " output += y[\"llm_output\"] + r\"\\Error:\" + str(y[\"error\"])\n",
+ " except Exception:\n",
+ " pass\n",
+ " return output\n",
+ " except Exception:\n",
+ " return None\n",
+ "\n",
+ "\n",
+ "result_df[\"thoughts\"] = result_df[\"intermediate_steps\"].apply(lambda x: get_thoughts(x))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "agent_name\n",
+ "code_o3-mini_03_february_remove-navigational 165\n",
+ "code_o1_03_february_text_high-reasoning-effort 165\n",
+ "code_o1_01_february_text 165\n",
+ "code_gpt4o_03_february_text 165\n",
+ "code_o1_03_february_fix-print-outputs 164\n",
+ "code_o1_03_february_remove-navigational 164\n",
+ "code_o1_03_february_goodoldtext-unbroken 161\n",
+ "code_gpt4o_03_february_magenticbrowser 159\n",
+ "code_gpt4o_03_february_goodoldtext-unbroken 159\n",
+ "code_o1_03_february_fix-print-outputs2 156\n",
+ "code_gpt4o_03_february_magenticbrowser2 156\n",
+ "code_o1_29-01_text 105\n",
+ "code_llama-3 90\n",
+ "code_o1_22-01_managedagent-summary_planning 67\n",
+ "code_o1_25-01_visioon 53\n",
+ "code_gpt4o_03_february_goodoldtext 50\n",
+ "code_qwen-coder-32B_03_february_text 43\n",
+ "code_sonnet_03_february_goodoldtext-unbroken 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "result_df[\"agent_name\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 2. Inspect specific runs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "agent_name\n",
+ "code_o3-mini_03_february_remove-navigational 165\n",
+ "code_o1_03_february_text_high-reasoning-effort 165\n",
+ "code_o1_01_february_text 165\n",
+ "code_gpt4o_03_february_text 165\n",
+ "code_o1_03_february_fix-print-outputs 164\n",
+ "code_o1_03_february_remove-navigational 164\n",
+ "code_o1_03_february_goodoldtext-unbroken 161\n",
+ "code_gpt4o_03_february_magenticbrowser 159\n",
+ "code_gpt4o_03_february_goodoldtext-unbroken 159\n",
+ "code_o1_03_february_fix-print-outputs2 156\n",
+ "code_gpt4o_03_february_magenticbrowser2 156\n",
+ "code_o1_29-01_text 105\n",
+ "code_llama-3 90\n",
+ "code_o1_22-01_managedagent-summary_planning 67\n",
+ "code_o1_25-01_visioon 53\n",
+ "code_gpt4o_03_february_goodoldtext 50\n",
+ "code_qwen-coder-32B_03_february_text 43\n",
+ "code_sonnet_03_february_goodoldtext-unbroken 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "agent_name task\n",
+ "code_gpt4o_03_february_goodoldtext 2 26\n",
+ " 1 19\n",
+ " 3 5\n",
+ "code_gpt4o_03_february_goodoldtext-unbroken 2 84\n",
+ " 1 53\n",
+ " 3 22\n",
+ "code_gpt4o_03_february_magenticbrowser 2 83\n",
+ " 1 52\n",
+ " 3 24\n",
+ "code_gpt4o_03_february_magenticbrowser2 2 81\n",
+ " 1 52\n",
+ " 3 23\n",
+ "code_gpt4o_03_february_text 2 86\n",
+ " 1 53\n",
+ " 3 26\n",
+ "code_llama-3 2 50\n",
+ " 1 26\n",
+ " 3 14\n",
+ "code_o1_01_february_text 2 86\n",
+ " 1 53\n",
+ " 3 26\n",
+ "code_o1_03_february_fix-print-outputs 2 85\n",
+ " 1 53\n",
+ " 3 26\n",
+ "code_o1_03_february_fix-print-outputs2 2 79\n",
+ " 1 53\n",
+ " 3 24\n",
+ "code_o1_03_february_goodoldtext-unbroken 2 85\n",
+ " 1 53\n",
+ " 3 23\n",
+ "code_o1_03_february_remove-navigational 2 85\n",
+ " 1 53\n",
+ " 3 26\n",
+ "code_o1_03_february_text_high-reasoning-effort 2 86\n",
+ " 1 53\n",
+ " 3 26\n",
+ "code_o1_22-01_managedagent-summary_planning 2 36\n",
+ " 1 21\n",
+ " 3 10\n",
+ "code_o1_25-01_visioon 2 30\n",
+ " 1 17\n",
+ " 3 6\n",
+ "code_o1_29-01_text 2 58\n",
+ " 1 31\n",
+ " 3 16\n",
+ "code_o3-mini_03_february_remove-navigational 2 86\n",
+ " 1 53\n",
+ " 3 26\n",
+ "code_qwen-coder-32B_03_february_text 2 22\n",
+ " 1 14\n",
+ " 3 7\n",
+ "code_sonnet_03_february_goodoldtext-unbroken 2 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total length: 2188 - is complete: False\n"
+ ]
+ }
+ ],
+ "source": [
+ "o1_vision = \"code_o1_25-01_visioon\"\n",
+ "o1_next = \"code_o1_29-01_text\"\n",
+ "o1 = \"code_o1_01_february_text\"\n",
+ "\n",
+ "list_versions = [o1, o1_vision, o1_next]\n",
+ "\n",
+ "# submission_selection_name = \"react_code_llama3-70b_02-05_full-gaia-validation-code\"\n",
+ "sel_df = result_df\n",
+ "# sel_df = sel_df.loc[\n",
+ "# (result_df[\"agent_name\"].isin(list_versions))\n",
+ "# # & (~result_df[\"question\"].isin(UNSOLVED_QUESTIONS))\n",
+ "# ]\n",
+ "sel_df = sel_df.reset_index(drop=True)\n",
+ "display(sel_df[\"agent_name\"].value_counts())\n",
+ "sel_df = sel_df.drop_duplicates(subset=[\"agent_name\", \"question\"])\n",
+ "display(sel_df.groupby(\"agent_name\")[[\"task\"]].value_counts())\n",
+ "print(\"Total length:\", len(sel_df), \"- is complete:\", len(sel_df) == 165)\n",
+ "# assert sel_df[\"question\"].value_counts().max() == len(list_versions), \"Some questions are duplicate!\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Average score:'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " is_correct | \n",
+ "
\n",
+ " \n",
+ " agent_name | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " code_gpt4o_03_february_goodoldtext | \n",
+ " 0.440 | \n",
+ "
\n",
+ " \n",
+ " code_gpt4o_03_february_goodoldtext-unbroken | \n",
+ " 0.384 | \n",
+ "
\n",
+ " \n",
+ " code_gpt4o_03_february_magenticbrowser | \n",
+ " 0.352 | \n",
+ "
\n",
+ " \n",
+ " code_gpt4o_03_february_magenticbrowser2 | \n",
+ " 0.365 | \n",
+ "
\n",
+ " \n",
+ " code_gpt4o_03_february_text | \n",
+ " 0.376 | \n",
+ "
\n",
+ " \n",
+ " code_llama-3 | \n",
+ " 0.078 | \n",
+ "
\n",
+ " \n",
+ " code_o1_01_february_text | \n",
+ " 0.491 | \n",
+ "
\n",
+ " \n",
+ " code_o1_03_february_fix-print-outputs | \n",
+ " 0.518 | \n",
+ "
\n",
+ " \n",
+ " code_o1_03_february_fix-print-outputs2 | \n",
+ " 0.526 | \n",
+ "
\n",
+ " \n",
+ " code_o1_03_february_goodoldtext-unbroken | \n",
+ " 0.534 | \n",
+ "
\n",
+ " \n",
+ " code_o1_03_february_remove-navigational | \n",
+ " 0.537 | \n",
+ "
\n",
+ " \n",
+ " code_o1_03_february_text_high-reasoning-effort | \n",
+ " 0.485 | \n",
+ "
\n",
+ " \n",
+ " code_o1_22-01_managedagent-summary_planning | \n",
+ " 0.418 | \n",
+ "
\n",
+ " \n",
+ " code_o1_25-01_visioon | \n",
+ " 0.340 | \n",
+ "
\n",
+ " \n",
+ " code_o1_29-01_text | \n",
+ " 0.390 | \n",
+ "
\n",
+ " \n",
+ " code_o3-mini_03_february_remove-navigational | \n",
+ " 0.291 | \n",
+ "
\n",
+ " \n",
+ " code_qwen-coder-32B_03_february_text | \n",
+ " 0.209 | \n",
+ "
\n",
+ " \n",
+ " code_sonnet_03_february_goodoldtext-unbroken | \n",
+ " 0.000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " is_correct\n",
+ "agent_name \n",
+ "code_gpt4o_03_february_goodoldtext 0.440\n",
+ "code_gpt4o_03_february_goodoldtext-unbroken 0.384\n",
+ "code_gpt4o_03_february_magenticbrowser 0.352\n",
+ "code_gpt4o_03_february_magenticbrowser2 0.365\n",
+ "code_gpt4o_03_february_text 0.376\n",
+ "code_llama-3 0.078\n",
+ "code_o1_01_february_text 0.491\n",
+ "code_o1_03_february_fix-print-outputs 0.518\n",
+ "code_o1_03_february_fix-print-outputs2 0.526\n",
+ "code_o1_03_february_goodoldtext-unbroken 0.534\n",
+ "code_o1_03_february_remove-navigational 0.537\n",
+ "code_o1_03_february_text_high-reasoning-effort 0.485\n",
+ "code_o1_22-01_managedagent-summary_planning 0.418\n",
+ "code_o1_25-01_visioon 0.340\n",
+ "code_o1_29-01_text 0.390\n",
+ "code_o3-mini_03_february_remove-navigational 0.291\n",
+ "code_qwen-coder-32B_03_february_text 0.209\n",
+ "code_sonnet_03_february_goodoldtext-unbroken 0.000"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " is_correct | \n",
+ " is_near_correct | \n",
+ " count_steps | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " agent_name | \n",
+ " task | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " code_gpt4o_03_february_goodoldtext | \n",
+ " 1 | \n",
+ " 0.631579 | \n",
+ " 0.631579 | \n",
+ " 7.421053 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.346154 | \n",
+ " 0.384615 | \n",
+ " 7.346154 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.200000 | \n",
+ " 0.200000 | \n",
+ " 7.200000 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " code_gpt4o_03_february_goodoldtext-unbroken | \n",
+ " 1 | \n",
+ " 0.452830 | \n",
+ " 0.452830 | \n",
+ " 7.000000 | \n",
+ " 53 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.380952 | \n",
+ " 0.392857 | \n",
+ " 8.511905 | \n",
+ " 84 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.227273 | \n",
+ " 0.227273 | \n",
+ " 10.409091 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " code_gpt4o_03_february_magenticbrowser | \n",
+ " 1 | \n",
+ " 0.480769 | \n",
+ " 0.480769 | \n",
+ " 7.153846 | \n",
+ " 52 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.349398 | \n",
+ " 0.361446 | \n",
+ " 8.168675 | \n",
+ " 83 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.083333 | \n",
+ " 0.083333 | \n",
+ " 10.375000 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " code_gpt4o_03_february_magenticbrowser2 | \n",
+ " 1 | \n",
+ " 0.461538 | \n",
+ " 0.461538 | \n",
+ " 6.923077 | \n",
+ " 52 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.345679 | \n",
+ " 0.345679 | \n",
+ " 7.925926 | \n",
+ " 81 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.217391 | \n",
+ " 0.260870 | \n",
+ " 9.739130 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " code_gpt4o_03_february_text | \n",
+ " 1 | \n",
+ " 0.433962 | \n",
+ " 0.452830 | \n",
+ " 5.924528 | \n",
+ " 53 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.406977 | \n",
+ " 0.418605 | \n",
+ " 7.255814 | \n",
+ " 86 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.153846 | \n",
+ " 0.153846 | \n",
+ " 8.115385 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " code_llama-3 | \n",
+ " 1 | \n",
+ " 0.192308 | \n",
+ " 0.192308 | \n",
+ " 1.230769 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.040000 | \n",
+ " 0.040000 | \n",
+ " 1.080000 | \n",
+ " 50 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.285714 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " code_o1_01_february_text | \n",
+ " 1 | \n",
+ " 0.547170 | \n",
+ " 0.566038 | \n",
+ " 2.849057 | \n",
+ " 53 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.534884 | \n",
+ " 0.534884 | \n",
+ " 3.325581 | \n",
+ " 86 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.230769 | \n",
+ " 0.230769 | \n",
+ " 4.269231 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " code_o1_03_february_fix-print-outputs | \n",
+ " 1 | \n",
+ " 0.622642 | \n",
+ " 0.622642 | \n",
+ " 4.018868 | \n",
+ " 53 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.505882 | \n",
+ " 0.505882 | \n",
+ " 4.270588 | \n",
+ " 85 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.346154 | \n",
+ " 0.346154 | \n",
+ " 5.500000 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " code_o1_03_february_fix-print-outputs2 | \n",
+ " 1 | \n",
+ " 0.641509 | \n",
+ " 0.641509 | \n",
+ " 3.811321 | \n",
+ " 53 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.506329 | \n",
+ " 0.506329 | \n",
+ " 3.784810 | \n",
+ " 79 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.333333 | \n",
+ " 0.333333 | \n",
+ " 3.875000 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " code_o1_03_february_goodoldtext-unbroken | \n",
+ " 1 | \n",
+ " 0.622642 | \n",
+ " 0.622642 | \n",
+ " 4.132075 | \n",
+ " 53 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.541176 | \n",
+ " 0.541176 | \n",
+ " 4.152941 | \n",
+ " 85 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.304348 | \n",
+ " 0.304348 | \n",
+ " 4.391304 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " code_o1_03_february_remove-navigational | \n",
+ " 1 | \n",
+ " 0.641509 | \n",
+ " 0.641509 | \n",
+ " 3.962264 | \n",
+ " 53 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.541176 | \n",
+ " 0.552941 | \n",
+ " 4.164706 | \n",
+ " 85 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.307692 | \n",
+ " 0.307692 | \n",
+ " 5.692308 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " code_o1_03_february_text_high-reasoning-effort | \n",
+ " 1 | \n",
+ " 0.547170 | \n",
+ " 0.547170 | \n",
+ " 3.037736 | \n",
+ " 53 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.523256 | \n",
+ " 0.534884 | \n",
+ " 2.930233 | \n",
+ " 86 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.230769 | \n",
+ " 0.230769 | \n",
+ " 3.653846 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " code_o1_22-01_managedagent-summary_planning | \n",
+ " 1 | \n",
+ " 0.476190 | \n",
+ " 0.523810 | \n",
+ " 5.047619 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.472222 | \n",
+ " 0.500000 | \n",
+ " 5.222222 | \n",
+ " 36 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.100000 | \n",
+ " 0.100000 | \n",
+ " 5.500000 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " code_o1_25-01_visioon | \n",
+ " 1 | \n",
+ " 0.411765 | \n",
+ " 0.411765 | \n",
+ " 5.294118 | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.366667 | \n",
+ " 0.366667 | \n",
+ " 5.333333 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 6.666667 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " code_o1_29-01_text | \n",
+ " 1 | \n",
+ " 0.516129 | \n",
+ " 0.516129 | \n",
+ " 4.967742 | \n",
+ " 31 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.379310 | \n",
+ " 0.431034 | \n",
+ " 5.241379 | \n",
+ " 58 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.187500 | \n",
+ " 0.187500 | \n",
+ " 6.500000 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " code_o3-mini_03_february_remove-navigational | \n",
+ " 1 | \n",
+ " 0.452830 | \n",
+ " 0.452830 | \n",
+ " 5.056604 | \n",
+ " 53 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.232558 | \n",
+ " 0.244186 | \n",
+ " 4.976744 | \n",
+ " 86 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.153846 | \n",
+ " 0.153846 | \n",
+ " 6.615385 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " code_qwen-coder-32B_03_february_text | \n",
+ " 1 | \n",
+ " 0.357143 | \n",
+ " 0.357143 | \n",
+ " 5.428571 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.136364 | \n",
+ " 0.136364 | \n",
+ " 6.409091 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.142857 | \n",
+ " 0.142857 | \n",
+ " 6.571429 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " code_sonnet_03_february_goodoldtext-unbroken | \n",
+ " 2 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 5.000000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " is_correct \\\n",
+ "agent_name task \n",
+ "code_gpt4o_03_february_goodoldtext 1 0.631579 \n",
+ " 2 0.346154 \n",
+ " 3 0.200000 \n",
+ "code_gpt4o_03_february_goodoldtext-unbroken 1 0.452830 \n",
+ " 2 0.380952 \n",
+ " 3 0.227273 \n",
+ "code_gpt4o_03_february_magenticbrowser 1 0.480769 \n",
+ " 2 0.349398 \n",
+ " 3 0.083333 \n",
+ "code_gpt4o_03_february_magenticbrowser2 1 0.461538 \n",
+ " 2 0.345679 \n",
+ " 3 0.217391 \n",
+ "code_gpt4o_03_february_text 1 0.433962 \n",
+ " 2 0.406977 \n",
+ " 3 0.153846 \n",
+ "code_llama-3 1 0.192308 \n",
+ " 2 0.040000 \n",
+ " 3 0.000000 \n",
+ "code_o1_01_february_text 1 0.547170 \n",
+ " 2 0.534884 \n",
+ " 3 0.230769 \n",
+ "code_o1_03_february_fix-print-outputs 1 0.622642 \n",
+ " 2 0.505882 \n",
+ " 3 0.346154 \n",
+ "code_o1_03_february_fix-print-outputs2 1 0.641509 \n",
+ " 2 0.506329 \n",
+ " 3 0.333333 \n",
+ "code_o1_03_february_goodoldtext-unbroken 1 0.622642 \n",
+ " 2 0.541176 \n",
+ " 3 0.304348 \n",
+ "code_o1_03_february_remove-navigational 1 0.641509 \n",
+ " 2 0.541176 \n",
+ " 3 0.307692 \n",
+ "code_o1_03_february_text_high-reasoning-effort 1 0.547170 \n",
+ " 2 0.523256 \n",
+ " 3 0.230769 \n",
+ "code_o1_22-01_managedagent-summary_planning 1 0.476190 \n",
+ " 2 0.472222 \n",
+ " 3 0.100000 \n",
+ "code_o1_25-01_visioon 1 0.411765 \n",
+ " 2 0.366667 \n",
+ " 3 0.000000 \n",
+ "code_o1_29-01_text 1 0.516129 \n",
+ " 2 0.379310 \n",
+ " 3 0.187500 \n",
+ "code_o3-mini_03_february_remove-navigational 1 0.452830 \n",
+ " 2 0.232558 \n",
+ " 3 0.153846 \n",
+ "code_qwen-coder-32B_03_february_text 1 0.357143 \n",
+ " 2 0.136364 \n",
+ " 3 0.142857 \n",
+ "code_sonnet_03_february_goodoldtext-unbroken 2 0.000000 \n",
+ "\n",
+ " is_near_correct \\\n",
+ "agent_name task \n",
+ "code_gpt4o_03_february_goodoldtext 1 0.631579 \n",
+ " 2 0.384615 \n",
+ " 3 0.200000 \n",
+ "code_gpt4o_03_february_goodoldtext-unbroken 1 0.452830 \n",
+ " 2 0.392857 \n",
+ " 3 0.227273 \n",
+ "code_gpt4o_03_february_magenticbrowser 1 0.480769 \n",
+ " 2 0.361446 \n",
+ " 3 0.083333 \n",
+ "code_gpt4o_03_february_magenticbrowser2 1 0.461538 \n",
+ " 2 0.345679 \n",
+ " 3 0.260870 \n",
+ "code_gpt4o_03_february_text 1 0.452830 \n",
+ " 2 0.418605 \n",
+ " 3 0.153846 \n",
+ "code_llama-3 1 0.192308 \n",
+ " 2 0.040000 \n",
+ " 3 0.000000 \n",
+ "code_o1_01_february_text 1 0.566038 \n",
+ " 2 0.534884 \n",
+ " 3 0.230769 \n",
+ "code_o1_03_february_fix-print-outputs 1 0.622642 \n",
+ " 2 0.505882 \n",
+ " 3 0.346154 \n",
+ "code_o1_03_february_fix-print-outputs2 1 0.641509 \n",
+ " 2 0.506329 \n",
+ " 3 0.333333 \n",
+ "code_o1_03_february_goodoldtext-unbroken 1 0.622642 \n",
+ " 2 0.541176 \n",
+ " 3 0.304348 \n",
+ "code_o1_03_february_remove-navigational 1 0.641509 \n",
+ " 2 0.552941 \n",
+ " 3 0.307692 \n",
+ "code_o1_03_february_text_high-reasoning-effort 1 0.547170 \n",
+ " 2 0.534884 \n",
+ " 3 0.230769 \n",
+ "code_o1_22-01_managedagent-summary_planning 1 0.523810 \n",
+ " 2 0.500000 \n",
+ " 3 0.100000 \n",
+ "code_o1_25-01_visioon 1 0.411765 \n",
+ " 2 0.366667 \n",
+ " 3 0.000000 \n",
+ "code_o1_29-01_text 1 0.516129 \n",
+ " 2 0.431034 \n",
+ " 3 0.187500 \n",
+ "code_o3-mini_03_february_remove-navigational 1 0.452830 \n",
+ " 2 0.244186 \n",
+ " 3 0.153846 \n",
+ "code_qwen-coder-32B_03_february_text 1 0.357143 \n",
+ " 2 0.136364 \n",
+ " 3 0.142857 \n",
+ "code_sonnet_03_february_goodoldtext-unbroken 2 0.000000 \n",
+ "\n",
+ " count_steps count \n",
+ "agent_name task \n",
+ "code_gpt4o_03_february_goodoldtext 1 7.421053 19 \n",
+ " 2 7.346154 26 \n",
+ " 3 7.200000 5 \n",
+ "code_gpt4o_03_february_goodoldtext-unbroken 1 7.000000 53 \n",
+ " 2 8.511905 84 \n",
+ " 3 10.409091 22 \n",
+ "code_gpt4o_03_february_magenticbrowser 1 7.153846 52 \n",
+ " 2 8.168675 83 \n",
+ " 3 10.375000 24 \n",
+ "code_gpt4o_03_february_magenticbrowser2 1 6.923077 52 \n",
+ " 2 7.925926 81 \n",
+ " 3 9.739130 23 \n",
+ "code_gpt4o_03_february_text 1 5.924528 53 \n",
+ " 2 7.255814 86 \n",
+ " 3 8.115385 26 \n",
+ "code_llama-3 1 1.230769 26 \n",
+ " 2 1.080000 50 \n",
+ " 3 0.285714 14 \n",
+ "code_o1_01_february_text 1 2.849057 53 \n",
+ " 2 3.325581 86 \n",
+ " 3 4.269231 26 \n",
+ "code_o1_03_february_fix-print-outputs 1 4.018868 53 \n",
+ " 2 4.270588 85 \n",
+ " 3 5.500000 26 \n",
+ "code_o1_03_february_fix-print-outputs2 1 3.811321 53 \n",
+ " 2 3.784810 79 \n",
+ " 3 3.875000 24 \n",
+ "code_o1_03_february_goodoldtext-unbroken 1 4.132075 53 \n",
+ " 2 4.152941 85 \n",
+ " 3 4.391304 23 \n",
+ "code_o1_03_february_remove-navigational 1 3.962264 53 \n",
+ " 2 4.164706 85 \n",
+ " 3 5.692308 26 \n",
+ "code_o1_03_february_text_high-reasoning-effort 1 3.037736 53 \n",
+ " 2 2.930233 86 \n",
+ " 3 3.653846 26 \n",
+ "code_o1_22-01_managedagent-summary_planning 1 5.047619 21 \n",
+ " 2 5.222222 36 \n",
+ " 3 5.500000 10 \n",
+ "code_o1_25-01_visioon 1 5.294118 17 \n",
+ " 2 5.333333 30 \n",
+ " 3 6.666667 6 \n",
+ "code_o1_29-01_text 1 4.967742 31 \n",
+ " 2 5.241379 58 \n",
+ " 3 6.500000 16 \n",
+ "code_o3-mini_03_february_remove-navigational 1 5.056604 53 \n",
+ " 2 4.976744 86 \n",
+ " 3 6.615385 26 \n",
+ "code_qwen-coder-32B_03_february_text 1 5.428571 14 \n",
+ " 2 6.409091 22 \n",
+ " 3 6.571429 7 \n",
+ "code_sonnet_03_february_goodoldtext-unbroken 2 5.000000 1 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "display(\"Average score:\", sel_df.groupby(\"agent_name\")[[\"is_correct\"]].mean().round(3))\n",
+ "display(\n",
+ " sel_df.groupby([\"agent_name\", \"task\"])[[\"is_correct\", \"is_near_correct\", \"count_steps\", \"question\"]]\n",
+ " .agg(\n",
+ " {\n",
+ " \"is_correct\": \"mean\",\n",
+ " \"is_near_correct\": \"mean\",\n",
+ " \"count_steps\": \"mean\",\n",
+ " \"question\": \"count\",\n",
+ " }\n",
+ " )\n",
+ " .rename(columns={\"question\": \"count\"})\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.plotly.v1+json": {
+ "config": {
+ "plotlyServerURL": "https://plot.ly"
+ },
+ "data": [
+ {
+ "customdata": [
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_gpt4o_03_february_goodoldtext
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_gpt4o_03_february_goodoldtext",
+ "line": {
+ "color": "#636efa",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_gpt4o_03_february_goodoldtext",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDE=",
+ "dtype": "i1"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADgPxzHcRzHcdw/AAAAAAAA4D8XXXTRRRfdP6uqqqqqqto/ntiJndiJ3T/btm3btm3bP5qZmZmZmdk/AAAAAAAA2D+XlpaWlpbWP1VVVVVVVdU/Q3kN5TWU1z+amZmZmZnZP9u2bdu2bds/F1100UUX3T+96U1vetPbP6uqqqqqqto/mpmZmZmZ2T87sRM7sRPbPxzHcRzHcdw/btu2bdu23T/UCMs9jbDcP97d3d3d3d0/55xzzjnn3D8AAAAAAADcPxdddNFFF90/PDw8PDw83D8d1EEd1EHdP47jOI7jON4/KvJZN5gi3z8N5TWU11DeP9/yLd/yLd8/ZmZmZmZm3j+pXYnalajdP57neZ7ned4/cUfcEXfE3T+MLrroooveP97d3d3d3d0/05ve9KY33T9yBTG5gpjcPwAAAAAAANw/L6fg5RS83D8pXI/C9SjcPw==",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "The attached spreadsheet contains the sales of men"
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "You are given this Excel file as a map. You start "
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "This spreadsheet contains a list of clients for a "
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "On the BBC Earth YouTube video of the Top 5 Sillie"
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "On the DeepFruits fruit detection graph on Connect"
+ ],
+ [
+ "Pull out the sentence in the following 5x7 block o"
+ ],
+ [
+ "What is the final numeric output from the attached"
+ ],
+ [
+ "Bob was invited to participate in a game show, and"
+ ],
+ [
+ "The book with the doi 10.1353/book.24372 concerns "
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n"
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "How many at bats did the Yankee with the most walk"
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "The attached spreadsheet lists the locomotives own"
+ ],
+ [
+ "The longest-lived vertebrate is named after an isl"
+ ],
+ [
+ "A 5-man group made up of one tank, one healer, and"
+ ],
+ [
+ "The attached file lists the locomotives owned by a"
+ ],
+ [
+ "The cover of the August 2021 issue of Vogue shows "
+ ],
+ [
+ "How many more blocks (also denoted as layers) in B"
+ ],
+ [
+ "Hi, I was out sick from my classes on Friday, so I"
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "Take the gender split from the 2011 Bulgarian cens"
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "On June 6, 2023, an article by Carolyn Collins Pet"
+ ],
+ [
+ "The attached spreadsheet contains a list of books "
+ ],
+ [
+ "What is the absolute difference in tens of thousan"
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "During the first week of August 2015, one of the N"
+ ],
+ [
+ "Who are the pitchers with the number before and af"
+ ],
+ [
+ "Where were the Vietnamese specimens described by K"
+ ],
+ [
+ "A standard Rubik’s cube has been broken into cubes"
+ ],
+ [
+ "All of the individuals who formally held the posit"
+ ],
+ [
+ "What is the first name of the only Malko Competiti"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ],
+ [
+ "If this whole pint is made up of ice cream, how ma"
+ ],
+ [
+ "According to Girls Who Code, how long did it take "
+ ],
+ [
+ "Of the cities within the United States where U.S. "
+ ],
+ [
+ "What was the actual enrollment count of the clinic"
+ ],
+ [
+ "What country had the least number of athletes at t"
+ ],
+ [
+ "In the 2015 Metropolitan Museum of Art exhibition "
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "On Cornell Law School website's legal information "
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "According to Openreview.net, at the NeurIPS 2022 C"
+ ],
+ [
+ "I'd like to learn more about some popular reality "
+ ],
+ [
+ "The attached Excel file contains the sales of menu"
+ ],
+ [
+ "As of May 2023, how many stops are between South S"
+ ],
+ [
+ "In the film Goldfinger, what color was the object "
+ ],
+ [
+ "What was the complete title of the book in which t"
+ ],
+ [
+ "The brand that makes these harnesses the dogs are "
+ ],
+ [
+ "Eva Draconis has a personal website which can be a"
+ ],
+ [
+ "When was a picture of St. Thomas Aquinas first add"
+ ],
+ [
+ "In NASA's Astronomy Picture of the Day on 2006 Jan"
+ ],
+ [
+ "I'm curious about how much information is availabl"
+ ],
+ [
+ "At the two-minute mark in the YouTube video upload"
+ ],
+ [
+ "I read a paper about multiwavelength observations "
+ ],
+ [
+ "I thought we could try a fun word puzzle together "
+ ],
+ [
+ "According to the USGS, in what year was the Americ"
+ ],
+ [
+ "As of August 2023, who is the only winner of the U"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_gpt4o_03_february_goodoldtext-unbroken
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_gpt4o_03_february_goodoldtext-unbroken",
+ "line": {
+ "color": "#EF553B",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_gpt4o_03_february_goodoldtext-unbroken",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4A",
+ "dtype": "i2"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADoP1VVVVVVVeU/MzMzMzMz4z9ddNFFF13kP1VVVVVVVeU/dmIndmIn5j8lSZIkSZLkPzMzMzMzM+M/AAAAAAAA4j/T0tLS0tLiP3Icx3Ecx+E/bCivobyG4j+amZmZmZnhP5IkSZIkSeI/dNFFF1104T8LWchCFrLgP1VVVVVVVeE/7FG4HoXr4T+xEzuxEzvhP3sJ7SW0l+A/AAAAAAAA4D/d0wjLPY3gPxEREREREeE/hBBCCCGE4D8AAAAAAADgPwgffPDBB98/AAAAAAAA4D9f8RVf8RXfP47jOI7jON4/fdYNpshn3T8bymsor6HcP1y+5Vu+5ds/zczMzMzM3D8ZnI/B+RjcPz3P8zzP89w/EnfEHXFH3D+jiy666KLbPxzHcRzHcdw/05ve9KY33T94Nuo7G/XdP1VVVVVVVd0/L6fg5RS83D9xPQrXo3DdP93c3Nzc3Nw/7MRO7MRO3D8iNcF4K/vcPxPaS2gvod0/F1100UUX3T8lSZIkSZLcPx/BfQT3Edw/GmG5pxGW2z91Xx5bETTcP7y7u7u7u9s/Q7CONu9T3D/fe++9997bP9u2bdu2bds/AAAAAAAA2z9bqZVaqZXaPyebbLLJJts/NSbSA5Wz2z88PDw8PDzcP8y1A3PtwNw/fMVXfMVX3D8LmwOJVtjcPxzHcRzHcdw/4MCBAwcO3D/QusEU+azbP08b6LSBTts/KK+hvIby2j++Y2pg75jaPxqkQRqkQdo/2TMQlY7s2T+amZmZmZnZP+Dp1vywSNk/+hicj8H52D+q82sPuazYP0mSJEmSJNk/2djY2NjY2D9T1pQ1ZU3ZPzv0m61Dv9k/L7rooosu2j+e8YxnPOPZP5qZmZmZmdk/WqAFWqAF2j+c3vSmN73ZP3bZZZdddtk/Z6O+s1Hf2T+amZmZmZnZPwAAAAAAANo/Grab5Ulk2j+IxvrQWB/aPywFav1Kgdo/PQrXo3A92j8ZvhEFJp3aP/v6+vr6+to/G0PTHey32j87sRM7sRPbP9u2bdu2bds/ln0OqQnG2z8T6J26loPbPya0l9BeQts/JrBpP1kC2z/D2jesfcPaP5ax/Y5eGds/27Zt27Zt2z80+bJBky/bPyivobyG8to/q8FzBIq22j8+jbDc0wjbP5u1WZu1Wds/BA0ndV8e2z+bCOSaCOTaPzMzMzMzM9s/hYn3I6f52j+f4pIhWEfbPw8b6bCRDts/W2uttdZa2z/ZzvdT46XbP/y+7/u+79s/7na73W632z8AAAAAAADcP/KGvCFvyNs/HLmRG7mR2z8j+oDq2FvbPyebbLLJJts/27Zt27Zt2z9YYyI9UDnbP1uwBVuwBds/09LS0tLS2j/TVwljs6DaP6c3velNb9o/D+jGPH202j87qIM6qIPaP2le/ImEU9o/gkQrbA4k2j9r/N08QvXZP3Icx3Ecx9k/mpmZmZmZ2T/Lli1btmzZP2x21CLkr9k/I591gyny2T9SkPx5lcXZP5qZmZmZmdk/y7hl3DJu2T82lNdQXkPZP9ouhNkuhNk/EWflJ8RZ2T8wWf6S5S/ZP2mQBmmQBtk/fo/ICcLd2D9rcRPmd7XYP3bpMX+vjdg/",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "The attached spreadsheet contains the sales of men"
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "You are given this Excel file as a map. You start "
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "I thought we could try a fun word puzzle together "
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "This spreadsheet contains a list of clients for a "
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "What is the final numeric output from the attached"
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "On the BBC Earth YouTube video of the Top 5 Sillie"
+ ],
+ [
+ "Pull out the sentence in the following 5x7 block o"
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "The attached spreadsheet contains a list of books "
+ ],
+ [
+ "The longest-lived vertebrate is named after an isl"
+ ],
+ [
+ "During the first week of August 2015, one of the N"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ],
+ [
+ "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n"
+ ],
+ [
+ "Bob was invited to participate in a game show, and"
+ ],
+ [
+ "How many more blocks (also denoted as layers) in B"
+ ],
+ [
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+ ],
+ [
+ "According to Girls Who Code, how long did it take "
+ ],
+ [
+ "On the DeepFruits fruit detection graph on Connect"
+ ],
+ [
+ "All of the individuals who formally held the posit"
+ ],
+ [
+ "The attached file lists the locomotives owned by a"
+ ],
+ [
+ "The cover of the August 2021 issue of Vogue shows "
+ ],
+ [
+ "Hi, I was out sick from my classes on Friday, so I"
+ ],
+ [
+ "What is the absolute difference in tens of thousan"
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "The attached spreadsheet lists the locomotives own"
+ ],
+ [
+ "The brand that makes these harnesses the dogs are "
+ ],
+ [
+ "On June 6, 2023, an article by Carolyn Collins Pet"
+ ],
+ [
+ "Eva Draconis has a personal website which can be a"
+ ],
+ [
+ "Take the gender split from the 2011 Bulgarian cens"
+ ],
+ [
+ "A standard Rubik’s cube has been broken into cubes"
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "If this whole pint is made up of ice cream, how ma"
+ ],
+ [
+ "What country had the least number of athletes at t"
+ ],
+ [
+ "Where were the Vietnamese specimens described by K"
+ ],
+ [
+ "How many at bats did the Yankee with the most walk"
+ ],
+ [
+ "The attached Excel file contains the sales of menu"
+ ],
+ [
+ "What was the complete title of the book in which t"
+ ],
+ [
+ "What was the actual enrollment count of the clinic"
+ ],
+ [
+ "Who are the pitchers with the number before and af"
+ ],
+ [
+ "In the YouTube 360 VR video from March 2018 narrat"
+ ],
+ [
+ "As of May 2023, how many stops are between South S"
+ ],
+ [
+ "I'd like to learn more about some popular reality "
+ ],
+ [
+ "In the film Goldfinger, what color was the object "
+ ],
+ [
+ "A 5-man group made up of one tank, one healer, and"
+ ],
+ [
+ "According to the USGS, in what year was the Americ"
+ ],
+ [
+ "What is the first name of the only Malko Competiti"
+ ],
+ [
+ "I'm curious about how much information is availabl"
+ ],
+ [
+ "When was a picture of St. Thomas Aquinas first add"
+ ],
+ [
+ "I read a paper about multiwavelength observations "
+ ],
+ [
+ "At the two-minute mark in the YouTube video upload"
+ ],
+ [
+ "In the 2015 Metropolitan Museum of Art exhibition "
+ ],
+ [
+ "According to Openreview.net, at the NeurIPS 2022 C"
+ ],
+ [
+ "In NASA's Astronomy Picture of the Day on 2006 Jan"
+ ],
+ [
+ "As of August 2023, who is the only winner of the U"
+ ],
+ [
+ "Of the cities within the United States where U.S. "
+ ]
+ ],
+ "hovertemplate": "agent_name=code_gpt4o_03_february_magenticbrowser
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_gpt4o_03_february_magenticbrowser",
+ "line": {
+ "color": "#00cc96",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_gpt4o_03_february_magenticbrowser",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4A",
+ "dtype": "i2"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACamZmZmZnJP1VVVVVVVcU/kiRJkiRJwj8AAAAAAADAPxzHcRzHcbw/mpmZmZmZyT900UUXXXTRPwAAAAAAANA/FDuxEzux0z+3bdu2bdvWP1VVVVVVVdU/AAAAAAAA1D/T0tLS0tLSP3Icx3Ecx9E/eQ3lNZTX0D8AAAAAAADQP5IkSZIkSdI/dNFFF1100T84velNb3rTP1VVVVVVVdU/exSuR+F61D8UO7ETO7HTP2gvob2E9tI/kiRJkiRJ0j8Jyz2NsNzTPzMzMzMzM9M/lVJKKaWU0j8AAAAAAADUP1VVVVVVVdU/tbS0tLS01D/UQR3UQR3UP+Q4juM4jtM/HEyRz7rB1D9RXkN5DeXVP1VVVVVVVdU/ZmZmZmZm1j/blahdidrVP7dt27Zt29Y/lTVlTVlT1j9GF1100UXXPxdswRZswdY/etOb3vSm1z9dQUyuICbXPwAAAAAAANg/4eUUvJyC1z8K16NwPQrXP9jX19fX19c/J3ZiJ3Zi1z9ln0NqgvHWP0xoL6G9hNY/RhdddNFF1z+3bdu2bdvWP0xnMZ3FdNY/fBphuacR1j/QcFL35bHVP1VVVVVVVdU/yRCso8371D+ttdZaa63VP1VVVVVVVdU/AAAAAAAA1T/VSq3USq3UP1VVVVVVVdU/0gOVs1v41T+mpaWlpaXVP1VVVVVVVdU/Fl/xFV/x1T8g0QqbA4nWP47jOI7jONY/r169evXq1T/JZ91ginzWPwrXo3A9Ctc/ymsor6G81j8oxFn5CXHWP3ZiJ3ZiJ9Y/Xi1uwvyu1j9mZmZmZmbWP6QMPN2aH9Y/25WoXYna1T80dX7tIZfVP1VVVVVVVdU/FRUVFRUV1T82ZU1ZU9bUPy+QSfECmdQ/XXTRRRdd1D9CEYpQhCLUP5Q+6ZM+6dM/VEZlVEZl1D9DFrKQhSzUP6WUUkoppdQ/Ut/ZqO9s1D8mTv2eW+LUP1VVVVVVVdU/1g86KvDF1T9jfWisD43VP1DrVwrU+tU/w/UoXI/C1T+bB7nrZ4vVP/b19fX19dU/2xia7mC/1T+e2Imd2InVP1VVVVVVVdU/2eeQmmC81T/XcnCzX4jVP9FeQnsJ7dU//mQJbNpP1j8c1r5h7RvWP49eGdvv6NU/btu2bdu21T96amGlpxbWP0xnMZ3FdNY/bTV4jkDR1j9Y7mmE5Z7WP9ZmbdZmbdY/QcNJ3ZfH1j/XRCDXRCDXP3d3d3d3d9c/RhdddNFF1z8RrKPN+xTXP+UWT27x5NY/Ouecc8451z8K16NwPQrXP9d1Xdd1Xdc/7PV6vV6v1z8AAAAAAIDXP/QFfUFf0Nc/GHqhF3qh1z/f2jDNXfDXP8IHH3zwwdc/9oDZA2YP2D9JD1TObuHXP0J7Ce0ltNc/iIeHh4eH1z82C6o9J9PXP4K5dmCuHdg/6qPVJETx1z+ogzqogzrYP2C3x1qGDtg/Zfx2qSfj1z/MknJAZLjXP+Q4juM4jtc/J0p2baJk1z+6c+fOnTvXP+HlFLycgtc/n3WDKfJZ1z99GzBU0zHXP3d3d3d3d9c/uj5dn65P1z+H8hrKayjXP1esAVesAdc/t23btm3b1j+21lprrbXWPwdpkAZpkNY/dRhlKp5r1j9eLW7C/K7WP+EMCCV3itY/",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "The attached spreadsheet contains the sales of men"
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "You are given this Excel file as a map. You start "
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "This spreadsheet contains a list of clients for a "
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "What is the final numeric output from the attached"
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "On the BBC Earth YouTube video of the Top 5 Sillie"
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "Pull out the sentence in the following 5x7 block o"
+ ],
+ [
+ "On the DeepFruits fruit detection graph on Connect"
+ ],
+ [
+ "The longest-lived vertebrate is named after an isl"
+ ],
+ [
+ "Bob was invited to participate in a game show, and"
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "During the first week of August 2015, one of the N"
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ],
+ [
+ "All of the individuals who formally held the posit"
+ ],
+ [
+ "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n"
+ ],
+ [
+ "How many more blocks (also denoted as layers) in B"
+ ],
+ [
+ "According to Girls Who Code, how long did it take "
+ ],
+ [
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+ ],
+ [
+ "According to the USGS, in what year was the Americ"
+ ],
+ [
+ "The attached spreadsheet contains a list of books "
+ ],
+ [
+ "On Cornell Law School website's legal information "
+ ],
+ [
+ "Of the cities within the United States where U.S. "
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "The attached file lists the locomotives owned by a"
+ ],
+ [
+ "The cover of the August 2021 issue of Vogue shows "
+ ],
+ [
+ "A 5-man group made up of one tank, one healer, and"
+ ],
+ [
+ "Eva Draconis has a personal website which can be a"
+ ],
+ [
+ "The brand that makes these harnesses the dogs are "
+ ],
+ [
+ "The attached spreadsheet lists the locomotives own"
+ ],
+ [
+ "What is the absolute difference in tens of thousan"
+ ],
+ [
+ "Hi, I was out sick from my classes on Friday, so I"
+ ],
+ [
+ "I'm curious about how much information is availabl"
+ ],
+ [
+ "Take the gender split from the 2011 Bulgarian cens"
+ ],
+ [
+ "A standard Rubik’s cube has been broken into cubes"
+ ],
+ [
+ "How many at bats did the Yankee with the most walk"
+ ],
+ [
+ "If this whole pint is made up of ice cream, how ma"
+ ],
+ [
+ "Where were the Vietnamese specimens described by K"
+ ],
+ [
+ "The attached Excel file contains the sales of menu"
+ ],
+ [
+ "What country had the least number of athletes at t"
+ ],
+ [
+ "I'd like to learn more about some popular reality "
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ],
+ [
+ "Who are the pitchers with the number before and af"
+ ],
+ [
+ "What is the first name of the only Malko Competiti"
+ ],
+ [
+ "In the YouTube 360 VR video from March 2018 narrat"
+ ],
+ [
+ "What was the complete title of the book in which t"
+ ],
+ [
+ "I thought we could try a fun word puzzle together "
+ ],
+ [
+ "As of August 2023, who is the only winner of the U"
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "In NASA's Astronomy Picture of the Day on 2006 Jan"
+ ],
+ [
+ "In the film Goldfinger, what color was the object "
+ ],
+ [
+ "In the 2015 Metropolitan Museum of Art exhibition "
+ ],
+ [
+ "As of May 2023, how many stops are between South S"
+ ],
+ [
+ "At the two-minute mark in the YouTube video upload"
+ ],
+ [
+ "According to Openreview.net, at the NeurIPS 2022 C"
+ ],
+ [
+ "When was a picture of St. Thomas Aquinas first add"
+ ],
+ [
+ "What was the actual enrollment count of the clinic"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_gpt4o_03_february_magenticbrowser2
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_gpt4o_03_february_magenticbrowser2",
+ "line": {
+ "color": "#ab63fa",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_gpt4o_03_february_magenticbrowser2",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsA",
+ "dtype": "i2"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADgPxzHcRzHcdw/AAAAAAAA4D8XXXTRRRfdPwAAAAAAAOA/sRM7sRM74T+SJEmSJEniPxEREREREeE/AAAAAAAA4D8eHh4eHh7ePwAAAAAAAOA/DeU1lNdQ3j/NzMzMzMzcP57neZ7ned4/F1100UUX3T+96U1vetPbP1VVVVVVVd0/KVyPwvUo3D87sRM7sRPbPy+hvYT2Eto/27Zt27Zt2z9huacRlnvaP5qZmZmZmdk/11prrbXW2j8AAAAAAADcPxdddNFFF90/PDw8PDw83D/btm3btm3bP6uqqqqqqto/0LrBFPms2z8or6G8hvLaPxqkQRqkQdo/mpmZmZmZ2T+J2pWoXYnaP9u2bdu2bds/EnfEHXFH3D8XXXTRRRfdPxzHcRzHcdw/velNb3rT2z9yBTG5gpjcP1VVVVVVVd0/g5dT8HIK3j9xPQrXo3DdP93c3Nzc3Nw/7MRO7MRO3D8iNcF4K/vcPxzHcRzHcdw/7RvWvmHt2z8lSZIkSZLcPx/BfQT3Edw/1AjLPY2w3D91Xx5bETTcP83MzMzMzNw/532KS4Zg3T/nnHPOOefcP13XdV3Xdd0/AAAAAAAA3T/dyI3cyI3cPxdddNFFF90/rDGRHqic3T8tLS0tLS3dP8y1A3PtwNw/fMVXfMVX3D8yfrvUk/HbPxzHcRzHcdw/4MCBAwcO3D/QusEU+azbP08b6LSBTts/KK+hvIby2j/btm3btm3bP1y+5Vu+5ds/FzdhfleL2z8zMzMzMzPbP35YpAw83do/idqVqF2J2j/ksmKghDfaP3qe53me59k/mpmZmZmZ2T9T1pQ1ZU3ZPxUvkEnxAtk/dNFFF1102T+TlaxkJSvZP5qZmZmZmdk/GZVRGZVR2T+RhSxkIQvZP3bZZZdddtk/5QpicgUx2T+amZmZmZnZP1VVVVVVVdk/mYbtZnkS2T801ofG+tDYPzbZZJNNNtk/9ihcj8L12D+qeZC7frbYPxkZGRkZGdk/i/gEUsl52T+xEzuxEzvZP5qZmZmZmdk/fg6pCcZb2T/7hVhRGh/ZPzmO4ziO49g/koq51Rmp2D9wWPuGtW/YP6+M7Xf0ytg/JUmSJEmS2D/pqYWVnlrYPzqL6Syms9g/iHG/Lql82D/LPY2w3NPYP9mJndiJndg/6r48tiJo2D9YoTNYoTPYPwAAAAAAANg/Kky8HznN1z/jkiFYR5vXP2pXonYlatc/vvfee++91z+q8dJNYhDYP/h93/d939c/DAaDwWAw2D8AAAAAAADYPxT2hD1hT9g/+IEf+IEf2D/pA6pjb23YPz744IMPPtg/qYilIpaK2D8m0gOVs1vYP9iCLdiCLdg/AAAAAAAA2D9Q7TmZvkrYP4mfUeJnlNg/TGV71wHd2D/5iq/4iq/YP2JyBTG5gtg/0QqbA4lW2D/ZiZ3YiZ3YPzmO4ziO49g/0nmLIZ232D/EiBEjRozYPzTWh8b60Ng/YYp81g2m2D/oVRZntHvYP1K4HoXrUdg/waJgUbAo2D8AAAAAAADYP9jX19fX19c/1cDeMTWw1z+JV5F4FYnXPyd2Yid2Ytc/",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "Which of the text elements under CATEGORIES in the"
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "The attached spreadsheet contains the sales of men"
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "You are given this Excel file as a map. You start "
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "According to the World Bank, which countries had g"
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "This spreadsheet contains a list of clients for a "
+ ],
+ [
+ "What is the final numeric output from the attached"
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "How many more blocks (also denoted as layers) in B"
+ ],
+ [
+ "Pull out the sentence in the following 5x7 block o"
+ ],
+ [
+ "Bob was invited to participate in a game show, and"
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n"
+ ],
+ [
+ "On the BBC Earth YouTube video of the Top 5 Sillie"
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ],
+ [
+ "The longest-lived vertebrate is named after an isl"
+ ],
+ [
+ "According to Girls Who Code, how long did it take "
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "On the DeepFruits fruit detection graph on Connect"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "The attached spreadsheet lists the locomotives own"
+ ],
+ [
+ "All of the individuals who formally held the posit"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ],
+ [
+ "Hi, I was out sick from my classes on Friday, so I"
+ ],
+ [
+ "The attached file lists the locomotives owned by a"
+ ],
+ [
+ "A 5-man group made up of one tank, one healer, and"
+ ],
+ [
+ "The cover of the August 2021 issue of Vogue shows "
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "The attached spreadsheet contains a list of books "
+ ],
+ [
+ "What is the absolute difference in tens of thousan"
+ ],
+ [
+ "The book with the doi 10.1353/book.24372 concerns "
+ ],
+ [
+ "What was the complete title of the book in which t"
+ ],
+ [
+ "A standard Rubik’s cube has been broken into cubes"
+ ],
+ [
+ "If this whole pint is made up of ice cream, how ma"
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "The attached Excel file contains the sales of menu"
+ ],
+ [
+ "Where were the Vietnamese specimens described by K"
+ ],
+ [
+ "During the first week of August 2015, one of the N"
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "The brand that makes these harnesses the dogs are "
+ ],
+ [
+ "On Cornell Law School website's legal information "
+ ],
+ [
+ "Eva Draconis has a personal website which can be a"
+ ],
+ [
+ "As of August 2023, who is the only winner of the U"
+ ],
+ [
+ "What country had the least number of athletes at t"
+ ],
+ [
+ "Take the gender split from the 2011 Bulgarian cens"
+ ],
+ [
+ "I'm curious about how much information is availabl"
+ ],
+ [
+ "How many at bats did the Yankee with the most walk"
+ ],
+ [
+ "Who are the pitchers with the number before and af"
+ ],
+ [
+ "What is the first name of the only Malko Competiti"
+ ],
+ [
+ "In the film Goldfinger, what color was the object "
+ ],
+ [
+ "I'd like to learn more about some popular reality "
+ ],
+ [
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "According to Openreview.net, at the NeurIPS 2022 C"
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "In the YouTube 360 VR video from March 2018 narrat"
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "In the 2015 Metropolitan Museum of Art exhibition "
+ ],
+ [
+ "What was the actual enrollment count of the clinic"
+ ],
+ [
+ "When was a picture of St. Thomas Aquinas first add"
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "At the two-minute mark in the YouTube video upload"
+ ],
+ [
+ "On June 6, 2023, an article by Carolyn Collins Pet"
+ ],
+ [
+ "I read a paper about multiwavelength observations "
+ ],
+ [
+ "I thought we could try a fun word puzzle together "
+ ],
+ [
+ "As of May 2023, how many stops are between South S"
+ ],
+ [
+ "In NASA's Astronomy Picture of the Day on 2006 Jan"
+ ],
+ [
+ "Of the cities within the United States where U.S. "
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "According to the USGS, in what year was the Americ"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_gpt4o_03_february_text
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_gpt4o_03_february_text",
+ "line": {
+ "color": "#FFA15A",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_gpt4o_03_february_text",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
+ "dtype": "i2"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVdU/AAAAAAAA4D8zMzMzMzPjP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADkP3Icx3Ecx+E/AAAAAAAA4D8XXXTRRRfdPwAAAAAAAOA/sRM7sRM74T8AAAAAAADgP97d3d3d3d0/AAAAAAAA3D9aWlpaWlraPzmO4ziO49g/KK+hvIby2j+amZmZmZnZP9u2bdu2bds/L7rooosu2j+96U1vetPbP6uqqqqqqto/mpmZmZmZ2T/ZiZ3YiZ3YPy+hvYT2Eto/27Zt27Zt2z/UCMs9jbDcP7y7u7u7u9s/55xzzjnn3D8AAAAAAADcPxdddNFFF90/PDw8PDw83D8d1EEd1EHdP47jOI7jON4/KvJZN5gi3z8N5TWU11DeP9/yLd/yLd8/AAAAAAAA4D84H4PzMTjfPwAAAAAAAOA/0Bf0BX1B3z8AAAAAAADgP5/0SZ/0Sd8/6k1vetOb3j94Nuo7G/XdP1VVVVVVVd0/L6fg5RS83D8pXI/C9SjcP93c3Nzc3Nw/7MRO7MRO3D+WfQ6pCcbbPya0l9BeQts/7RvWvmHt2z8lSZIkSZLcPx/BfQT3Edw/1AjLPY2w3D91Xx5bETTcP7y7u7u7u9s/Q7CONu9T3D/fe++9997bP9u2bdu2bds/AAAAAAAA2z8cuZEbuZHbPx988MEHH9w/NSbSA5Wz2z9LS0tLS0vbP73pTW9609s/27Zt27Zt2z8yfrvUk/HbP+Q4juM4jts/2bJly5Yt2z/QusEU+azbP08b6LSBTts/KK+hvIby2j++Y2pg75jaPxqkQRqkQdo/2TMQlY7s2T9mZmZmZmbaPy+hvYT2Eto/idqVqF2J2j+CEt5o6vzaP6uqqqqqqto/WlpaWlpa2j+zpqwpa8raP2G5pxGWe9o/L7rooosu2j+e8YxnPOPZP/qkT/qkT9o/WqAFWqAF2j+c3vSmN73ZPyeaaKKJJto/Z6O+s1Hf2T+amZmZmZnZPwAAAAAAANo/Wp5EpmG72T+IxvrQWB/aPzFvZ0jM29k/mpmZmZmZ2T96kLt+tljZP7q5ubm5udk/i/gEUsl52T+KndiJndjZP5qZmZmZmdk/fg6pCcZb2T+B3qlrObjZP7SX0F5Ce9k/XJ2RirnV2T+amZmZmZnZP+mVsf2OXtk/btu2bdu22T+hyZcNmnzZPzGdxXQW09k/mpmZmZmZ2T+oEZZ7GmHZP1qbtVmbtdk/lLovj60I2j8arNAZrNDZPyIiIiIiIto/vB85zdfq2T/8FJcMwTraP4nalahdido/U0oppZRS2j/pJjEIrBzaP3qe53me59k/bTabzWaz2T8AAAAAAIDZP3PGnDFnzNk/mpmZmZmZ2T8GfxUnpOTZP7LJJptsstk/wp8Jfyb82T+/GhPpgcrZP5qZmZmZmdk/aWlpaWlp2T+fk+mrhLHZP6BR4meU+Nk/rSYhir/I2T+amZmZmZnZP2bogN0ea9k/FjYHEq2w2T/lgMhwr4LZP3Icx3Ecx9k/famg1ZcK2j/SpEmTJk3aP4jG+tBYH9o/DqbIZ91g2j8h+fMqizPaPwc6baDTBto/z2pntbPa2T/zGsprKK/ZP9ouhNkuhNk/EWflJ8RZ2T8wWf6S5S/ZP2mQBmmQBtk/fo/ICcLd2D9rcRPmd7XYP3bpMX+vjdg/ZmZmZmZm2D+vUkzQXaXYP5Ey8HRrftg/GFuCb/NX2D8yOB+D8zHYPwyYxoBpDNg/",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "Which of the text elements under CATEGORIES in the"
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_llama-3
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_llama-3",
+ "line": {
+ "color": "#19d3f3",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_llama-3",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZ",
+ "dtype": "i1"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABEREREREbE/AAAAAAAAsD8eHh4eHh6uPxzHcRzHcaw/KK+hvIbyqj+amZmZmZmpPxiGYRiGYag/RhdddNFFpz9kIQtZyEKmP1VVVVVVVaU/exSuR+F6pD8UO7ETO7GjP2gvob2E9qI/kiRJkiRJoj+WexphuaehPxEREREREaE/hBBCCCGEoD8AAAAAAACgPwgffPDBB58/Hh4eHh4erj8d1EEd1EGtPxzHcRzHcaw/0LrBFPmsqz8or6G8hvKqPxqkQRqkQao/MzMzMzMzsz+7ErUrUbuyP5IkSZIkSbI/d8QdcUfcsT900UUXXXSxPxEREREREbE/ZCELWchCtj9XEJMriMm1P1VVVVVVVbU/OQUvp+DltD97FK5H4Xq0PxQUFBQUFLQ/FDuxEzuxsz/BeCv7HFKzP2gvob2E9rI/nhLkKUGesj+SJEmSJEmyP3AfwX0E97E/fBphuacRtj/QcFL35bG1P1VVVVVVVbU/yRCso837tD/GGGOMMca4PxiGYRiGYbg/AAAAAAAAuD8YeqEXeqG3P0YXXXTRRbc/jYn0QOXstj+XlpaWlpa2P2QhC1nIQrY/Fl/xFV/xtT9ItMLmQKK1P1VVVVVVVbU/qFChQoUKtT8cTJHPusG0P3sUrkfherQ/XkN5DeU1tD/Oyk+Is/KzP5dv+ZZv+bY/Xi1uwvyutj9mZmZmZma2P6QMPN2aH7Y/25WoXYnatT80dX7tIZe1P1VVVVVVVbU/FRUVFRUVtT82ZU1ZU9a0Py+QSfECmbQ/XXTRRRddtD9CEYpQhCK0P5Q+6ZM+6bM/",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "The attached spreadsheet contains the sales of men"
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ "You are given this Excel file as a map. You start "
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "I thought we could try a fun word puzzle together "
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "According to the World Bank, which countries had g"
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "This spreadsheet contains a list of clients for a "
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "The longest-lived vertebrate is named after an isl"
+ ],
+ [
+ "On the BBC Earth YouTube video of the Top 5 Sillie"
+ ],
+ [
+ "What is the final numeric output from the attached"
+ ],
+ [
+ "How many more blocks (also denoted as layers) in B"
+ ],
+ [
+ "During the first week of August 2015, one of the N"
+ ],
+ [
+ "Pull out the sentence in the following 5x7 block o"
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "All of the individuals who formally held the posit"
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "On the DeepFruits fruit detection graph on Connect"
+ ],
+ [
+ "Of the cities within the United States where U.S. "
+ ],
+ [
+ "The book with the doi 10.1353/book.24372 concerns "
+ ],
+ [
+ "Bob was invited to participate in a game show, and"
+ ],
+ [
+ "On Cornell Law School website's legal information "
+ ],
+ [
+ "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n"
+ ],
+ [
+ "As of August 2023, who is the only winner of the U"
+ ],
+ [
+ "Eva Draconis has a personal website which can be a"
+ ],
+ [
+ "According to Girls Who Code, how long did it take "
+ ],
+ [
+ "The attached spreadsheet lists the locomotives own"
+ ],
+ [
+ "How many at bats did the Yankee with the most walk"
+ ],
+ [
+ "What was the complete title of the book in which t"
+ ],
+ [
+ "The cover of the August 2021 issue of Vogue shows "
+ ],
+ [
+ "The attached file lists the locomotives owned by a"
+ ],
+ [
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+ ],
+ [
+ "Hi, I was out sick from my classes on Friday, so I"
+ ],
+ [
+ "A 5-man group made up of one tank, one healer, and"
+ ],
+ [
+ "According to Openreview.net, at the NeurIPS 2022 C"
+ ],
+ [
+ "Take the gender split from the 2011 Bulgarian cens"
+ ],
+ [
+ "When was a picture of St. Thomas Aquinas first add"
+ ],
+ [
+ "If this whole pint is made up of ice cream, how ma"
+ ],
+ [
+ "What is the absolute difference in tens of thousan"
+ ],
+ [
+ "I'd like to learn more about some popular reality "
+ ],
+ [
+ "The attached spreadsheet contains a list of books "
+ ],
+ [
+ "Where were the Vietnamese specimens described by K"
+ ],
+ [
+ "A standard Rubik’s cube has been broken into cubes"
+ ],
+ [
+ "Who are the pitchers with the number before and af"
+ ],
+ [
+ "What is the first name of the only Malko Competiti"
+ ],
+ [
+ "What was the actual enrollment count of the clinic"
+ ],
+ [
+ "On June 6, 2023, an article by Carolyn Collins Pet"
+ ],
+ [
+ "I'm curious about how much information is availabl"
+ ],
+ [
+ "In the film Goldfinger, what color was the object "
+ ],
+ [
+ "In the YouTube 360 VR video from March 2018 narrat"
+ ],
+ [
+ "What country had the least number of athletes at t"
+ ],
+ [
+ "In NASA's Astronomy Picture of the Day on 2006 Jan"
+ ],
+ [
+ "As of May 2023, how many stops are between South S"
+ ],
+ [
+ "I read a paper about multiwavelength observations "
+ ],
+ [
+ "At the two-minute mark in the YouTube video upload"
+ ],
+ [
+ "According to the USGS, in what year was the Americ"
+ ],
+ [
+ "In the 2015 Metropolitan Museum of Art exhibition "
+ ],
+ [
+ "The attached Excel file contains the sales of menu"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "Which of the text elements under CATEGORIES in the"
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "The brand that makes these harnesses the dogs are "
+ ]
+ ],
+ "hovertemplate": "agent_name=code_o1_01_february_text
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_o1_01_february_text",
+ "line": {
+ "color": "#FF6692",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_o1_01_february_text",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
+ "dtype": "i2"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA6D+amZmZmZnpP1VVVVVVVeU/t23btm3b5j8AAAAAAADoP1VVVVVVVeU/ZmZmZmZm5j9ddNFFF13kP6uqqqqqquI/FDuxEzux4z+SJEmSJEniPxEREREREeE/AAAAAAAA4j/x8PDw8PDgPwAAAAAAAOA/DeU1lNdQ3j/NzMzMzMzcP57neZ7ned4/F1100UUX3T+96U1vetPbP6uqqqqqqto/KVyPwvUo3D+e2Imd2IndPxzHcRzHcdw/btu2bdu23T9HWO5phOXePwAAAAAAAOA/hBBCCCGE4D8AAAAAAADgP3zwwQcffOA/AAAAAAAA4D9QB3VQB3XgPzmO4ziO4+A/whT5rBtM4T95DeU1lNfgP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhPzEMwzAMw+A/R9wRd8Qd4T900UUXXXThPxEREREREeE/C1nIQhay4D/E5ApicgXhP6uqqqqqquA/FbycgpdT4D+kcD0K16PgP/Hw8PDw8OA/sRM7sRM74T9vZZ9DaoLhP3Icx3Ecx+E/CfKUIE8J4j9u27Zt27bhP3AfwX0E9+E/lnsaYbmn4T8NJ3VfHlvhPxEREREREeE/DcE62rxP4T+MMcYYY4zhP1EURVEUReE/AAAAAAAA4T/RC73QC73gP/jggw8++OA/TKQHKme34D/x8PDw8PDgPxM/o8TPKOE/8RVf8RVf4T8OJFphcyDhPzmO4ziO4+A/iREjRowY4T/CFPmsG0zhPxEREREREeE/NpTXUF5D4T/lJ8RZ+QnhP7ETO7ETO+E/BqLSkT0D4T8zMzMzMzPhPyNl4OnW/OA/LFG7ErUr4T9T59ceclnhP0mSJEmSJOE/8fDw8PDw4D8w6Av6gr7gP93TCMs9jeA/XXTRRRdd4D8DF7jABS7gPwAAAAAAAOA/0AIt0AIt4D+GLGQhC1ngP4QQQgghhOA/QUyuICZX4D+yAmGkHSvgP1VVVVVVVeA/8MXVDzoq4D8VvJyCl1PgP3+lQK1fKeA/UrgehetR4D8cUWDSqXngP6GgoKCgoOA/9lttDE134D/sxE7sxE7gP3ACJ3ACJ+A/463sc0hN4D8hVpTGRybgPwAAAAAAAOA/WQKb9pMl4D8AAAAAAADgP04CcaHmJOA/kiRJkiRJ4D/3QwJvPyTgP34E9xHcR+A/AkVbDZ4j4D/uaYTlnkbgPzACIzACI+A/AAAAAAAA4D/gKLvfKLvfP3d3d3d3d98/jmVQKky83z8uGYJ1tHnfPzgfg/MxON8/+N5777333j8IrBxaZDvfP7/v+77v+94/0Ofz+Xw+3z8AAAAAAIDfP/AH/AF/wN8/IPiBH/iB3z97a8M0d8HfP4QPPvjgg98/c/TN0TdH3z9FeqBydgvfP5/0SZ/0Sd8/Dw8PDw8P3z/ZLKj2nEzfP/EzSvyMEt8/Rs6w4FLZ3j9f8RVf8RXfP31no76zUd8/lPHbpZ6M3z89QvWZtsbfPwAAAAAAAOA/Dnj84YDH3z8AAAAAAADgP/LX7KhFyN8/AAAAAAAA4D+ZS4QnBcnfPwAAAAAAAOA//iZ/k7/J3z8AAAAAAADgPyB1yh91yt8/cVZ+QpyV3z9hHxf2cWHfP9/yLd/yLd8/PiInCHdj3z9hfleLmzDfPzqkJhhvZd8/mpmZmZmZ3z+P5g82Hs3fP1ikDDzdmt8/sxpFHDpp3z+cj8H5GJzfP2vfsPYNa98/",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "Which of the text elements under CATEGORIES in the"
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "The attached spreadsheet contains the sales of men"
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "You are given this Excel file as a map. You start "
+ ],
+ [
+ "This spreadsheet contains a list of clients for a "
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "What is the final numeric output from the attached"
+ ],
+ [
+ "How many more blocks (also denoted as layers) in B"
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "On the BBC Earth YouTube video of the Top 5 Sillie"
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "All of the individuals who formally held the posit"
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "Pull out the sentence in the following 5x7 block o"
+ ],
+ [
+ "On Cornell Law School website's legal information "
+ ],
+ [
+ "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n"
+ ],
+ [
+ "Of the cities within the United States where U.S. "
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "The book with the doi 10.1353/book.24372 concerns "
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ],
+ [
+ "As of August 2023, who is the only winner of the U"
+ ],
+ [
+ "On the DeepFruits fruit detection graph on Connect"
+ ],
+ [
+ "The longest-lived vertebrate is named after an isl"
+ ],
+ [
+ "The cover of the August 2021 issue of Vogue shows "
+ ],
+ [
+ "The attached file lists the locomotives owned by a"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+ ],
+ [
+ "During the first week of August 2015, one of the N"
+ ],
+ [
+ "The attached spreadsheet lists the locomotives own"
+ ],
+ [
+ "According to Girls Who Code, how long did it take "
+ ],
+ [
+ "How many at bats did the Yankee with the most walk"
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "What was the complete title of the book in which t"
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "According to the USGS, in what year was the Americ"
+ ],
+ [
+ "Hi, I was out sick from my classes on Friday, so I"
+ ],
+ [
+ "What country had the least number of athletes at t"
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "Eva Draconis has a personal website which can be a"
+ ],
+ [
+ "A 5-man group made up of one tank, one healer, and"
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "Take the gender split from the 2011 Bulgarian cens"
+ ],
+ [
+ "What is the absolute difference in tens of thousan"
+ ],
+ [
+ "If this whole pint is made up of ice cream, how ma"
+ ],
+ [
+ "The brand that makes these harnesses the dogs are "
+ ],
+ [
+ "Where were the Vietnamese specimens described by K"
+ ],
+ [
+ "A standard Rubik’s cube has been broken into cubes"
+ ],
+ [
+ "What was the actual enrollment count of the clinic"
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "The attached Excel file contains the sales of menu"
+ ],
+ [
+ "Who are the pitchers with the number before and af"
+ ],
+ [
+ "In the film Goldfinger, what color was the object "
+ ],
+ [
+ "Bob was invited to participate in a game show, and"
+ ],
+ [
+ "When was a picture of St. Thomas Aquinas first add"
+ ],
+ [
+ "What is the first name of the only Malko Competiti"
+ ],
+ [
+ "I thought we could try a fun word puzzle together "
+ ],
+ [
+ "In NASA's Astronomy Picture of the Day on 2006 Jan"
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ "As of May 2023, how many stops are between South S"
+ ],
+ [
+ "In the YouTube 360 VR video from March 2018 narrat"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ],
+ [
+ "In the 2015 Metropolitan Museum of Art exhibition "
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "I'd like to learn more about some popular reality "
+ ],
+ [
+ "On June 6, 2023, an article by Carolyn Collins Pet"
+ ],
+ [
+ "I read a paper about multiwavelength observations "
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "According to Openreview.net, at the NeurIPS 2022 C"
+ ],
+ [
+ "At the two-minute mark in the YouTube video upload"
+ ],
+ [
+ "I'm curious about how much information is availabl"
+ ],
+ [
+ "The attached spreadsheet contains a list of books "
+ ]
+ ],
+ "hovertemplate": "agent_name=code_o1_03_february_fix-print-outputs
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_o1_03_february_fix-print-outputs",
+ "line": {
+ "color": "#B6E880",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_o1_03_february_fix-print-outputs",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAA==",
+ "dtype": "i2"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADkP1VVVVVVVeU/ZmZmZmZm5j9GF1100UXnP1VVVVVVVeU/dmIndmIn5j+3bdu2bdvmP1VVVVVVVeU/AAAAAAAA5j+1tLS0tLTkP1VVVVVVVeU/UV5DeQ3l5T9mZmZmZmbmP1VVVVVVVeU/XXTRRRdd5D9Ob3rTm97kPwAAAAAAAOQ/MzMzMzMz4z9iJ3ZiJ3biP3Icx3Ecx+E/SZIkSZIk4T+WexphuafhPyIiIiIiIuI/jDHGGGOM4T8AAAAAAADhP3TRRRdddOE/4uHh4eHh4T+SJEmSJEniP3Icx3Ecx+E/mCKfdYMp4j/zGsprKK/hP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhP2IYhmEYhuE/d8QdcUfc4T8vuuiiiy7iP9InfdInfeI/IQtZyEIW4j9HfWejvrPhPwAAAAAAAOI/aKwPjfWh4T9I4XoUrkfhP5KRkZGRkeE/sRM7sRM74T9vZZ9DaoLhP3Icx3Ecx+E/dNFFF1104T9JkiRJkiThP3UW01lMZ+E/uacRlnsa4T/VfXlsRdDgPxEREREREeE/DcE62rxP4T8IIYQQQgjhPzEMwzAMw+A/AAAAAAAA4T/RC73QC73gP3zwwQcffOA/TKQHKme34D94eHh4eHjgPwtZyEIWsuA/oQ7qoA7q4D8OJFphcyDhP1VVVVVVVeE/jBgxYsSI4T/CFPmsG0zhP36x5BdLfuE/8xrKayiv4T8De8fUwN7hP9IgDdIgDeI/pSN7BqLS4T+amZmZmZnhP8rA0635YeE/LFG7ErUr4T9T59ceclnhP0mSJEmSJOE/UVFRUVFR4T9f0Bf0BX3hP5Z7GmG5p+E/dNFFF1104T8UoQhFKELhPxEREREREeE/sRM7sRM74T8WspCFLGThPzTRRBNNNOE/BTG5gphc4T9BGGnHCoThP6uqqqqqquE/UoEvrn7Q4T99aKwPjfXhP29nSMzbGeI/PQrXo3A94j+LleEbUWDiPzIyMjIyMuI/Kjkvi/gE4j92Yid2YifiP7If+7Ef++E/UhOMt7LP4T+zX4gVpfHhP3Icx3Ecx+E/1hmpmFud4T+/Ye0b1r7hP18Z2+/oleE/btu2bdu24T+c6xjFuY7hP/Maymsor+E/HYGirQbP4T+E5Z5GWO7hP9IgDdIgDeI/RdBwUvfl4T9Sdr9Rdr/hP97d3d3d3eE/WQalwsT74T/ep7hkCNbhP/QxOB+D8+E/zjnnnHPO4T9Ei2zn+6nhP2IYhmEYhuE/aTQajUaj4T8AAAAAAMDhP2fMGXPGnOE/oRd6oRd64T9gxQkpeZbhP3TRRRdddOE/LBWxVMRS4T8qZ7fwqzHhP9wUo4a/TeE/aWlpaWlp4T/Ircs74EjhPxaykIUsZOE/P1pNQhR/4T+amZmZmZnhP8afSDileeE/RStsDiRa4T+xEzuxEzvhP8dxHMdxHOE/smsTJbs24T+JESNGjBjhPz801ofG+uA/TJHPusEU4T83YKimYy7hP0jhehSuR+E/ianEVGIq4T/YUF5DeQ3hP9F7JtF7JuE/5SfEWfkJ4T/uUN0O1e3gPyEN0iAN0uA/DVjSy5+24D+fgah0ZM/gP2dAKLlTtOA/mpmZmZmZ4D+aP9h4NH/gP6hb88MiZeA/axRx6KR94D+WqF2J2pXgPw==",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "Which of the text elements under CATEGORIES in the"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "The attached spreadsheet contains the sales of men"
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "What is the final numeric output from the attached"
+ ],
+ [
+ "This spreadsheet contains a list of clients for a "
+ ],
+ [
+ "How many more blocks (also denoted as layers) in B"
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "The longest-lived vertebrate is named after an isl"
+ ],
+ [
+ "Pull out the sentence in the following 5x7 block o"
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "All of the individuals who formally held the posit"
+ ],
+ [
+ "On the DeepFruits fruit detection graph on Connect"
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "The book with the doi 10.1353/book.24372 concerns "
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "On the BBC Earth YouTube video of the Top 5 Sillie"
+ ],
+ [
+ "On Cornell Law School website's legal information "
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "During the first week of August 2015, one of the N"
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "The cover of the August 2021 issue of Vogue shows "
+ ],
+ [
+ "The attached spreadsheet lists the locomotives own"
+ ],
+ [
+ "Bob was invited to participate in a game show, and"
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "Hi, I was out sick from my classes on Friday, so I"
+ ],
+ [
+ "According to Girls Who Code, how long did it take "
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ "I'd like to learn more about some popular reality "
+ ],
+ [
+ "What was the complete title of the book in which t"
+ ],
+ [
+ "The attached file lists the locomotives owned by a"
+ ],
+ [
+ "What is the absolute difference in tens of thousan"
+ ],
+ [
+ "According to the USGS, in what year was the Americ"
+ ],
+ [
+ "If this whole pint is made up of ice cream, how ma"
+ ],
+ [
+ "A 5-man group made up of one tank, one healer, and"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "Take the gender split from the 2011 Bulgarian cens"
+ ],
+ [
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+ ],
+ [
+ "Of the cities within the United States where U.S. "
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "Eva Draconis has a personal website which can be a"
+ ],
+ [
+ "Where were the Vietnamese specimens described by K"
+ ],
+ [
+ "A standard Rubik’s cube has been broken into cubes"
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "The attached Excel file contains the sales of menu"
+ ],
+ [
+ "According to Openreview.net, at the NeurIPS 2022 C"
+ ],
+ [
+ "As of August 2023, who is the only winner of the U"
+ ],
+ [
+ "What is the first name of the only Malko Competiti"
+ ],
+ [
+ "How many at bats did the Yankee with the most walk"
+ ],
+ [
+ "On June 6, 2023, an article by Carolyn Collins Pet"
+ ],
+ [
+ "The brand that makes these harnesses the dogs are "
+ ],
+ [
+ "When was a picture of St. Thomas Aquinas first add"
+ ],
+ [
+ "In NASA's Astronomy Picture of the Day on 2006 Jan"
+ ],
+ [
+ "What country had the least number of athletes at t"
+ ],
+ [
+ "Who are the pitchers with the number before and af"
+ ],
+ [
+ "In the YouTube 360 VR video from March 2018 narrat"
+ ],
+ [
+ "You are given this Excel file as a map. You start "
+ ],
+ [
+ "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n"
+ ],
+ [
+ "What was the actual enrollment count of the clinic"
+ ],
+ [
+ "As of May 2023, how many stops are between South S"
+ ],
+ [
+ "I read a paper about multiwavelength observations "
+ ]
+ ],
+ "hovertemplate": "agent_name=code_o1_03_february_fix-print-outputs2
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_o1_03_february_fix-print-outputs2",
+ "line": {
+ "color": "#FF97FF",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_o1_03_february_fix-print-outputs2",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsA",
+ "dtype": "i2"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAA8D8AAAAAAADgP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADkP1VVVVVVVeU/ZmZmZmZm5j9GF1100UXnP1VVVVVVVeU/FDuxEzux4z8lSZIkSZLkPzMzMzMzM+M/AAAAAAAA4j/T0tLS0tLiP3Icx3Ecx+E/eQ3lNZTX4D8AAAAAAADgPzEMwzAMw+A/dNFFF1104T8LWchCFrLgP1VVVVVVVeE/pHA9Ctej4D+xEzuxEzvhP3Icx3Ecx+E/SZIkSZIk4T+WexphuafhPyIiIiIiIuI/lVJKKaWU4j8AAAAAAADiP3TRRRdddOE/4uHh4eHh4T/xFV/xFV/hP3Icx3Ecx+E/mCKfdYMp4j/zGsprKK/hP9IgDdIgDeI/ZmZmZmZm4j/0MTgfg/PhP5IkSZIkSeI/p6wpa8qa4j/poosuuujiPzMzMzMzM+M/LWQhC1nI4j9MriAmVxDjP6uqqqqqquI/kiRJkiRJ4j+PwvUoXI/iPzIyMjIyMuI/ip3YiZ3Y4T81wXgr+xziP3Icx3Ecx+E/CfKUIE8J4j9u27Zt27bhP3AfwX0E9+E/lnsaYbmn4T8NJ3VfHlvhPxEREREREeE/DcE62rxP4T+MMcYYY4zhP1EURVEUReE/AAAAAACA4T+SG7mRG7nhP/DBBx988OE/5ewWfjUm4j/i4eHh4eHhPxolfkaJn+E/Qh3UQR3U4T/nQKIVNgfiP47jOI7jOOI/kB8/fvz44T+tG0yRz7rhP36x5BdLfuE/8xrKayiv4T8De8fUwN7hP0IapEEapOE/1uImzO9q4T8zMzMzMzPhP8rA0635YeE/kMH5GJyP4T8ilxUDJbzhP3qe53me5+E/EhISEhIS4j+PuCPuiDviPyleIJPiBeI/0UUXXXTR4T8g/ehHP/rhPyIiIiIiIuI/8h7v8R7v4T8hC1nIQhbiP+SRRx555OE/iMkVxOQK4j+kHSsQRtrhPwAAAAAAAOI/UoEvrn7Q4T99aKwPjfXhP3Icx3Ecx+E/mpmZmZmZ4T+8frZYGb7hP5KRkZGRkeE/hqY72G+14T+e2Imd2InhP9IardEareE/b2WfQ2qC4T8tBzf7hVjhP7SX0F5Ce+E/IxVzqzNS4T8qQZ4S5CnhPyUQF2pOAuE/SZIkSZIk4T+uYxTnOkbhP/cR3EdwH+E/FG01eI5A4T+oEZZ7GmHhP7ETO7ETO+E/cVL35bEV4T8RyDURyDXhPxEREREREeE/kJzma/Xs4D+kzfsUlwzhP+mwkQ4b6eA/CCGEEEII4T/0/dR46SbhP1EURVEUReE/SSQSiUQi4T8AAAAAAADhPzjkDXlD3uA/0Qu90Au94D9Mcxf8VZzgP7rooosuuuA/oAl/JvyZ4D/ewq/GRHrgP7AFW7AFW+A/eHh4eHh44D+h2nMyfZXgPwtZyEIWsuA/3Zinj1aT4D/5iq/4iq/gP8IpzYs/keA/ohU2BxKt4D8rmCXlgMjgPzmO4ziO4+A/wOMPBzz+4D8HDhw4cODgPzEMwzAMw+A/1g2myGfd4D/QqyzOaPfgP9pApw102uA/iNBD6CH04D/YUF5DeQ3hP9F7JtF7JuE/5SfEWfkJ4T/uUN0O1e3gPyEN0iAN0uA/",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "The attached spreadsheet contains the sales of men"
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "This spreadsheet contains a list of clients for a "
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "What is the final numeric output from the attached"
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "How many more blocks (also denoted as layers) in B"
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "On the DeepFruits fruit detection graph on Connect"
+ ],
+ [
+ "Pull out the sentence in the following 5x7 block o"
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "All of the individuals who formally held the posit"
+ ],
+ [
+ "You are given this Excel file as a map. You start "
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "The longest-lived vertebrate is named after an isl"
+ ],
+ [
+ "Of the cities within the United States where U.S. "
+ ],
+ [
+ "On the BBC Earth YouTube video of the Top 5 Sillie"
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "The attached spreadsheet contains a list of books "
+ ],
+ [
+ "On Cornell Law School website's legal information "
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ],
+ [
+ "During the first week of August 2015, one of the N"
+ ],
+ [
+ "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n"
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "The attached spreadsheet lists the locomotives own"
+ ],
+ [
+ "The attached file lists the locomotives owned by a"
+ ],
+ [
+ "According to Girls Who Code, how long did it take "
+ ],
+ [
+ "What was the complete title of the book in which t"
+ ],
+ [
+ "The book with the doi 10.1353/book.24372 concerns "
+ ],
+ [
+ "The cover of the August 2021 issue of Vogue shows "
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "A 5-man group made up of one tank, one healer, and"
+ ],
+ [
+ "How many at bats did the Yankee with the most walk"
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "According to the World Bank, which countries had g"
+ ],
+ [
+ "What is the absolute difference in tens of thousan"
+ ],
+ [
+ "Hi, I was out sick from my classes on Friday, so I"
+ ],
+ [
+ "Eva Draconis has a personal website which can be a"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ],
+ [
+ "Bob was invited to participate in a game show, and"
+ ],
+ [
+ "Where were the Vietnamese specimens described by K"
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "A standard Rubik’s cube has been broken into cubes"
+ ],
+ [
+ "If this whole pint is made up of ice cream, how ma"
+ ],
+ [
+ "The attached Excel file contains the sales of menu"
+ ],
+ [
+ "I thought we could try a fun word puzzle together "
+ ],
+ [
+ "I'd like to learn more about some popular reality "
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "Take the gender split from the 2011 Bulgarian cens"
+ ],
+ [
+ "As of August 2023, who is the only winner of the U"
+ ],
+ [
+ "Who are the pitchers with the number before and af"
+ ],
+ [
+ "In the film Goldfinger, what color was the object "
+ ],
+ [
+ "What was the actual enrollment count of the clinic"
+ ],
+ [
+ "What is the first name of the only Malko Competiti"
+ ],
+ [
+ "In NASA's Astronomy Picture of the Day on 2006 Jan"
+ ],
+ [
+ "In the YouTube 360 VR video from March 2018 narrat"
+ ],
+ [
+ "As of May 2023, how many stops are between South S"
+ ],
+ [
+ "What country had the least number of athletes at t"
+ ],
+ [
+ "According to Openreview.net, at the NeurIPS 2022 C"
+ ],
+ [
+ "In the 2015 Metropolitan Museum of Art exhibition "
+ ],
+ [
+ "The brand that makes these harnesses the dogs are "
+ ],
+ [
+ "According to the USGS, in what year was the Americ"
+ ],
+ [
+ "I read a paper about multiwavelength observations "
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "When was a picture of St. Thomas Aquinas first add"
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "I'm curious about how much information is availabl"
+ ],
+ [
+ "On June 6, 2023, an article by Carolyn Collins Pet"
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_o1_03_february_goodoldtext-unbroken
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_o1_03_february_goodoldtext-unbroken",
+ "line": {
+ "color": "#FECB52",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_o1_03_february_goodoldtext-unbroken",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAA==",
+ "dtype": "i2"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADoPzmO4ziO4+g/ZmZmZmZm5j9ddNFFF13kP1VVVVVVVeU/dmIndmIn5j+3bdu2bdvmP3d3d3d3d+c/AAAAAAAA5j+XlpaWlpbmP8dxHMdxHOc/UV5DeQ3l5T9mZmZmZmbmP7dt27Zt2+Y/0UUXXXTR5T9Ob3rTm97kP1VVVVVVVeU/w/UoXI/C5T/FTuzETuzkP1VVVVVVVeU/btu2bdu25T98GmG5pxHmP1VVVVVVVeU/rbXWWmut5T8AAAAAAADlP1VVVVVVVeU/tbS0tLS05D91UAd1UAflPxzHcRzHceQ/HEyRz7rB5D/YUF5DeQ3lPzVIgzRIg+Q/zczMzMzM5D9L1K5E7UrkPyVJkiRJkuQ/NmVNWVPW5D9ddNFFF13kP/VJn/RJn+Q/QxaykIUs5D/PRn1no77jPwAAAAAAAOQ/5hS8nIKX4z/Xo3A9CtfjP3Nzc3Nzc+M/O7ETO7ET4z/7HFITjLfiP+0ltJfQXuI/CfKUIE8J4j9u27Zt27bhP3AfwX0E9+E/lnsaYbmn4T8NJ3VfHlvhP5qZmZmZmeE/3qe4ZAjW4T8RQgghhBDiP3Icx3Ecx+E/AAAAAAAA4j+SG7mRG7nhP/DBBx988OE/CCpnt/Cr4T/i4eHh4eHhPyELWchCFuI/kiRJkiRJ4j9TT8Zvl3riP47jOI7jOOI/kyZNmjRp4j+YIp91gyniP+xRuB6F6+E/r6G8hvIa4j8De8fUwN7hP9IgDdIgDeI/dWTPQFQ64j8AAAAAAADiPxl4ujU/LOI/9DE4H4Pz4T/xRlPn1x7iP5IkSZIkSeI/cnJycnJy4j+PuCPuiDviP7xAJsULZOI/jC666KKL4j8rWclKVrLiP4Mt2IIt2OI/0y/90i/94j+ykIUsZCHjP+2yyy677OI/C2JyBTG54j/PLXHq99ziP6uqqqqqquI/8yQyDdvN4j+8nIKXU/DiP2r9SoFav+I/4XoUrkfh4j9brAzfiALjPyMjIyMjI+M/FvEJpJLz4j9P7MRO7MTiP3Mpl3Ipl+I/GG9ln0Nq4j85uNkvxIriP+0ltJfQXuI/OyMVc6sz4j9UgjwlyFPiP5gin3WDKeI/AAAAAAAA4j+Kcx2jONfhP3AfwX0E9+E/IQtZyEIW4j+E5Z5GWO7hP3Icx3Ecx+E/RdBwUvfl4T9yTQRyTQTiP97d3d3d3eE/52v17BC44T/ep7hkCNbhP7GRDhvpsOE/zjnnnHPO4T9Ei2zn+6nhP2IYhmEYhuE/WSwWi8Vi4T8AAAAAAIDhP2fMGXPGnOE/khu5kRu54T9gxQkpeZbhP3TRRRdddOE/BhkXZFyQ4T8IKme38KvhP3Icx3Ecx+E/pqWlpaWl4T/ij1uXd8DhPxolfkaJn+E/l8r2rgO64T+amZmZmZnhP8afSDileeE/ezJ+u9ST4T900UUXXXThP+Q4juM4juE/pPMWQzpv4T+MGDFixIjhP1uE/DU7auE/whT5rBtM4T83YKimYy7hP0jhehSuR+E/ianEVGIq4T/YUF5DeQ3hP9F7JtF7JuE/rfyEOCs/4T8j8SoSryLhP7ETO7ETO+E/OUG4G/se4T8GotKRPQPhP+vSY/5eG+E/MzMzMzMz4T/ti6jW2RfhPw==",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "The attached spreadsheet contains the sales of men"
+ ],
+ [
+ "You are given this Excel file as a map. You start "
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "This spreadsheet contains a list of clients for a "
+ ],
+ [
+ "What is the final numeric output from the attached"
+ ],
+ [
+ "On the BBC Earth YouTube video of the Top 5 Sillie"
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "The longest-lived vertebrate is named after an isl"
+ ],
+ [
+ "How many more blocks (also denoted as layers) in B"
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "On the DeepFruits fruit detection graph on Connect"
+ ],
+ [
+ "All of the individuals who formally held the posit"
+ ],
+ [
+ "Pull out the sentence in the following 5x7 block o"
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ],
+ [
+ "During the first week of August 2015, one of the N"
+ ],
+ [
+ "On Cornell Law School website's legal information "
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "Bob was invited to participate in a game show, and"
+ ],
+ [
+ "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n"
+ ],
+ [
+ "Of the cities within the United States where U.S. "
+ ],
+ [
+ "The book with the doi 10.1353/book.24372 concerns "
+ ],
+ [
+ "The attached spreadsheet lists the locomotives own"
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "The cover of the August 2021 issue of Vogue shows "
+ ],
+ [
+ "The attached spreadsheet contains a list of books "
+ ],
+ [
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+ ],
+ [
+ "The attached file lists the locomotives owned by a"
+ ],
+ [
+ "How many at bats did the Yankee with the most walk"
+ ],
+ [
+ "According to Girls Who Code, how long did it take "
+ ],
+ [
+ "What was the complete title of the book in which t"
+ ],
+ [
+ "What is the absolute difference in tens of thousan"
+ ],
+ [
+ "Eva Draconis has a personal website which can be a"
+ ],
+ [
+ "Hi, I was out sick from my classes on Friday, so I"
+ ],
+ [
+ "A 5-man group made up of one tank, one healer, and"
+ ],
+ [
+ "According to the USGS, in what year was the Americ"
+ ],
+ [
+ "If this whole pint is made up of ice cream, how ma"
+ ],
+ [
+ "I'd like to learn more about some popular reality "
+ ],
+ [
+ "Take the gender split from the 2011 Bulgarian cens"
+ ],
+ [
+ "The brand that makes these harnesses the dogs are "
+ ],
+ [
+ "As of August 2023, who is the only winner of the U"
+ ],
+ [
+ "Where were the Vietnamese specimens described by K"
+ ],
+ [
+ "A standard Rubik’s cube has been broken into cubes"
+ ],
+ [
+ "The attached Excel file contains the sales of menu"
+ ],
+ [
+ "What was the actual enrollment count of the clinic"
+ ],
+ [
+ "According to Openreview.net, at the NeurIPS 2022 C"
+ ],
+ [
+ "What country had the least number of athletes at t"
+ ],
+ [
+ "In the film Goldfinger, what color was the object "
+ ],
+ [
+ "What is the first name of the only Malko Competiti"
+ ],
+ [
+ "Who are the pitchers with the number before and af"
+ ],
+ [
+ "When was a picture of St. Thomas Aquinas first add"
+ ],
+ [
+ "In NASA's Astronomy Picture of the Day on 2006 Jan"
+ ],
+ [
+ "I read a paper about multiwavelength observations "
+ ],
+ [
+ "In the YouTube 360 VR video from March 2018 narrat"
+ ],
+ [
+ "I'm curious about how much information is availabl"
+ ],
+ [
+ "On June 6, 2023, an article by Carolyn Collins Pet"
+ ],
+ [
+ "As of May 2023, how many stops are between South S"
+ ],
+ [
+ "In the 2015 Metropolitan Museum of Art exhibition "
+ ],
+ [
+ "At the two-minute mark in the YouTube video upload"
+ ],
+ [
+ "I thought we could try a fun word puzzle together "
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ "Which of the text elements under CATEGORIES in the"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_o1_03_february_remove-navigational
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_o1_03_february_remove-navigational",
+ "line": {
+ "color": "#636efa",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_o1_03_february_remove-navigational",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAA==",
+ "dtype": "i2"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADkP1VVVVVVVeU/MzMzMzMz4z9ddNFFF13kP1VVVVVVVeU/FDuxEzux4z+SJEmSJEniPzMzMzMzM+M/AAAAAAAA4j/x8PDw8PDgPwAAAAAAAOA/DeU1lNdQ3j8AAAAAAADgPzEMwzAMw+A/dNFFF1104T8hC1nIQhbiP6uqqqqqquI/7FG4HoXr4T+xEzuxEzvhP3Icx3Ecx+E/SZIkSZIk4T+WexphuafhPyIiIiIiIuI/lVJKKaWU4j8AAAAAAADiP22yySabbOI/09LS0tLS4j+SJEmSJEniP3Icx3Ecx+E/mCKfdYMp4j/zGsprKK/hP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhPzEMwzAMw+A/GPQFfUFf4D+66KKLLrrgPxEREREREeE/FrKQhSxk4T/E5ApicgXhP1VVVVVVVeE/aKwPjfWh4T/sUbgehevhP5KRkZGRkeE/sRM7sRM74T+pCcZb2efgP/cS2ktoL+E/37D2DWvf4D9JkiRJkiThP3UW01lMZ+E/lnsaYbmn4T8NJ3VfHlvhP5qZmZmZmeE/DcE62rxP4T8IIYQQQgjhP1EURVEUReE/AAAAAAAA4T/RC73QC73gP/jggw8++OA/TKQHKme34D/x8PDw8PDgPwtZyEIWsuA/oQ7qoA7q4D8OJFphcyDhP1VVVVVVVeE/jBgxYsSI4T/CFPmsG0zhPxEREREREeE/eQ3lNZTX4D/lJ8RZ+QnhP7ETO7ETO+E/1uImzO9q4T8zMzMzMzPhPyNl4OnW/OA/yOB8DM7H4D+FN5o6v/bgP0mSJEmSJOE/UVFRUVFR4T9f0Bf0BX3hPwOZFC+QSeE/dNFFF1104T8UoQhFKELhP8EWbMEWbOE/UhmVURmV4T8WspCFLGThPzTRRBNNNOE/xOQKYnIF4T95DeU1lNfgP6uqqqqqquA/sd0sTyLT4D8qeDkFL6fgP3o7Q2LezuA/9ihcj8L14D/sZ4uV4RvhP0FBQUFBQeE/PoFUcl4W4T+xEzuxEzvhP/EVX/EVX+E/b2WfQ2qC4T9ws1+IFaXhP3Icx3Ecx+E/1hmpmFud4T900UUXXXThP8IU+awbTOE/27Zt27Zt4T+c6xjFuY7hP3UW01lMZ+E/FG01eI5A4T+oEZZ7GmHhP7ETO7ETO+E/cVL35bEV4T/x8PDw8PDgP83MzMzMzOA/HgI3lkGp4D/S5n2KS4bgP6cQaAqBpuA/xhhjjDHG4D+kcD0K16PgPzEMwzAMw+A/OBwOh8Ph4D8AAAAAAADhP0fcEXfEHeE/sRM7sRM74T9WnJCSZxnhP/jggw8++OA/UxFLRSwV4T+7hV+NifTgPxEREREREeE/8fDw8PDw4D+7vAOOFA3hPw/MtQNz7eA/jnn6aDUJ4T9JkiRJkiThP8TkCmJyBeE/DiRaYXMg4T+xEzuxEzvhP1VVVVVVVeE/pPMWQzpv4T8LFSpUqFDhP01c6d6AMuE/whT5rBtM4T+eFCR/XmXhP36x5BdLfuE/jVvGLeOW4T+U11BeQ3nhP5KRkZGRkeE/dNFFF1104T+MMcYYY4zhP0IapEEapOE/+x6RE4S74T8+A1HpyJ7hP29ln0NqguE/ZmZmZmZm4T9epZigu0rhP/cS2ktoL+E/fJu/wqxG4T8sUbsStSvhPw==",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "Which of the text elements under CATEGORIES in the"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "The attached spreadsheet contains the sales of men"
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "According to the World Bank, which countries had g"
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "I thought we could try a fun word puzzle together "
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "You are given this Excel file as a map. You start "
+ ],
+ [
+ "What is the final numeric output from the attached"
+ ],
+ [
+ "This spreadsheet contains a list of clients for a "
+ ],
+ [
+ "How many more blocks (also denoted as layers) in B"
+ ],
+ [
+ "On the DeepFruits fruit detection graph on Connect"
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ],
+ [
+ "The book with the doi 10.1353/book.24372 concerns "
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "On the BBC Earth YouTube video of the Top 5 Sillie"
+ ],
+ [
+ "The longest-lived vertebrate is named after an isl"
+ ],
+ [
+ "During the first week of August 2015, one of the N"
+ ],
+ [
+ "All of the individuals who formally held the posit"
+ ],
+ [
+ "Pull out the sentence in the following 5x7 block o"
+ ],
+ [
+ "Of the cities within the United States where U.S. "
+ ],
+ [
+ "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n"
+ ],
+ [
+ "On Cornell Law School website's legal information "
+ ],
+ [
+ "According to Girls Who Code, how long did it take "
+ ],
+ [
+ "What was the complete title of the book in which t"
+ ],
+ [
+ "As of August 2023, who is the only winner of the U"
+ ],
+ [
+ "Eva Draconis has a personal website which can be a"
+ ],
+ [
+ "The cover of the August 2021 issue of Vogue shows "
+ ],
+ [
+ "The attached spreadsheet lists the locomotives own"
+ ],
+ [
+ "Bob was invited to participate in a game show, and"
+ ],
+ [
+ "The attached spreadsheet contains a list of books "
+ ],
+ [
+ "How many at bats did the Yankee with the most walk"
+ ],
+ [
+ "The attached file lists the locomotives owned by a"
+ ],
+ [
+ "Hi, I was out sick from my classes on Friday, so I"
+ ],
+ [
+ "A 5-man group made up of one tank, one healer, and"
+ ],
+ [
+ "What is the absolute difference in tens of thousan"
+ ],
+ [
+ "According to the USGS, in what year was the Americ"
+ ],
+ [
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+ ],
+ [
+ "If this whole pint is made up of ice cream, how ma"
+ ],
+ [
+ "I'd like to learn more about some popular reality "
+ ],
+ [
+ "Take the gender split from the 2011 Bulgarian cens"
+ ],
+ [
+ "The brand that makes these harnesses the dogs are "
+ ],
+ [
+ "According to Openreview.net, at the NeurIPS 2022 C"
+ ],
+ [
+ "What was the actual enrollment count of the clinic"
+ ],
+ [
+ "A standard Rubik’s cube has been broken into cubes"
+ ],
+ [
+ "On June 6, 2023, an article by Carolyn Collins Pet"
+ ],
+ [
+ "Where were the Vietnamese specimens described by K"
+ ],
+ [
+ "Who are the pitchers with the number before and af"
+ ],
+ [
+ "The attached Excel file contains the sales of menu"
+ ],
+ [
+ "When was a picture of St. Thomas Aquinas first add"
+ ],
+ [
+ "I'm curious about how much information is availabl"
+ ],
+ [
+ "In NASA's Astronomy Picture of the Day on 2006 Jan"
+ ],
+ [
+ "What is the first name of the only Malko Competiti"
+ ],
+ [
+ "What country had the least number of athletes at t"
+ ],
+ [
+ "In the film Goldfinger, what color was the object "
+ ],
+ [
+ "As of May 2023, how many stops are between South S"
+ ],
+ [
+ "I read a paper about multiwavelength observations "
+ ],
+ [
+ "In the YouTube 360 VR video from March 2018 narrat"
+ ],
+ [
+ "In the 2015 Metropolitan Museum of Art exhibition "
+ ],
+ [
+ "At the two-minute mark in the YouTube video upload"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_o1_03_february_text_high-reasoning-effort
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_o1_03_february_text_high-reasoning-effort",
+ "line": {
+ "color": "#EF553B",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_o1_03_february_text_high-reasoning-effort",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
+ "dtype": "i2"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAA8D8AAAAAAADgP1VVVVVVVdU/AAAAAAAA4D+amZmZmZnZPwAAAAAAAOA/27Zt27Zt2z8AAAAAAADYPxzHcRzHcdw/AAAAAAAA4D900UUXXXThPwAAAAAAAOA/sRM7sRM74T8AAAAAAADgP97d3d3d3d0/AAAAAAAA3D8eHh4eHh7ePxzHcRzHcdw/KK+hvIby2j+amZmZmZnZP9u2bdu2bds/L7rooosu2j+96U1vetPbP6uqqqqqqto/mpmZmZmZ2T/ZiZ3YiZ3YPy+hvYT2Eto/27Zt27Zt2z9huacRlnvaP5qZmZmZmdk/xhhjjDHG2D8AAAAAAADaPyebbLLJJts/PDw8PDw83D8d1EEd1EHdPxzHcRzHcdw/0LrBFPms2z8or6G8hvLaPxqkQRqkQdo/mpmZmZmZ2T+J2pWoXYnaP9u2bdu2bds/s6asKWvK2j+jiy666KLbP1uwBVuwBds/velNb3rT2z9t1Hc26jvbPwAAAAAAANw/27Zt27Zt2z/hehSuR+HaP5ybm5ubm9s/O7ETO7ET2z8KxlvZ55DaPya0l9BeQts/w9o3rH3D2j/btm3btm3bPx/BfQT3Edw/GmG5pxGW2z8EDSd1Xx7bP7y7u7u7u9s/Q7CONu9T3D/nnHPOOefcPxzHcRzHcdw/AAAAAAAA3D/dyI3cyI3cPx988MEHH9w/NSbSA5Wz2z9LS0tLS0vbP64dmGsH5to/27Zt27Zt2z8yfrvUk/HbPxzHcRzHcdw/4MCBAwcO3D/QusEU+azbPylcj8L1KNw/oryG8hrK2z/5CXFWfkLcP33Lt3zLt9w/VDqyZyAq3T/NzMzMzMzcP2t+WKQMPN0/qV2J2pWo3T/3kMuKgRLeP27btm3btt0/Hh4eHh4e3j9xR9wRd8TdPyCT4gUyKd4/jC666KKL3j/vda973evePz/pkz7pk94/3uM93uM93j/f9KY3vendP5dddtlll90/eDbqOxv13T9G2rECYaTdPwAAAAAAAN4/3ixPItOw3T+Dl1PwcgrePw2JeTtDYt4/uB6F61G43j/IXT9brAzfP19fX19fX98/FEgl52UR3z8ndmIndmLfPyD7sR/7sd8/OqQmGG9l3z83+4VYURrfPwntJbSX0N4/hOjxXTiI3j/WvmHtG9beP/DolbH9jt4/kiRJkiRJ3j9bWOmphZXePwnuI7iP4N4/6k1vetOb3j9HWO5phOXePx/qoR7qod4/VwQNJ3Vf3j9fzKdezKfeP2ZmZmZmZt4/4MYyKBUm3j+KS4ZgHW3ePy6e3OLJLd4/dM4555xz3j+4HoXrUbjeP57neZ7ned4/j8fj8Xg83j8AAAAAAADeP3FH3BF3xN0/ntiJndiJ3T9Ux97aMM3dP5NNNtlkk90/Wt1pdafV3T+K9EDl7BbeP97d3d3d3d0/Hh4eHh4e3j+kaIg/bl3eP+JnlPgZJd4/le1dB3Rj3j++4iu+4iveP3rxJxJOad4/u9ST8dul3j+qz7Q1/m7eP47jOI7jON4/Y0jnLYZ03j/16tWrV6/eP7o3oExc6d4/PusGU+Sz3j8uEZ4UJH/eP7gehetRuN4/+MJ74b3w3j+H8hrKayjfP59J9J5J9N4/4qz8hDgr3z/43nvvvffeP0/sxE7sxN4/EjlBuBv73j9hfleLmzDfPzqkJhhvZd8/mpmZmZmZ3z+P5g82Hs3fP1ikDDzdmt8/sxpFHDpp3z84H4PzMTjfPwgffPDBB98/",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_o1_22-01_managedagent-summary_planning
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_o1_22-01_managedagent-summary_planning",
+ "line": {
+ "color": "#00cc96",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_o1_22-01_managedagent-summary_planning",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQg==",
+ "dtype": "i1"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADkP3Icx3Ecx+E/AAAAAAAA4D8XXXTRRRfdPwAAAAAAAOA/sRM7sRM74T8AAAAAAADgPxEREREREeE/AAAAAAAA4D8eHh4eHh7ePxzHcRzHcdw/KK+hvIby2j+amZmZmZnZPxiGYRiGYdg/RhdddNFF1z+RhSxkIQvZPwAAAAAAANg/mpmZmZmZ2T/ZiZ3YiZ3YP0J7Ce0ltNc/t23btm3b1j98GmG5pxHWP1VVVVVVVdU/pZRSSiml1D8AAAAAAADUP2WTTTbZZNM/tbS0tLS01D8WX/EVX/HVP1VVVVVVVdU/yWfdYIp81j9DeQ3lNZTXP9mJndiJndg/mpmZmZmZ2T/6GJyPwfnYP3qe53me59k/s6asKWvK2j8vuuiiiy7aP5qZmZmZmdk/pze96U1v2j9t1Hc26jvbPwAAAAAAANw/27Zt27Zt2z/hehSuR+HaP1paWlpaWto/O7ETO7ET2z+WfQ6pCcbbPxzHcRzHcdw/F1100UUX3T8lSZIkSZLcPxbTWUxnMd0/jbDc0wjL3T/msRVBw0ndP83MzMzMzNw/Q7CONu9T3D/fe++9997bP9u2bdu2bds/AAAAAAAA2z9bqZVaqZXaPyebbLLJJts/eqBydgu/2j8=",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ]
+ ],
+ "hovertemplate": "agent_name=code_o1_25-01_visioon
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_o1_25-01_visioon",
+ "line": {
+ "color": "#ab63fa",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_o1_25-01_visioon",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ=",
+ "dtype": "i1"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAA8D8AAAAAAADgP1VVVVVVVdU/AAAAAAAA0D+amZmZmZnJP1VVVVVVVdU/27Zt27Zt2z8AAAAAAADYP1VVVVVVVdU/MzMzMzMz0z900UUXXXTRP1VVVVVVVdU/2Ymd2Imd2D+3bdu2bdvWP5qZmZmZmdk/AAAAAAAA2D+XlpaWlpbWPzmO4ziO49g/Q3kN5TWU1z9mZmZmZmbWP1VVVVVVVdU/XXTRRRdd1D9kIQtZyELWP1VVVVVVVdU/exSuR+F61D92Yid2YifWP1VVVVVVVdU/JUmSJEmS1D8Jyz2NsNzTPzMzMzMzM9M/lVJKKaWU0j8AAAAAAADSP3TRRRdddNE/09LS0tLS0j/UQR3UQR3UP+Q4juM4jtM/HEyRz7rB1D9RXkN5DeXVP1VVVVVVVdU/ZmZmZmZm1j/blahdidrVP1VVVVVVVdU/lTVlTVlT1j/RRRdddNHVP1VVVVVVVdU/ZCELWchC1j9dQUyuICbXP6uqqqqqqtY/jfWhsT401j/D9Shcj8LVP1VVVVVVVdU/xU7sxE7s1D/Z55CaYLzVPw==",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "I thought we could try a fun word puzzle together "
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "According to the World Bank, which countries had g"
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_o1_29-01_text
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_o1_29-01_text",
+ "line": {
+ "color": "#FFA15A",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_o1_29-01_text",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdo",
+ "dtype": "i1"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA4D+amZmZmZnZPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADgPxzHcRzHcdw/mpmZmZmZ2T9GF1100UXXP1VVVVVVVdU/2Ymd2Imd2D+3bdu2bdvWP5qZmZmZmdk/AAAAAAAA3D9aWlpaWlraPzmO4ziO49g/Q3kN5TWU1z9mZmZmZmbWP1VVVVVVVdU/XXTRRRdd1D9kIQtZyELWP1VVVVVVVdU/exSuR+F61D8UO7ETO7HTP2gvob2E9tI/kiRJkiRJ0j+WexphuafRPxEREREREdE/hBBCCCGE0D8AAAAAAADQPwgffPDBB88/8fDw8PDw0D+SJEmSJEnSP+Q4juM4jtM/HEyRz7rB1D9RXkN5DeXVP5dv+ZZv+dY/AAAAAAAA2D9qV6J2JWrXPxiGYRiGYdg/9AV9QV/Q1z9GF1100UXXPxdswRZswdY/etOb3vSm1z9icgUxuYLYPwAAAAAAANg/4eUUvJyC1z8K16NwPQrXP5eWlpaWltY/dmIndmIn1j9ln0NqgvHWP0J7Ce0ltNc/cFj7hrVv2D9JkiRJkiTZPzGdxXQW09k/YbmnEZZ72j+Uui+PrQjaP5qZmZmZmdk/WEeb9yku2T/GGGOMMcbYPxiGYRiGYdg/AAAAAAAA2D8YeqEXeqHXPz744IMPPtg/SQ9Uzm7h1z+Ih4eHh4fXP4K5dmCuHdg/+Yqv+Iqv2D/RCpsDiVbYPwAAAAAAANg/vXr16tWr1z+fdYMp8lnXP+UXS36x5Nc/Q3kN5TWU1z9kamDvmBrYP9mJndiJndg/OrJnICod2T/NzMzMzMzYP5Ey8HRrftg/Mjgfg/Mx2D+q82sPuazYPxiGYRiGYdg/GBgYGBgY2D8k7og74o7YP+5phOWeRtg/AAAAAAAA2D983ete97rXP9iCLdiCLdg/2Ymd2Imd2D+GLGQhC1nYP8YYY4wxxtg/YnIFMbmC2D8LhJF2rEDYPwAAAAAAANg/2G6WJ5Fp2D801ofG+tDYPzbZZJNNNtk/mpmZmZmZ2T96kLt+tljZP7q5ubm5udk/i/gEUsl52T+xEzuxEzvZP9mP/diP/dg/",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "What's the last line of the rhyme under the flavor"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "In Valentina Re’s contribution to the 2017 book “W"
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "Compute the check digit the Tropicos ID for the Or"
+ ],
+ [
+ "Could you help me out with this assignment? Our pr"
+ ],
+ [
+ "Given this table defining * on the set S = {a, b, "
+ ],
+ [
+ "What time was the Tri-Rail train that carried the "
+ ],
+ [
+ "In the fictional language of Tizin, basic sentence"
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "I was trying to remember how well the Cheater Beat"
+ ],
+ [
+ "The attached file contains a list of vendors in th"
+ ],
+ [
+ "Review the chess position provided in the image. I"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "Who nominated the only Featured Article on English"
+ ],
+ [
+ "The Latin root of the Yola word \"gimlie\" shares a "
+ ],
+ [
+ "The attached file shows a list of books in the col"
+ ],
+ [
+ "According to Google Finance, when was the first ye"
+ ],
+ [
+ "Using bass clef notes, what is the age of someone "
+ ],
+ [
+ "On a leap day before the year 2008, a joke was rem"
+ ],
+ [
+ "On July 15, 2008, Phys.org published an article ab"
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "If there is anything that doesn't make sense in th"
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "The following numbers function similarly to ISBN 1"
+ ],
+ [
+ "In the year 2022, and before December, what does \""
+ ],
+ [
+ "What is the volume in milliliters of a system comp"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "The attached file lists accommodations in the reso"
+ ],
+ [
+ "In the NIH translation of the original 1913 Michae"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ],
+ [
+ "You are Van Helsing, a renowned vampire hunter. A "
+ ],
+ [
+ "Find the value of x to the nearest tenth: Lx = (d/"
+ ],
+ [
+ "You are a telecommunications engineer who wants to"
+ ],
+ [
+ "According to Box Office Mojo's 2020 Worldwide Box "
+ ],
+ [
+ "How many applicants for the job in the PDF are onl"
+ ],
+ [
+ "As of the 2020 census, what was the population dif"
+ ],
+ [
+ "The Metropolitan Museum of Art has a portrait in i"
+ ],
+ [
+ "How many slides in this PowerPoint presentation me"
+ ],
+ [
+ "This is a secret message my friend gave me. It say"
+ ],
+ [
+ "According to wikipedia, how many Asian countries s"
+ ],
+ [
+ "The work referenced in footnote 397 of Federico La"
+ ],
+ [
+ "I was referencing each of the tables in the file f"
+ ],
+ [
+ "In Nature journal's Scientific Reports conference "
+ ],
+ [
+ "The attached file shows the locomotives in the col"
+ ],
+ [
+ "How many nonindigenous crocodiles were found in Fl"
+ ],
+ [
+ "As a comma separated list with no whitespace, usin"
+ ],
+ [
+ "According to the World Bank, which countries had g"
+ ],
+ [
+ "The attached spreadsheet contains the sales of men"
+ ],
+ [
+ "Who composed the song that was performed by a roos"
+ ],
+ [
+ "I'm making a grocery list for my mom, but she's a "
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "In the 2018 VSCode blog post on replit.com, what w"
+ ],
+ [
+ "Look at the attached image. The quiz is scored as "
+ ],
+ [
+ "What writer is quoted by Merriam-Webster for the W"
+ ],
+ [
+ "Examine the video at https://www.youtube.com/watch"
+ ],
+ [
+ "Hi, I'm making a pie but I could use some help wit"
+ ],
+ [
+ "In the Scikit-Learn July 2017 changelog, what othe"
+ ],
+ [
+ "You are given this Excel file as a map. You start "
+ ],
+ [
+ "How many images are there in the latest 2022 Lego "
+ ],
+ [
+ "The attached image contains a Python script. Run t"
+ ],
+ [
+ "I thought we could try a fun word puzzle together "
+ ],
+ [
+ "On ScienceDirect, what is the difference to 3 deci"
+ ],
+ [
+ "What is the final numeric output from the attached"
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "How many more blocks (also denoted as layers) in B"
+ ],
+ [
+ "The longest-lived vertebrate is named after an isl"
+ ],
+ [
+ "On the DeepFruits fruit detection graph on Connect"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "The attached PDF lists accommodations in the resor"
+ ],
+ [
+ "This spreadsheet contains a list of clients for a "
+ ],
+ [
+ "How many times was a Twitter/X post cited as a ref"
+ ],
+ [
+ "During the first week of August 2015, one of the N"
+ ],
+ [
+ "What is the surname of the equine veterinarian men"
+ ],
+ [
+ "The YouTube channel Game Grumps began a Let’s Play"
+ ],
+ [
+ "What is the last word before the second chorus of "
+ ],
+ [
+ "Who did the actor who played Ray in the Polish-lan"
+ ],
+ [
+ "I have the Standard plan in the image below, and I"
+ ],
+ [
+ "In the endnote found in the second-to-last paragra"
+ ],
+ [
+ "The book with the doi 10.1353/book.24372 concerns "
+ ],
+ [
+ "Pull out the sentence in the following 5x7 block o"
+ ],
+ [
+ "What is the latest chronological year date written"
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ "Eva Draconis has a personal website which can be a"
+ ],
+ [
+ "How many at bats did the Yankee with the most walk"
+ ],
+ [
+ "According to Girls Who Code, how long did it take "
+ ],
+ [
+ "The attached spreadsheet contains a list of books "
+ ],
+ [
+ "How many pages if the 2023 IPCC report (85 pages v"
+ ],
+ [
+ "It's May 2023, and I'm about to drive across the U"
+ ],
+ [
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”,"
+ ],
+ [
+ "On Cornell Law School website's legal information "
+ ],
+ [
+ "How many edits were made to the Wikipedia page on "
+ ],
+ [
+ "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n"
+ ],
+ [
+ "On the BBC Earth YouTube video of the Top 5 Sillie"
+ ],
+ [
+ "What is the absolute difference in tens of thousan"
+ ],
+ [
+ "The attached spreadsheet lists the locomotives own"
+ ],
+ [
+ "The attached file lists the locomotives owned by a"
+ ],
+ [
+ "I’m thinking about selling my home, so I want to l"
+ ],
+ [
+ "When was a picture of St. Thomas Aquinas first add"
+ ],
+ [
+ "As of August 2023, who is the only winner of the U"
+ ],
+ [
+ "Take the gender split from the 2011 Bulgarian cens"
+ ],
+ [
+ "All of the individuals who formally held the posit"
+ ],
+ [
+ "Hi, I was out sick from my classes on Friday, so I"
+ ],
+ [
+ "If this whole pint is made up of ice cream, how ma"
+ ],
+ [
+ "Which of the fruits shown in the 2008 painting \"Em"
+ ],
+ [
+ "What country had the least number of athletes at t"
+ ],
+ [
+ "In the YouTube 360 VR video from March 2018 narrat"
+ ],
+ [
+ "Which of the text elements under CATEGORIES in the"
+ ],
+ [
+ "Where were the Vietnamese specimens described by K"
+ ],
+ [
+ "The cover of the August 2021 issue of Vogue shows "
+ ],
+ [
+ "I'd like to learn more about some popular reality "
+ ],
+ [
+ "I read a paper about multiwavelength observations "
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
+ ],
+ [
+ "A standard Rubik’s cube has been broken into cubes"
+ ],
+ [
+ "According to the USGS, in what year was the Americ"
+ ],
+ [
+ "The attached Excel file contains the sales of menu"
+ ],
+ [
+ "I'm curious about how much information is availabl"
+ ],
+ [
+ "What percentage of the total penguin population ac"
+ ],
+ [
+ "As of May 2023, how many stops are between South S"
+ ],
+ [
+ "According to Openreview.net, at the NeurIPS 2022 C"
+ ],
+ [
+ "Of the cities within the United States where U.S. "
+ ],
+ [
+ "Who are the pitchers with the number before and af"
+ ],
+ [
+ "In the 2015 Metropolitan Museum of Art exhibition "
+ ],
+ [
+ "On June 6, 2023, an article by Carolyn Collins Pet"
+ ],
+ [
+ "What is the area of the green polygon in the attac"
+ ],
+ [
+ "What is the first name of the only Malko Competiti"
+ ],
+ [
+ "The brand that makes these harnesses the dogs are "
+ ],
+ [
+ "The year is 2022. I am at the National Air and Spa"
+ ],
+ [
+ "What was the actual enrollment count of the clinic"
+ ],
+ [
+ "What was the complete title of the book in which t"
+ ],
+ [
+ "Bob was invited to participate in a game show, and"
+ ],
+ [
+ "In NASA's Astronomy Picture of the Day on 2006 Jan"
+ ],
+ [
+ "At the two-minute mark in the YouTube video upload"
+ ],
+ [
+ "In the film Goldfinger, what color was the object "
+ ],
+ [
+ "A 5-man group made up of one tank, one healer, and"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_o3-mini_03_february_remove-navigational
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_o3-mini_03_february_remove-navigational",
+ "line": {
+ "color": "#19d3f3",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_o3-mini_03_february_remove-navigational",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
+ "dtype": "i2"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVdU/AAAAAAAA0D+amZmZmZnJP1VVVVVVVcU/kiRJkiRJwj8AAAAAAADAPxzHcRzHccw/mpmZmZmZyT9GF1100UXHPwAAAAAAANA/FDuxEzux0z+SJEmSJEnSP1VVVVVVVdU/AAAAAAAA1D/T0tLS0tLSP3Icx3Ecx9E/XkN5DeU11D9mZmZmZmbWP1VVVVVVVdU/RhdddNFF1z9kIQtZyELWP1VVVVVVVdU/exSuR+F61D8UO7ETO7HTP2gvob2E9tI/kiRJkiRJ0j+WexphuafRPzMzMzMzM9M/lVJKKaWU0j8AAAAAAADSP2WTTTbZZNM/09LS0tLS0j+SJEmSJEnSP3Icx3Ecx9E/whT5rBtM0T9sKK+hvIbSP9IgDdIgDdI/mpmZmZmZ0T+7ErUrUbvSP5IkSZIkSdI/1pQ1ZU1Z0z9ddNFFF13UP5Q+6ZM+6dM/OL3pTW960z9MriAmVxDTP6uqqqqqqtI/kiRJkiRJ0j8zMzMzMzPTP9PS0tLS0tI/FDuxEzux0z/BeCv7HFLTP19CewntJdQ/yFOCPCXI0z/btm3btm3TP2cxncV0FtM/Ccs9jbDc0z/vy2MrgobTPzMzMzMzM9M/JkOwjjbv0z+llFJKKaXUP1VVVVVVVdU/AAAAAAAA1T+WWqmVWqnVP1VVVVVVVdU/F341JtID1T+mpaWlpaXVP1VVVVVVVdU/Fl/xFV/x1T9ItMLmQKLVP1VVVVVVVdU/r169evXq1T/yWTeYIp/VP1VVVVVVVdU/2FBeQ3kN1T/sHVMDe8fUP1VVVVVVVdU/ICod2TMQ1T/NzMzMzMzUPwaebs0Pi9Q/S9SuRO1K1D/6tYdcVgzUPyVJkiRJktQ/VFRUVFRU1D8GfUFf0BfUPwnLPY2w3NM/o4suuuii0z83talNbWrTP5Q+6ZM+6dM/VEZlVEZl1D9DFrKQhSzUP6WUUkoppdQ/Ut/ZqO9s1D8mTv2eW+LUP1VVVVVVVdU/Ffji6gcd1T85BS+n4OXUP1VVVVVVVdU/H4XrUbge1T/L8I0oMOnUP7W0tLS0tNQ/k/OyiE8g1T/FTuzETuzUP5VLuZRLudQ/E4y3ss8h1T9RGh+ZQO/UP1VVVVVVVdU/NFIxtzoj1T+HtW9Y+4bVP1VVVVVVVdU/SZIkSZIk1T/DSk8trPTUP1pMZzGdxdQ/SeXDuF+X1D/mnkZY7mnUP9RDPdRDPdQ/J3VfHlsR1D+U3W+U3W/UP0RERERERNQ/69khcGMZ1D8mQ7CONu/TP0vUrkTtStQ/IYQQQggh1D97FK5H4XrUPxRFURRFUdQ/CoVCoVAo1D8AAAAAAADUP/aEPWFP2NM/FDuxEzux0z+Hae6Cv4rTP+GDDz744NM/qzut7rS60z9+NSbSA5XTP/42xajhb9M/S0tLS0tL0z8xNguqPSfTPzDXDsy1A9M/UfxFzrDg0j8zMzMzMzPTP0yuICZXENM/K2wOJFph0z8UO7ETO7HTPwAAAAAAANQ/Ccs9jbDc0z+hQoUKFSrUPwJl4kr3BtQ/RT7rBlPk0z8M1XTMJcLTP6DTBjptoNM/n65P16fr0z+ivIbyGsrTP1T+qFP+qNM/zspPiLPy0z/SExw9wdHTPxQ7sRM7sdM/Qbgb+x6R0z/jJszvanHTP8F4K/scUtM/MzMzMzMz0z9Wigm6qxTTP2gvob2E9tI/n6lcd7zY0j+7ErUrUbvSP54S5ClBntI/",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "A paper about AI regulation that was originally su"
+ ],
+ [
+ "If we assume all articles published by Nature in 2"
+ ],
+ [
+ "In Unlambda, what exact charcter or text needs to "
+ ],
+ [
+ "I’m researching species that became invasive after"
+ ],
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ],
+ [
+ "How many studio albums were published by Mercedes "
+ ],
+ [
+ "If Eliud Kipchoge could maintain his record-making"
+ ],
+ [
+ "The object in the British Museum's collection with"
+ ],
+ [
+ "According to github, when was Regression added to "
+ ],
+ [
+ "Here's a fun riddle that I think you'll enjoy.\n\nYo"
+ ],
+ [
+ "Using the Biopython library in Python, parse the P"
+ ],
+ [
+ "What are the EC numbers of the two most commonly u"
+ ],
+ [
+ "In July 2, 1959 United States standards for grades"
+ ],
+ [
+ "In April of 1977, who was the Prime Minister of th"
+ ],
+ [
+ "Use density measures from the chemistry materials "
+ ],
+ [
+ "What was the volume in m^3 of the fish bag that wa"
+ ],
+ [
+ "What is the average number of pre-2020 works on th"
+ ],
+ [
+ "In the video https://www.youtube.com/watch?v=L1vXC"
+ ],
+ [
+ "Of the authors (First M. Last) that worked on the "
+ ],
+ [
+ "When you take the average of the standard populati"
+ ],
+ [
+ "Assuming scientists in the famous youtube video Th"
+ ],
+ [
+ "In Series 9, Episode 11 of Doctor Who, the Doctor "
+ ],
+ [
+ "In terms of geographical distance between capital "
+ ],
+ [
+ "In the NCATS PubChem compound database for Food Ad"
+ ],
+ [
+ "I need to fact-check a citation. This is the citat"
+ ],
+ [
+ "Which contributor to the version of OpenCV where s"
+ ],
+ [
+ "What integer-rounded percentage of the total lengt"
+ ],
+ [
+ "An office held a Secret Santa gift exchange where "
+ ],
+ [
+ "What is the maximum length in meters of #9 in the "
+ ],
+ [
+ "What two-word type of model did Manash Pratim Kash"
+ ],
+ [
+ "What animals that were mentioned in both Ilias Lag"
+ ],
+ [
+ "How many High Energy Physics - Lattice articles li"
+ ],
+ [
+ "The photograph in the Whitney Museum of American A"
+ ],
+ [
+ ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
+ ],
+ [
+ "What is the minimum number of page links a person "
+ ],
+ [
+ "Each cell in the attached spreadsheet represents a"
+ ],
+ [
+ "Which of the text elements under CATEGORIES in the"
+ ],
+ [
+ "I went to Virtue restaurant & bar in Chicago for m"
+ ],
+ [
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) "
+ ],
+ [
+ "My family reunion is this week, and I was assigned"
+ ],
+ [
+ "In Emily Midkiff's June 2014 article in a journal "
+ ],
+ [
+ "It is 1999. Before you party like it is 1999, plea"
+ ],
+ [
+ "Under DDC 633 on Bielefeld University Library's BA"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_qwen-coder-32B_03_february_text
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_qwen-coder-32B_03_february_text",
+ "line": {
+ "color": "#FF6692",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_qwen-coder-32B_03_february_text",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKg==",
+ "dtype": "i1"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVdU/AAAAAAAA0D+amZmZmZnZP1VVVVVVVdU/kiRJkiRJ0j8AAAAAAADQPxzHcRzHccw/mpmZmZmZyT9GF1100UXHP1VVVVVVVcU/FDuxEzuxwz+SJEmSJEnCP5qZmZmZmck/AAAAAAAA0D8eHh4eHh7OPxzHcRzHccw/KK+hvIbyyj+amZmZmZnJP57neZ7nec4/F1100UUXzT+96U1vetPLP6uqqqqqqso/mpmZmZmZyT/ZiZ3YiZ3IP0J7Ce0ltMc/t23btm3bxj98GmG5pxHGP1VVVVVVVcU/pZRSSimlxD8AAAAAAADEP2WTTTbZZMM/l5aWlpaWxj8WX/EVX/HFPzmO4ziO48g/doMp8lk3yD9DeQ3lNZTHPxqkQRqkQco/zczMzMzMzD8ZnI/B+RjMP9u2bdu2bcs/s6asKWvKyj8=",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "customdata": [
+ [
+ "The attached spreadsheet shows the inventory for a"
+ ]
+ ],
+ "hovertemplate": "agent_name=code_sonnet_03_february_goodoldtext-unbroken
index=%{x}
is_correct=%{y}
question=%{customdata[0]}",
+ "legendgroup": "code_sonnet_03_february_goodoldtext-unbroken",
+ "line": {
+ "color": "#B6E880",
+ "dash": "solid"
+ },
+ "marker": {
+ "symbol": "circle"
+ },
+ "mode": "lines",
+ "name": "code_sonnet_03_february_goodoldtext-unbroken",
+ "showlegend": true,
+ "type": "scattergl",
+ "x": {
+ "bdata": "AA==",
+ "dtype": "i1"
+ },
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAAAAA=",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ }
+ ],
+ "layout": {
+ "legend": {
+ "title": {
+ "text": "agent_name"
+ },
+ "tracegroupgap": 0
+ },
+ "margin": {
+ "t": 60
+ },
+ "template": {
+ "data": {
+ "bar": [
+ {
+ "error_x": {
+ "color": "#2a3f5f"
+ },
+ "error_y": {
+ "color": "#2a3f5f"
+ },
+ "marker": {
+ "line": {
+ "color": "#E5ECF6",
+ "width": 0.5
+ },
+ "pattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ }
+ },
+ "type": "bar"
+ }
+ ],
+ "barpolar": [
+ {
+ "marker": {
+ "line": {
+ "color": "#E5ECF6",
+ "width": 0.5
+ },
+ "pattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ }
+ },
+ "type": "barpolar"
+ }
+ ],
+ "carpet": [
+ {
+ "aaxis": {
+ "endlinecolor": "#2a3f5f",
+ "gridcolor": "white",
+ "linecolor": "white",
+ "minorgridcolor": "white",
+ "startlinecolor": "#2a3f5f"
+ },
+ "baxis": {
+ "endlinecolor": "#2a3f5f",
+ "gridcolor": "white",
+ "linecolor": "white",
+ "minorgridcolor": "white",
+ "startlinecolor": "#2a3f5f"
+ },
+ "type": "carpet"
+ }
+ ],
+ "choropleth": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "choropleth"
+ }
+ ],
+ "contour": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "contour"
+ }
+ ],
+ "contourcarpet": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "contourcarpet"
+ }
+ ],
+ "heatmap": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "heatmap"
+ }
+ ],
+ "histogram": [
+ {
+ "marker": {
+ "pattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ }
+ },
+ "type": "histogram"
+ }
+ ],
+ "histogram2d": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "histogram2d"
+ }
+ ],
+ "histogram2dcontour": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "histogram2dcontour"
+ }
+ ],
+ "mesh3d": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "mesh3d"
+ }
+ ],
+ "parcoords": [
+ {
+ "line": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "parcoords"
+ }
+ ],
+ "pie": [
+ {
+ "automargin": true,
+ "type": "pie"
+ }
+ ],
+ "scatter": [
+ {
+ "fillpattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ },
+ "type": "scatter"
+ }
+ ],
+ "scatter3d": [
+ {
+ "line": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatter3d"
+ }
+ ],
+ "scattercarpet": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattercarpet"
+ }
+ ],
+ "scattergeo": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattergeo"
+ }
+ ],
+ "scattergl": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattergl"
+ }
+ ],
+ "scattermap": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattermap"
+ }
+ ],
+ "scattermapbox": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattermapbox"
+ }
+ ],
+ "scatterpolar": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterpolar"
+ }
+ ],
+ "scatterpolargl": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterpolargl"
+ }
+ ],
+ "scatterternary": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterternary"
+ }
+ ],
+ "surface": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "surface"
+ }
+ ],
+ "table": [
+ {
+ "cells": {
+ "fill": {
+ "color": "#EBF0F8"
+ },
+ "line": {
+ "color": "white"
+ }
+ },
+ "header": {
+ "fill": {
+ "color": "#C8D4E3"
+ },
+ "line": {
+ "color": "white"
+ }
+ },
+ "type": "table"
+ }
+ ]
+ },
+ "layout": {
+ "annotationdefaults": {
+ "arrowcolor": "#2a3f5f",
+ "arrowhead": 0,
+ "arrowwidth": 1
+ },
+ "autotypenumbers": "strict",
+ "coloraxis": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "colorscale": {
+ "diverging": [
+ [
+ 0,
+ "#8e0152"
+ ],
+ [
+ 0.1,
+ "#c51b7d"
+ ],
+ [
+ 0.2,
+ "#de77ae"
+ ],
+ [
+ 0.3,
+ "#f1b6da"
+ ],
+ [
+ 0.4,
+ "#fde0ef"
+ ],
+ [
+ 0.5,
+ "#f7f7f7"
+ ],
+ [
+ 0.6,
+ "#e6f5d0"
+ ],
+ [
+ 0.7,
+ "#b8e186"
+ ],
+ [
+ 0.8,
+ "#7fbc41"
+ ],
+ [
+ 0.9,
+ "#4d9221"
+ ],
+ [
+ 1,
+ "#276419"
+ ]
+ ],
+ "sequential": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "sequentialminus": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ]
+ },
+ "colorway": [
+ "#636efa",
+ "#EF553B",
+ "#00cc96",
+ "#ab63fa",
+ "#FFA15A",
+ "#19d3f3",
+ "#FF6692",
+ "#B6E880",
+ "#FF97FF",
+ "#FECB52"
+ ],
+ "font": {
+ "color": "#2a3f5f"
+ },
+ "geo": {
+ "bgcolor": "white",
+ "lakecolor": "white",
+ "landcolor": "#E5ECF6",
+ "showlakes": true,
+ "showland": true,
+ "subunitcolor": "white"
+ },
+ "hoverlabel": {
+ "align": "left"
+ },
+ "hovermode": "closest",
+ "mapbox": {
+ "style": "light"
+ },
+ "paper_bgcolor": "white",
+ "plot_bgcolor": "#E5ECF6",
+ "polar": {
+ "angularaxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ },
+ "bgcolor": "#E5ECF6",
+ "radialaxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ }
+ },
+ "scene": {
+ "xaxis": {
+ "backgroundcolor": "#E5ECF6",
+ "gridcolor": "white",
+ "gridwidth": 2,
+ "linecolor": "white",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "white"
+ },
+ "yaxis": {
+ "backgroundcolor": "#E5ECF6",
+ "gridcolor": "white",
+ "gridwidth": 2,
+ "linecolor": "white",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "white"
+ },
+ "zaxis": {
+ "backgroundcolor": "#E5ECF6",
+ "gridcolor": "white",
+ "gridwidth": 2,
+ "linecolor": "white",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "white"
+ }
+ },
+ "shapedefaults": {
+ "line": {
+ "color": "#2a3f5f"
+ }
+ },
+ "ternary": {
+ "aaxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ },
+ "baxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ },
+ "bgcolor": "#E5ECF6",
+ "caxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ }
+ },
+ "title": {
+ "x": 0.05
+ },
+ "xaxis": {
+ "automargin": true,
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": "",
+ "title": {
+ "standoff": 15
+ },
+ "zerolinecolor": "white",
+ "zerolinewidth": 2
+ },
+ "yaxis": {
+ "automargin": true,
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": "",
+ "title": {
+ "standoff": 15
+ },
+ "zerolinecolor": "white",
+ "zerolinewidth": 2
+ }
+ }
+ },
+ "xaxis": {
+ "anchor": "y",
+ "domain": [
+ 0,
+ 1
+ ],
+ "title": {
+ "text": "index"
+ }
+ },
+ "yaxis": {
+ "anchor": "x",
+ "domain": [
+ 0,
+ 1
+ ],
+ "title": {
+ "text": "is_correct"
+ }
+ }
+ }
+ }
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import plotly.express as px\n",
+ "\n",
+ "\n",
+ "cumulative_df = (\n",
+ " (\n",
+ " sel_df.groupby(\"agent_name\")[[\"is_correct\", \"is_near_correct\"]]\n",
+ " .expanding(min_periods=1, axis=0, method=\"single\")\n",
+ " .agg({\"is_correct\": \"mean\", \"is_near_correct\": \"count\"})\n",
+ " .reset_index()\n",
+ " )\n",
+ " .copy()\n",
+ " .rename(columns={\"is_near_correct\": \"index\"})\n",
+ ")\n",
+ "cumulative_df[\"index\"] = cumulative_df[\"index\"].astype(int) - 1\n",
+ "\n",
+ "\n",
+ "def find_question(row):\n",
+ " try:\n",
+ " res = sel_df.loc[sel_df[\"agent_name\"] == row[\"agent_name\"], \"question\"].iloc[row[\"index\"]][:50]\n",
+ " return res\n",
+ " except Exception:\n",
+ " return \"\"\n",
+ "\n",
+ "\n",
+ "cumulative_df[\"question\"] = cumulative_df.apply(find_question, axis=1)\n",
+ "# cumulative_df[\"question\"] = [el[:50] for el in sel_df[\"question\"].values]\n",
+ "\n",
+ "# cumulative_df[\"is_correct\"] = cumulative_df[\"is_correct\"] * (165 - 68) / 165\n",
+ "\n",
+ "px.line(\n",
+ " cumulative_df,\n",
+ " color=\"agent_name\",\n",
+ " x=\"index\",\n",
+ " y=\"is_correct\",\n",
+ " hover_data=\"question\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 3. Dive deeper into one run"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "165\n"
+ ]
+ }
+ ],
+ "source": [
+ "sel_df = result_df.loc[result_df[\"agent_name\"] == o1]\n",
+ "print(len(sel_df))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Count errors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2022001392.py:10: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ "\n",
+ "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2022001392.py:10: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ "\n",
+ "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2022001392.py:10: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ "\n",
+ "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2022001392.py:10: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ "\n",
+ "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2022001392.py:11: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "\n",
+ "error_types = [\n",
+ " \"AgentParsingError\",\n",
+ " \"AgentExecutionError\",\n",
+ " \"AgentMaxIterationsError\",\n",
+ " \"AgentGenerationError\",\n",
+ "]\n",
+ "sel_df[error_types] = 0\n",
+ "sel_df[\"Count steps\"] = np.nan\n",
+ "\n",
+ "\n",
+ "def count_errors(row):\n",
+ " if isinstance(row[\"intermediate_steps\"], list):\n",
+ " row[\"Count steps\"] = len(row[\"intermediate_steps\"])\n",
+ " for step in row[\"intermediate_steps\"]:\n",
+ " if isinstance(step, dict) and \"error\" in step:\n",
+ " try:\n",
+ " row[str(step[\"error\"][\"error_type\"])] += 1\n",
+ " except Exception:\n",
+ " pass\n",
+ " return row\n",
+ "\n",
+ "\n",
+ "sel_df = sel_df.apply(count_errors, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.plotly.v1+json": {
+ "config": {
+ "plotlyServerURL": "https://plot.ly"
+ },
+ "data": [
+ {
+ "hovertemplate": "is_correct=False
variable=%{x}
Average count=%{y}",
+ "legendgroup": "False",
+ "marker": {
+ "color": "#636efa",
+ "pattern": {
+ "shape": ""
+ }
+ },
+ "name": "False",
+ "orientation": "v",
+ "showlegend": true,
+ "textposition": "outside",
+ "type": "bar",
+ "x": [
+ "AgentParsingError",
+ "AgentExecutionError",
+ "AgentMaxIterationsError",
+ "AgentGenerationError",
+ "Count steps"
+ ],
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACSJEmSJEkMQA==",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ },
+ {
+ "hovertemplate": "is_correct=True
variable=%{x}
Average count=%{y}",
+ "legendgroup": "True",
+ "marker": {
+ "color": "#EF553B",
+ "pattern": {
+ "shape": ""
+ }
+ },
+ "name": "True",
+ "orientation": "v",
+ "showlegend": true,
+ "textposition": "outside",
+ "type": "bar",
+ "x": [
+ "AgentParsingError",
+ "AgentExecutionError",
+ "AgentMaxIterationsError",
+ "AgentGenerationError",
+ "Count steps"
+ ],
+ "xaxis": "x",
+ "y": {
+ "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABPt+aHRcoIQA==",
+ "dtype": "f8"
+ },
+ "yaxis": "y"
+ }
+ ],
+ "layout": {
+ "bargroupgap": 0,
+ "barmode": "group",
+ "height": 500,
+ "legend": {
+ "title": {
+ "text": "is_correct"
+ },
+ "tracegroupgap": 0
+ },
+ "margin": {
+ "t": 60
+ },
+ "template": {
+ "data": {
+ "bar": [
+ {
+ "error_x": {
+ "color": "#2a3f5f"
+ },
+ "error_y": {
+ "color": "#2a3f5f"
+ },
+ "marker": {
+ "line": {
+ "color": "#E5ECF6",
+ "width": 0.5
+ },
+ "pattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ }
+ },
+ "type": "bar"
+ }
+ ],
+ "barpolar": [
+ {
+ "marker": {
+ "line": {
+ "color": "#E5ECF6",
+ "width": 0.5
+ },
+ "pattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ }
+ },
+ "type": "barpolar"
+ }
+ ],
+ "carpet": [
+ {
+ "aaxis": {
+ "endlinecolor": "#2a3f5f",
+ "gridcolor": "white",
+ "linecolor": "white",
+ "minorgridcolor": "white",
+ "startlinecolor": "#2a3f5f"
+ },
+ "baxis": {
+ "endlinecolor": "#2a3f5f",
+ "gridcolor": "white",
+ "linecolor": "white",
+ "minorgridcolor": "white",
+ "startlinecolor": "#2a3f5f"
+ },
+ "type": "carpet"
+ }
+ ],
+ "choropleth": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "choropleth"
+ }
+ ],
+ "contour": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "contour"
+ }
+ ],
+ "contourcarpet": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "contourcarpet"
+ }
+ ],
+ "heatmap": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "heatmap"
+ }
+ ],
+ "histogram": [
+ {
+ "marker": {
+ "pattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ }
+ },
+ "type": "histogram"
+ }
+ ],
+ "histogram2d": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "histogram2d"
+ }
+ ],
+ "histogram2dcontour": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "histogram2dcontour"
+ }
+ ],
+ "mesh3d": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "mesh3d"
+ }
+ ],
+ "parcoords": [
+ {
+ "line": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "parcoords"
+ }
+ ],
+ "pie": [
+ {
+ "automargin": true,
+ "type": "pie"
+ }
+ ],
+ "scatter": [
+ {
+ "fillpattern": {
+ "fillmode": "overlay",
+ "size": 10,
+ "solidity": 0.2
+ },
+ "type": "scatter"
+ }
+ ],
+ "scatter3d": [
+ {
+ "line": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatter3d"
+ }
+ ],
+ "scattercarpet": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattercarpet"
+ }
+ ],
+ "scattergeo": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattergeo"
+ }
+ ],
+ "scattergl": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattergl"
+ }
+ ],
+ "scattermap": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattermap"
+ }
+ ],
+ "scattermapbox": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattermapbox"
+ }
+ ],
+ "scatterpolar": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterpolar"
+ }
+ ],
+ "scatterpolargl": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterpolargl"
+ }
+ ],
+ "scatterternary": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterternary"
+ }
+ ],
+ "surface": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "surface"
+ }
+ ],
+ "table": [
+ {
+ "cells": {
+ "fill": {
+ "color": "#EBF0F8"
+ },
+ "line": {
+ "color": "white"
+ }
+ },
+ "header": {
+ "fill": {
+ "color": "#C8D4E3"
+ },
+ "line": {
+ "color": "white"
+ }
+ },
+ "type": "table"
+ }
+ ]
+ },
+ "layout": {
+ "annotationdefaults": {
+ "arrowcolor": "#2a3f5f",
+ "arrowhead": 0,
+ "arrowwidth": 1
+ },
+ "autotypenumbers": "strict",
+ "coloraxis": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "colorscale": {
+ "diverging": [
+ [
+ 0,
+ "#8e0152"
+ ],
+ [
+ 0.1,
+ "#c51b7d"
+ ],
+ [
+ 0.2,
+ "#de77ae"
+ ],
+ [
+ 0.3,
+ "#f1b6da"
+ ],
+ [
+ 0.4,
+ "#fde0ef"
+ ],
+ [
+ 0.5,
+ "#f7f7f7"
+ ],
+ [
+ 0.6,
+ "#e6f5d0"
+ ],
+ [
+ 0.7,
+ "#b8e186"
+ ],
+ [
+ 0.8,
+ "#7fbc41"
+ ],
+ [
+ 0.9,
+ "#4d9221"
+ ],
+ [
+ 1,
+ "#276419"
+ ]
+ ],
+ "sequential": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "sequentialminus": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ]
+ },
+ "colorway": [
+ "#636efa",
+ "#EF553B",
+ "#00cc96",
+ "#ab63fa",
+ "#FFA15A",
+ "#19d3f3",
+ "#FF6692",
+ "#B6E880",
+ "#FF97FF",
+ "#FECB52"
+ ],
+ "font": {
+ "color": "#2a3f5f"
+ },
+ "geo": {
+ "bgcolor": "white",
+ "lakecolor": "white",
+ "landcolor": "#E5ECF6",
+ "showlakes": true,
+ "showland": true,
+ "subunitcolor": "white"
+ },
+ "hoverlabel": {
+ "align": "left"
+ },
+ "hovermode": "closest",
+ "mapbox": {
+ "style": "light"
+ },
+ "paper_bgcolor": "white",
+ "plot_bgcolor": "#E5ECF6",
+ "polar": {
+ "angularaxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ },
+ "bgcolor": "#E5ECF6",
+ "radialaxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ }
+ },
+ "scene": {
+ "xaxis": {
+ "backgroundcolor": "#E5ECF6",
+ "gridcolor": "white",
+ "gridwidth": 2,
+ "linecolor": "white",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "white"
+ },
+ "yaxis": {
+ "backgroundcolor": "#E5ECF6",
+ "gridcolor": "white",
+ "gridwidth": 2,
+ "linecolor": "white",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "white"
+ },
+ "zaxis": {
+ "backgroundcolor": "#E5ECF6",
+ "gridcolor": "white",
+ "gridwidth": 2,
+ "linecolor": "white",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "white"
+ }
+ },
+ "shapedefaults": {
+ "line": {
+ "color": "#2a3f5f"
+ }
+ },
+ "ternary": {
+ "aaxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ },
+ "baxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ },
+ "bgcolor": "#E5ECF6",
+ "caxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ }
+ },
+ "title": {
+ "x": 0.05
+ },
+ "xaxis": {
+ "automargin": true,
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": "",
+ "title": {
+ "standoff": 15
+ },
+ "zerolinecolor": "white",
+ "zerolinewidth": 2
+ },
+ "yaxis": {
+ "automargin": true,
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": "",
+ "title": {
+ "standoff": 15
+ },
+ "zerolinecolor": "white",
+ "zerolinewidth": 2
+ }
+ }
+ },
+ "width": 800,
+ "xaxis": {
+ "anchor": "y",
+ "domain": [
+ 0,
+ 1
+ ],
+ "title": {
+ "text": "variable"
+ }
+ },
+ "yaxis": {
+ "anchor": "x",
+ "domain": [
+ 0,
+ 1
+ ],
+ "title": {
+ "text": "Average count"
+ }
+ }
+ }
+ }
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import plotly.express as px\n",
+ "\n",
+ "\n",
+ "aggregate_errors = (\n",
+ " sel_df.groupby([\"is_correct\"])[error_types + [\"Count steps\"]].mean().reset_index().melt(id_vars=[\"is_correct\"])\n",
+ ")\n",
+ "\n",
+ "fig = px.bar(\n",
+ " aggregate_errors,\n",
+ " y=\"value\",\n",
+ " x=\"variable\",\n",
+ " color=\"is_correct\",\n",
+ " labels={\n",
+ " \"agent_name\": \"Model\",\n",
+ " \"task\": \"Level\",\n",
+ " \"aggregate_score\": \"Performance\",\n",
+ " \"value\": \"Average count\",\n",
+ " \"eval_score_GPT4\": \"Score\",\n",
+ " },\n",
+ ")\n",
+ "fig.update_layout(\n",
+ " height=500,\n",
+ " width=800,\n",
+ " barmode=\"group\",\n",
+ " bargroupgap=0.0,\n",
+ ")\n",
+ "fig.update_traces(textposition=\"outside\")\n",
+ "fig.write_image(\"aggregate_errors.png\", scale=3)\n",
+ "fig.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Inspect result by file extension type"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " is_correct | \n",
+ " count_steps | \n",
+ " question | \n",
+ "
\n",
+ " \n",
+ " agent_name | \n",
+ " attachment_type | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " code_o1_01_february_text | \n",
+ " None | \n",
+ " 0.496063 | \n",
+ " 3.362205 | \n",
+ " 127 | \n",
+ "
\n",
+ " \n",
+ " csv | \n",
+ " 0.000000 | \n",
+ " 7.000000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " docx | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " jpg | \n",
+ " 0.000000 | \n",
+ " 3.000000 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " jsonld | \n",
+ " 0.000000 | \n",
+ " 8.000000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " mp3 | \n",
+ " 0.333333 | \n",
+ " 2.333333 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " pdb | \n",
+ " 0.000000 | \n",
+ " 4.000000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " pdf | \n",
+ " 0.666667 | \n",
+ " 2.666667 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " png | \n",
+ " 0.250000 | \n",
+ " 2.375000 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " pptx | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " py | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " txt | \n",
+ " 1.000000 | \n",
+ " 4.000000 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " xlsx | \n",
+ " 0.615385 | \n",
+ " 3.153846 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " zip | \n",
+ " 0.500000 | \n",
+ " 4.000000 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " is_correct count_steps question\n",
+ "agent_name attachment_type \n",
+ "code_o1_01_february_text None 0.496063 3.362205 127\n",
+ " csv 0.000000 7.000000 1\n",
+ " docx 1.000000 3.000000 1\n",
+ " jpg 0.000000 3.000000 2\n",
+ " jsonld 0.000000 8.000000 1\n",
+ " mp3 0.333333 2.333333 3\n",
+ " pdb 0.000000 4.000000 1\n",
+ " pdf 0.666667 2.666667 3\n",
+ " png 0.250000 2.375000 8\n",
+ " pptx 1.000000 3.000000 1\n",
+ " py 1.000000 3.000000 1\n",
+ " txt 1.000000 4.000000 1\n",
+ " xlsx 0.615385 3.153846 13\n",
+ " zip 0.500000 4.000000 2"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "display(\n",
+ " sel_df.groupby([\"agent_name\", \"attachment_type\"])[[\"is_correct\", \"count_steps\", \"question\"]].agg(\n",
+ " {\"is_correct\": \"mean\", \"count_steps\": \"mean\", \"question\": \"count\"}\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 4. Ensembling methods"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "counts = result_df[\"agent_name\"].value_counts()\n",
+ "long_series = result_df.loc[result_df[\"agent_name\"].isin(counts[counts > 140].index)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "agent_name\n",
+ "code_gpt4o_03_february_goodoldtext-unbroken 38.36\n",
+ "code_gpt4o_03_february_magenticbrowser 35.22\n",
+ "code_gpt4o_03_february_magenticbrowser2 36.54\n",
+ "code_gpt4o_03_february_text 37.58\n",
+ "code_o1_01_february_text 49.09\n",
+ "code_o1_03_february_fix-print-outputs 51.83\n",
+ "code_o1_03_february_fix-print-outputs2 52.56\n",
+ "code_o1_03_february_goodoldtext-unbroken 53.42\n",
+ "code_o1_03_february_remove-navigational 53.66\n",
+ "code_o1_03_february_text_high-reasoning-effort 48.48\n",
+ "code_o3-mini_03_february_remove-navigational 29.09\n",
+ "Name: is_correct, dtype: float64"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Majority score: 58.18\n",
+ "Oracle score: 70.91\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_94354/2283375871.py:25: DeprecationWarning:\n",
+ "\n",
+ "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "def majority_vote(df):\n",
+ " df = df[(df[\"prediction\"] != \"Unable to determine\") & (~df[\"prediction\"].isna()) & (df[\"prediction\"] != \"None\")]\n",
+ "\n",
+ " # First get the mode (most common answer) for each question\n",
+ " answer_modes = df.groupby(\"question\")[\"prediction\"].agg(lambda x: x.mode()[0]).reset_index()\n",
+ "\n",
+ " # For each question-answer pair, get the first occurrence's task and is_correct\n",
+ " first_occurrences = (\n",
+ " df.groupby([\"question\", \"prediction\"]).agg({\"task\": \"first\", \"is_correct\": \"first\"}).reset_index()\n",
+ " )\n",
+ "\n",
+ " # Merge the mode answers with their corresponding first occurrences\n",
+ " result = answer_modes.merge(first_occurrences, on=[\"question\", \"prediction\"], how=\"left\")\n",
+ "\n",
+ " return result\n",
+ "\n",
+ "\n",
+ "def oracle(df):\n",
+ " def get_first_correct_or_first_wrong(group):\n",
+ " correct_answers = group[group[\"is_correct\"]]\n",
+ " if len(correct_answers) > 0:\n",
+ " return correct_answers.iloc[0]\n",
+ " return group.iloc[0]\n",
+ "\n",
+ " result = df.groupby(\"question\").apply(get_first_correct_or_first_wrong)\n",
+ "\n",
+ " return result.reset_index(drop=True)\n",
+ "\n",
+ "\n",
+ "display((long_series.groupby(\"agent_name\")[\"is_correct\"].mean() * 100).round(2))\n",
+ "print(f\"Majority score: {majority_vote(long_series)['is_correct'].mean() * 100:.2f}\")\n",
+ "print(f\"Oracle score: {oracle(long_series)['is_correct'].mean() * 100:.2f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "gaia",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/open_deep_research/requirements.txt b/examples/open_deep_research/requirements.txt
new file mode 100644
index 000000000..a18936ae4
--- /dev/null
+++ b/examples/open_deep_research/requirements.txt
@@ -0,0 +1,39 @@
+anthropic>=0.37.1
+beautifulsoup4>=4.12.3
+datasets>=2.21.0
+google_search_results>=2.4.2
+huggingface_hub>=0.23.4
+mammoth>=1.8.0
+markdownify>=0.13.1
+numexpr>=2.10.1
+numpy>=2.1.2
+openai>=1.52.2
+openpyxl
+pandas>=2.2.3
+pathvalidate>=3.2.1
+pdfminer>=20191125
+pdfminer.six>=20240706
+Pillow>=11.0.0
+puremagic>=1.28
+pypdf>=5.1.0
+python-dotenv>=1.0.1
+python_pptx>=1.0.2
+Requests>=2.32.3
+serpapi>=0.1.5
+tqdm>=4.66.4
+torch>=2.2.2
+torchvision>=0.17.2
+transformers>=4.46.0
+youtube_transcript_api>=0.6.2
+chess
+sympy
+pubchempy
+Bio
+scikit-learn
+scipy
+pydub
+PyPDF2
+python-pptx
+torch
+xlrd
+SpeechRecognition
\ No newline at end of file
diff --git a/examples/open_deep_research/run.py b/examples/open_deep_research/run.py
new file mode 100644
index 000000000..7e1231136
--- /dev/null
+++ b/examples/open_deep_research/run.py
@@ -0,0 +1,300 @@
+import argparse
+import json
+import os
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime
+from pathlib import Path
+from typing import List
+
+import datasets
+import pandas as pd
+from dotenv import load_dotenv
+from huggingface_hub import login
+from scripts.reformulator import prepare_response
+from scripts.run_agents import (
+ get_single_file_description,
+ get_zip_description,
+)
+from scripts.text_inspector_tool import TextInspectorTool
+from scripts.text_web_browser import (
+ ArchiveSearchTool,
+ FinderTool,
+ FindNextTool,
+ PageDownTool,
+ PageUpTool,
+ SearchInformationTool,
+ SimpleTextBrowser,
+ VisitTool,
+)
+from scripts.visual_qa import visualizer
+from tqdm import tqdm
+
+from smolagents import (
+ MANAGED_AGENT_PROMPT,
+ CodeAgent,
+ # HfApiModel,
+ LiteLLMModel,
+ Model,
+ ToolCallingAgent,
+)
+
+
+AUTHORIZED_IMPORTS = [
+ "requests",
+ "zipfile",
+ "os",
+ "pandas",
+ "numpy",
+ "sympy",
+ "json",
+ "bs4",
+ "pubchempy",
+ "xml",
+ "yahoo_finance",
+ "Bio",
+ "sklearn",
+ "scipy",
+ "pydub",
+ "io",
+ "PIL",
+ "chess",
+ "PyPDF2",
+ "pptx",
+ "torch",
+ "datetime",
+ "fractions",
+ "csv",
+]
+load_dotenv(override=True)
+login(os.getenv("HF_TOKEN"))
+
+append_answer_lock = threading.Lock()
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--concurrency", type=int, default=8)
+ parser.add_argument("--model-id", type=str, default="o1")
+ parser.add_argument("--api-base", type=str, default=None)
+ parser.add_argument("--run-name", type=str, required=True)
+ return parser.parse_args()
+
+
+### IMPORTANT: EVALUATION SWITCHES
+
+print("Make sure you deactivated Tailscale VPN, else some URLs will be blocked!")
+
+USE_OPEN_MODELS = False
+
+SET = "validation"
+
+custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
+
+### LOAD EVALUATION DATASET
+
+eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")[SET]
+eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})
+
+
+def preprocess_file_paths(row):
+ if len(row["file_name"]) > 0:
+ row["file_name"] = f"data/gaia/{SET}/" + row["file_name"]
+ return row
+
+
+eval_ds = eval_ds.map(preprocess_file_paths)
+eval_df = pd.DataFrame(eval_ds)
+print("Loaded evaluation dataset:")
+print(eval_df["task"].value_counts())
+
+user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
+
+BROWSER_CONFIG = {
+ "viewport_size": 1024 * 5,
+ "downloads_folder": "downloads_folder",
+ "request_kwargs": {
+ "headers": {"User-Agent": user_agent},
+ "timeout": 300,
+ },
+ "serpapi_key": os.getenv("SERPAPI_API_KEY"),
+}
+
+os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)
+
+
+def create_agent_hierarchy(model: Model):
+ text_limit = 100000
+ ti_tool = TextInspectorTool(model, text_limit)
+
+ browser = SimpleTextBrowser(**BROWSER_CONFIG)
+
+ WEB_TOOLS = [
+ SearchInformationTool(browser),
+ VisitTool(browser),
+ PageUpTool(browser),
+ PageDownTool(browser),
+ FinderTool(browser),
+ FindNextTool(browser),
+ ArchiveSearchTool(browser),
+ TextInspectorTool(model, text_limit),
+ ]
+ text_webbrowser_agent = ToolCallingAgent(
+ model=model,
+ tools=WEB_TOOLS,
+ max_steps=20,
+ verbosity_level=2,
+ planning_interval=4,
+ name="search_agent",
+ description="""A team member that will search the internet to answer your question.
+ Ask him for all your questions that require browsing the web.
+ Provide him as much context as possible, in particular if you need to search on a specific timeframe!
+ And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
+ Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
+ """,
+ provide_run_summary=True,
+ managed_agent_prompt=MANAGED_AGENT_PROMPT
+ + """You can navigate to .txt online files.
+ If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
+ Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information.""",
+ )
+
+ manager_agent = CodeAgent(
+ model=model,
+ tools=[visualizer, ti_tool],
+ max_steps=12,
+ verbosity_level=2,
+ additional_authorized_imports=AUTHORIZED_IMPORTS,
+ planning_interval=4,
+ managed_agents=[text_webbrowser_agent],
+ )
+ return manager_agent
+
+
+def append_answer(entry: dict, jsonl_file: str) -> None:
+ jsonl_file = Path(jsonl_file)
+ jsonl_file.parent.mkdir(parents=True, exist_ok=True)
+ with append_answer_lock, open(jsonl_file, "a", encoding="utf-8") as fp:
+ fp.write(json.dumps(entry) + "\n")
+ assert os.path.exists(jsonl_file), "File not found!"
+ print("Answer exported to file:", jsonl_file.resolve())
+
+
+def answer_single_question(example, model_id, answers_file, visual_inspection_tool):
+ model = LiteLLMModel(
+ model_id,
+ custom_role_conversions=custom_role_conversions,
+ max_completion_tokens=8192,
+ reasoning_effort="high",
+ )
+ # model = HfApiModel("Qwen/Qwen2.5-72B-Instruct", provider="together")
+ # "https://lnxyuvj02bpe6mam.us-east-1.aws.endpoints.huggingface.cloud",
+ # custom_role_conversions=custom_role_conversions,
+ # # provider="sambanova",
+ # max_tokens=8096,
+ # )
+ document_inspection_tool = TextInspectorTool(model, 100000)
+
+ agent = create_agent_hierarchy(model)
+
+ augmented_question = """You have one question to answer. It is paramount that you provide a correct answer.
+Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be tolerated, success will be rewarded.
+Run verification steps if that's needed, you must make sure you find the correct answer!
+Here is the task:
+""" + example["question"]
+
+ if example["file_name"]:
+ if ".zip" in example["file_name"]:
+ prompt_use_files = "\n\nTo solve the task above, you will have to use these attached files:\n"
+ prompt_use_files += get_zip_description(
+ example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool
+ )
+ else:
+ prompt_use_files = "\n\nTo solve the task above, you will have to use this attached file:"
+ prompt_use_files += get_single_file_description(
+ example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool
+ )
+ augmented_question += prompt_use_files
+
+ start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ try:
+ # Run agent 🚀
+ final_result = agent.run(augmented_question)
+
+ agent_memory = agent.write_memory_to_messages(summary_mode=True)
+
+ final_result = prepare_response(augmented_question, agent_memory, reformulation_model=model)
+
+ output = str(final_result)
+ for memory_step in agent.memory.steps:
+ memory_step.model_input_messages = None
+ intermediate_steps = [str(step) for step in agent.memory.steps]
+
+ # Check for parsing errors which indicate the LLM failed to follow the required format
+ parsing_error = True if any(["AgentParsingError" in step for step in intermediate_steps]) else False
+
+ # check if iteration limit exceeded
+ iteration_limit_exceeded = True if "Agent stopped due to iteration limit or time limit." in output else False
+ raised_exception = False
+
+ except Exception as e:
+ print("Error on ", augmented_question, e)
+ output = None
+ intermediate_steps = []
+ parsing_error = False
+ iteration_limit_exceeded = False
+ exception = e
+ raised_exception = True
+ end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ annotated_example = {
+ "agent_name": model.model_id,
+ "question": example["question"],
+ "augmented_question": augmented_question,
+ "prediction": output,
+ "intermediate_steps": intermediate_steps,
+ "parsing_error": parsing_error,
+ "iteration_limit_exceeded": iteration_limit_exceeded,
+ "agent_error": str(exception) if raised_exception else None,
+ "start_time": start_time,
+ "end_time": end_time,
+ "task": example["task"],
+ "task_id": example["task_id"],
+ "true_answer": example["true_answer"],
+ }
+ append_answer(annotated_example, answers_file)
+
+
+def get_examples_to_answer(answers_file, eval_ds) -> List[dict]:
+ print(f"Loading answers from {answers_file}...")
+ try:
+ done_questions = pd.read_json(answers_file, lines=True)["question"].tolist()
+ print(f"Found {len(done_questions)} previous results!")
+ except Exception as e:
+ print("Error when loading records: ", e)
+ print("No usable records! ▶️ Starting new.")
+ done_questions = []
+ return [line for line in eval_ds.to_list() if line["question"] not in done_questions]
+
+
+def main():
+ args = parse_args()
+ print(f"Starting run with arguments: {args}")
+
+ answers_file = f"output/{SET}/{args.run_name}.jsonl"
+ tasks_to_run = get_examples_to_answer(answers_file, eval_ds)
+
+ with ThreadPoolExecutor(max_workers=args.concurrency) as exe:
+ futures = [
+ exe.submit(answer_single_question, example, args.model_id, answers_file, visualizer)
+ for example in tasks_to_run
+ ]
+ for f in tqdm(as_completed(futures), total=len(tasks_to_run), desc="Processing tasks"):
+ f.result()
+
+ # for example in tasks_to_run:
+ # answer_single_question(example, args.model_id, answers_file, visualizer)
+ print("All tasks processed.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/open_deep_research/scripts/cookies.py b/examples/open_deep_research/scripts/cookies.py
new file mode 100644
index 000000000..8e4233356
--- /dev/null
+++ b/examples/open_deep_research/scripts/cookies.py
@@ -0,0 +1,715 @@
+from requests.cookies import RequestsCookieJar
+
+
+COOKIES_LIST = [
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1718884961,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "ST-xuwub9",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753004444.745411,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "__Secure-YEC",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753434620.050824,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "__Secure-3PSID",
+ "path": "/",
+ "sameSite": "no_restriction",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1750420959.974642,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "SIDCC",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753434620.050652,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "SID",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1750420958.397534,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "__Secure-1PSIDTS",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753433494.44729,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_ga_M0180HEFCY",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753434620.050933,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "SAPISID",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1750420959.974764,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "__Secure-1PSIDCC",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753434620.050881,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "SSID",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "AmlwXHnQvOQ10LVd-",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753434620.050959,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "__Secure-1PAPISID",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753434620.050795,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "__Secure-1PSID",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753434620.050993,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "__Secure-3PAPISID",
+ "path": "/",
+ "sameSite": "no_restriction",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1750420959.974815,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "__Secure-3PSIDCC",
+ "path": "/",
+ "sameSite": "no_restriction",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1750420958.397647,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "__Secure-3PSIDTS",
+ "path": "/",
+ "sameSite": "no_restriction",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753434620.050908,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "APISID",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753434620.050855,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "HSID",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "AasA7hmRuTFv7vjoq",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753435873.577793,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "LOGIN_INFO",
+ "path": "/",
+ "sameSite": "no_restriction",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
+ },
+ {
+ "domain": ".youtube.com",
+ "expirationDate": 1753444956.555608,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "PREF",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
+ },
+]
+
+COOKIES_LIST += [
+ {
+ "domain": ".www.researchgate.net",
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "isInstIp",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": True,
+ "storeId": None,
+ "value": "False",
+ },
+ {
+ "domain": ".researchgate.net",
+ "expirationDate": 1734423981,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "__eoi",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
+ },
+ {
+ "domain": ".www.researchgate.net",
+ "expirationDate": 1753444909.646103,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "ptc",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "RG1.8947708639250500550.1718872043",
+ },
+ {
+ "domain": ".researchgate.net",
+ "expirationDate": 1750507578,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "euconsent-v2-didomi",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
+ },
+ {
+ "domain": ".researchgate.net",
+ "expirationDate": 1718885236,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_gat",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "1",
+ },
+ {
+ "domain": "www.researchgate.net",
+ "expirationDate": 1721477183,
+ "hostOnly": True,
+ "httpOnly": False,
+ "name": "_pbjs_userid_consent_data",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "3524755945110770",
+ },
+ {
+ "domain": ".researchgate.net",
+ "expirationDate": 1752567981,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "__gads",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
+ },
+ {
+ "domain": ".researchgate.net",
+ "expirationDate": 1718886709.646173,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "__cf_bm",
+ "path": "/",
+ "sameSite": "no_restriction",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
+ },
+ {
+ "domain": ".researchgate.net",
+ "expirationDate": 1752567981,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "__gpi",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
+ },
+ {
+ "domain": ".researchgate.net",
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "_cfuvid",
+ "path": "/",
+ "sameSite": "no_restriction",
+ "secure": True,
+ "session": True,
+ "storeId": None,
+ "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
+ },
+ {
+ "domain": ".researchgate.net",
+ "expirationDate": 1753445177.271667,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_ga",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "GA1.1.1525244793.1718885177",
+ },
+ {
+ "domain": ".researchgate.net",
+ "expirationDate": 1753445177.271482,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_ga_4P31SJ70EJ",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
+ },
+ {
+ "domain": ".researchgate.net",
+ "expirationDate": 1718971576,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_gid",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "GA1.2.854907463.1718885177",
+ },
+ {
+ "domain": ".www.researchgate.net",
+ "expirationDate": 1750407982.506505,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "did",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
+ },
+ {
+ "domain": ".researchgate.net",
+ "expirationDate": 1750507578,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "didomi_token",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
+ },
+ {
+ "domain": ".www.researchgate.net",
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "hasPdpNext",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": True,
+ "storeId": None,
+ "value": "False",
+ },
+ {
+ "domain": ".researchgate.net",
+ "expirationDate": 1750421183,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
+ },
+ {
+ "domain": ".www.researchgate.net",
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "sid",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": True,
+ "storeId": None,
+ "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
+ },
+]
+
+COOKIES_LIST += [
+ {
+ "domain": "github.com",
+ "hostOnly": True,
+ "httpOnly": True,
+ "name": "_gh_sess",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": True,
+ "session": True,
+ "storeId": None,
+ "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
+ },
+ {
+ "domain": ".github.com",
+ "expirationDate": 1750408875.763785,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_octo",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "GH1.1.728652011.1718872875",
+ },
+ {
+ "domain": ".github.com",
+ "expirationDate": 1750408875.763926,
+ "hostOnly": False,
+ "httpOnly": True,
+ "name": "logged_in",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": True,
+ "session": False,
+ "storeId": None,
+ "value": "no",
+ },
+ {
+ "domain": ".github.com",
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "preferred_color_mode",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": True,
+ "session": True,
+ "storeId": None,
+ "value": "dark",
+ },
+ {
+ "domain": ".github.com",
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "tz",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": True,
+ "session": True,
+ "storeId": None,
+ "value": "Europe%2FParis",
+ },
+]
+
+COOKIES_LIST += [
+ {
+ "domain": ".web.archive.org",
+ "expirationDate": 1718886430,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_gat",
+ "path": "/web/20201123221659/http://orcid.org/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "1",
+ },
+ {
+ "domain": ".web.archive.org",
+ "expirationDate": 1718972770,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_gid",
+ "path": "/web/20201123221659/http://orcid.org/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "GA1.2.402246368.1606169825",
+ },
+ {
+ "domain": ".web.archive.org",
+ "expirationDate": 1753446370.315621,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_ga",
+ "path": "/web/20201123221659/http://orcid.org/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "GA1.2.1301409987.1606169825",
+ },
+ {
+ "domain": ".web.archive.org",
+ "expirationDate": 1750422367,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_hjid",
+ "path": "/web/20201123221659/http://orcid.org/",
+ "sameSite": "lax",
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
+ },
+ {
+ "domain": ".web.archive.org",
+ "expirationDate": 1718888167,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_hjFirstSeen",
+ "path": "/web/20201123221659/http://orcid.org/",
+ "sameSite": "lax",
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "1",
+ },
+]
+COOKIES_LIST += [
+ {
+ "domain": "orcid.org",
+ "hostOnly": True,
+ "httpOnly": False,
+ "name": "AWSELBCORS",
+ "path": "/",
+ "sameSite": "no_restriction",
+ "secure": True,
+ "session": True,
+ "storeId": None,
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
+ },
+ {
+ "domain": ".orcid.org",
+ "expirationDate": 1753452454.637671,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_ga_9R61FWK9H5",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
+ },
+ {
+ "domain": ".orcid.org",
+ "expirationDate": 1753452454.63421,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "_ga",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "GA1.1.2021310691.1718892455",
+ },
+ {
+ "domain": "orcid.org",
+ "hostOnly": True,
+ "httpOnly": False,
+ "name": "AWSELB",
+ "path": "/",
+ "sameSite": None,
+ "secure": False,
+ "session": True,
+ "storeId": None,
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
+ },
+ {
+ "domain": ".orcid.org",
+ "expirationDate": 1750428454,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "OptanonAlertBoxClosed",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "2024-06-20T14:07:34.583Z",
+ },
+ {
+ "domain": ".orcid.org",
+ "expirationDate": 1750428454,
+ "hostOnly": False,
+ "httpOnly": False,
+ "name": "OptanonConsent",
+ "path": "/",
+ "sameSite": "lax",
+ "secure": False,
+ "session": False,
+ "storeId": None,
+ "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
+ },
+ {
+ "domain": "orcid.org",
+ "hostOnly": True,
+ "httpOnly": False,
+ "name": "XSRF-TOKEN",
+ "path": "/",
+ "sameSite": None,
+ "secure": True,
+ "session": True,
+ "storeId": None,
+ "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
+ },
+]
+
+# Create a RequestsCookieJar instance
+COOKIES = RequestsCookieJar()
+
+# Add cookies to the jar
+for cookie in COOKIES_LIST:
+ COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])
diff --git a/examples/open_deep_research/scripts/gaia_scorer.py b/examples/open_deep_research/scripts/gaia_scorer.py
new file mode 100644
index 000000000..532e0c380
--- /dev/null
+++ b/examples/open_deep_research/scripts/gaia_scorer.py
@@ -0,0 +1,124 @@
+import re
+import string
+import warnings
+
+
+def normalize_number_str(number_str: str) -> float:
+ # we replace these common units and commas to allow
+ # conversion to float
+ for char in ["$", "%", ","]:
+ number_str = number_str.replace(char, "")
+ try:
+ return float(number_str)
+ except ValueError:
+ print(f"String {number_str} cannot be normalized to number str.")
+ return float("inf")
+
+
+def split_string(
+ s: str,
+ char_list: list[str] = [",", ";"],
+) -> list[str]:
+ pattern = f"[{''.join(char_list)}]"
+ return re.split(pattern, s)
+
+
+def is_float(element: any) -> bool:
+ try:
+ float(element)
+ return True
+ except ValueError:
+ return False
+
+
+def question_scorer(
+ model_answer: str,
+ ground_truth: str,
+) -> bool:
+ # if gt is a number
+ if is_float(ground_truth):
+ normalized_answer = normalize_number_str(str(model_answer))
+ return normalized_answer == float(ground_truth)
+
+ # if gt is a list
+ elif any(char in ground_truth for char in [",", ";"]):
+ # question with the fish: normalization removes punct
+
+ gt_elems = split_string(ground_truth)
+ ma_elems = split_string(model_answer)
+
+ # check length is the same
+ if len(gt_elems) != len(ma_elems):
+ warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
+ return False
+
+ # compare each element as float or str
+ comparisons = []
+ for ma_elem, gt_elem in zip(ma_elems, gt_elems):
+ if is_float(gt_elem):
+ normalized_ma_elem = normalize_number_str(ma_elem)
+ comparisons.append(normalized_ma_elem == float(gt_elem))
+ else:
+ # we do not remove punct since comparisons can include punct
+ comparisons.append(
+ normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)
+ )
+ return all(comparisons)
+
+ # if gt is a str
+ else:
+ return normalize_str(model_answer) == normalize_str(ground_truth)
+
+
+def check_prediction_contains_answer_letters_in_order(prediction, true_answer):
+ prediction = prediction.lower()
+ true_answer = true_answer.lower()
+ if len(prediction) > len(true_answer) * 3:
+ return False
+ i = 0
+ for letter in true_answer:
+ if letter in prediction[i:]:
+ i += prediction[i:].index(letter)
+ else:
+ return False
+ return True
+
+
+def check_close_call(prediction, true_answer, is_correct):
+ if is_correct:
+ return True
+ else:
+ if is_float(true_answer):
+ return is_correct
+ else:
+ if (
+ check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer))
+ and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2
+ ):
+ print(f"Close call: {prediction} vs {true_answer}")
+ return True
+ else:
+ return False
+
+
+def normalize_str(input_str, remove_punct=True) -> str:
+ """
+ Normalize a string by:
+ - Removing all white spaces
+ - Optionally removing punctuation (if remove_punct is True)
+ - Converting to lowercase
+ Parameters:
+ - input_str: str, the string to normalize
+ - remove_punct: bool, whether to remove punctuation (default: True)
+ Returns:
+ - str, the normalized string
+ """
+ # Remove all white spaces. Required e.g for seagull vs. sea gull
+ no_spaces = re.sub(r"\s", "", input_str)
+
+ # Remove punctuation, if specified.
+ if remove_punct:
+ translator = str.maketrans("", "", string.punctuation)
+ return no_spaces.lower().translate(translator)
+ else:
+ return no_spaces.lower()
diff --git a/examples/open_deep_research/scripts/mdconvert.py b/examples/open_deep_research/scripts/mdconvert.py
new file mode 100644
index 000000000..15df61875
--- /dev/null
+++ b/examples/open_deep_research/scripts/mdconvert.py
@@ -0,0 +1,949 @@
+# This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py
+# Thanks to Microsoft researchers for open-sourcing this!
+# type: ignore
+import base64
+import copy
+import html
+import json
+import mimetypes
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import traceback
+from typing import Any, Dict, List, Optional, Union
+from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
+
+import mammoth
+import markdownify
+import pandas as pd
+import pdfminer
+import pdfminer.high_level
+import pptx
+
+# File-format detection
+import puremagic
+import pydub
+import requests
+import speech_recognition as sr
+from bs4 import BeautifulSoup
+from youtube_transcript_api import YouTubeTranscriptApi
+from youtube_transcript_api.formatters import SRTFormatter
+
+
+class _CustomMarkdownify(markdownify.MarkdownConverter):
+ """
+ A custom version of markdownify's MarkdownConverter. Changes include:
+
+ - Altering the default heading style to use '#', '##', etc.
+ - Removing javascript hyperlinks.
+ - Truncating images with large data:uri sources.
+ - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
+ """
+
+ def __init__(self, **options: Any):
+ options["heading_style"] = options.get("heading_style", markdownify.ATX)
+ # Explicitly cast options to the expected type if necessary
+ super().__init__(**options)
+
+ def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
+ """Same as usual, but be sure to start with a new line"""
+ if not convert_as_inline:
+ if not re.search(r"^\n", text):
+ return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
+
+ return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
+
+ def convert_a(self, el: Any, text: str, convert_as_inline: bool):
+ """Same as usual converter, but removes Javascript links and escapes URIs."""
+ prefix, suffix, text = markdownify.chomp(text) # type: ignore
+ if not text:
+ return ""
+ href = el.get("href")
+ title = el.get("title")
+
+ # Escape URIs and skip non-http or file schemes
+ if href:
+ try:
+ parsed_url = urlparse(href) # type: ignore
+ if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
+ return "%s%s%s" % (prefix, text, suffix)
+ href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
+ except ValueError: # It's not clear if this ever gets thrown
+ return "%s%s%s" % (prefix, text, suffix)
+
+ # For the replacement see #29: text nodes underscores are escaped
+ if (
+ self.options["autolinks"]
+ and text.replace(r"\_", "_") == href
+ and not title
+ and not self.options["default_title"]
+ ):
+ # Shortcut syntax
+ return "<%s>" % href
+ if self.options["default_title"] and not title:
+ title = href
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+ return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text
+
+ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
+ """Same as usual converter, but removes data URIs"""
+
+ alt = el.attrs.get("alt", None) or ""
+ src = el.attrs.get("src", None) or ""
+ title = el.attrs.get("title", None) or ""
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+ if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]:
+ return alt
+
+ # Remove dataURIs
+ if src.startswith("data:"):
+ src = src.split(",")[0] + "..."
+
+ return "" % (alt, src, title_part)
+
+ def convert_soup(self, soup: Any) -> str:
+ return super().convert_soup(soup) # type: ignore
+
+
+class DocumentConverterResult:
+ """The result of converting a document to text."""
+
+ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
+ self.title: Union[str, None] = title
+ self.text_content: str = text_content
+
+
+class DocumentConverter:
+ """Abstract superclass of all DocumentConverters."""
+
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+ raise NotImplementedError()
+
+
+class PlainTextConverter(DocumentConverter):
+ """Anything with content type text/plain"""
+
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+ # Guess the content type from any file extension that might be around
+ content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
+
+ # Only accept text files
+ if content_type is None:
+ return None
+ # elif "text/" not in content_type.lower():
+ # return None
+
+ text_content = ""
+ with open(local_path, "rt", encoding="utf-8") as fh:
+ text_content = fh.read()
+ return DocumentConverterResult(
+ title=None,
+ text_content=text_content,
+ )
+
+
+class HtmlConverter(DocumentConverter):
+ """Anything with content type text/html"""
+
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+ # Bail if not html
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() not in [".html", ".htm"]:
+ return None
+
+ result = None
+ with open(local_path, "rt", encoding="utf-8") as fh:
+ result = self._convert(fh.read())
+
+ return result
+
+ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
+ """Helper function that converts and HTML string."""
+
+ # Parse the string
+ soup = BeautifulSoup(html_content, "html.parser")
+
+ # Remove javascript and style blocks
+ for script in soup(["script", "style"]):
+ script.extract()
+
+ # Print only the main content
+ body_elm = soup.find("body")
+ webpage_text = ""
+ if body_elm:
+ webpage_text = _CustomMarkdownify().convert_soup(body_elm)
+ else:
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
+
+ assert isinstance(webpage_text, str)
+
+ return DocumentConverterResult(
+ title=None if soup.title is None else soup.title.string, text_content=webpage_text
+ )
+
+
+class WikipediaConverter(DocumentConverter):
+ """Handle Wikipedia pages separately, focusing only on the main document content."""
+
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+ # Bail if not Wikipedia
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() not in [".html", ".htm"]:
+ return None
+ url = kwargs.get("url", "")
+ if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
+ return None
+
+ # Parse the file
+ soup = None
+ with open(local_path, "rt", encoding="utf-8") as fh:
+ soup = BeautifulSoup(fh.read(), "html.parser")
+
+ # Remove javascript and style blocks
+ for script in soup(["script", "style"]):
+ script.extract()
+
+ # Print only the main content
+ body_elm = soup.find("div", {"id": "mw-content-text"})
+ title_elm = soup.find("span", {"class": "mw-page-title-main"})
+
+ webpage_text = ""
+ main_title = None if soup.title is None else soup.title.string
+
+ if body_elm:
+ # What's the title
+ if title_elm and len(title_elm) > 0:
+ main_title = title_elm.string # type: ignore
+ assert isinstance(main_title, str)
+
+ # Convert the page
+ webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm)
+ else:
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
+
+ return DocumentConverterResult(
+ title=main_title,
+ text_content=webpage_text,
+ )
+
+
+class YouTubeConverter(DocumentConverter):
+ """Handle YouTube specially, focusing on the video title, description, and transcript."""
+
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+ # Bail if not YouTube
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() not in [".html", ".htm"]:
+ return None
+ url = kwargs.get("url", "")
+ if not url.startswith("https://www.youtube.com/watch?"):
+ return None
+
+ # Parse the file
+ soup = None
+ with open(local_path, "rt", encoding="utf-8") as fh:
+ soup = BeautifulSoup(fh.read(), "html.parser")
+
+ # Read the meta tags
+ assert soup.title is not None and soup.title.string is not None
+ metadata: Dict[str, str] = {"title": soup.title.string}
+ for meta in soup(["meta"]):
+ for a in meta.attrs:
+ if a in ["itemprop", "property", "name"]:
+ metadata[meta[a]] = meta.get("content", "")
+ break
+
+ # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
+ try:
+ for script in soup(["script"]):
+ content = script.text
+ if "ytInitialData" in content:
+ lines = re.split(r"\r?\n", content)
+ obj_start = lines[0].find("{")
+ obj_end = lines[0].rfind("}")
+ if obj_start >= 0 and obj_end >= 0:
+ data = json.loads(lines[0][obj_start : obj_end + 1])
+ attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
+ if attrdesc:
+ metadata["description"] = str(attrdesc["content"])
+ break
+ except Exception:
+ pass
+
+ # Start preparing the page
+ webpage_text = "# YouTube\n"
+
+ title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
+ assert isinstance(title, str)
+
+ if title:
+ webpage_text += f"\n## {title}\n"
+
+ stats = ""
+ views = self._get(metadata, ["interactionCount"]) # type: ignore
+ if views:
+ stats += f"- **Views:** {views}\n"
+
+ keywords = self._get(metadata, ["keywords"]) # type: ignore
+ if keywords:
+ stats += f"- **Keywords:** {keywords}\n"
+
+ runtime = self._get(metadata, ["duration"]) # type: ignore
+ if runtime:
+ stats += f"- **Runtime:** {runtime}\n"
+
+ if len(stats) > 0:
+ webpage_text += f"\n### Video Metadata\n{stats}\n"
+
+ description = self._get(metadata, ["description", "og:description"]) # type: ignore
+ if description:
+ webpage_text += f"\n### Description\n{description}\n"
+
+ transcript_text = ""
+ parsed_url = urlparse(url) # type: ignore
+ params = parse_qs(parsed_url.query) # type: ignore
+ if "v" in params:
+ assert isinstance(params["v"][0], str)
+ video_id = str(params["v"][0])
+ try:
+ # Must be a single transcript.
+ transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
+ # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
+ # Alternative formatting:
+ transcript_text = SRTFormatter().format_transcript(transcript)
+ except Exception:
+ pass
+ if transcript_text:
+ webpage_text += f"\n### Transcript\n{transcript_text}\n"
+
+ title = title if title else soup.title.string
+ assert isinstance(title, str)
+
+ return DocumentConverterResult(
+ title=title,
+ text_content=webpage_text,
+ )
+
+ def _get(self, metadata: Dict[str, str], keys: List[str], default: Union[str, None] = None) -> Union[str, None]:
+ for k in keys:
+ if k in metadata:
+ return metadata[k]
+ return default
+
+ def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type
+ if isinstance(json, list):
+ for elm in json:
+ ret = self._findKey(elm, key)
+ if ret is not None:
+ return ret
+ elif isinstance(json, dict):
+ for k in json:
+ if k == key:
+ return json[k]
+ else:
+ ret = self._findKey(json[k], key)
+ if ret is not None:
+ return ret
+ return None
+
+
+class PdfConverter(DocumentConverter):
+ """
+ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a PDF
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() != ".pdf":
+ return None
+
+ return DocumentConverterResult(
+ title=None,
+ text_content=pdfminer.high_level.extract_text(local_path),
+ )
+
+
+class DocxConverter(HtmlConverter):
+ """
+ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a DOCX
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() != ".docx":
+ return None
+
+ result = None
+ with open(local_path, "rb") as docx_file:
+ result = mammoth.convert_to_html(docx_file)
+ html_content = result.value
+ result = self._convert(html_content)
+
+ return result
+
+
+class XlsxConverter(HtmlConverter):
+ """
+ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a XLSX
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() not in [".xlsx", ".xls"]:
+ return None
+
+ sheets = pd.read_excel(local_path, sheet_name=None)
+ md_content = ""
+ for s in sheets:
+ md_content += f"## {s}\n"
+ html_content = sheets[s].to_html(index=False)
+ md_content += self._convert(html_content).text_content.strip() + "\n\n"
+
+ return DocumentConverterResult(
+ title=None,
+ text_content=md_content.strip(),
+ )
+
+
+class PptxConverter(HtmlConverter):
+ """
+ Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a PPTX
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() != ".pptx":
+ return None
+
+ md_content = ""
+
+ presentation = pptx.Presentation(local_path)
+ slide_num = 0
+ for slide in presentation.slides:
+ slide_num += 1
+
+ md_content += f"\n\n\n"
+
+ title = slide.shapes.title
+ for shape in slide.shapes:
+ # Pictures
+ if self._is_picture(shape):
+ # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
+ alt_text = ""
+ try:
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
+ except Exception:
+ pass
+
+ # A placeholder name
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
+ md_content += "\n\n"
+
+ # Tables
+ if self._is_table(shape):
+ html_table = ""
+ first_row = True
+ for row in shape.table.rows:
+ html_table += ""
+ for cell in row.cells:
+ if first_row:
+ html_table += "" + html.escape(cell.text) + " | "
+ else:
+ html_table += "" + html.escape(cell.text) + " | "
+ html_table += "
"
+ first_row = False
+ html_table += "
"
+ md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
+
+ # Text areas
+ elif shape.has_text_frame:
+ if shape == title:
+ md_content += "# " + shape.text.lstrip() + "\n"
+ else:
+ md_content += shape.text + "\n"
+
+ md_content = md_content.strip()
+
+ if slide.has_notes_slide:
+ md_content += "\n\n### Notes:\n"
+ notes_frame = slide.notes_slide.notes_text_frame
+ if notes_frame is not None:
+ md_content += notes_frame.text
+ md_content = md_content.strip()
+
+ return DocumentConverterResult(
+ title=None,
+ text_content=md_content.strip(),
+ )
+
+ def _is_picture(self, shape):
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
+ return True
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
+ if hasattr(shape, "image"):
+ return True
+ return False
+
+ def _is_table(self, shape):
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
+ return True
+ return False
+
+
+class MediaConverter(DocumentConverter):
+ """
+ Abstract class for multi-modal media (e.g., images and audio)
+ """
+
+ def _get_metadata(self, local_path):
+ exiftool = shutil.which("exiftool")
+ if not exiftool:
+ return None
+ else:
+ try:
+ result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout
+ return json.loads(result)[0]
+ except Exception:
+ return None
+
+
+class WavConverter(MediaConverter):
+ """
+ Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a XLSX
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() != ".wav":
+ return None
+
+ md_content = ""
+
+ # Add metadata
+ metadata = self._get_metadata(local_path)
+ if metadata:
+ for f in [
+ "Title",
+ "Artist",
+ "Author",
+ "Band",
+ "Album",
+ "Genre",
+ "Track",
+ "DateTimeOriginal",
+ "CreateDate",
+ "Duration",
+ ]:
+ if f in metadata:
+ md_content += f"{f}: {metadata[f]}\n"
+
+ # Transcribe
+ try:
+ transcript = self._transcribe_audio(local_path)
+ md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript)
+ except Exception:
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
+
+ return DocumentConverterResult(
+ title=None,
+ text_content=md_content.strip(),
+ )
+
+ def _transcribe_audio(self, local_path) -> str:
+ recognizer = sr.Recognizer()
+ with sr.AudioFile(local_path) as source:
+ audio = recognizer.record(source)
+ return recognizer.recognize_google(audio).strip()
+
+
+class Mp3Converter(WavConverter):
+ """
+ Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a MP3
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() != ".mp3":
+ return None
+
+ md_content = ""
+
+ # Add metadata
+ metadata = self._get_metadata(local_path)
+ if metadata:
+ for f in [
+ "Title",
+ "Artist",
+ "Author",
+ "Band",
+ "Album",
+ "Genre",
+ "Track",
+ "DateTimeOriginal",
+ "CreateDate",
+ "Duration",
+ ]:
+ if f in metadata:
+ md_content += f"{f}: {metadata[f]}\n"
+
+ # Transcribe
+ handle, temp_path = tempfile.mkstemp(suffix=".wav")
+ os.close(handle)
+ try:
+ sound = pydub.AudioSegment.from_mp3(local_path)
+ sound.export(temp_path, format="wav")
+
+ _args = dict()
+ _args.update(kwargs)
+ _args["file_extension"] = ".wav"
+
+ try:
+ transcript = super()._transcribe_audio(temp_path).strip()
+ md_content += "\n\n### Audio Transcript:\n" + (
+ "[No speech detected]" if transcript == "" else transcript
+ )
+ except Exception:
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
+
+ finally:
+ os.unlink(temp_path)
+
+ # Return the result
+ return DocumentConverterResult(
+ title=None,
+ text_content=md_content.strip(),
+ )
+
+
+class ImageConverter(MediaConverter):
+ """
+ Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
+ """
+
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ # Bail if not a XLSX
+ extension = kwargs.get("file_extension", "")
+ if extension.lower() not in [".jpg", ".jpeg", ".png"]:
+ return None
+
+ md_content = ""
+
+ # Add metadata
+ metadata = self._get_metadata(local_path)
+ if metadata:
+ for f in [
+ "ImageSize",
+ "Title",
+ "Caption",
+ "Description",
+ "Keywords",
+ "Artist",
+ "Author",
+ "DateTimeOriginal",
+ "CreateDate",
+ "GPSPosition",
+ ]:
+ if f in metadata:
+ md_content += f"{f}: {metadata[f]}\n"
+
+ # Try describing the image with GPTV
+ mlm_client = kwargs.get("mlm_client")
+ mlm_model = kwargs.get("mlm_model")
+ if mlm_client is not None and mlm_model is not None:
+ md_content += (
+ "\n# Description:\n"
+ + self._get_mlm_description(
+ local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt")
+ ).strip()
+ + "\n"
+ )
+
+ return DocumentConverterResult(
+ title=None,
+ text_content=md_content,
+ )
+
+ def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
+ if prompt is None or prompt.strip() == "":
+ prompt = "Write a detailed caption for this image."
+
+ sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
+
+ data_uri = ""
+ with open(local_path, "rb") as image_file:
+ content_type, encoding = mimetypes.guess_type("_dummy" + extension)
+ if content_type is None:
+ content_type = "image/jpeg"
+ image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
+ data_uri = f"data:{content_type};base64,{image_base64}"
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": prompt},
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": data_uri,
+ },
+ },
+ ],
+ }
+ ]
+
+ response = client.chat.completions.create(model=model, messages=messages)
+ return response.choices[0].message.content
+
+
+class FileConversionException(BaseException):
+ pass
+
+
+class UnsupportedFormatException(BaseException):
+ pass
+
+
+class MarkdownConverter:
+ """(In preview) An extremely simple text-based document reader, suitable for LLM use.
+ This reader will convert common file-types or webpages to Markdown."""
+
+ def __init__(
+ self,
+ requests_session: Optional[requests.Session] = None,
+ mlm_client: Optional[Any] = None,
+ mlm_model: Optional[Any] = None,
+ ):
+ if requests_session is None:
+ self._requests_session = requests.Session()
+ else:
+ self._requests_session = requests_session
+
+ self._mlm_client = mlm_client
+ self._mlm_model = mlm_model
+
+ self._page_converters: List[DocumentConverter] = []
+
+ # Register converters for successful browsing operations
+ # Later registrations are tried first / take higher priority than earlier registrations
+ # To this end, the most specific converters should appear below the most generic converters
+ self.register_page_converter(PlainTextConverter())
+ self.register_page_converter(HtmlConverter())
+ self.register_page_converter(WikipediaConverter())
+ self.register_page_converter(YouTubeConverter())
+ self.register_page_converter(DocxConverter())
+ self.register_page_converter(XlsxConverter())
+ self.register_page_converter(PptxConverter())
+ self.register_page_converter(WavConverter())
+ self.register_page_converter(Mp3Converter())
+ self.register_page_converter(ImageConverter())
+ self.register_page_converter(PdfConverter())
+
+ def convert(
+ self, source: Union[str, requests.Response], **kwargs: Any
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
+ """
+ Args:
+ - source: can be a string representing a path or url, or a requests.response object
+ - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
+ """
+
+ # Local path or url
+ if isinstance(source, str):
+ if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
+ return self.convert_url(source, **kwargs)
+ else:
+ return self.convert_local(source, **kwargs)
+ # Request response
+ elif isinstance(source, requests.Response):
+ return self.convert_response(source, **kwargs)
+
+ def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
+ # Prepare a list of extensions to try (in order of priority)
+ ext = kwargs.get("file_extension")
+ extensions = [ext] if ext is not None else []
+
+ # Get extension alternatives from the path and puremagic
+ base, ext = os.path.splitext(path)
+ self._append_ext(extensions, ext)
+ self._append_ext(extensions, self._guess_ext_magic(path))
+
+ # Convert
+ return self._convert(path, extensions, **kwargs)
+
+ # TODO what should stream's type be?
+ def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
+ # Prepare a list of extensions to try (in order of priority)
+ ext = kwargs.get("file_extension")
+ extensions = [ext] if ext is not None else []
+
+ # Save the file locally to a temporary file. It will be deleted before this method exits
+ handle, temp_path = tempfile.mkstemp()
+ fh = os.fdopen(handle, "wb")
+ result = None
+ try:
+ # Write to the temporary file
+ content = stream.read()
+ if isinstance(content, str):
+ fh.write(content.encode("utf-8"))
+ else:
+ fh.write(content)
+ fh.close()
+
+ # Use puremagic to check for more extension options
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
+
+ # Convert
+ result = self._convert(temp_path, extensions, **kwargs)
+ # Clean up
+ finally:
+ try:
+ fh.close()
+ except Exception:
+ pass
+ os.unlink(temp_path)
+
+ return result
+
+ def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult: # TODO: fix kwargs type
+ # Send a HTTP request to the URL
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
+ response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent})
+ response.raise_for_status()
+ return self.convert_response(response, **kwargs)
+
+ def convert_response(
+ self, response: requests.Response, **kwargs: Any
+ ) -> DocumentConverterResult: # TODO fix kwargs type
+ # Prepare a list of extensions to try (in order of priority)
+ ext = kwargs.get("file_extension")
+ extensions = [ext] if ext is not None else []
+
+ # Guess from the mimetype
+ content_type = response.headers.get("content-type", "").split(";")[0]
+ self._append_ext(extensions, mimetypes.guess_extension(content_type))
+
+ # Read the content disposition if there is one
+ content_disposition = response.headers.get("content-disposition", "")
+ m = re.search(r"filename=([^;]+)", content_disposition)
+ if m:
+ base, ext = os.path.splitext(m.group(1).strip("\"'"))
+ self._append_ext(extensions, ext)
+
+ # Read from the extension from the path
+ base, ext = os.path.splitext(urlparse(response.url).path)
+ self._append_ext(extensions, ext)
+
+ # Save the file locally to a temporary file. It will be deleted before this method exits
+ handle, temp_path = tempfile.mkstemp()
+ fh = os.fdopen(handle, "wb")
+ result = None
+ try:
+ # Download the file
+ for chunk in response.iter_content(chunk_size=512):
+ fh.write(chunk)
+ fh.close()
+
+ # Use puremagic to check for more extension options
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
+
+ # Convert
+ result = self._convert(temp_path, extensions, url=response.url)
+ except Exception as e:
+ print(f"Error in converting: {e}")
+
+ # Clean up
+ finally:
+ try:
+ fh.close()
+ except Exception:
+ pass
+ os.unlink(temp_path)
+
+ return result
+
+ def _convert(self, local_path: str, extensions: List[Union[str, None]], **kwargs) -> DocumentConverterResult:
+ error_trace = ""
+ for ext in extensions + [None]: # Try last with no extension
+ for converter in self._page_converters:
+ _kwargs = copy.deepcopy(kwargs)
+
+ # Overwrite file_extension appropriately
+ if ext is None:
+ if "file_extension" in _kwargs:
+ del _kwargs["file_extension"]
+ else:
+ _kwargs.update({"file_extension": ext})
+
+ # Copy any additional global options
+ if "mlm_client" not in _kwargs and self._mlm_client is not None:
+ _kwargs["mlm_client"] = self._mlm_client
+
+ if "mlm_model" not in _kwargs and self._mlm_model is not None:
+ _kwargs["mlm_model"] = self._mlm_model
+
+ # If we hit an error log it and keep trying
+ try:
+ res = converter.convert(local_path, **_kwargs)
+ except Exception:
+ error_trace = ("\n\n" + traceback.format_exc()).strip()
+
+ if res is not None:
+ # Normalize the content
+ res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
+ res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
+
+ # Todo
+ return res
+
+ # If we got this far without success, report any exceptions
+ if len(error_trace) > 0:
+ raise FileConversionException(
+ f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
+ )
+
+ # Nothing can handle it!
+ raise UnsupportedFormatException(
+ f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
+ )
+
+ def _append_ext(self, extensions, ext):
+ """Append a unique non-None, non-empty extension to a list of extensions."""
+ if ext is None:
+ return
+ ext = ext.strip()
+ if ext == "":
+ return
+ # if ext not in extensions:
+ if True:
+ extensions.append(ext)
+
+ def _guess_ext_magic(self, path):
+ """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
+ # Use puremagic to guess
+ try:
+ guesses = puremagic.magic_file(path)
+ if len(guesses) > 0:
+ ext = guesses[0].extension.strip()
+ if len(ext) > 0:
+ return ext
+ except FileNotFoundError:
+ pass
+ except IsADirectoryError:
+ pass
+ except PermissionError:
+ pass
+ return None
+
+ def register_page_converter(self, converter: DocumentConverter) -> None:
+ """Register a page text converter."""
+ self._page_converters.insert(0, converter)
diff --git a/examples/open_deep_research/scripts/reformulator.py b/examples/open_deep_research/scripts/reformulator.py
new file mode 100644
index 000000000..db41704d8
--- /dev/null
+++ b/examples/open_deep_research/scripts/reformulator.py
@@ -0,0 +1,86 @@
+# Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
+# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
+import copy
+
+from smolagents.models import MessageRole, Model
+
+
+def prepare_response(original_task: str, inner_messages, reformulation_model: Model) -> str:
+ messages = [
+ {
+ "role": MessageRole.SYSTEM,
+ "content": [
+ {
+ "type": "text",
+ "text": f"""Earlier you were asked the following:
+
+{original_task}
+
+Your team then worked diligently to address that request. Read below a transcript of that conversation:""",
+ }
+ ],
+ }
+ ]
+
+ # The first message just repeats the question, so remove it
+ # if len(inner_messages) > 1:
+ # del inner_messages[0]
+
+ # copy them to this context
+ try:
+ for message in inner_messages:
+ if not message.get("content"):
+ continue
+ message = copy.deepcopy(message)
+ message["role"] = MessageRole.USER
+ messages.append(message)
+ except Exception:
+ messages += [{"role": MessageRole.ASSISTANT, "content": str(inner_messages)}]
+
+ # ask for the final answer
+ messages.append(
+ {
+ "role": MessageRole.USER,
+ "content": [
+ {
+ "type": "text",
+ "text": f"""
+Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
+
+{original_task}
+
+To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
+Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.
+If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine'
+""",
+ }
+ ],
+ }
+ )
+
+ response = reformulation_model(messages).content
+
+ final_answer = response.split("FINAL ANSWER: ")[-1].strip()
+ print("> Reformulated answer: ", final_answer)
+
+ # if "unable to determine" in final_answer.lower():
+ # messages.append({"role": MessageRole.ASSISTANT, "content": response })
+ # messages.append({"role": MessageRole.USER, "content": [{"type": "text", "text": """
+ # I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation.
+
+ # To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS]
+ # Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc.
+ # ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+ # If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+ # If you are asked for a string, don't use articles or abbreviations (e.g. cit for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+ # If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+ # """.strip()}]})
+
+ # response = model(messages).content
+ # print("\n>>>Making an educated guess.\n", response)
+ # final_answer = response.split("EDUCATED GUESS: ")[-1].strip()
+ return final_answer
diff --git a/examples/open_deep_research/scripts/run_agents.py b/examples/open_deep_research/scripts/run_agents.py
new file mode 100644
index 000000000..37da8a40e
--- /dev/null
+++ b/examples/open_deep_research/scripts/run_agents.py
@@ -0,0 +1,87 @@
+import json
+import os
+import shutil
+import textwrap
+from pathlib import Path
+
+# import tqdm.asyncio
+from smolagents.utils import AgentError
+
+
+def serialize_agent_error(obj):
+ if isinstance(obj, AgentError):
+ return {"error_type": obj.__class__.__name__, "message": obj.message}
+ else:
+ return str(obj)
+
+
+def get_image_description(file_name: str, question: str, visual_inspection_tool) -> str:
+ prompt = f"""Write a caption of 5 sentences for this image. Pay special attention to any details that might be useful for someone answering the following question:
+{question}. But do not try to answer the question directly!
+Do not add any information that is not present in the image."""
+ return visual_inspection_tool(image_path=file_name, question=prompt)
+
+
+def get_document_description(file_path: str, question: str, document_inspection_tool) -> str:
+ prompt = f"""Write a caption of 5 sentences for this document. Pay special attention to any details that might be useful for someone answering the following question:
+{question}. But do not try to answer the question directly!
+Do not add any information that is not present in the document."""
+ return document_inspection_tool.forward_initial_exam_mode(file_path=file_path, question=prompt)
+
+
+def get_single_file_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
+ file_extension = file_path.split(".")[-1]
+ if file_extension in ["png", "jpg", "jpeg"]:
+ file_description = f" - Attached image: {file_path}"
+ file_description += (
+ f"\n -> Image description: {get_image_description(file_path, question, visual_inspection_tool)}"
+ )
+ return file_description
+ elif file_extension in ["pdf", "xls", "xlsx", "docx", "doc", "xml"]:
+ file_description = f" - Attached document: {file_path}"
+ image_path = file_path.split(".")[0] + ".png"
+ if os.path.exists(image_path):
+ description = get_image_description(image_path, question, visual_inspection_tool)
+ else:
+ description = get_document_description(file_path, question, document_inspection_tool)
+ file_description += f"\n -> File description: {description}"
+ return file_description
+ elif file_extension in ["mp3", "m4a", "wav"]:
+ return f" - Attached audio: {file_path}"
+ else:
+ return f" - Attached file: {file_path}"
+
+
+def get_zip_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
+ folder_path = file_path.replace(".zip", "")
+ os.makedirs(folder_path, exist_ok=True)
+ shutil.unpack_archive(file_path, folder_path)
+
+ prompt_use_files = ""
+ for root, dirs, files in os.walk(folder_path):
+ for file in files:
+ file_path = os.path.join(root, file)
+ prompt_use_files += "\n" + textwrap.indent(
+ get_single_file_description(file_path, question, visual_inspection_tool, document_inspection_tool),
+ prefix=" ",
+ )
+ return prompt_use_files
+
+
+def get_tasks_to_run(data, total: int, base_filename: Path, tasks_ids: list[int]):
+ f = base_filename.parent / f"{base_filename.stem}_answers.jsonl"
+ done = set()
+ if f.exists():
+ with open(f, encoding="utf-8") as fh:
+ done = {json.loads(line)["task_id"] for line in fh if line.strip()}
+
+ tasks = []
+ for i in range(total):
+ task_id = int(data[i]["task_id"])
+ if task_id not in done:
+ if tasks_ids is not None:
+ if task_id in tasks_ids:
+ tasks.append(data[i])
+ else:
+ tasks.append(data[i])
+ return tasks
diff --git a/examples/open_deep_research/scripts/text_inspector_tool.py b/examples/open_deep_research/scripts/text_inspector_tool.py
new file mode 100644
index 000000000..09e7c1191
--- /dev/null
+++ b/examples/open_deep_research/scripts/text_inspector_tool.py
@@ -0,0 +1,122 @@
+from typing import Optional
+
+from smolagents import Tool
+from smolagents.models import MessageRole, Model
+
+from .mdconvert import MarkdownConverter
+
+
+class TextInspectorTool(Tool):
+ name = "inspect_file_as_text"
+ description = """
+You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it.
+This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."""
+
+ inputs = {
+ "file_path": {
+ "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT use this tool for an HTML webpage: use the web_search tool instead!",
+ "type": "string",
+ },
+ "question": {
+ "description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.",
+ "type": "string",
+ "nullable": True,
+ },
+ }
+ output_type = "string"
+ md_converter = MarkdownConverter()
+
+ def __init__(self, model: Model, text_limit: int):
+ super().__init__()
+ self.model = model
+ self.text_limit = text_limit
+
+ def forward_initial_exam_mode(self, file_path, question):
+ result = self.md_converter.convert(file_path)
+
+ if file_path[-4:] in [".png", ".jpg"]:
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
+
+ if ".zip" in file_path:
+ return result.text_content
+
+ if not question:
+ return result.text_content
+
+ if len(result.text_content) < 4000:
+ return "Document content: " + result.text_content
+
+ messages = [
+ {
+ "role": MessageRole.SYSTEM,
+ "content": [
+ {
+ "type": "text",
+ "text": "Here is a file:\n### "
+ + str(result.title)
+ + "\n\n"
+ + result.text_content[: self.text_limit],
+ }
+ ],
+ },
+ {
+ "role": MessageRole.USER,
+ "content": [
+ {
+ "type": "text",
+ "text": "Now please write a short, 5 sentence caption for this document, that could help someone asking this question: "
+ + question
+ + "\n\nDon't answer the question yourself! Just provide useful notes on the document",
+ }
+ ],
+ },
+ ]
+ return self.model(messages).content
+
+ def forward(self, file_path, question: Optional[str] = None) -> str:
+ result = self.md_converter.convert(file_path)
+
+ if file_path[-4:] in [".png", ".jpg"]:
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
+
+ if ".zip" in file_path:
+ return result.text_content
+
+ if not question:
+ return result.text_content
+
+ messages = [
+ {
+ "role": MessageRole.SYSTEM,
+ "content": [
+ {
+ "type": "text",
+ "text": "You will have to write a short caption for this file, then answer this question:"
+ + question,
+ }
+ ],
+ },
+ {
+ "role": MessageRole.USER,
+ "content": [
+ {
+ "type": "text",
+ "text": "Here is the complete file:\n### "
+ + str(result.title)
+ + "\n\n"
+ + result.text_content[: self.text_limit],
+ }
+ ],
+ },
+ {
+ "role": MessageRole.USER,
+ "content": [
+ {
+ "type": "text",
+ "text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'."
+ + question,
+ }
+ ],
+ },
+ ]
+ return self.model(messages).content
diff --git a/examples/open_deep_research/scripts/text_web_browser.py b/examples/open_deep_research/scripts/text_web_browser.py
new file mode 100644
index 000000000..4a95a6a3d
--- /dev/null
+++ b/examples/open_deep_research/scripts/text_web_browser.py
@@ -0,0 +1,563 @@
+# Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
+# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
+import mimetypes
+import os
+import pathlib
+import re
+import time
+import uuid
+from typing import Any, Dict, List, Optional, Tuple, Union
+from urllib.parse import unquote, urljoin, urlparse
+
+import pathvalidate
+import requests
+from serpapi import GoogleSearch
+
+from smolagents import Tool
+
+from .cookies import COOKIES
+from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException
+
+
+class SimpleTextBrowser:
+ """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
+
+ def __init__(
+ self,
+ start_page: Optional[str] = None,
+ viewport_size: Optional[int] = 1024 * 8,
+ downloads_folder: Optional[Union[str, None]] = None,
+ serpapi_key: Optional[Union[str, None]] = None,
+ request_kwargs: Optional[Union[Dict[str, Any], None]] = None,
+ ):
+ self.start_page: str = start_page if start_page else "about:blank"
+ self.viewport_size = viewport_size # Applies only to the standard uri types
+ self.downloads_folder = downloads_folder
+ self.history: List[Tuple[str, float]] = list()
+ self.page_title: Optional[str] = None
+ self.viewport_current_page = 0
+ self.viewport_pages: List[Tuple[int, int]] = list()
+ self.set_address(self.start_page)
+ self.serpapi_key = serpapi_key
+ self.request_kwargs = request_kwargs
+ self.request_kwargs["cookies"] = COOKIES
+ self._mdconvert = MarkdownConverter()
+ self._page_content: str = ""
+
+ self._find_on_page_query: Union[str, None] = None
+ self._find_on_page_last_result: Union[int, None] = None # Location of the last result
+
+ @property
+ def address(self) -> str:
+ """Return the address of the current page."""
+ return self.history[-1][0]
+
+ def set_address(self, uri_or_path: str, filter_year: Optional[int] = None) -> None:
+ # TODO: Handle anchors
+ self.history.append((uri_or_path, time.time()))
+
+ # Handle special URIs
+ if uri_or_path == "about:blank":
+ self._set_page_content("")
+ elif uri_or_path.startswith("google:"):
+ self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year)
+ else:
+ if (
+ not uri_or_path.startswith("http:")
+ and not uri_or_path.startswith("https:")
+ and not uri_or_path.startswith("file:")
+ ):
+ if len(self.history) > 1:
+ prior_address = self.history[-2][0]
+ uri_or_path = urljoin(prior_address, uri_or_path)
+ # Update the address with the fully-qualified path
+ self.history[-1] = (uri_or_path, self.history[-1][1])
+ self._fetch_page(uri_or_path)
+
+ self.viewport_current_page = 0
+ self.find_on_page_query = None
+ self.find_on_page_viewport = None
+
+ @property
+ def viewport(self) -> str:
+ """Return the content of the current viewport."""
+ bounds = self.viewport_pages[self.viewport_current_page]
+ return self.page_content[bounds[0] : bounds[1]]
+
+ @property
+ def page_content(self) -> str:
+ """Return the full contents of the current page."""
+ return self._page_content
+
+ def _set_page_content(self, content: str) -> None:
+ """Sets the text content of the current page."""
+ self._page_content = content
+ self._split_pages()
+ if self.viewport_current_page >= len(self.viewport_pages):
+ self.viewport_current_page = len(self.viewport_pages) - 1
+
+ def page_down(self) -> None:
+ self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
+
+ def page_up(self) -> None:
+ self.viewport_current_page = max(self.viewport_current_page - 1, 0)
+
+ def find_on_page(self, query: str) -> Union[str, None]:
+ """Searches for the query from the current viewport forward, looping back to the start if necessary."""
+
+ # Did we get here via a previous find_on_page search with the same query?
+ # If so, map to find_next
+ if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
+ return self.find_next()
+
+ # Ok it's a new search start from the current viewport
+ self._find_on_page_query = query
+ viewport_match = self._find_next_viewport(query, self.viewport_current_page)
+ if viewport_match is None:
+ self._find_on_page_last_result = None
+ return None
+ else:
+ self.viewport_current_page = viewport_match
+ self._find_on_page_last_result = viewport_match
+ return self.viewport
+
+ def find_next(self) -> Union[str, None]:
+ """Scroll to the next viewport that matches the query"""
+
+ if self._find_on_page_query is None:
+ return None
+
+ starting_viewport = self._find_on_page_last_result
+ if starting_viewport is None:
+ starting_viewport = 0
+ else:
+ starting_viewport += 1
+ if starting_viewport >= len(self.viewport_pages):
+ starting_viewport = 0
+
+ viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
+ if viewport_match is None:
+ self._find_on_page_last_result = None
+ return None
+ else:
+ self.viewport_current_page = viewport_match
+ self._find_on_page_last_result = viewport_match
+ return self.viewport
+
+ def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int, None]:
+ """Search for matches between the starting viewport looping when reaching the end."""
+
+ if query is None:
+ return None
+
+ # Normalize the query, and convert to a regular expression
+ nquery = re.sub(r"\*", "__STAR__", query)
+ nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
+ nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word
+ nquery = nquery.replace("__STAR__", ".*").lower()
+
+ if nquery.strip() == "":
+ return None
+
+ idxs = list()
+ idxs.extend(range(starting_viewport, len(self.viewport_pages)))
+ idxs.extend(range(0, starting_viewport))
+
+ for i in idxs:
+ bounds = self.viewport_pages[i]
+ content = self.page_content[bounds[0] : bounds[1]]
+
+ # TODO: Remove markdown links and images
+ ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
+ if re.search(nquery, ncontent):
+ return i
+
+ return None
+
+ def visit_page(self, path_or_uri: str, filter_year: Optional[int] = None) -> str:
+ """Update the address, visit the page, and return the content of the viewport."""
+ self.set_address(path_or_uri, filter_year=filter_year)
+ return self.viewport
+
+ def _split_pages(self) -> None:
+ # Do not split search results
+ if self.address.startswith("google:"):
+ self.viewport_pages = [(0, len(self._page_content))]
+ return
+
+ # Handle empty pages
+ if len(self._page_content) == 0:
+ self.viewport_pages = [(0, 0)]
+ return
+
+ # Break the viewport into pages
+ self.viewport_pages = []
+ start_idx = 0
+ while start_idx < len(self._page_content):
+ end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
+ # Adjust to end on a space
+ while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
+ end_idx += 1
+ self.viewport_pages.append((start_idx, end_idx))
+ start_idx = end_idx
+
+ def _serpapi_search(self, query: str, filter_year: Optional[int] = None) -> None:
+ if self.serpapi_key is None:
+ raise ValueError("Missing SerpAPI key.")
+
+ params = {
+ "engine": "google",
+ "q": query,
+ "api_key": self.serpapi_key,
+ }
+ if filter_year is not None:
+ params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
+
+ search = GoogleSearch(params)
+ results = search.get_dict()
+ self.page_title = f"{query} - Search"
+ if "organic_results" not in results.keys():
+ raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
+ if len(results["organic_results"]) == 0:
+ year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
+ self._set_page_content(
+ f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
+ )
+ return
+
+ def _prev_visit(url):
+ for i in range(len(self.history) - 1, -1, -1):
+ if self.history[i][0] == url:
+ return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
+ return ""
+
+ web_snippets: List[str] = list()
+ idx = 0
+ if "organic_results" in results:
+ for page in results["organic_results"]:
+ idx += 1
+ date_published = ""
+ if "date" in page:
+ date_published = "\nDate published: " + page["date"]
+
+ source = ""
+ if "source" in page:
+ source = "\nSource: " + page["source"]
+
+ snippet = ""
+ if "snippet" in page:
+ snippet = "\n" + page["snippet"]
+
+ redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}"
+
+ redacted_version = redacted_version.replace("Your browser can't play this video.", "")
+ web_snippets.append(redacted_version)
+
+ content = (
+ f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n"
+ + "\n\n".join(web_snippets)
+ )
+
+ self._set_page_content(content)
+
+ def _fetch_page(self, url: str) -> None:
+ download_path = ""
+ try:
+ if url.startswith("file://"):
+ download_path = os.path.normcase(os.path.normpath(unquote(url[7:])))
+ res = self._mdconvert.convert_local(download_path)
+ self.page_title = res.title
+ self._set_page_content(res.text_content)
+ else:
+ # Prepare the request parameters
+ request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
+ request_kwargs["stream"] = True
+
+ # Send a HTTP request to the URL
+ response = requests.get(url, **request_kwargs)
+ response.raise_for_status()
+
+ # If the HTTP request was successful
+ content_type = response.headers.get("content-type", "")
+
+ # Text or HTML
+ if "text/" in content_type.lower():
+ res = self._mdconvert.convert_response(response)
+ self.page_title = res.title
+ self._set_page_content(res.text_content)
+ # A download
+ else:
+ # Try producing a safe filename
+ fname = None
+ download_path = None
+ try:
+ fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
+
+ suffix = 0
+ while os.path.exists(download_path) and suffix < 1000:
+ suffix += 1
+ base, ext = os.path.splitext(fname)
+ new_fname = f"{base}__{suffix}{ext}"
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
+
+ except NameError:
+ pass
+
+ # No suitable name, so make one
+ if fname is None:
+ extension = mimetypes.guess_extension(content_type)
+ if extension is None:
+ extension = ".download"
+ fname = str(uuid.uuid4()) + extension
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
+
+ # Open a file for writing
+ with open(download_path, "wb") as fh:
+ for chunk in response.iter_content(chunk_size=512):
+ fh.write(chunk)
+
+ # Render it
+ local_uri = pathlib.Path(download_path).as_uri()
+ self.set_address(local_uri)
+
+ except UnsupportedFormatException as e:
+ print(e)
+ self.page_title = ("Download complete.",)
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
+ except FileConversionException as e:
+ print(e)
+ self.page_title = ("Download complete.",)
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
+ except FileNotFoundError:
+ self.page_title = "Error 404"
+ self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
+ except requests.exceptions.RequestException as request_exception:
+ try:
+ self.page_title = f"Error {response.status_code}"
+
+ # If the error was rendered in HTML we might as well render it
+ content_type = response.headers.get("content-type", "")
+ if content_type is not None and "text/html" in content_type.lower():
+ res = self._mdconvert.convert(response)
+ self.page_title = f"Error {response.status_code}"
+ self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}")
+ else:
+ text = ""
+ for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
+ text += chunk
+ self.page_title = f"Error {response.status_code}"
+ self._set_page_content(f"## Error {response.status_code}\n\n{text}")
+ except NameError:
+ self.page_title = "Error"
+ self._set_page_content(f"## Error\n\n{str(request_exception)}")
+
+ def _state(self) -> Tuple[str, str]:
+ header = f"Address: {self.address}\n"
+ if self.page_title is not None:
+ header += f"Title: {self.page_title}\n"
+
+ current_page = self.viewport_current_page
+ total_pages = len(self.viewport_pages)
+
+ address = self.address
+ for i in range(len(self.history) - 2, -1, -1): # Start from the second last
+ if self.history[i][0] == address:
+ header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
+ break
+
+ header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
+ return (header, self.viewport)
+
+
+class SearchInformationTool(Tool):
+ name = "web_search"
+ description = "Perform a web search query (think a google search) and returns the search results."
+ inputs = {"query": {"type": "string", "description": "The web search query to perform."}}
+ inputs["filter_year"] = {
+ "type": "string",
+ "description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!",
+ "nullable": True,
+ }
+ output_type = "string"
+
+ def __init__(self, browser):
+ super().__init__()
+ self.browser = browser
+
+ def forward(self, query: str, filter_year: Optional[int] = None) -> str:
+ self.browser.visit_page(f"google: {query}", filter_year=filter_year)
+ header, content = self.browser._state()
+ return header.strip() + "\n=======================\n" + content
+
+
+class VisitTool(Tool):
+ name = "visit_page"
+ description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript."
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webapge to visit."}}
+ output_type = "string"
+
+ def __init__(self, browser):
+ super().__init__()
+ self.browser = browser
+
+ def forward(self, url: str) -> str:
+ self.browser.visit_page(url)
+ header, content = self.browser._state()
+ return header.strip() + "\n=======================\n" + content
+
+
+class DownloadTool(Tool):
+ name = "download_file"
+ description = """
+Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".png", ".docx"]
+After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it.
+DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead."""
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}}
+ output_type = "string"
+
+ def __init__(self, browser):
+ super().__init__()
+ self.browser = browser
+
+ def forward(self, url: str) -> str:
+ if "arxiv" in url:
+ url = url.replace("abs", "pdf")
+ response = requests.get(url)
+ content_type = response.headers.get("content-type", "")
+ extension = mimetypes.guess_extension(content_type)
+ if extension and isinstance(extension, str):
+ new_path = f"./downloads/file{extension}"
+ else:
+ new_path = "./downloads/file.object"
+
+ with open(new_path, "wb") as f:
+ f.write(response.content)
+
+ if "pdf" in extension or "txt" in extension or "htm" in extension:
+ raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.")
+
+ return f"File was downloaded and saved under path {new_path}."
+
+
+class ArchiveSearchTool(Tool):
+ name = "find_archived_url"
+ description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date."
+ inputs = {
+ "url": {"type": "string", "description": "The url you need the archive for."},
+ "date": {
+ "type": "string",
+ "description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.",
+ },
+ }
+ output_type = "string"
+
+ def __init__(self, browser):
+ super().__init__()
+ self.browser = browser
+
+ def forward(self, url, date) -> str:
+ no_timestamp_url = f"https://archive.org/wayback/available?url={url}"
+ archive_url = no_timestamp_url + f"×tamp={date}"
+ response = requests.get(archive_url).json()
+ response_notimestamp = requests.get(no_timestamp_url).json()
+ if "archived_snapshots" in response and "closest" in response["archived_snapshots"]:
+ closest = response["archived_snapshots"]["closest"]
+ print("Archive found!", closest)
+
+ elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]:
+ closest = response_notimestamp["archived_snapshots"]["closest"]
+ print("Archive found!", closest)
+ else:
+ raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
+ target_url = closest["url"]
+ self.browser.visit_page(target_url)
+ header, content = self.browser._state()
+ return (
+ f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
+ + header.strip()
+ + "\n=======================\n"
+ + content
+ )
+
+
+class PageUpTool(Tool):
+ name = "page_up"
+ description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content."
+ inputs = {}
+ output_type = "string"
+
+ def __init__(self, browser):
+ super().__init__()
+ self.browser = browser
+
+ def forward(self) -> str:
+ self.browser.page_up()
+ header, content = self.browser._state()
+ return header.strip() + "\n=======================\n" + content
+
+
+class PageDownTool(Tool):
+ name = "page_down"
+ description = (
+ "Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content."
+ )
+ inputs = {}
+ output_type = "string"
+
+ def __init__(self, browser):
+ super().__init__()
+ self.browser = browser
+
+ def forward(self) -> str:
+ self.browser.page_down()
+ header, content = self.browser._state()
+ return header.strip() + "\n=======================\n" + content
+
+
+class FinderTool(Tool):
+ name = "find_on_page_ctrl_f"
+ description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F."
+ inputs = {
+ "search_string": {
+ "type": "string",
+ "description": "The string to search for on the page. This search string supports wildcards like '*'",
+ }
+ }
+ output_type = "string"
+
+ def __init__(self, browser):
+ super().__init__()
+ self.browser = browser
+
+ def forward(self, search_string: str) -> str:
+ find_result = self.browser.find_on_page(search_string)
+ header, content = self.browser._state()
+
+ if find_result is None:
+ return (
+ header.strip()
+ + f"\n=======================\nThe search string '{search_string}' was not found on this page."
+ )
+ else:
+ return header.strip() + "\n=======================\n" + content
+
+
+class FindNextTool(Tool):
+ name = "find_next"
+ description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search."
+ inputs = {}
+ output_type = "string"
+
+ def __init__(self, browser):
+ super().__init__()
+ self.browser = browser
+
+ def forward(self) -> str:
+ find_result = self.browser.find_next()
+ header, content = self.browser._state()
+
+ if find_result is None:
+ return header.strip() + "\n=======================\nThe search string was not found on this page."
+ else:
+ return header.strip() + "\n=======================\n" + content
diff --git a/examples/open_deep_research/scripts/visual_qa.py b/examples/open_deep_research/scripts/visual_qa.py
new file mode 100644
index 000000000..84d240b66
--- /dev/null
+++ b/examples/open_deep_research/scripts/visual_qa.py
@@ -0,0 +1,187 @@
+import base64
+import json
+import mimetypes
+import os
+import uuid
+from io import BytesIO
+from typing import Optional
+
+import requests
+from dotenv import load_dotenv
+from huggingface_hub import InferenceClient
+from PIL import Image
+from transformers import AutoProcessor
+
+from smolagents import Tool, tool
+
+
+load_dotenv(override=True)
+
+idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
+
+
+def process_images_and_text(image_path, query, client):
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": query},
+ ],
+ },
+ ]
+
+ prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)
+
+ # load images from local directory
+
+ # encode images to strings which can be sent to the endpoint
+ def encode_local_image(image_path):
+ # load image
+ image = Image.open(image_path).convert("RGB")
+
+ # Convert the image to a base64 string
+ buffer = BytesIO()
+ image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG)
+ base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
+
+ # add string formatting required by the endpoint
+ image_string = f"data:image/jpeg;base64,{base64_image}"
+
+ return image_string
+
+ image_string = encode_local_image(image_path)
+ prompt_with_images = prompt_with_template.replace("", " ").format(image_string)
+
+ payload = {
+ "inputs": prompt_with_images,
+ "parameters": {
+ "return_full_text": False,
+ "max_new_tokens": 200,
+ },
+ }
+
+ return json.loads(client.post(json=payload).decode())[0]
+
+
+# Function to encode the image
+def encode_image(image_path):
+ if image_path.startswith("http"):
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
+ request_kwargs = {
+ "headers": {"User-Agent": user_agent},
+ "stream": True,
+ }
+
+ # Send a HTTP request to the URL
+ response = requests.get(image_path, **request_kwargs)
+ response.raise_for_status()
+ content_type = response.headers.get("content-type", "")
+
+ extension = mimetypes.guess_extension(content_type)
+ if extension is None:
+ extension = ".download"
+
+ fname = str(uuid.uuid4()) + extension
+ download_path = os.path.abspath(os.path.join("downloads", fname))
+
+ with open(download_path, "wb") as fh:
+ for chunk in response.iter_content(chunk_size=512):
+ fh.write(chunk)
+
+ image_path = download_path
+
+ with open(image_path, "rb") as image_file:
+ return base64.b64encode(image_file.read()).decode("utf-8")
+
+
+headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"}
+
+
+def resize_image(image_path):
+ img = Image.open(image_path)
+ width, height = img.size
+ img = img.resize((int(width / 2), int(height / 2)))
+ new_image_path = f"resized_{image_path}"
+ img.save(new_image_path)
+ return new_image_path
+
+
+class VisualQATool(Tool):
+ name = "visualizer"
+ description = "A tool that can answer questions about attached images."
+ inputs = {
+ "image_path": {
+ "description": "The path to the image on which to answer the question",
+ "type": "string",
+ },
+ "question": {"description": "the question to answer", "type": "string", "nullable": True},
+ }
+ output_type = "string"
+
+ client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
+
+ def forward(self, image_path: str, question: Optional[str] = None) -> str:
+ output = ""
+ add_note = False
+ if not question:
+ add_note = True
+ question = "Please write a detailed caption for this image."
+ try:
+ output = process_images_and_text(image_path, question, self.client)
+ except Exception as e:
+ print(e)
+ if "Payload Too Large" in str(e):
+ new_image_path = resize_image(image_path)
+ output = process_images_and_text(new_image_path, question, self.client)
+
+ if add_note:
+ output = (
+ f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
+ )
+
+ return output
+
+
+@tool
+def visualizer(image_path: str, question: Optional[str] = None) -> str:
+ """A tool that can answer questions about attached images.
+
+ Args:
+ image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
+ question: The question to answer.
+ """
+
+ add_note = False
+ if not question:
+ add_note = True
+ question = "Please write a detailed caption for this image."
+ if not isinstance(image_path, str):
+ raise Exception("You should provide at least `image_path` string argument to this tool!")
+
+ mime_type, _ = mimetypes.guess_type(image_path)
+ base64_image = encode_image(image_path)
+
+ payload = {
+ "model": "gpt-4o",
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": question},
+ {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}},
+ ],
+ }
+ ],
+ "max_tokens": 1000,
+ }
+ response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+ try:
+ output = response.json()["choices"][0]["message"]["content"]
+ except Exception:
+ raise Exception(f"Response format unexpected: {response.json()}")
+
+ if add_note:
+ output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
+
+ return output
diff --git a/examples/open_deep_research/visual_vs_text_browser.ipynb b/examples/open_deep_research/visual_vs_text_browser.ipynb
new file mode 100644
index 000000000..2eece88a3
--- /dev/null
+++ b/examples/open_deep_research/visual_vs_text_browser.ipynb
@@ -0,0 +1,350 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install \"smolagents[litellm]\" -q"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import datasets\n",
+ "\n",
+ "\n",
+ "eval_ds = datasets.load_dataset(\"gaia-benchmark/GAIA\", \"2023_all\")[\"validation\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "to_keep = [\n",
+ " \"What's the last line of the rhyme under the flavor\",\n",
+ " 'Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus',\n",
+ " \"In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.\",\n",
+ " \"Which contributor to the version of OpenCV where support was added for the Mask-RCNN model has the same name as a former Chinese head of government when the names are transliterated to the Latin alphabet?\",\n",
+ " \"The photograph in the Whitney Museum of American Art's collection with accession number 2022.128 shows a person holding a book. Which military unit did the author of this book join in 1813? Answer without using articles.\",\n",
+ " \"I went to Virtue restaurant & bar in Chicago for my birthday on March 22, 2021 and the main course I had was delicious! Unfortunately, when I went back about a month later on April 21, it was no longer on the dinner menu.\",\n",
+ " \"In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's \",\n",
+ " \"Under DDC 633 on Bielefeld University Library's BASE, as of 2020\",\n",
+ " \"In the 2018 VSCode blog post on replit.com, what was the command they clicked on in the last video to remove extra lines?\",\n",
+ " \"The Metropolitan Museum of Art has a portrait in its collection with an accession number of 29.100.5. Of the consecrators and co-consecrators\",\n",
+ " \"In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied?\",\n",
+ " 'In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content',\n",
+ " \"Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?\",\n",
+ "]\n",
+ "eval_ds = eval_ds.filter(lambda row: any([el in row[\"Question\"] for el in to_keep]))\n",
+ "eval_ds = eval_ds.rename_columns({\"Question\": \"question\", \"Final answer\": \"true_answer\", \"Level\": \"task\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "from dotenv import load_dotenv\n",
+ "from huggingface_hub import login\n",
+ "\n",
+ "\n",
+ "load_dotenv(override=True)\n",
+ "\n",
+ "login(os.getenv(\"HF_TOKEN\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Text browser"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from scripts.run_agents import answer_questions\n",
+ "from scripts.text_inspector_tool import TextInspectorTool\n",
+ "from scripts.text_web_browser import (\n",
+ " ArchiveSearchTool,\n",
+ " FinderTool,\n",
+ " FindNextTool,\n",
+ " NavigationalSearchTool,\n",
+ " PageDownTool,\n",
+ " PageUpTool,\n",
+ " SearchInformationTool,\n",
+ " VisitTool,\n",
+ ")\n",
+ "from scripts.visual_qa import VisualQAGPT4Tool\n",
+ "\n",
+ "from smolagents import CodeAgent, LiteLLMModel\n",
+ "\n",
+ "\n",
+ "proprietary_model = LiteLLMModel(\"gpt-4o\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "### BUILD AGENTS & TOOLS\n",
+ "\n",
+ "WEB_TOOLS = [\n",
+ " SearchInformationTool(),\n",
+ " NavigationalSearchTool(),\n",
+ " VisitTool(),\n",
+ " PageUpTool(),\n",
+ " PageDownTool(),\n",
+ " FinderTool(),\n",
+ " FindNextTool(),\n",
+ " ArchiveSearchTool(),\n",
+ "]\n",
+ "\n",
+ "\n",
+ "surfer_agent = CodeAgent(\n",
+ " model=proprietary_model,\n",
+ " tools=WEB_TOOLS,\n",
+ " max_steps=20,\n",
+ " verbosity_level=2,\n",
+ ")\n",
+ "\n",
+ "results_text = answer_questions(\n",
+ " eval_ds,\n",
+ " surfer_agent,\n",
+ " \"code_gpt4o_27-01_text\",\n",
+ " reformulation_model=proprietary_model,\n",
+ " output_folder=\"output_browsers\",\n",
+ " visual_inspection_tool=VisualQAGPT4Tool(),\n",
+ " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Vision browser"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install helium -q"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from scripts.visual_qa import VisualQAGPT4Tool\n",
+ "\n",
+ "from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel\n",
+ "from smolagents.vision_web_browser import (\n",
+ " close_popups,\n",
+ " go_back,\n",
+ " helium_instructions,\n",
+ " initialize_agent,\n",
+ " save_screenshot,\n",
+ " search_item_ctrl_f,\n",
+ ")\n",
+ "\n",
+ "\n",
+ "proprietary_model = LiteLLMModel(\"gpt-4o\")\n",
+ "vision_browser_agent = initialize_agent(proprietary_model)\n",
+ "### BUILD AGENTS & TOOLS\n",
+ "\n",
+ "CodeAgent(\n",
+ " tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],\n",
+ " model=proprietary_model,\n",
+ " additional_authorized_imports=[\"helium\"],\n",
+ " step_callbacks=[save_screenshot],\n",
+ " max_steps=20,\n",
+ " verbosity_level=2,\n",
+ ")\n",
+ "\n",
+ "results_vision = answer_questions(\n",
+ " eval_ds,\n",
+ " vision_browser_agent,\n",
+ " \"code_gpt4o_27-01_vision\",\n",
+ " reformulation_model=proprietary_model,\n",
+ " output_folder=\"output_browsers\",\n",
+ " visual_inspection_tool=VisualQAGPT4Tool(),\n",
+ " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
+ " postprompt=helium_instructions\n",
+ " + \"Any web browser controls won't work on .pdf urls, rather use the tool 'inspect_file_as_text' to read them\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Browser-use browser"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install browser-use lxml_html_clean -q\n",
+ "!playwright install"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import asyncio\n",
+ "\n",
+ "import nest_asyncio\n",
+ "\n",
+ "\n",
+ "nest_asyncio.apply()\n",
+ "\n",
+ "from browser_use import Agent\n",
+ "from dotenv import load_dotenv\n",
+ "from langchain_openai import ChatOpenAI\n",
+ "\n",
+ "\n",
+ "load_dotenv()\n",
+ "\n",
+ "\n",
+ "class BrowserUseAgent:\n",
+ " logs = []\n",
+ "\n",
+ " def write_inner_memory_from_logs(self, summary_mode):\n",
+ " return self.results\n",
+ "\n",
+ " def run(self, task, **kwargs):\n",
+ " agent = Agent(\n",
+ " task=task,\n",
+ " llm=ChatOpenAI(model=\"gpt-4o\"),\n",
+ " )\n",
+ " self.results = asyncio.get_event_loop().run_until_complete(agent.run())\n",
+ " return self.results.history[-1].result[0].extracted_content\n",
+ "\n",
+ "\n",
+ "browser_use_agent = BrowserUseAgent()\n",
+ "\n",
+ "results_browseruse = answer_questions(\n",
+ " eval_ds,\n",
+ " browser_use_agent,\n",
+ " \"gpt-4o_27-01_browseruse\",\n",
+ " reformulation_model=proprietary_model,\n",
+ " output_folder=\"output_browsers\",\n",
+ " visual_inspection_tool=VisualQAGPT4Tool(),\n",
+ " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
+ " postprompt=\"\",\n",
+ " run_simple=True,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Get results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from scripts.gaia_scorer import question_scorer\n",
+ "\n",
+ "\n",
+ "results_vision, results_text, results_browseruse = (\n",
+ " pd.DataFrame(results_vision),\n",
+ " pd.DataFrame(results_text),\n",
+ " pd.DataFrame(results_browseruse),\n",
+ ")\n",
+ "\n",
+ "results_vision[\"is_correct\"] = results_vision.apply(\n",
+ " lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n",
+ ")\n",
+ "results_text[\"is_correct\"] = results_text.apply(lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1)\n",
+ "results_browseruse[\"is_correct\"] = results_browseruse.apply(\n",
+ " lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results = pd.concat([results_vision, results_text, results_browseruse])\n",
+ "results.groupby(\"agent_name\")[\"is_correct\"].mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "correct_vision_results = results_vision.loc[results_vision[\"is_correct\"]]\n",
+ "correct_vision_results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "false_text_results = results_text.loc[~results_text[\"is_correct\"]]\n",
+ "false_text_results"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "gaia",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index d73072f90..647721754 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -143,6 +143,7 @@ class MultiStepAgent:
name (`str`, *optional*): Necessary for a managed agent only - the name by which this agent can be called.
description (`str`, *optional*): Necessary for a managed agent only - the description of this agent.
managed_agent_prompt (`str`, *optional*): Custom prompt for the managed agent. Defaults to None.
+ provide_run_summary (`bool`, *optional*): Wether to provide a run summary when called as a managed agent.
"""
def __init__(
@@ -162,6 +163,7 @@ def __init__(
name: Optional[str] = None,
description: Optional[str] = None,
managed_agent_prompt: Optional[str] = None,
+ provide_run_summary: bool = False,
):
if system_prompt is None:
system_prompt = CODE_SYSTEM_PROMPT
@@ -181,6 +183,7 @@ def __init__(
self.name = name
self.description = description
self.managed_agent_prompt = managed_agent_prompt if managed_agent_prompt else MANAGED_AGENT_PROMPT
+ self.provide_run_summary = provide_run_summary
self.managed_agents = {}
if managed_agents is not None:
@@ -356,7 +359,7 @@ def execute_tool_call(self, tool_name: str, arguments: Union[Dict[str, str], str
if tool_name in self.tools:
tool_description = get_tool_description_with_args(available_tools[tool_name])
error_msg = (
- f"Error in tool call execution: {e}\nYou should only use this tool with a correct input.\n"
+ f"Error in tool call execution: {type(e).__name__}: {e}\nYou should only use this tool with a correct input.\n"
f"As a reminder, this tool's description is the following:\n{tool_description}"
)
raise AgentExecutionError(error_msg, self.logger)
@@ -453,10 +456,10 @@ def _run(self, task: str, images: List[str] | None = None) -> Generator[ActionSt
observations_images=images,
)
try:
- if self.planning_interval is not None and self.step_number % self.planning_interval == 0:
+ if self.planning_interval is not None and self.step_number % self.planning_interval == 1:
self.planning_step(
task,
- is_first_step=(self.step_number == 0),
+ is_first_step=(self.step_number == 1),
step=self.step_number,
)
self.logger.log_rule(f"Step {self.step_number}", level=LogLevel.INFO)
@@ -651,21 +654,21 @@ def replay(self, detailed: bool = False):
"""
self.memory.replay(self.logger, detailed=detailed)
- def __call__(self, request, provide_run_summary=False, **kwargs):
- """Adds additional prompting for the managed agent, and runs it."""
+ def __call__(self, request: str, **kwargs):
+ """
+ This methd is called only by a manager agent.
+ Adds additional prompting for the managed agent, runs it, and wraps the output.
+ """
full_task = self.managed_agent_prompt.format(name=self.name, task=request).strip()
output = self.run(full_task, **kwargs)
- if provide_run_summary:
- answer = f"Here is the final answer from your managed agent '{self.name}':\n"
- answer += str(output)
+ answer = f"Here is the final answer from your managed agent '{self.name}':\n{str(output)}"
+ if self.provide_run_summary:
answer += f"\n\nFor more detail, find below a summary of this agent's work:\nSUMMARY OF WORK FROM AGENT '{self.name}':\n"
for message in self.write_memory_to_messages(summary_mode=True):
content = message["content"]
answer += "\n" + truncate_content(str(content)) + "\n---"
answer += f"\nEND OF SUMMARY OF WORK FROM AGENT '{self.name}'."
- return answer
- else:
- return output
+ return answer
class ToolCallingAgent(MultiStepAgent):
@@ -925,8 +928,8 @@ def step(self, memory_step: ActionStep) -> Union[None, Any]:
]
observation = "Execution logs:\n" + execution_logs
except Exception as e:
- if hasattr(self.python_executor, "state") and "print_outputs" in self.python_executor.state:
- execution_logs = self.python_executor.state["print_outputs"]
+ if hasattr(self.python_executor, "state") and "_print_outputs" in self.python_executor.state:
+ execution_logs = str(self.python_executor.state["_print_outputs"])
if len(execution_logs) > 0:
execution_outputs_console = [
Text("Execution logs:", style="bold"),
diff --git a/src/smolagents/default_tools.py b/src/smolagents/default_tools.py
index d290e6f7b..a36775886 100644
--- a/src/smolagents/default_tools.py
+++ b/src/smolagents/default_tools.py
@@ -76,7 +76,7 @@ def forward(self, code: str) -> str:
authorized_imports=self.authorized_imports,
)[0] # The second element is boolean is_final_answer
)
- return f"Stdout:\n{state['print_outputs']}\nOutput: {output}"
+ return f"Stdout:\n{str(state['_print_outputs'])}\nOutput: {output}"
class FinalAnswerTool(Tool):
@@ -169,10 +169,10 @@ def forward(self, query: str, filter_year: Optional[int] = None) -> str:
if "organic_results" not in results.keys():
if filter_year is not None:
raise Exception(
- f"'organic_results' key not found for query: '{query}' with filtering on year={filter_year}. Use a less restrictive query or do not filter on year."
+ f"No results found for query: '{query}' with filtering on year={filter_year}. Use a less restrictive query or do not filter on year."
)
else:
- raise Exception(f"'organic_results' key not found for query: '{query}'. Use a less restrictive query.")
+ raise Exception(f"No results found for query: '{query}'. Use a less restrictive query.")
if len(results["organic_results"]) == 0:
year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
return f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
diff --git a/src/smolagents/local_python_executor.py b/src/smolagents/local_python_executor.py
index dd4367b78..5efa620a7 100644
--- a/src/smolagents/local_python_executor.py
+++ b/src/smolagents/local_python_executor.py
@@ -50,8 +50,8 @@ class InterpreterError(ValueError):
if isinstance(getattr(builtins, name), type) and issubclass(getattr(builtins, name), BaseException)
}
-PRINT_OUTPUTS, DEFAULT_MAX_LEN_OUTPUT = "", 50000
-OPERATIONS_COUNT, MAX_OPERATIONS = 0, 10000000
+DEFAULT_MAX_LEN_OUTPUT = 50000
+MAX_OPERATIONS = 10000000
def custom_print(*args):
@@ -114,6 +114,32 @@ def custom_print(*args):
}
+class PrintContainer:
+ def __init__(self):
+ self.value = ""
+
+ def append(self, text):
+ self.value += text
+ return self
+
+ def __iadd__(self, other):
+ """Implements the += operator"""
+ self.value += str(other)
+ return self
+
+ def __str__(self):
+ """String representation"""
+ return self.value
+
+ def __repr__(self):
+ """Representation for debugging"""
+ return f"PrintContainer({self.value})"
+
+ def __len__(self):
+ """Implements len() function support"""
+ return len(self.value)
+
+
class BreakException(Exception):
pass
@@ -215,7 +241,7 @@ def evaluate_while(
custom_tools: Dict[str, Callable],
authorized_imports: List[str],
) -> None:
- max_iterations = 1000
+ max_iterations = 1000000
iterations = 0
while evaluate_ast(while_loop.test, state, static_tools, custom_tools, authorized_imports):
for node in while_loop.body:
@@ -603,10 +629,7 @@ def evaluate_call(
raise InterpreterError("super() takes at most 2 arguments")
else:
if func_name == "print":
- output = " ".join(map(str, args))
- global PRINT_OUTPUTS
- PRINT_OUTPUTS += output + "\n"
- # cap the number of lines
+ state["_print_outputs"] += " ".join(map(str, args)) + "\n"
return None
else: # Assume it's a callable object
if (
@@ -1090,6 +1113,42 @@ def evaluate_dictcomp(
return result
+def evaluate_delete(
+ delete_node: ast.Delete,
+ state: Dict[str, Any],
+ static_tools: Dict[str, Callable],
+ custom_tools: Dict[str, Callable],
+ authorized_imports: List[str],
+) -> None:
+ """
+ Evaluate a delete statement (del x, del x[y]).
+
+ Args:
+ delete_node: The AST Delete node to evaluate
+ state: The current state dictionary
+ static_tools: Dictionary of static tools
+ custom_tools: Dictionary of custom tools
+ authorized_imports: List of authorized imports
+ """
+ for target in delete_node.targets:
+ if isinstance(target, ast.Name):
+ # Handle simple variable deletion (del x)
+ if target.id in state:
+ del state[target.id]
+ else:
+ raise InterpreterError(f"Cannot delete name '{target.id}': name is not defined")
+ elif isinstance(target, ast.Subscript):
+ # Handle index/key deletion (del x[y])
+ obj = evaluate_ast(target.value, state, static_tools, custom_tools, authorized_imports)
+ index = evaluate_ast(target.slice, state, static_tools, custom_tools, authorized_imports)
+ try:
+ del obj[index]
+ except (TypeError, KeyError, IndexError) as e:
+ raise InterpreterError(f"Cannot delete index/key: {str(e)}")
+ else:
+ raise InterpreterError(f"Deletion of {type(target).__name__} targets is not supported")
+
+
def evaluate_ast(
expression: ast.AST,
state: Dict[str, Any],
@@ -1117,12 +1176,11 @@ def evaluate_ast(
The list of modules that can be imported by the code. By default, only a few safe modules are allowed.
If it contains "*", it will authorize any import. Use this at your own risk!
"""
- global OPERATIONS_COUNT
- if OPERATIONS_COUNT >= MAX_OPERATIONS:
+ if state["_operations_count"] >= MAX_OPERATIONS:
raise InterpreterError(
f"Reached the max number of operations of {MAX_OPERATIONS}. Maybe there is an infinite loop somewhere in the code, or you're just asking too many calculations."
)
- OPERATIONS_COUNT += 1
+ state["_operations_count"] += 1
if isinstance(expression, ast.Assign):
# Assignment -> we evaluate the assignment which should update the state
# We return the variable assigned as it may be used to determine the final result.
@@ -1241,6 +1299,8 @@ def evaluate_ast(
)
elif isinstance(expression, ast.Pass):
return None
+ elif isinstance(expression, ast.Delete):
+ return evaluate_delete(expression, state, static_tools, custom_tools, authorized_imports)
else:
# For now we refuse anything else. Let's add things as we need them.
raise InterpreterError(f"{expression.__class__.__name__} is not supported.")
@@ -1277,7 +1337,7 @@ def evaluate_python_code(
state (`Dict[str, Any]`):
A dictionary mapping variable names to values. The `state` should contain the initial inputs but will be
updated by this function to contain all variables as they are evaluated.
- The print outputs will be stored in the state under the key 'print_outputs'.
+ The print outputs will be stored in the state under the key "_print_outputs".
"""
try:
expression = ast.parse(code)
@@ -1294,10 +1354,8 @@ def evaluate_python_code(
static_tools = static_tools.copy() if static_tools is not None else {}
custom_tools = custom_tools if custom_tools is not None else {}
result = None
- global PRINT_OUTPUTS
- PRINT_OUTPUTS = ""
- global OPERATIONS_COUNT
- OPERATIONS_COUNT = 0
+ state["_print_outputs"] = PrintContainer()
+ state["_operations_count"] = 0
def final_answer(value):
raise FinalAnswerException(value)
@@ -1307,16 +1365,22 @@ def final_answer(value):
try:
for node in expression.body:
result = evaluate_ast(node, state, static_tools, custom_tools, authorized_imports)
- state["print_outputs"] = truncate_content(PRINT_OUTPUTS, max_length=max_print_outputs_length)
+ state["_print_outputs"].value = truncate_content(
+ str(state["_print_outputs"]), max_length=max_print_outputs_length
+ )
is_final_answer = False
return result, is_final_answer
except FinalAnswerException as e:
- state["print_outputs"] = truncate_content(PRINT_OUTPUTS, max_length=max_print_outputs_length)
+ state["_print_outputs"].value = truncate_content(
+ str(state["_print_outputs"]), max_length=max_print_outputs_length
+ )
is_final_answer = True
return e.value, is_final_answer
except Exception as e:
exception_type = type(e).__name__
- state["print_outputs"] = truncate_content(PRINT_OUTPUTS, max_length=max_print_outputs_length)
+ state["_print_outputs"].value = truncate_content(
+ str(state["_print_outputs"]), max_length=max_print_outputs_length
+ )
raise InterpreterError(
f"Code execution failed at line '{ast.get_source_segment(code, node)}' due to: {exception_type}:{str(e)}"
)
@@ -1353,7 +1417,7 @@ def __call__(self, code_action: str, additional_variables: Dict) -> Tuple[Any, s
authorized_imports=self.authorized_imports,
max_print_outputs_length=self.max_print_outputs_length,
)
- logs = self.state["print_outputs"]
+ logs = str(self.state["_print_outputs"])
return output, logs, is_final_answer
diff --git a/src/smolagents/models.py b/src/smolagents/models.py
index dc1c7c609..28de2eefa 100644
--- a/src/smolagents/models.py
+++ b/src/smolagents/models.py
@@ -347,6 +347,9 @@ class HfApiModel(Model):
If not provided, the class will try to use environment variable 'HF_TOKEN', else use the token stored in the Hugging Face CLI configuration.
timeout (`int`, *optional*, defaults to 120):
Timeout for the API request, in seconds.
+ custom_role_conversions (`dict[str, str]`, *optional*):
+ Custom role conversion mapping to convert message roles in others.
+ Useful for specific models that do not support specific message roles like "system".
**kwargs:
Additional keyword arguments to pass to the Hugging Face API.
@@ -374,6 +377,7 @@ def __init__(
provider: Optional[str] = None,
token: Optional[str] = None,
timeout: Optional[int] = 120,
+ custom_role_conversions: Optional[Dict[str, str]] = None,
**kwargs,
):
super().__init__(**kwargs)
@@ -382,6 +386,7 @@ def __init__(
if token is None:
token = os.getenv("HF_TOKEN")
self.client = InferenceClient(self.model_id, provider=provider, token=token, timeout=timeout)
+ self.custom_role_conversions = custom_role_conversions
def __call__(
self,
@@ -397,9 +402,9 @@ def __call__(
grammar=grammar,
tools_to_call_from=tools_to_call_from,
convert_images_to_image_urls=True,
+ custom_role_conversions=self.custom_role_conversions,
**kwargs,
)
-
response = self.client.chat_completion(**completion_kwargs)
self.last_input_token_count = response.usage.prompt_tokens
diff --git a/src/smolagents/prompts.py b/src/smolagents/prompts.py
index 7d05be723..b3686e946 100644
--- a/src/smolagents/prompts.py
+++ b/src/smolagents/prompts.py
@@ -351,7 +351,8 @@
print("Pope age as per google search:", pope_age_search)
```
Observation:
-Pope age: "The pope Francis is currently 88 years old."
+Pope age as per wikipedia: "The pope Francis is currently 88 years old."
+Pope age as per google search: "The current pope, Francis, just turned 88."
Thought: I know that the pope is 88 years old. Let's compute the result using python code.
Code:
@@ -501,7 +502,7 @@
Task:
{task}
---
-You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer.
+You're helping your manager solve a wider task: so do not just provide a one-line answer, instead give as much information as possible to give them a clear understanding of the answer.
Your final_answer WILL HAVE to contain these parts:
### 1. Task outcome (short version):
diff --git a/tests/test_python_interpreter.py b/tests/test_python_interpreter.py
index ad8b99d41..3e95711f6 100644
--- a/tests/test_python_interpreter.py
+++ b/tests/test_python_interpreter.py
@@ -35,19 +35,25 @@ def add_two(x):
class PythonInterpreterTester(unittest.TestCase):
+ def assertDictEqualNoPrint(self, dict1, dict2):
+ return self.assertDictEqual(
+ {k: v for k, v in dict1.items() if k != "_print_outputs"},
+ {k: v for k, v in dict2.items() if k != "_print_outputs"},
+ )
+
def test_evaluate_assign(self):
code = "x = 3"
state = {}
result, _ = evaluate_python_code(code, {}, state=state)
assert result == 3
- self.assertDictEqual(state, {"x": 3, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "_operations_count": 2})
code = "x = y"
state = {"y": 5}
result, _ = evaluate_python_code(code, {}, state=state)
# evaluate returns the value of the last assignment.
assert result == 5
- self.assertDictEqual(state, {"x": 5, "y": 5, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 5, "y": 5, "_operations_count": 2})
code = "a=1;b=None"
result, _ = evaluate_python_code(code, {}, state={})
@@ -73,7 +79,7 @@ def test_evaluate_call(self):
state = {"x": 3}
result, _ = evaluate_python_code(code, {"add_two": add_two}, state=state)
assert result == 5
- self.assertDictEqual(state, {"x": 3, "y": 5, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "y": 5, "_operations_count": 3})
# Should not work without the tool
with pytest.raises(InterpreterError) as e:
@@ -85,14 +91,14 @@ def test_evaluate_constant(self):
state = {}
result, _ = evaluate_python_code(code, {}, state=state)
assert result == 3
- self.assertDictEqual(state, {"x": 3, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "_operations_count": 2})
def test_evaluate_dict(self):
code = "test_dict = {'x': x, 'y': add_two(x)}"
state = {"x": 3}
result, _ = evaluate_python_code(code, {"add_two": add_two}, state=state)
self.assertDictEqual(result, {"x": 3, "y": 5})
- self.assertDictEqual(state, {"x": 3, "test_dict": {"x": 3, "y": 5}, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "test_dict": {"x": 3, "y": 5}, "_operations_count": 7})
def test_evaluate_expression(self):
code = "x = 3\ny = 5"
@@ -100,7 +106,7 @@ def test_evaluate_expression(self):
result, _ = evaluate_python_code(code, {}, state=state)
# evaluate returns the value of the last assignment.
assert result == 5
- self.assertDictEqual(state, {"x": 3, "y": 5, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "y": 5, "_operations_count": 4})
def test_evaluate_f_string(self):
code = "text = f'This is x: {x}.'"
@@ -108,7 +114,7 @@ def test_evaluate_f_string(self):
result, _ = evaluate_python_code(code, {}, state=state)
# evaluate returns the value of the last assignment.
assert result == "This is x: 3."
- self.assertDictEqual(state, {"x": 3, "text": "This is x: 3.", "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "text": "This is x: 3.", "_operations_count": 6})
def test_evaluate_if(self):
code = "if x <= 3:\n y = 2\nelse:\n y = 5"
@@ -116,40 +122,40 @@ def test_evaluate_if(self):
result, _ = evaluate_python_code(code, {}, state=state)
# evaluate returns the value of the last assignment.
assert result == 2
- self.assertDictEqual(state, {"x": 3, "y": 2, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "y": 2, "_operations_count": 6})
state = {"x": 8}
result, _ = evaluate_python_code(code, {}, state=state)
# evaluate returns the value of the last assignment.
assert result == 5
- self.assertDictEqual(state, {"x": 8, "y": 5, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 8, "y": 5, "_operations_count": 6})
def test_evaluate_list(self):
code = "test_list = [x, add_two(x)]"
state = {"x": 3}
result, _ = evaluate_python_code(code, {"add_two": add_two}, state=state)
self.assertListEqual(result, [3, 5])
- self.assertDictEqual(state, {"x": 3, "test_list": [3, 5], "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "test_list": [3, 5], "_operations_count": 5})
def test_evaluate_name(self):
code = "y = x"
state = {"x": 3}
result, _ = evaluate_python_code(code, {}, state=state)
assert result == 3
- self.assertDictEqual(state, {"x": 3, "y": 3, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "y": 3, "_operations_count": 2})
def test_evaluate_subscript(self):
code = "test_list = [x, add_two(x)]\ntest_list[1]"
state = {"x": 3}
result, _ = evaluate_python_code(code, {"add_two": add_two}, state=state)
assert result == 5
- self.assertDictEqual(state, {"x": 3, "test_list": [3, 5], "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "test_list": [3, 5], "_operations_count": 9})
code = "test_dict = {'x': x, 'y': add_two(x)}\ntest_dict['y']"
state = {"x": 3}
result, _ = evaluate_python_code(code, {"add_two": add_two}, state=state)
assert result == 5
- self.assertDictEqual(state, {"x": 3, "test_dict": {"x": 3, "y": 5}, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "test_dict": {"x": 3, "y": 5}, "_operations_count": 11})
code = "vendor = {'revenue': 31000, 'rent': 50312}; vendor['ratio'] = round(vendor['revenue'] / vendor['rent'], 2)"
state = {}
@@ -173,14 +179,14 @@ def test_evaluate_for(self):
state = {}
result, _ = evaluate_python_code(code, {"range": range}, state=state)
assert result == 2
- self.assertDictEqual(state, {"x": 2, "i": 2, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 2, "i": 2, "_operations_count": 11})
def test_evaluate_binop(self):
code = "y + x"
state = {"x": 3, "y": 6}
result, _ = evaluate_python_code(code, {}, state=state)
assert result == 9
- self.assertDictEqual(state, {"x": 3, "y": 6, "print_outputs": ""})
+ self.assertDictEqualNoPrint(state, {"x": 3, "y": 6, "_operations_count": 4})
def test_recursive_function(self):
code = """
@@ -377,7 +383,7 @@ def test_if_conditions(self):
print('2')"""
state = {}
evaluate_python_code(code, BASE_PYTHON_TOOLS, state=state)
- assert state["print_outputs"] == "2\n"
+ assert state["_print_outputs"].value == "2\n"
def test_imports(self):
code = "import math\nmath.sqrt(4)"
@@ -456,9 +462,9 @@ def test_print_output(self):
state = {}
result, _ = evaluate_python_code(code, BASE_PYTHON_TOOLS, state=state)
assert result is None
- assert state["print_outputs"] == "Hello world!\nOk no one cares\n"
+ assert state["_print_outputs"].value == "Hello world!\nOk no one cares\n"
- # test print in function
+ # Test print in function (state copy)
code = """
print("1")
def function():
@@ -466,7 +472,17 @@ def function():
function()"""
state = {}
evaluate_python_code(code, {"print": print}, state=state)
- assert state["print_outputs"] == "1\n2\n"
+ assert state["_print_outputs"].value == "1\n2\n"
+
+ # Test print in list comprehension (state copy)
+ code = """
+print("1")
+def function():
+ print("2")
+[function() for i in range(10)]"""
+ state = {}
+ evaluate_python_code(code, {"print": print, "range": range}, state=state)
+ assert state["_print_outputs"].value == "1\n2\n2\n2\n2\n2\n2\n2\n2\n2\n2\n"
def test_tuple_target_in_iterator(self):
code = "for a, b in [('Ralf Weikert', 'Austria'), ('Samuel Seungwon Lee', 'South Korea')]:res = a.split()[0]"
@@ -588,7 +604,7 @@ def test_print(self):
code = "print(min([1, 2, 3]))"
state = {}
evaluate_python_code(code, {"min": min, "print": print}, state=state)
- assert state["print_outputs"] == "1\n"
+ assert state["_print_outputs"].value == "1\n"
def test_types_as_objects(self):
code = "type_a = float(2); type_b = str; type_c = int"