Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open Deep Research #317

Merged
merged 47 commits into from
Feb 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
394fe57
Start GAIA benchmark
aymeric-roucher Jan 22, 2025
2adb556
Fixes
aymeric-roucher Jan 22, 2025
17a793c
Merge branch 'main' into gaia-submission-r1
aymeric-roucher Jan 22, 2025
02a5bb5
Merge branch 'main' into gaia-submission-r1
aymeric-roucher Jan 22, 2025
4bd9e3c
Up
aymeric-roucher Jan 22, 2025
850b4c3
Update
aymeric-roucher Jan 23, 2025
7f3ce93
Update gaia for visual web browser
aymeric-roucher Jan 25, 2025
d8c481d
Merge branch 'main' into gaia-submission-r1
aymeric-roucher Jan 25, 2025
9699e2e
Update reformulator
aymeric-roucher Jan 27, 2025
6a32465
Add visual text browser comparator
aymeric-roucher Jan 28, 2025
8309740
Add browser use test
aymeric-roucher Jan 29, 2025
31e0a48
Merge branch 'main' into gaia-submission-r1
aymeric-roucher Jan 29, 2025
a803b27
Improvements
aymeric-roucher Jan 29, 2025
74ff788
Improve analysis
aymeric-roucher Jan 30, 2025
28bb072
Merge branch 'main' into gaia-submission-r1
aymeric-roucher Jan 30, 2025
584fd6c
Fix pyproject
aymeric-roucher Jan 30, 2025
376a074
Merge branch 'main' into gaia-submission-r1
aymeric-roucher Feb 1, 2025
f2c5bec
Fix reformulator
aymeric-roucher Feb 1, 2025
078c4d9
Multithreaded and revamped gaia, with nice scores 🚀
aymeric-roucher Feb 1, 2025
08efff7
Update analysis notebook
aymeric-roucher Feb 2, 2025
7cf4234
Rename folder to open_deep_research
aymeric-roucher Feb 3, 2025
2650bf0
Merge branch 'main' into gaia-submission-r1
aymeric-roucher Feb 3, 2025
2af67f1
Adapt to new multiagent structure
aymeric-roucher Feb 3, 2025
e70d817
Reach 53% score
aymeric-roucher Feb 4, 2025
dd11a3c
Fix format
aymeric-roucher Feb 4, 2025
9be8854
Remove navigational web search tool
aymeric-roucher Feb 4, 2025
3a0f8ce
Add smolagents vision web browser rather than script one
aymeric-roucher Feb 4, 2025
74594d7
Revert markdownify version
aymeric-roucher Feb 4, 2025
a15dfd1
Format notebook
aymeric-roucher Feb 4, 2025
e4968e9
Update examples/open_deep_research/run.py
aymeric-roucher Feb 4, 2025
993778a
Update examples/open_deep_research/scripts/run_agents.py
aymeric-roucher Feb 4, 2025
4a87ecf
Update examples/open_deep_research/scripts/visual_qa.py
aymeric-roucher Feb 4, 2025
d9f2e5e
Update examples/open_deep_research/scripts/text_web_browser.py
aymeric-roucher Feb 4, 2025
bd8d439
Update examples/open_deep_research/scripts/visual_qa.py
aymeric-roucher Feb 4, 2025
2068edf
Fix leaky print outputs
aymeric-roucher Feb 4, 2025
f7dfcd4
Fix tests print ouputs
aymeric-roucher Feb 4, 2025
bc2c63d
Logic fixes
aymeric-roucher Feb 4, 2025
5d02247
More fixes
aymeric-roucher Feb 4, 2025
38153a8
Update examples/open_deep_research/requirements.txt
aymeric-roucher Feb 4, 2025
bfefdb1
Apply suggestions from code review
aymeric-roucher Feb 4, 2025
36f0f39
Update examples/open_deep_research/run.py
aymeric-roucher Feb 4, 2025
9c2af92
Update examples/open_deep_research/run.py
aymeric-roucher Feb 4, 2025
87cd2fb
Update examples/open_deep_research/run.py
aymeric-roucher Feb 4, 2025
fa41ea2
Update examples/open_deep_research/run.py
aymeric-roucher Feb 4, 2025
f9ec253
Pass interpreter tests
aymeric-roucher Feb 4, 2025
a9d1769
Reverse benchmark
aymeric-roucher Feb 4, 2025
c41a50a
Revert issues on web browsers
aymeric-roucher Feb 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10,552 changes: 10,552 additions & 0 deletions examples/open_deep_research/analysis.ipynb

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions examples/open_deep_research/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
anthropic>=0.37.1
beautifulsoup4>=4.12.3
datasets>=2.21.0
google_search_results>=2.4.2
huggingface_hub>=0.23.4
mammoth>=1.8.0
markdownify>=0.13.1
numexpr>=2.10.1
numpy>=2.1.2
openai>=1.52.2
openpyxl
pandas>=2.2.3
pathvalidate>=3.2.1
pdfminer>=20191125
pdfminer.six>=20240706
Pillow>=11.0.0
puremagic>=1.28
pypdf>=5.1.0
python-dotenv>=1.0.1
python_pptx>=1.0.2
Requests>=2.32.3
serpapi>=0.1.5
tqdm>=4.66.4
torch>=2.2.2
torchvision>=0.17.2
transformers>=4.46.0
youtube_transcript_api>=0.6.2
chess
sympy
pubchempy
Bio
scikit-learn
scipy
pydub
PyPDF2
python-pptx
torch
xlrd
SpeechRecognition
300 changes: 300 additions & 0 deletions examples/open_deep_research/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
import argparse
import json
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from typing import List

import datasets
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import login
from scripts.reformulator import prepare_response
from scripts.run_agents import (
get_single_file_description,
get_zip_description,
)
from scripts.text_inspector_tool import TextInspectorTool
from scripts.text_web_browser import (
ArchiveSearchTool,
FinderTool,
FindNextTool,
PageDownTool,
PageUpTool,
SearchInformationTool,
SimpleTextBrowser,
VisitTool,
)
from scripts.visual_qa import visualizer
from tqdm import tqdm

from smolagents import (
MANAGED_AGENT_PROMPT,
CodeAgent,
# HfApiModel,
LiteLLMModel,
Model,
ToolCallingAgent,
)


AUTHORIZED_IMPORTS = [
"requests",
"zipfile",
"os",
"pandas",
"numpy",
"sympy",
"json",
"bs4",
"pubchempy",
"xml",
"yahoo_finance",
"Bio",
"sklearn",
"scipy",
"pydub",
"io",
"PIL",
"chess",
"PyPDF2",
"pptx",
"torch",
"datetime",
"fractions",
"csv",
]
load_dotenv(override=True)
login(os.getenv("HF_TOKEN"))

append_answer_lock = threading.Lock()


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--concurrency", type=int, default=8)
parser.add_argument("--model-id", type=str, default="o1")
parser.add_argument("--api-base", type=str, default=None)
parser.add_argument("--run-name", type=str, required=True)
return parser.parse_args()


### IMPORTANT: EVALUATION SWITCHES

print("Make sure you deactivated Tailscale VPN, else some URLs will be blocked!")

USE_OPEN_MODELS = False

SET = "validation"

custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}

### LOAD EVALUATION DATASET

eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")[SET]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you reverted my fix to FileNotFoundError...

Copy link
Member

@albertvillanova albertvillanova Feb 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that the datasets library loads by default the viewer auto-generated Parquet version of the dataset (https://huggingface.co/datasets/gaia-benchmark/GAIA/tree/refs%2Fconvert%2Fparquet) and in this version:

  • file_path field of the dataset makes no sense (it points to a path in the viewer server)
  • files are not downloaded: the user needs to download them by hand

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before, you manually downloaded the data files to:

  • data/gaia/validation/

With the fix, the data files are now in:

  • data/gaia/2023/validation/

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you. Let's note for a later PR to integrate this improved file management that you propose.

eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})


def preprocess_file_paths(row):
if len(row["file_name"]) > 0:
row["file_name"] = f"data/gaia/{SET}/" + row["file_name"]
return row


eval_ds = eval_ds.map(preprocess_file_paths)
Comment on lines +99 to +106
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def preprocess_file_paths(row):
if len(row["file_name"]) > 0:
row["file_name"] = f"data/gaia/{SET}/" + row["file_name"]
return row
eval_ds = eval_ds.map(preprocess_file_paths)

Copy link
Collaborator Author

@aymeric-roucher aymeric-roucher Feb 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@albertvillanova actually I'd prefer to keep local files because I've done a manual screenshot of many documents( thus there is a check, if image path exists in png version it will be preferred to the base extension) (I could have done it automatically but wanted to spare time). I think this manually edited folder can be shared later on to make the code fully reproducible!

eval_df = pd.DataFrame(eval_ds)
print("Loaded evaluation dataset:")
print(eval_df["task"].value_counts())

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"

BROWSER_CONFIG = {
"viewport_size": 1024 * 5,
"downloads_folder": "downloads_folder",
"request_kwargs": {
"headers": {"User-Agent": user_agent},
"timeout": 300,
},
"serpapi_key": os.getenv("SERPAPI_API_KEY"),
}

os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)


def create_agent_hierarchy(model: Model):
text_limit = 100000
ti_tool = TextInspectorTool(model, text_limit)

browser = SimpleTextBrowser(**BROWSER_CONFIG)

WEB_TOOLS = [
SearchInformationTool(browser),
VisitTool(browser),
PageUpTool(browser),
PageDownTool(browser),
FinderTool(browser),
FindNextTool(browser),
ArchiveSearchTool(browser),
TextInspectorTool(model, text_limit),
]
text_webbrowser_agent = ToolCallingAgent(
model=model,
tools=WEB_TOOLS,
max_steps=20,
verbosity_level=2,
planning_interval=4,
name="search_agent",
description="""A team member that will search the internet to answer your question.
Ask him for all your questions that require browsing the web.
Provide him as much context as possible, in particular if you need to search on a specific timeframe!
And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
""",
provide_run_summary=True,
managed_agent_prompt=MANAGED_AGENT_PROMPT
+ """You can navigate to .txt online files.
If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information.""",
)

manager_agent = CodeAgent(
model=model,
tools=[visualizer, ti_tool],
max_steps=12,
verbosity_level=2,
additional_authorized_imports=AUTHORIZED_IMPORTS,
planning_interval=4,
managed_agents=[text_webbrowser_agent],
)
return manager_agent


def append_answer(entry: dict, jsonl_file: str) -> None:
jsonl_file = Path(jsonl_file)
jsonl_file.parent.mkdir(parents=True, exist_ok=True)
with append_answer_lock, open(jsonl_file, "a", encoding="utf-8") as fp:
fp.write(json.dumps(entry) + "\n")
assert os.path.exists(jsonl_file), "File not found!"
print("Answer exported to file:", jsonl_file.resolve())


def answer_single_question(example, model_id, answers_file, visual_inspection_tool):
model = LiteLLMModel(
model_id,
custom_role_conversions=custom_role_conversions,
max_completion_tokens=8192,
reasoning_effort="high",
)
# model = HfApiModel("Qwen/Qwen2.5-72B-Instruct", provider="together")
# "https://lnxyuvj02bpe6mam.us-east-1.aws.endpoints.huggingface.cloud",
# custom_role_conversions=custom_role_conversions,
# # provider="sambanova",
# max_tokens=8096,
# )
document_inspection_tool = TextInspectorTool(model, 100000)

agent = create_agent_hierarchy(model)

augmented_question = """You have one question to answer. It is paramount that you provide a correct answer.
Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be tolerated, success will be rewarded.
Run verification steps if that's needed, you must make sure you find the correct answer!
Here is the task:
""" + example["question"]

if example["file_name"]:
if ".zip" in example["file_name"]:
prompt_use_files = "\n\nTo solve the task above, you will have to use these attached files:\n"
prompt_use_files += get_zip_description(
example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool
)
else:
prompt_use_files = "\n\nTo solve the task above, you will have to use this attached file:"
prompt_use_files += get_single_file_description(
example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool
)
augmented_question += prompt_use_files

start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
try:
# Run agent 🚀
final_result = agent.run(augmented_question)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This final_result is never used?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No indeed! Since it's already in the last step of the agent's memory used in prepare_response I didn't include it. But maybe it would increase results to explicitly mention the response, good point! I'll try adding it.


agent_memory = agent.write_memory_to_messages(summary_mode=True)

final_result = prepare_response(augmented_question, agent_memory, reformulation_model=model)

output = str(final_result)
for memory_step in agent.memory.steps:
memory_step.model_input_messages = None
intermediate_steps = [str(step) for step in agent.memory.steps]

# Check for parsing errors which indicate the LLM failed to follow the required format
parsing_error = True if any(["AgentParsingError" in step for step in intermediate_steps]) else False

# check if iteration limit exceeded
iteration_limit_exceeded = True if "Agent stopped due to iteration limit or time limit." in output else False
raised_exception = False

except Exception as e:
print("Error on ", augmented_question, e)
output = None
intermediate_steps = []
parsing_error = False
iteration_limit_exceeded = False
exception = e
raised_exception = True
end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
annotated_example = {
"agent_name": model.model_id,
"question": example["question"],
"augmented_question": augmented_question,
"prediction": output,
"intermediate_steps": intermediate_steps,
"parsing_error": parsing_error,
"iteration_limit_exceeded": iteration_limit_exceeded,
"agent_error": str(exception) if raised_exception else None,
"start_time": start_time,
"end_time": end_time,
"task": example["task"],
"task_id": example["task_id"],
"true_answer": example["true_answer"],
}
append_answer(annotated_example, answers_file)


def get_examples_to_answer(answers_file, eval_ds) -> List[dict]:
print(f"Loading answers from {answers_file}...")
try:
done_questions = pd.read_json(answers_file, lines=True)["question"].tolist()
print(f"Found {len(done_questions)} previous results!")
except Exception as e:
print("Error when loading records: ", e)
print("No usable records! ▶️ Starting new.")
done_questions = []
return [line for line in eval_ds.to_list() if line["question"] not in done_questions]


def main():
args = parse_args()
print(f"Starting run with arguments: {args}")

answers_file = f"output/{SET}/{args.run_name}.jsonl"
tasks_to_run = get_examples_to_answer(answers_file, eval_ds)

with ThreadPoolExecutor(max_workers=args.concurrency) as exe:
futures = [
exe.submit(answer_single_question, example, args.model_id, answers_file, visualizer)
for example in tasks_to_run
]
for f in tqdm(as_completed(futures), total=len(tasks_to_run), desc="Processing tasks"):
f.result()

# for example in tasks_to_run:
# answer_single_question(example, args.model_id, answers_file, visualizer)
print("All tasks processed.")


if __name__ == "__main__":
main()
Loading