Skip to content

Commit

Permalink
Adding benchmark.py EVO-style
Browse files Browse the repository at this point in the history
  • Loading branch information
gabrielfior committed Apr 5, 2024
1 parent 996f94a commit 766380e
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 122 deletions.
162 changes: 53 additions & 109 deletions prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,21 @@
import time
from datetime import timedelta, datetime
import typing as t
from datetime import datetime

import typer
from dotenv import load_dotenv
from prediction_market_agent_tooling.benchmark.agents import AbstractBenchmarkedAgent
from prediction_market_agent_tooling.benchmark.agents import (
AbstractBenchmarkedAgent, RandomAgent, FixedAgent,
)
from prediction_market_agent_tooling.benchmark.benchmark import Benchmarker
from prediction_market_agent_tooling.benchmark.utils import (
OutcomePrediction,
Prediction,
)
from prediction_market_agent_tooling.gtypes import Probability, DatetimeWithTimezone
from prediction_market_agent_tooling.markets.markets import AgentMarket
from prediction_market_agent_tooling.tools.utils import utcnow
from pydantic import BaseModel
from prediction_market_agent_tooling.gtypes import Probability
from prediction_market_agent_tooling.markets.agent_market import SortBy, FilterBy, AgentMarket
from prediction_market_agent_tooling.markets.markets import get_binary_markets, MarketType

from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import CrewAIAgentSubquestions
from prediction_market_agent.agents.known_outcome_agent.known_outcome_agent import (
Result,
get_known_outcome,
)


def build_market_from_question_without_validation(question: str) -> Market:
return Market(url=question,
question=question, p_yes=0.5,
source=MarketSource.MANIFOLD,
volume=0,
created_time=DatetimeWithTimezone(datetime(2024, 1, 1)),
close_time=DatetimeWithTimezone(datetime(2024, 3, 15))
)


def build_binary_agent_market_from_question(question: str) -> AgentMarket:
Expand All @@ -41,12 +29,6 @@ def build_binary_agent_market_from_question(question: str) -> AgentMarket:
)


class QuestionAndAnswer(BaseModel):
question: str
result: Result
bet_correct: bool


class CrewAIAgentSubquestionsBenchmark(AbstractBenchmarkedAgent):
def __init__(
self,
Expand All @@ -61,99 +43,61 @@ def __init__(
super().__init__(agent_name=agent_name, max_workers=max_workers)

def predict(self, market_question: str) -> Prediction:

market = build_binary_agent_market_from_question(market_question)
result = self.agent.answer_binary_market(market)

answer = get_known_outcome(
model=self.model,
question=market_question,
max_tries=self.max_tries,
)
print(f"Answered {market_question=} with {answer.result=}, {answer.reasoning=}")
if not answer.has_known_result():
return Prediction(
is_predictable=False,
outcome_prediction=None,
)
else:
return Prediction(
is_predictable=True,
outcome_prediction=OutcomePrediction(
p_yes=answer.result.to_p_yes(),
confidence=1.0,
info_utility=None,
),
)


if __name__ == "__main__":
result = self.agent.answer_binary_market(market_question)
return Prediction(outcome_prediction=OutcomePrediction(
p_yes=result.p_yes,
confidence=result.confidence))


def main(
n: int = 10,
output: str = "./benchmark_report.md",
reference: MarketType = MarketType.MANIFOLD,
filter: FilterBy = FilterBy.OPEN,
sort: SortBy = SortBy.NONE,
max_workers: int = 1,
cache_path: t.Optional[str] = "predictions_cache.json",
only_cached: bool = False,
) -> None:
"""
Polymarket usually contains higher quality questions,
but on Manifold, additionally to filtering by MarketFilter.resolved, you can sort by MarketSort.newest.
"""
load_dotenv()
tomorrow_str = (utcnow() + timedelta(days=1)).strftime("%d %B %Y")
markets = get_binary_markets(n, reference, filter_by=filter, sort_by=sort)
markets = markets[:1]
markets_deduplicated = list(({m.question: m for m in markets}.values()))
if len(markets) != len(markets_deduplicated):
print(
f"Warning: Deduplicated markets from {len(markets)} to {len(markets_deduplicated)}."
)

# Fetch example questions which our agents answered in the past.
questions = [
QuestionAndAnswer(
question="Will the stock price of Donald Trump's media company exceed $100 on 1 April 2024?",
result=Result.NO,
bet_correct=True
),
QuestionAndAnswer(
question="Will Andy Murray return to professional tennis from his ankle injury on or before 31 March 2024?",
result=Result.NO,
bet_correct=True
),
QuestionAndAnswer(
question="Will any legislation be signed by President Biden that could potentially lead to the ban of TikTok by 1 April 2024?",
result=Result.YES,
bet_correct=False
),
QuestionAndAnswer(
question="Will the United States v. Apple case have a verdict by 1 April 2024?",
result=Result.NO,
bet_correct=True
),
QuestionAndAnswer(
question="Will Microsoft Teams launch the announced Copilot AI features by 1 April 2024?",
result=Result.YES,
bet_correct=True
),
QuestionAndAnswer(
question="Will the Francis Scott Key Bridge in Baltimore be fully rebuilt by 2 April 2024?",
result=Result.NO,
bet_correct=True
),
QuestionAndAnswer(
question="Will iOS 18 break the iPhone's iconic app grid by 1 April 2024?",
result=Result.YES,
bet_correct=False
),
QuestionAndAnswer(
question="Will a winner of the Mega Millions jackpot be announced by 26 March 2024?",
result=Result.YES,
bet_correct=False
),
]
print(f"Found {len(markets_deduplicated)} markets.")

benchmarker = Benchmarker(
markets=[build_market_from_question_without_validation(q.question) for q in questions][:1],
markets=markets_deduplicated,
agents=[
CrewAIAgentSubquestionsBenchmark(
agent_name="subsequential_questions",
model="gpt-3.5-turbo-0125",
max_tries=3,
max_workers=1,
CrewAIAgentSubquestionsBenchmark("subsequential-questions-crewai", max_workers=max_workers, max_tries=3,
model="gpt-3.5-turbo-0125"),
RandomAgent(agent_name="random", max_workers=max_workers),
FixedAgent(
fixed_answer=False, agent_name="fixed-no", max_workers=max_workers
),
FixedAgent(
fixed_answer=True, agent_name="fixed-yes", max_workers=max_workers
),
],
cache_path=cache_path,
only_cached=only_cached,
)
benchmarker.run_agents()

benchmarker.run_agents(enable_timing=False) # Caching of search etc. can distort timings
md = benchmarker.generate_markdown_report()

output = f"./subsequential_questions_agent_benchmark_report.{int(time.time())}.md"
with open(output, "w") as f:
print(f"Writing benchmark report to: {output}")
f.write(md)

# Check all predictions are correct, i.e. mean-squared-error == 0
metrics = benchmarker.compute_metrics()
assert metrics["MSE for `p_yes`"][0] == 0.0

if __name__ == "__main__":
typer.run(main)
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import typing as t

from crewai import Agent, Task, Process, Crew
from crewai_tools import SerperDevTool
from langchain_openai import OpenAI
from prediction_market_agent_tooling.markets.agent_market import AgentMarket
from pydantic import BaseModel

from prediction_market_agent.agents.abstract import AbstractAgent
from prediction_market_agent.agents.crewai_subsequential_agent.prompts import *
from prediction_market_agent.tools.crewai_tools import TavilyDevTool

Expand All @@ -25,7 +21,7 @@ class ProbabilityOutput(BaseModel):
confidence: float


class CrewAIAgentSubquestions(AbstractAgent):
class CrewAIAgentSubquestions:
def __init__(self) -> None:
# openai_model_name as str automatically interpreted by CrewAI, else create LLM object.
self.researcher = Agent(
Expand Down Expand Up @@ -121,9 +117,9 @@ def generate_final_decision(self, outcomes_with_probabilities: list[t.Tuple[str,
'outcome_to_assess': outcomes_with_probabilities[0][0]})
return ProbabilityOutput.model_validate_json(task_final_decision.output.raw_output)

def answer_binary_market(self, market: AgentMarket) -> bool:
def answer_binary_market(self, question: str) -> ProbabilityOutput:

outcomes = self.split_research_into_outcomes(market.question)
outcomes = self.split_research_into_outcomes(question)
print ("outcomes ", outcomes)

outcomes_with_probs = []
Expand Down Expand Up @@ -155,4 +151,4 @@ def answer_binary_market(self, market: AgentMarket) -> bool:
outcomes_with_probs.append((outcome, prediction_result))

final_answer = self.generate_final_decision(outcomes_with_probs)
return True if final_answer.decision == "y" else False
return final_answer
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

class DeployableThinkThoroughlyAgent(DeployableAgent):
# For cheaper credits at this experimental stage
model = "gpt-3.5-turbo"
model = "gpt-4-turbo-preview"

def load(self) -> None:
self.markets_with_known_outcomes: dict[str, Result] = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
{{"report": <REPORT>}}
where <REPORT> is a free text field that contains a well though out justification
for the predicition based on the summary of your findings.
The report must not be longer than 7000 characters.
"""

FINAL_DECISION_PROMPT = """
Expand All @@ -83,16 +84,18 @@
OUTCOME_TO_ASSESS: {outcome_to_assess}
OUTCOMES_WITH_PROBABILITIES: {outcomes_with_probabilities}
"""

PROBABILITY_CLASS_OUTPUT = """
Your response should be a JSON string containing the following keys:
Your response should be a JSON string containing the following keys:
- "decision": The decision you made. Either `y` (for `Yes`) or `n` (for `No`).
- "p_yes": Probability that the sentence outcome will be `Yes`. Ranging from 0 (lowest probability) to 1 (maximum probability).
- "p_no": Probability that the sentence outcome will be `No`. Ranging from 0 (lowest probability) to 1 (maximum probability).
- "confidence": Indicating the confidence in the estimated probabilities you provided ranging from 0 (lowest confidence) to
1 (maximum confidence). Confidence can be calculated based on the quality and quantity of data used for the estimation.
A valid JSON string as output could look like the example below:
Example output: {"decision": "y","p_yes": 0.1, "p_no": 0.9, "confidence": 0.4}
Do not use escape quotes and line breaks. Do not output any reasoning, only the JSON object.
Ensure p_yes + p_no equals 1.
Format your response in JSON format, including the keys "decision", "p_yes", "p_no" and "confidence".
Only output the JSON-formatted string, nothing else.
"""

0 comments on commit 766380e

Please sign in to comment.