From 766380e33b198d439e7525c380ffcf341271c107 Mon Sep 17 00:00:00 2001 From: Gabriel Fior Date: Fri, 5 Apr 2024 20:03:36 -0300 Subject: [PATCH] Adding benchmark.py EVO-style --- .../crewai_subsequential_agent/benchmark.py | 162 ++++++------------ .../crewai_agent_subquestions.py | 12 +- .../crewai_subsequential_agent/deploy.py | 2 +- .../crewai_subsequential_agent/prompts.py | 11 +- 4 files changed, 65 insertions(+), 122 deletions(-) diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py b/prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py index 30971597..cab1deb5 100644 --- a/prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py +++ b/prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py @@ -1,33 +1,21 @@ -import time -from datetime import timedelta, datetime +import typing as t +from datetime import datetime +import typer from dotenv import load_dotenv -from prediction_market_agent_tooling.benchmark.agents import AbstractBenchmarkedAgent +from prediction_market_agent_tooling.benchmark.agents import ( + AbstractBenchmarkedAgent, RandomAgent, FixedAgent, +) from prediction_market_agent_tooling.benchmark.benchmark import Benchmarker from prediction_market_agent_tooling.benchmark.utils import ( OutcomePrediction, Prediction, ) -from prediction_market_agent_tooling.gtypes import Probability, DatetimeWithTimezone -from prediction_market_agent_tooling.markets.markets import AgentMarket -from prediction_market_agent_tooling.tools.utils import utcnow -from pydantic import BaseModel +from prediction_market_agent_tooling.gtypes import Probability +from prediction_market_agent_tooling.markets.agent_market import SortBy, FilterBy, AgentMarket +from prediction_market_agent_tooling.markets.markets import get_binary_markets, MarketType from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import CrewAIAgentSubquestions -from prediction_market_agent.agents.known_outcome_agent.known_outcome_agent import ( - Result, - get_known_outcome, -) - - -def build_market_from_question_without_validation(question: str) -> Market: - return Market(url=question, - question=question, p_yes=0.5, - source=MarketSource.MANIFOLD, - volume=0, - created_time=DatetimeWithTimezone(datetime(2024, 1, 1)), - close_time=DatetimeWithTimezone(datetime(2024, 3, 15)) - ) def build_binary_agent_market_from_question(question: str) -> AgentMarket: @@ -41,12 +29,6 @@ def build_binary_agent_market_from_question(question: str) -> AgentMarket: ) -class QuestionAndAnswer(BaseModel): - question: str - result: Result - bet_correct: bool - - class CrewAIAgentSubquestionsBenchmark(AbstractBenchmarkedAgent): def __init__( self, @@ -61,99 +43,61 @@ def __init__( super().__init__(agent_name=agent_name, max_workers=max_workers) def predict(self, market_question: str) -> Prediction: - - market = build_binary_agent_market_from_question(market_question) - result = self.agent.answer_binary_market(market) - - answer = get_known_outcome( - model=self.model, - question=market_question, - max_tries=self.max_tries, - ) - print(f"Answered {market_question=} with {answer.result=}, {answer.reasoning=}") - if not answer.has_known_result(): - return Prediction( - is_predictable=False, - outcome_prediction=None, - ) - else: - return Prediction( - is_predictable=True, - outcome_prediction=OutcomePrediction( - p_yes=answer.result.to_p_yes(), - confidence=1.0, - info_utility=None, - ), - ) - - -if __name__ == "__main__": + result = self.agent.answer_binary_market(market_question) + return Prediction(outcome_prediction=OutcomePrediction( + p_yes=result.p_yes, + confidence=result.confidence)) + + +def main( + n: int = 10, + output: str = "./benchmark_report.md", + reference: MarketType = MarketType.MANIFOLD, + filter: FilterBy = FilterBy.OPEN, + sort: SortBy = SortBy.NONE, + max_workers: int = 1, + cache_path: t.Optional[str] = "predictions_cache.json", + only_cached: bool = False, +) -> None: + """ + Polymarket usually contains higher quality questions, + but on Manifold, additionally to filtering by MarketFilter.resolved, you can sort by MarketSort.newest. + """ load_dotenv() - tomorrow_str = (utcnow() + timedelta(days=1)).strftime("%d %B %Y") + markets = get_binary_markets(n, reference, filter_by=filter, sort_by=sort) + markets = markets[:1] + markets_deduplicated = list(({m.question: m for m in markets}.values())) + if len(markets) != len(markets_deduplicated): + print( + f"Warning: Deduplicated markets from {len(markets)} to {len(markets_deduplicated)}." + ) - # Fetch example questions which our agents answered in the past. - questions = [ - QuestionAndAnswer( - question="Will the stock price of Donald Trump's media company exceed $100 on 1 April 2024?", - result=Result.NO, - bet_correct=True - ), - QuestionAndAnswer( - question="Will Andy Murray return to professional tennis from his ankle injury on or before 31 March 2024?", - result=Result.NO, - bet_correct=True - ), - QuestionAndAnswer( - question="Will any legislation be signed by President Biden that could potentially lead to the ban of TikTok by 1 April 2024?", - result=Result.YES, - bet_correct=False - ), - QuestionAndAnswer( - question="Will the United States v. Apple case have a verdict by 1 April 2024?", - result=Result.NO, - bet_correct=True - ), - QuestionAndAnswer( - question="Will Microsoft Teams launch the announced Copilot AI features by 1 April 2024?", - result=Result.YES, - bet_correct=True - ), - QuestionAndAnswer( - question="Will the Francis Scott Key Bridge in Baltimore be fully rebuilt by 2 April 2024?", - result=Result.NO, - bet_correct=True - ), - QuestionAndAnswer( - question="Will iOS 18 break the iPhone's iconic app grid by 1 April 2024?", - result=Result.YES, - bet_correct=False - ), - QuestionAndAnswer( - question="Will a winner of the Mega Millions jackpot be announced by 26 March 2024?", - result=Result.YES, - bet_correct=False - ), - ] + print(f"Found {len(markets_deduplicated)} markets.") benchmarker = Benchmarker( - markets=[build_market_from_question_without_validation(q.question) for q in questions][:1], + markets=markets_deduplicated, agents=[ - CrewAIAgentSubquestionsBenchmark( - agent_name="subsequential_questions", - model="gpt-3.5-turbo-0125", - max_tries=3, - max_workers=1, + CrewAIAgentSubquestionsBenchmark("subsequential-questions-crewai", max_workers=max_workers, max_tries=3, + model="gpt-3.5-turbo-0125"), + RandomAgent(agent_name="random", max_workers=max_workers), + FixedAgent( + fixed_answer=False, agent_name="fixed-no", max_workers=max_workers + ), + FixedAgent( + fixed_answer=True, agent_name="fixed-yes", max_workers=max_workers ), ], + cache_path=cache_path, + only_cached=only_cached, ) - benchmarker.run_agents() + + benchmarker.run_agents(enable_timing=False) # Caching of search etc. can distort timings md = benchmarker.generate_markdown_report() - output = f"./subsequential_questions_agent_benchmark_report.{int(time.time())}.md" with open(output, "w") as f: print(f"Writing benchmark report to: {output}") f.write(md) - # Check all predictions are correct, i.e. mean-squared-error == 0 - metrics = benchmarker.compute_metrics() - assert metrics["MSE for `p_yes`"][0] == 0.0 + +if __name__ == "__main__": + typer.run(main) diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/crewai_agent_subquestions.py b/prediction_market_agent/agents/crewai_subsequential_agent/crewai_agent_subquestions.py index 81dcb245..be9fa0dd 100644 --- a/prediction_market_agent/agents/crewai_subsequential_agent/crewai_agent_subquestions.py +++ b/prediction_market_agent/agents/crewai_subsequential_agent/crewai_agent_subquestions.py @@ -1,12 +1,8 @@ import typing as t from crewai import Agent, Task, Process, Crew -from crewai_tools import SerperDevTool -from langchain_openai import OpenAI -from prediction_market_agent_tooling.markets.agent_market import AgentMarket from pydantic import BaseModel -from prediction_market_agent.agents.abstract import AbstractAgent from prediction_market_agent.agents.crewai_subsequential_agent.prompts import * from prediction_market_agent.tools.crewai_tools import TavilyDevTool @@ -25,7 +21,7 @@ class ProbabilityOutput(BaseModel): confidence: float -class CrewAIAgentSubquestions(AbstractAgent): +class CrewAIAgentSubquestions: def __init__(self) -> None: # openai_model_name as str automatically interpreted by CrewAI, else create LLM object. self.researcher = Agent( @@ -121,9 +117,9 @@ def generate_final_decision(self, outcomes_with_probabilities: list[t.Tuple[str, 'outcome_to_assess': outcomes_with_probabilities[0][0]}) return ProbabilityOutput.model_validate_json(task_final_decision.output.raw_output) - def answer_binary_market(self, market: AgentMarket) -> bool: + def answer_binary_market(self, question: str) -> ProbabilityOutput: - outcomes = self.split_research_into_outcomes(market.question) + outcomes = self.split_research_into_outcomes(question) print ("outcomes ", outcomes) outcomes_with_probs = [] @@ -155,4 +151,4 @@ def answer_binary_market(self, market: AgentMarket) -> bool: outcomes_with_probs.append((outcome, prediction_result)) final_answer = self.generate_final_decision(outcomes_with_probs) - return True if final_answer.decision == "y" else False + return final_answer diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/deploy.py b/prediction_market_agent/agents/crewai_subsequential_agent/deploy.py index 356859c0..35c3ca7a 100644 --- a/prediction_market_agent/agents/crewai_subsequential_agent/deploy.py +++ b/prediction_market_agent/agents/crewai_subsequential_agent/deploy.py @@ -17,7 +17,7 @@ class DeployableThinkThoroughlyAgent(DeployableAgent): # For cheaper credits at this experimental stage - model = "gpt-3.5-turbo" + model = "gpt-4-turbo-preview" def load(self) -> None: self.markets_with_known_outcomes: dict[str, Result] = {} diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/prompts.py b/prediction_market_agent/agents/crewai_subsequential_agent/prompts.py index 948495eb..9af3ac10 100644 --- a/prediction_market_agent/agents/crewai_subsequential_agent/prompts.py +++ b/prediction_market_agent/agents/crewai_subsequential_agent/prompts.py @@ -58,6 +58,7 @@ {{"report": }} where is a free text field that contains a well though out justification for the predicition based on the summary of your findings. + The report must not be longer than 7000 characters. """ FINAL_DECISION_PROMPT = """ @@ -83,16 +84,18 @@ OUTCOME_TO_ASSESS: {outcome_to_assess} OUTCOMES_WITH_PROBABILITIES: {outcomes_with_probabilities} """ + PROBABILITY_CLASS_OUTPUT = """ - Your response should be a JSON string containing the following keys: + Your response should be a JSON string containing the following keys: - "decision": The decision you made. Either `y` (for `Yes`) or `n` (for `No`). - "p_yes": Probability that the sentence outcome will be `Yes`. Ranging from 0 (lowest probability) to 1 (maximum probability). - "p_no": Probability that the sentence outcome will be `No`. Ranging from 0 (lowest probability) to 1 (maximum probability). - "confidence": Indicating the confidence in the estimated probabilities you provided ranging from 0 (lowest confidence) to 1 (maximum confidence). Confidence can be calculated based on the quality and quantity of data used for the estimation. + A valid JSON string as output could look like the example below: + Example output: {"decision": "y","p_yes": 0.1, "p_no": 0.9, "confidence": 0.4} + Do not use escape quotes and line breaks. Do not output any reasoning, only the JSON object. + Ensure p_yes + p_no equals 1. - - Format your response in JSON format, including the keys "decision", "p_yes", "p_no" and "confidence". - Only output the JSON-formatted string, nothing else. """