Adding benchmark.py EVO-style

gnosis · Apr 5, 2024 · 766380e · 766380e
1 parent 996f94a
commit 766380e
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 122 deletions.
diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py b/prediction_market_agent/agents/crewai_subsequential_agent/benchmark.py
@@ -1,33 +1,21 @@
-import time
-from datetime import timedelta, datetime
+import typing as t
+from datetime import datetime
 
+import typer
 from dotenv import load_dotenv
-from prediction_market_agent_tooling.benchmark.agents import AbstractBenchmarkedAgent
+from prediction_market_agent_tooling.benchmark.agents import (
+    AbstractBenchmarkedAgent, RandomAgent, FixedAgent,
+)
 from prediction_market_agent_tooling.benchmark.benchmark import Benchmarker
 from prediction_market_agent_tooling.benchmark.utils import (
     OutcomePrediction,
     Prediction,
 )
-from prediction_market_agent_tooling.gtypes import Probability, DatetimeWithTimezone
-from prediction_market_agent_tooling.markets.markets import AgentMarket
-from prediction_market_agent_tooling.tools.utils import utcnow
-from pydantic import BaseModel
+from prediction_market_agent_tooling.gtypes import Probability
+from prediction_market_agent_tooling.markets.agent_market import SortBy, FilterBy, AgentMarket
+from prediction_market_agent_tooling.markets.markets import get_binary_markets, MarketType
 
 from prediction_market_agent.agents.crewai_subsequential_agent.crewai_agent_subquestions import CrewAIAgentSubquestions
-from prediction_market_agent.agents.known_outcome_agent.known_outcome_agent import (
-    Result,
-    get_known_outcome,
-)
-
-
-def build_market_from_question_without_validation(question: str) -> Market:
-    return Market(url=question,
-                  question=question, p_yes=0.5,
-                  source=MarketSource.MANIFOLD,
-                  volume=0,
-                  created_time=DatetimeWithTimezone(datetime(2024, 1, 1)),
-                  close_time=DatetimeWithTimezone(datetime(2024, 3, 15))
-                  )
 
 
 def build_binary_agent_market_from_question(question: str) -> AgentMarket:
@@ -41,12 +29,6 @@ def build_binary_agent_market_from_question(question: str) -> AgentMarket:
     )
 
 
-class QuestionAndAnswer(BaseModel):
-    question: str
-    result: Result
-    bet_correct: bool
-
-
 class CrewAIAgentSubquestionsBenchmark(AbstractBenchmarkedAgent):
     def __init__(
             self,
@@ -61,99 +43,61 @@ def __init__(
         super().__init__(agent_name=agent_name, max_workers=max_workers)
 
     def predict(self, market_question: str) -> Prediction:
-
-        market = build_binary_agent_market_from_question(market_question)
-        result = self.agent.answer_binary_market(market)
-
-        answer = get_known_outcome(
-            model=self.model,
-            question=market_question,
-            max_tries=self.max_tries,
-        )
-        print(f"Answered {market_question=} with {answer.result=}, {answer.reasoning=}")
-        if not answer.has_known_result():
-            return Prediction(
-                is_predictable=False,
-                outcome_prediction=None,
-            )
-        else:
-            return Prediction(
-                is_predictable=True,
-                outcome_prediction=OutcomePrediction(
-                    p_yes=answer.result.to_p_yes(),
-                    confidence=1.0,
-                    info_utility=None,
-                ),
-            )
-
-
-if __name__ == "__main__":
+        result = self.agent.answer_binary_market(market_question)
+        return Prediction(outcome_prediction=OutcomePrediction(
+            p_yes=result.p_yes,
+            confidence=result.confidence))
+
+
+def main(
+        n: int = 10,
+        output: str = "./benchmark_report.md",
+        reference: MarketType = MarketType.MANIFOLD,
+        filter: FilterBy = FilterBy.OPEN,
+        sort: SortBy = SortBy.NONE,
+        max_workers: int = 1,
+        cache_path: t.Optional[str] = "predictions_cache.json",
+        only_cached: bool = False,
+) -> None:
+    """
+    Polymarket usually contains higher quality questions,
+    but on Manifold, additionally to filtering by MarketFilter.resolved, you can sort by MarketSort.newest.
+    """
     load_dotenv()
-    tomorrow_str = (utcnow() + timedelta(days=1)).strftime("%d %B %Y")
+    markets = get_binary_markets(n, reference, filter_by=filter, sort_by=sort)
+    markets = markets[:1]
+    markets_deduplicated = list(({m.question: m for m in markets}.values()))
+    if len(markets) != len(markets_deduplicated):
+        print(
+            f"Warning: Deduplicated markets from {len(markets)} to {len(markets_deduplicated)}."
+        )
 
-    # Fetch example questions which our agents answered in the past.
-    questions = [
-        QuestionAndAnswer(
-            question="Will the stock price of Donald Trump's media company exceed $100 on 1 April 2024?",
-            result=Result.NO,
-            bet_correct=True
-        ),
-        QuestionAndAnswer(
-            question="Will Andy Murray return to professional tennis from his ankle injury on or before 31 March 2024?",
-            result=Result.NO,
-            bet_correct=True
-        ),
-        QuestionAndAnswer(
-            question="Will any legislation be signed by President Biden that could potentially lead to the ban of TikTok by 1 April 2024?",
-            result=Result.YES,
-            bet_correct=False
-        ),
-        QuestionAndAnswer(
-            question="Will the United States v. Apple case have a verdict by 1 April 2024?",
-            result=Result.NO,
-            bet_correct=True
-        ),
-        QuestionAndAnswer(
-            question="Will Microsoft Teams launch the announced Copilot AI features by 1 April 2024?",
-            result=Result.YES,
-            bet_correct=True
-        ),
-        QuestionAndAnswer(
-            question="Will the Francis Scott Key Bridge in Baltimore be fully rebuilt by 2 April 2024?",
-            result=Result.NO,
-            bet_correct=True
-        ),
-        QuestionAndAnswer(
-            question="Will iOS 18 break the iPhone's iconic app grid by 1 April 2024?",
-            result=Result.YES,
-            bet_correct=False
-        ),
-        QuestionAndAnswer(
-            question="Will a winner of the Mega Millions jackpot be announced by 26 March 2024?",
-            result=Result.YES,
-            bet_correct=False
-        ),
-    ]
+    print(f"Found {len(markets_deduplicated)} markets.")
 
     benchmarker = Benchmarker(
-        markets=[build_market_from_question_without_validation(q.question) for q in questions][:1],
+        markets=markets_deduplicated,
         agents=[
-            CrewAIAgentSubquestionsBenchmark(
-                agent_name="subsequential_questions",
-                model="gpt-3.5-turbo-0125",
-                max_tries=3,
-                max_workers=1,
+            CrewAIAgentSubquestionsBenchmark("subsequential-questions-crewai", max_workers=max_workers, max_tries=3,
+                                             model="gpt-3.5-turbo-0125"),
+            RandomAgent(agent_name="random", max_workers=max_workers),
+            FixedAgent(
+                fixed_answer=False, agent_name="fixed-no", max_workers=max_workers
+            ),
+            FixedAgent(
+                fixed_answer=True, agent_name="fixed-yes", max_workers=max_workers
             ),
         ],
+        cache_path=cache_path,
+        only_cached=only_cached,
     )
-    benchmarker.run_agents()
+
+    benchmarker.run_agents(enable_timing=False)  # Caching of search etc. can distort timings
     md = benchmarker.generate_markdown_report()
 
-    output = f"./subsequential_questions_agent_benchmark_report.{int(time.time())}.md"
     with open(output, "w") as f:
         print(f"Writing benchmark report to: {output}")
         f.write(md)
 
-    # Check all predictions are correct, i.e. mean-squared-error == 0
-    metrics = benchmarker.compute_metrics()
-    assert metrics["MSE for `p_yes`"][0] == 0.0
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/crewai_agent_subquestions.py b/prediction_market_agent/agents/crewai_subsequential_agent/crewai_agent_subquestions.py
@@ -1,12 +1,8 @@
 import typing as t
 
 from crewai import Agent, Task, Process, Crew
-from crewai_tools import SerperDevTool
-from langchain_openai import OpenAI
-from prediction_market_agent_tooling.markets.agent_market import AgentMarket
 from pydantic import BaseModel
 
-from prediction_market_agent.agents.abstract import AbstractAgent
 from prediction_market_agent.agents.crewai_subsequential_agent.prompts import *
 from prediction_market_agent.tools.crewai_tools import TavilyDevTool
 
@@ -25,7 +21,7 @@ class ProbabilityOutput(BaseModel):
     confidence: float
 
 
-class CrewAIAgentSubquestions(AbstractAgent):
+class CrewAIAgentSubquestions:
     def __init__(self) -> None:
         # openai_model_name as str automatically interpreted by CrewAI, else create LLM object.
         self.researcher = Agent(
@@ -121,9 +117,9 @@ def generate_final_decision(self, outcomes_with_probabilities: list[t.Tuple[str,
                                       'outcome_to_assess': outcomes_with_probabilities[0][0]})
         return ProbabilityOutput.model_validate_json(task_final_decision.output.raw_output)
 
-    def answer_binary_market(self, market: AgentMarket) -> bool:
+    def answer_binary_market(self, question: str) -> ProbabilityOutput:
 
-        outcomes = self.split_research_into_outcomes(market.question)
+        outcomes = self.split_research_into_outcomes(question)
         print ("outcomes ", outcomes)
 
         outcomes_with_probs = []
@@ -155,4 +151,4 @@ def answer_binary_market(self, market: AgentMarket) -> bool:
             outcomes_with_probs.append((outcome, prediction_result))
 
         final_answer = self.generate_final_decision(outcomes_with_probs)
-        return True if final_answer.decision == "y" else False
+        return final_answer
diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/deploy.py b/prediction_market_agent/agents/crewai_subsequential_agent/deploy.py
@@ -17,7 +17,7 @@
 
 class DeployableThinkThoroughlyAgent(DeployableAgent):
     # For cheaper credits at this experimental stage
-    model = "gpt-3.5-turbo"
+    model = "gpt-4-turbo-preview"
 
     def load(self) -> None:
         self.markets_with_known_outcomes: dict[str, Result] = {}

diff --git a/prediction_market_agent/agents/crewai_subsequential_agent/prompts.py b/prediction_market_agent/agents/crewai_subsequential_agent/prompts.py
@@ -58,6 +58,7 @@
                 {{"report": <REPORT>}}
                 where <REPORT> is a free text field that contains a well though out justification 
                 for the predicition based on the summary of your findings.
+                The report must not be longer than 7000 characters.
             """
 
 FINAL_DECISION_PROMPT = """
@@ -83,16 +84,18 @@
 OUTCOME_TO_ASSESS: {outcome_to_assess}
 OUTCOMES_WITH_PROBABILITIES: {outcomes_with_probabilities}
 """
+
 PROBABILITY_CLASS_OUTPUT = """
-        Your response should be a JSON string containing the following keys:
+    Your response should be a JSON string containing the following keys:
     - "decision": The decision you made. Either `y` (for `Yes`) or `n` (for `No`).
     - "p_yes": Probability that the sentence outcome will be `Yes`. Ranging from 0 (lowest probability) to 1 (maximum probability).
     - "p_no": Probability that the sentence outcome will be `No`. Ranging from 0 (lowest probability) to 1 (maximum probability).
     - "confidence": Indicating the confidence in the estimated probabilities you provided ranging from 0 (lowest confidence) to 
     1 (maximum confidence). Confidence can be calculated based on the quality and quantity of data used for the estimation.
 
+    A valid JSON string as output could look like the example below:
+    Example output: {"decision": "y","p_yes": 0.1, "p_no": 0.9, "confidence": 0.4}
+    Do not use escape quotes and line breaks. Do not output any reasoning, only the JSON object.
+
   Ensure p_yes + p_no equals 1.
-    
-    Format your response in JSON format, including the keys "decision", "p_yes", "p_no" and "confidence".
-    Only output the JSON-formatted string, nothing else.
 """