From cf85f9d5162f2e35ac90f25d5c89e6521c022398 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Albert=20=C3=96rwall?= <albert@moatless.ai>
Date: Tue, 6 Aug 2024 09:05:49 +0200
Subject: [PATCH] Adjust report_v2 to new trajectory format

---
 moatless/benchmark/claude_evaluation.py      |   6 +-
 moatless/benchmark/evaluation.py             |  57 ++--
 moatless/benchmark/report_v2.py              | 324 ++++++++++---------
 moatless/trajectory.py                       |   6 +
 moatless/types.py                            |  16 +-
 tests/benchmark/test_evaluation.py           |  75 +++++
 tests/benchmark/test_report_v2.py            |  39 +++
 tests/test_state.py                          |   2 +-
 tests/trajectories/django__django_16379.json |   2 +-
 9 files changed, 345 insertions(+), 182 deletions(-)
 create mode 100644 tests/benchmark/test_evaluation.py
 create mode 100644 tests/benchmark/test_report_v2.py

diff --git a/moatless/benchmark/claude_evaluation.py b/moatless/benchmark/claude_evaluation.py
index 45c99b1c..6b44915f 100644
--- a/moatless/benchmark/claude_evaluation.py
+++ b/moatless/benchmark/claude_evaluation.py
@@ -4,7 +4,7 @@
 
 import instructor
 
-from moatless import Transitions
+from moatless.transition_rules import TransitionRules
 from moatless.benchmark.evaluation import create_evaluation_name, Evaluation
 from moatless.edit.edit import EditCode
 from moatless.edit.plan import PlanToCode
@@ -170,7 +170,7 @@ def run_evaluation():
 
 
 def evaluate_search():
-    transitions = Transitions(
+    transitions = TransitionRules(
         global_params=global_params,
         state_params={
             SearchCode: {"max_search_results": 50, "provide_initial_context": True},
@@ -280,7 +280,7 @@ def evaluate_coding():
 
 
 def evaluate_plan(previous_trajectory_dir: Optional[str] = None):
-    transitions = Transitions(
+    transitions = TransitionRules(
         global_params=global_params,
         state_params={
             SearchCode: {
diff --git a/moatless/benchmark/evaluation.py b/moatless/benchmark/evaluation.py
index 78a73a18..c13ba71d 100644
--- a/moatless/benchmark/evaluation.py
+++ b/moatless/benchmark/evaluation.py
@@ -7,7 +7,7 @@
 import traceback
 from collections import defaultdict
 from datetime import datetime, timezone
-from typing import Optional
+from typing import Optional, Tuple
 
 import instructor
 import litellm
@@ -15,6 +15,7 @@
 from tqdm.auto import tqdm
 
 from moatless.benchmark.report_v2 import to_result, generate_md_report
+from moatless.trajectory import Trajectory
 from moatless.transition_rules import TransitionRules
 from moatless.benchmark.swebench import (
     found_in_alternative_spans,
@@ -82,6 +83,7 @@ def __init__(
         max_transitions: int = 25,
         max_expansions: int = 2,
         max_file_context_tokens: int = 16000,
+        markdown_report: bool = False,
         litellm_callback: Optional[str] = None,
         previous_trajectory_dir: Optional[str] = None,
         retry_state: Optional[str] = None,
@@ -93,6 +95,7 @@ def __init__(
         self.evaluations_dir = evaluations_dir
         self.num_workers = num_workers
         self.detailed_report = detailed_report
+        self.markdown_report = markdown_report
 
         self.evaluation_name = evaluation_name
         self.max_file_context_tokens = max_file_context_tokens
@@ -193,11 +196,11 @@ def run_single_instance(
         instance_id: str,
         dataset: str = "princeton-nlp/SWE-bench_Lite",
         split="test",
-    ):
+    ) -> dict:
         instance = load_instance(instance_id, dataset, split)
         return self._evaluate_instance(instance)
 
-    def _evaluate_instance(self, instance: dict, retry: bool = False) -> dict:
+    def _evaluate_instance(self, instance: dict, retry: bool = False) -> Trajectory:
         instance_id = instance["instance_id"]
         trajectory_path = os.path.join(self.trajectory_dir, f"{instance_id}.json")
         prompt_log_dir = os.path.join(self.logs_dir, f"{instance_id}")
@@ -205,10 +208,8 @@ def _evaluate_instance(self, instance: dict, retry: bool = False) -> dict:
             os.makedirs(prompt_log_dir)
 
         if os.path.exists(trajectory_path) and not retry:
-            with open(trajectory_path) as file:
-                trajectory = json.load(file)
-            if trajectory["info"].get("status") or trajectory["info"].get("error"):
-                return trajectory
+            # TODO: Retry when failed or not finished?
+            return Trajectory.load(trajectory_path)
 
         repo_dir = setup_swebench_repo(instance)
         persist_dir = os.path.join(self.index_store_dir, get_repo_dir_name(instance_id))
@@ -284,31 +285,30 @@ def _evaluate_instance(self, instance: dict, retry: bool = False) -> dict:
         info["submission"] = diff
 
         loop.trajectory.save_info(info)
-        return loop.trajectory.to_dict()
+        return loop.trajectory
 
-    def _process_instance(self, instance):
+    def _process_instance(self, instance) -> Tuple[dict, str]:
         trajectory = self._evaluate_instance(instance)
-        if not trajectory:
-            return None, None, None
 
-        result, transition_result = to_result(instance, trajectory, self.report)
-        submission = trajectory["info"].get("submission", "")
+        result = to_result(instance, trajectory, self.report)
+        submission = trajectory.info.get("submission", "")
 
-        try:
-            md_report = generate_md_report(trajectory, instance)
-            if not os.path.exists(f"{self.evaluation_dir}/reports"):
-                os.makedirs(f"{self.evaluation_dir}/reports")
-            with open(
-                f"{self.evaluation_dir}/reports/{instance['instance_id']}.md",
-                "w",
-            ) as file:
-                file.write(md_report)
-        except Exception:
-            logging.exception(
-                f"Error in generating report for {instance['instance_id']} "
-            )
+        if self.markdown_report:
+            try:
+                md_report = generate_md_report(trajectory, instance)
+                if not os.path.exists(f"{self.evaluation_dir}/reports"):
+                    os.makedirs(f"{self.evaluation_dir}/reports")
+                with open(
+                    f"{self.evaluation_dir}/reports/{instance['instance_id']}.md",
+                    "w",
+                ) as file:
+                    file.write(md_report)
+            except Exception:
+                logging.exception(
+                    f"Error in generating report for {instance['instance_id']} "
+                )
 
-        return result, transition_result, submission
+        return result, submission
 
     def _process_repo_group(self, repo, instances):
         results = []
@@ -322,9 +322,8 @@ def _process_repo_group(self, repo, instances):
             if not trajectory:
                 return None, None
 
-            result, transition_result = to_result(instance, trajectory, report=self.report)
+            result = to_result(instance, trajectory, report=self.report)
             results.append(result)
-            transition_results.extend(transition_result)
 
             try:
                 md_report = generate_md_report(trajectory, instance)
diff --git a/moatless/benchmark/report_v2.py b/moatless/benchmark/report_v2.py
index a98ce537..b8430cce 100644
--- a/moatless/benchmark/report_v2.py
+++ b/moatless/benchmark/report_v2.py
@@ -3,13 +3,37 @@
 from moatless import FileRepository
 from moatless.benchmark.swebench import found_in_expected_spans, found_in_alternative_spans, setup_swebench_repo
 from moatless.benchmark.utils import get_missing_files
+from moatless.edit.plan import ApplyChange
 from moatless.file_context import FileContext
+from moatless.find.search import SearchRequest
 
 logger = logging.getLogger(__name__)
 
+import logging
+
+from moatless import FileRepository
+from moatless.benchmark.swebench import found_in_expected_spans, found_in_alternative_spans, setup_swebench_repo
+from moatless.benchmark.utils import get_missing_files
+from moatless.file_context import FileContext
+
+logger = logging.getLogger(__name__)
+
+import logging
+from typing import Dict, List, Tuple, Optional
+
+from moatless import FileRepository
+from moatless.benchmark.swebench import found_in_expected_spans, found_in_alternative_spans, setup_swebench_repo
+from moatless.benchmark.utils import get_missing_files
+from moatless.file_context import FileContext
+from moatless.trajectory import Trajectory
+from moatless.types import ActionTransaction, Usage, Content
+from moatless.state import AgenticState
+
+logger = logging.getLogger(__name__)
 
-def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[dict, list]:
-    info = trajectory["info"]
+
+def to_result(instance: Dict, trajectory: Trajectory, report: Optional[Dict] = None) -> Dict:
+    info = trajectory._info
 
     if report and "resolved_ids" in report and instance["instance_id"] in report["resolved_ids"]:
         result_status = "resolved"
@@ -19,7 +43,6 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di
     resolved = result_status == "resolved"
 
     try:
-        transitions = []
         result = {
             "instance_id": instance["instance_id"],
             "duration": info.get("duration", 0),
@@ -27,7 +50,7 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di
             "resolved_by": (len(instance.get("resolved_by", []))),
             "status": None,
             "result_status": result_status,
-            "transitions": len(trajectory["transitions"]),
+            "transitions": len(trajectory.transitions),
             "edited": False,
             "planned": False,
             "identified": None,
@@ -49,34 +72,29 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di
         }
 
         lint_codes = set()
-        search_results_spans = {}
-        identified_spans = {}
-        planned_spans = {}
-        edited_spans = {}
+        search_results_spans: Dict[str, List[str]] = {}
+        identified_spans: Dict[str, List[str]] = {}
+        planned_spans: Dict[str, List[str]] = {}
+        edited_spans: Dict[str, List[str]] = {}
 
         id_iterations = 0
         search_iterations = 0
 
         selected_transition_ids = []
-        if "current_transition_id" in trajectory:
-            transitions_map = {t["id"]: t for t in trajectory["transitions"]}
-
-            transition = transitions_map.get(trajectory["current_transition_id"])
-            while transition:
-                selected_transition_ids.append(transition["id"])
-                if "parent_id" in transition:
-                    transition = transitions_map.get(transition["parent_id"])
-                else:
-                    break
+        current_state = trajectory.get_current_state()
+        while current_state:
+            selected_transition_ids.append(current_state.id)
+            current_state = current_state.previous_state
 
         logger.info(f"Selected transitions: {selected_transition_ids}")
 
         if instance.get("expected_spans"):
-            for transition in trajectory["transitions"]:
-                if selected_transition_ids and transition["id"] not in selected_transition_ids:
+            for transition in trajectory.transitions:
+                if selected_transition_ids and transition.id not in selected_transition_ids:
                     continue
 
-                state_name = transition["state"]["name"]
+                state: AgenticState = transition.state
+                state_name = state.name
 
                 if state_name not in result:
                     result[state_name] = 0
@@ -88,76 +106,42 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di
                 for file_path, span_ids in instance["expected_spans"].items():
                     expected_span_str += f"{file_path}: {span_ids} "
 
-                transition_result = {
-                    "instance_id": instance["instance_id"],
-                    "resolved": resolved,
-                    "name": state_name,
-                    "cost": 0,
-                    "expected_spans": expected_span_str,
-                    "actual_spans": "",
-                }
-
-                if not transition["actions"]:
+                if not state._actions:
                     continue
 
-                for traj_action in transition["actions"]:
-                    result[f"{state_name}_cost"] += traj_action.get(
-                        "completion_cost", 0
-                    )
-                    transition_result["cost"] += traj_action.get(
-                        "completion_cost", 0
-                    )
+                for action in state._actions:
+                    result[f"{state_name}_cost"] += action.usage.completion_cost if action.usage else 0
 
                 if state_name == "SearchCode":
                     search_iterations += 1
 
-                    action = transition["actions"][-1]
+                    action = state._actions[-1]
 
-                    if "search_requests" in action["action"]:
-                        for search_request in action["action"]["search_requests"]:
-                            if search_request.get("query"):
+                    if isinstance(action.request, SearchRequest):
+                        for search_request in action.request.search_requests:
+                            if search_request.query:
                                 result["p_query"] += 1
-
-                            if search_request.get("file_pattern"):
+                            if search_request.file_pattern:
                                 result["p_file"] += 1
-
-                            if search_request.get("code_snippet"):
+                            if search_request.code_snippet:
                                 result["p_code"] += 1
-
-                            if search_request.get(
-                                    "class_name"
-                            ) or search_request.get("class_names"):
+                            if search_request.class_name or search_request.class_names:
                                 result["p_class"] += 1
-
-                            if search_request.get(
-                                    "function_name"
-                            ) or search_request.get("function_names"):
+                            if search_request.function_name or search_request.function_names:
                                 result["p_function"] += 1
 
                 if state_name == "IdentifyCode":
                     id_iterations += 1
 
-                    state = transition["state"]
-                    if state.get("ranked_spans"):
-                        for ranked_span in state["ranked_spans"]:
-                            if (
-                                    ranked_span["file_path"]
-                                    not in search_results_spans
-                            ):
-                                search_results_spans[
-                                    ranked_span["file_path"]
-                                ] = []
-                            search_results_spans[
-                                ranked_span["file_path"]
-                            ].append(ranked_span["span_id"])
+                    if state.ranked_spans:
+                        for ranked_span in state.ranked_spans:
+                            if ranked_span.file_path not in search_results_spans:
+                                search_results_spans[ranked_span.file_path] = []
+                            search_results_spans[ranked_span.file_path].append(ranked_span.span_id)
 
                         if not result["found_in_search"] and (
-                                found_in_expected_spans(
-                                    instance, search_results_spans
-                                )
-                                or found_in_alternative_spans(
-                            instance, search_results_spans
-                        )
+                                found_in_expected_spans(instance, search_results_spans)
+                                or found_in_alternative_spans(instance, search_results_spans)
                         ):
                             result["found_in_search"] = search_iterations
 
@@ -169,24 +153,17 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di
                             if not missing_files:
                                 result["file_in_search"] = search_iterations
 
-                    action = transition["actions"][-1]
-                    if action.get("action"):
+                    if state._actions:
+                        action = state._actions[-1]
                         identified_str = ""
-                        if action["action"].get("identified_spans"):
-                            for span in action["action"]["identified_spans"]:
-                                identified_str += (
-                                    f"{span['file_path']}: {span['span_ids']} "
-                                )
-                                if span["file_path"] not in identified_spans:
-                                    identified_spans[span["file_path"]] = []
-
-                                transition_result["actual_spans"] += (
-                                    f"{span['file_path']}: {','.join(span['span_ids'])} "
-                                )
-                                for span_id in span["span_ids"]:
-                                    identified_spans[span["file_path"]].append(
-                                        span_id
-                                    )
+                        if action.request.identified_spans:
+                            for span in action.request.identified_spans:
+                                identified_str += f"{span.file_path}: {span.span_ids} "
+                                if span.file_path not in identified_spans:
+                                    identified_spans[span.file_path] = []
+
+                                for span_id in span.span_ids:
+                                    identified_spans[span.file_path].append(span_id)
                         result["identified_spans"] = identified_str
 
                     if not result["file_identified"]:
@@ -197,92 +174,62 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di
                         if not missing_files:
                             result["file_identified"] = id_iterations
 
-                    if result[
-                        "expected_identified"
-                    ] is None and found_in_expected_spans(
-                        instance, identified_spans
-                    ):
+                    if result["expected_identified"] is None and found_in_expected_spans(instance, identified_spans):
                         result["expected_identified"] = id_iterations
 
-                    if result[
-                        "alt_identified"
-                    ] is None and found_in_alternative_spans(
-                        instance, identified_spans
-                    ):
+                    if result["alt_identified"] is None and found_in_alternative_spans(instance, identified_spans):
                         result["alt_identified"] = id_iterations
 
-                    if result.get("alt_identified") or result.get(
-                            "expected_identified"
-                    ):
+                    if result.get("alt_identified") or result.get("expected_identified"):
                         result["identified"] = min(
                             result.get("alt_identified") or 1000,
                             result.get("expected_identified") or 1000,
                         )
 
                 if state_name == "PlanToCode":
-                    action = transition["actions"][-1]["action"]
-                    if action.get("action") == "review":
+                    action = state._actions[-1]
+
+                    if action.request.action == "review":
                         result["review"] = True
 
-                    if "file_path" in action:
-                        if "span_id" not in action:
-                            logger.warning(
-                                f"Span id missing in planning action in {instance['instance_id']}"
-                            )
-                        else:
-                            file_path = action["file_path"]
-                            if file_path not in planned_spans:
-                                planned_spans[file_path] = []
-                            planned_spans[file_path].append(action["span_id"])
-                            transition_result["actual_spans"] = (
-                                f"{file_path}: {action['span_id']} "
-                            )
+                    if action.request.file_path:
+                        file_path = action.request.file_path
+                        if file_path not in planned_spans:
+                            planned_spans[file_path] = []
+                        planned_spans[file_path].append(action.request.span_id)
 
                     if not result.get("planned") and (
-                            found_in_expected_spans(
-                                instance,
-                                planned_spans,
-                            )
+                            found_in_expected_spans(instance, planned_spans)
                             or found_in_alternative_spans(instance, planned_spans)
                     ):
                         result["planned"] = True
 
                 if state_name == "EditCode":
-                    result["edit_retries"] = len(transition["actions"]) - 1
+                    result["edit_retries"] = len(state._actions) - 1
 
-                    action = transition["actions"][-1]
-                    edited = action.get("trigger") == "finish"
+                    action = state._actions[-1]
+                    edited = action.response and action.response.trigger == "finish"
 
-                    if edited and "file_path" in transition["state"]:
-                        file_path = transition["state"]["file_path"]
+                    if edited and hasattr(state, 'file_path'):
+                        file_path = state.file_path
                         if file_path not in edited_spans:
                             edited_spans[file_path] = []
-                        edited_spans[file_path].append(
-                            transition["state"]["span_id"]
-                        )
-                        transition_result["actual_spans"] = (
-                            f"{file_path}: {transition['state']['span_id']} "
-                        )
+                        edited_spans[file_path].append(state.span_id)
 
                     if not result.get("edited") and (
-                            found_in_expected_spans(
-                                instance,
-                                edited_spans,
-                            )
+                            found_in_expected_spans(instance, edited_spans)
                             or found_in_alternative_spans(instance, edited_spans)
                     ):
                         result["edited"] = True
 
-                    output = action.get("output", {})
-                    if output:
+                    if action.response and action.response.output:
+                        output = action.response.output
                         if edited:
                             result["has_diff"] = True
 
                         for lint in output.get("verification_errors", []):
                             lint_codes.add(lint["code"])
 
-                transitions.append(transition_result)
-
             if result.get("alt_identified") or result.get("expected_identified"):
                 result["identified"] = min(
                     result.get("alt_identified") or 1000,
@@ -291,9 +238,7 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di
 
             result["expected_files"] = list(instance["expected_spans"].keys())
             result["edited_files"] = list(edited_spans.keys())
-            result["identified_spans"] = sum(
-                [len(v) for v in identified_spans.values()]
-            )
+            result["identified_spans"] = sum(len(v) for v in identified_spans.values())
 
         result["lints"] = ",".join(lint_codes)
 
@@ -316,7 +261,96 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di
     except Exception as e:
         raise e
 
-    return result, transitions
+    return result
+
+def generate_md_report(trajectory: Trajectory, instance: Dict) -> str:
+    info = trajectory._info
+    markdown = f"# {instance['instance_id']}\n"
+
+    markdown += "\n## Problem statement\n"
+    markdown += f"```\n{instance['problem_statement']}\n```\n"
+
+    if "error" in trajectory._info:
+        markdown += "\n## Error\n"
+        markdown += f"```\n{trajectory._info['error']}\n```\n"
+    else:
+        markdown += "\n## Prediction\n"
+        markdown += f"```diff\n{info['submission']}\n```\n"
+
+    markdown += "\n## Golden patch\n"
+    markdown += f"```diff\n{instance['golden_patch']}\n```\n"
+
+    markdown += "\n## Trajectory\n"
+
+    repo_dir = setup_swebench_repo(instance)
+    file_repo = FileRepository(repo_dir)
+
+    for j, transition in enumerate(trajectory.transitions):
+        state = transition.state
+        for i, action in enumerate(state._actions):
+            markdown += f"### {j+1} {state.name} ({i+1})\n\n"
+
+            if state.name == "PlanToCode":
+                if action.request.file_path:
+                    if action.request.instructions:
+                        markdown += f"\n\n * {action.request.instructions}"
+                    markdown += f"\n * {action.request.file_path}"
+                    markdown += f"\n * {action.request.span_id}"
+
+                    markdown += "\n\n#### File context \n\n"
+                    try:
+                        file_context = FileContext(file_repo)
+                        file_context.add_span_to_context(
+                            action.request.file_path,
+                            action.request.span_id,
+                        )
+                        markdown += file_context.create_prompt(
+                            show_outcommented_code=True
+                        )
+                    except Exception as e:
+                        logger.error(e)
+
+            if state.name == "EditCode":
+                markdown += "#### LLM Response\n\n"
+                markdown += f"```\n{action.request.content if isinstance(action.request, Content) else ''}\n```\n"
+
+                if action.response and action.response.output:
+                    output = action.response.output
+                    if output.get("diff"):
+                        markdown += "#### Diff\n\n"
+                        markdown += f"```diff\n{output['diff']}\n```\n"
+
+                    if output.get("errors"):
+                        markdown += "#### Errors\n\n"
+                        markdown += f"{output['errors']}\n\n"
+
+                    if output.get("message"):
+                        markdown += "#### Message\n\n"
+                        markdown += f"{output['message']}\n\n"
+
+            if state.name == "ClarifyCodeChange":
+
+                if action.request.scratch_pad:
+                    markdown += f"*{action.request.scratch_pad}*"
+
+                if action.response and action.response.output:
+                    output = action.response.output
+                    if output.get("start_line"):
+                        markdown += f"\n* Start Line: {output['start_line']}\n"
+                        markdown += f"\n* End Line: {output['end_line']}\n"
+
+            if state.name == "Finished":
+                markdown += f"*{action.request.thoughts}*\n"
+
+            if state.name == "Rejected":
+                markdown += f"*{action.request.thoughts}*\n"
+
+    markdown += "## Alternative patches\n"
+    for alternative in instance["resolved_by"]:
+        markdown += f"### {alternative['name']}\n"
+        markdown += f"```diff\n{alternative['patch']}\n```\n"
+
+    return markdown
 def generate_md_report(trajectory: dict, instance: dict):
     info = trajectory["info"]
     markdown = f"# {instance['instance_id']}\n"
diff --git a/moatless/trajectory.py b/moatless/trajectory.py
index bdebe4fe..8b5ac962 100644
--- a/moatless/trajectory.py
+++ b/moatless/trajectory.py
@@ -91,6 +91,8 @@ def load(cls, file_path: str):
             workspace=workspace
         )
 
+        trajectory._info = data.get("info", {})
+
         trajectory._transitions = {}
         trajectory._current_transition_id = data.get("current_transition_id", 0)
 
@@ -152,6 +154,10 @@ def load(cls, file_path: str):
     def initial_message(self):
         return self._initial_message
 
+    @property
+    def info(self):
+        return self._info
+
     @property
     def states(self) -> List[dict]:
         return [t.state.model_dump() for t in self.transitions]
diff --git a/moatless/types.py b/moatless/types.py
index f338efaf..5546f140 100644
--- a/moatless/types.py
+++ b/moatless/types.py
@@ -28,9 +28,19 @@ def action_name(self):
         return self.__class__.__name__
 
 class ActionResponse(BaseModel):
-    trigger: Optional[str] = None
-    output: Optional[dict[str, Any]] = None
-    retry_message: Optional[str] = None
+    trigger: Optional[str] = Field(
+        default=None,
+        description="Trigger to transition to the next state. If None, no transition is made.",
+    )
+    output: Optional[dict[str, Any]] = Field(
+        default=None,
+        description="Output data to be passed to the next state.",
+    )
+
+    retry_message: Optional[str] = Field(
+        default=None,
+        description="Message to use in retry."
+    )
 
     @classmethod
     def retry(cls, retry_message: str):
diff --git a/tests/benchmark/test_evaluation.py b/tests/benchmark/test_evaluation.py
new file mode 100644
index 00000000..28dea29c
--- /dev/null
+++ b/tests/benchmark/test_evaluation.py
@@ -0,0 +1,75 @@
+import os
+from datetime import datetime
+
+import pytest
+from dotenv import load_dotenv
+
+from moatless.benchmark.evaluation import Evaluation
+from moatless.edit import PlanToCode, EditCode
+from moatless.find import SearchCode, IdentifyCode, DecideRelevance
+from moatless.transitions import search_and_code_transitions
+
+load_dotenv()
+moatless_dir = os.getenv("MOATLESS_DIR", "/tmp/moatless")
+index_store_dir = os.getenv("INDEX_STORE_DIR", "/tmp/index_store")
+repo_dir = os.getenv("REPO_DIR", "/tmp/repo")
+
+global_params = {
+    "model": "gpt-4o-mini-2024-07-18",  # "azure/gpt-4o",
+    "temperature": 0.5,
+    "max_tokens": 2000,
+    "max_prompt_file_tokens": 8000,
+}
+
+state_params = {
+    SearchCode: {
+        "provide_initial_context": True,
+        "max_search_results": 75,
+        "initial_context_tokens": 6000,
+        "initial_search_results": 100,
+        "initial_context_spans_per_file": 5,
+    },
+    IdentifyCode: {"expand_context": True},
+    DecideRelevance: {
+        "finish_after_relevant_count": 1,
+    },
+    PlanToCode: {
+        "max_tokens_in_edit_prompt": 750,
+        "expand_context_with_related_spans": False,
+        "finish_on_review": True,
+    },
+    EditCode: {
+        "chain_of_thought": False,
+        "show_file_context": False,
+        "max_prompt_file_tokens": 8000,
+    },
+}
+
+search_and_code = search_and_code_transitions(
+    global_params=global_params, state_params=state_params
+)
+
+pytest.mark.llm_integration = pytest.mark.skipif(
+    "not config.getoption('--run-llm-integration')",
+    reason="need --run-llm-integration option to run tests that call LLMs",
+)
+
+
+@pytest.mark.llm_integration
+def test_run_single_evaluation_mcts():
+    datestr = datetime.now().strftime("%Y%m%d-%H%M%S")
+    dir = f"{moatless_dir}/eval_test"
+    evaluation_name = f"{datestr}_mcts"
+
+    evaluation = Evaluation(
+        transitions=search_and_code,
+        evaluations_dir=dir,
+        evaluation_name=evaluation_name,
+        index_store_dir=index_store_dir,
+        repo_base_dir=repo_dir,
+        max_file_context_tokens=16000,
+        num_workers=1,
+        detailed_report=True,
+    )
+
+    evaluation.run_single_instance("django__django-16379")
\ No newline at end of file
diff --git a/tests/benchmark/test_report_v2.py b/tests/benchmark/test_report_v2.py
new file mode 100644
index 00000000..e35c4771
--- /dev/null
+++ b/tests/benchmark/test_report_v2.py
@@ -0,0 +1,39 @@
+import json
+from pathlib import Path
+
+import pytest
+
+from moatless.benchmark.report_v2 import to_result
+from moatless.trajectory import Trajectory
+
+
+@pytest.fixture
+def django_trajectory():
+    file_path = Path("tests/trajectories/django__django_16379.json")
+    return Trajectory.load(str(file_path))
+
+
+@pytest.fixture
+def dataset():
+    with open("moatless/benchmark/swebench_lite_all_evaluations.json") as f:
+        return json.load(f)
+
+@pytest.fixture
+def django_instance(dataset):
+    for instance in dataset:
+        if instance["instance_id"] == "django__django-16379":
+            return instance
+
+    return None
+
+
+def test_to_result(django_trajectory, django_instance):
+    result = to_result(django_instance, django_trajectory)
+
+    assert result["instance_id"] == "django__django-16379"
+    assert result["status"] == "edited"
+    assert result["transitions"] == len(django_trajectory.transitions)
+    assert result["edited"]
+    assert result["identified"]
+    assert result["found_in_search"]
+    assert result["file_identified"]
diff --git a/tests/test_state.py b/tests/test_state.py
index a32c0021..ef45cedd 100644
--- a/tests/test_state.py
+++ b/tests/test_state.py
@@ -160,4 +160,4 @@ def test_finished_state_creation_and_dump():
     
     assert dumped_state["id"] == 1
     assert dumped_state["message"] == message
-    assert dumped_state["output"] == output
\ No newline at end of file
+    assert dumped_state["output"] == output
diff --git a/tests/trajectories/django__django_16379.json b/tests/trajectories/django__django_16379.json
index ab529b90..e8cf4cda 100644
--- a/tests/trajectories/django__django_16379.json
+++ b/tests/trajectories/django__django_16379.json
@@ -114,7 +114,7 @@
       "type": "git",
       "repo_path": "/tmp/repos/swe-bench_django__django",
       "git_repo_url": "https://github.com/swe-bench/django__django.git",
-      "commit": "8c1c528fea1644fc1cf597288d047cd110b3febd"
+      "commit": "1d0fa848e084cad62d0bb6bde3b51e4862558e57"
     },
     "file_context": {
       "max_tokens": 4000,