From cf85f9d5162f2e35ac90f25d5c89e6521c022398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Albert=20=C3=96rwall?= Date: Tue, 6 Aug 2024 09:05:49 +0200 Subject: [PATCH] Adjust report_v2 to new trajectory format --- moatless/benchmark/claude_evaluation.py | 6 +- moatless/benchmark/evaluation.py | 57 ++-- moatless/benchmark/report_v2.py | 324 ++++++++++--------- moatless/trajectory.py | 6 + moatless/types.py | 16 +- tests/benchmark/test_evaluation.py | 75 +++++ tests/benchmark/test_report_v2.py | 39 +++ tests/test_state.py | 2 +- tests/trajectories/django__django_16379.json | 2 +- 9 files changed, 345 insertions(+), 182 deletions(-) create mode 100644 tests/benchmark/test_evaluation.py create mode 100644 tests/benchmark/test_report_v2.py diff --git a/moatless/benchmark/claude_evaluation.py b/moatless/benchmark/claude_evaluation.py index 45c99b1c..6b44915f 100644 --- a/moatless/benchmark/claude_evaluation.py +++ b/moatless/benchmark/claude_evaluation.py @@ -4,7 +4,7 @@ import instructor -from moatless import Transitions +from moatless.transition_rules import TransitionRules from moatless.benchmark.evaluation import create_evaluation_name, Evaluation from moatless.edit.edit import EditCode from moatless.edit.plan import PlanToCode @@ -170,7 +170,7 @@ def run_evaluation(): def evaluate_search(): - transitions = Transitions( + transitions = TransitionRules( global_params=global_params, state_params={ SearchCode: {"max_search_results": 50, "provide_initial_context": True}, @@ -280,7 +280,7 @@ def evaluate_coding(): def evaluate_plan(previous_trajectory_dir: Optional[str] = None): - transitions = Transitions( + transitions = TransitionRules( global_params=global_params, state_params={ SearchCode: { diff --git a/moatless/benchmark/evaluation.py b/moatless/benchmark/evaluation.py index 78a73a18..c13ba71d 100644 --- a/moatless/benchmark/evaluation.py +++ b/moatless/benchmark/evaluation.py @@ -7,7 +7,7 @@ import traceback from collections import defaultdict from datetime import datetime, timezone -from typing import Optional +from typing import Optional, Tuple import instructor import litellm @@ -15,6 +15,7 @@ from tqdm.auto import tqdm from moatless.benchmark.report_v2 import to_result, generate_md_report +from moatless.trajectory import Trajectory from moatless.transition_rules import TransitionRules from moatless.benchmark.swebench import ( found_in_alternative_spans, @@ -82,6 +83,7 @@ def __init__( max_transitions: int = 25, max_expansions: int = 2, max_file_context_tokens: int = 16000, + markdown_report: bool = False, litellm_callback: Optional[str] = None, previous_trajectory_dir: Optional[str] = None, retry_state: Optional[str] = None, @@ -93,6 +95,7 @@ def __init__( self.evaluations_dir = evaluations_dir self.num_workers = num_workers self.detailed_report = detailed_report + self.markdown_report = markdown_report self.evaluation_name = evaluation_name self.max_file_context_tokens = max_file_context_tokens @@ -193,11 +196,11 @@ def run_single_instance( instance_id: str, dataset: str = "princeton-nlp/SWE-bench_Lite", split="test", - ): + ) -> dict: instance = load_instance(instance_id, dataset, split) return self._evaluate_instance(instance) - def _evaluate_instance(self, instance: dict, retry: bool = False) -> dict: + def _evaluate_instance(self, instance: dict, retry: bool = False) -> Trajectory: instance_id = instance["instance_id"] trajectory_path = os.path.join(self.trajectory_dir, f"{instance_id}.json") prompt_log_dir = os.path.join(self.logs_dir, f"{instance_id}") @@ -205,10 +208,8 @@ def _evaluate_instance(self, instance: dict, retry: bool = False) -> dict: os.makedirs(prompt_log_dir) if os.path.exists(trajectory_path) and not retry: - with open(trajectory_path) as file: - trajectory = json.load(file) - if trajectory["info"].get("status") or trajectory["info"].get("error"): - return trajectory + # TODO: Retry when failed or not finished? + return Trajectory.load(trajectory_path) repo_dir = setup_swebench_repo(instance) persist_dir = os.path.join(self.index_store_dir, get_repo_dir_name(instance_id)) @@ -284,31 +285,30 @@ def _evaluate_instance(self, instance: dict, retry: bool = False) -> dict: info["submission"] = diff loop.trajectory.save_info(info) - return loop.trajectory.to_dict() + return loop.trajectory - def _process_instance(self, instance): + def _process_instance(self, instance) -> Tuple[dict, str]: trajectory = self._evaluate_instance(instance) - if not trajectory: - return None, None, None - result, transition_result = to_result(instance, trajectory, self.report) - submission = trajectory["info"].get("submission", "") + result = to_result(instance, trajectory, self.report) + submission = trajectory.info.get("submission", "") - try: - md_report = generate_md_report(trajectory, instance) - if not os.path.exists(f"{self.evaluation_dir}/reports"): - os.makedirs(f"{self.evaluation_dir}/reports") - with open( - f"{self.evaluation_dir}/reports/{instance['instance_id']}.md", - "w", - ) as file: - file.write(md_report) - except Exception: - logging.exception( - f"Error in generating report for {instance['instance_id']} " - ) + if self.markdown_report: + try: + md_report = generate_md_report(trajectory, instance) + if not os.path.exists(f"{self.evaluation_dir}/reports"): + os.makedirs(f"{self.evaluation_dir}/reports") + with open( + f"{self.evaluation_dir}/reports/{instance['instance_id']}.md", + "w", + ) as file: + file.write(md_report) + except Exception: + logging.exception( + f"Error in generating report for {instance['instance_id']} " + ) - return result, transition_result, submission + return result, submission def _process_repo_group(self, repo, instances): results = [] @@ -322,9 +322,8 @@ def _process_repo_group(self, repo, instances): if not trajectory: return None, None - result, transition_result = to_result(instance, trajectory, report=self.report) + result = to_result(instance, trajectory, report=self.report) results.append(result) - transition_results.extend(transition_result) try: md_report = generate_md_report(trajectory, instance) diff --git a/moatless/benchmark/report_v2.py b/moatless/benchmark/report_v2.py index a98ce537..b8430cce 100644 --- a/moatless/benchmark/report_v2.py +++ b/moatless/benchmark/report_v2.py @@ -3,13 +3,37 @@ from moatless import FileRepository from moatless.benchmark.swebench import found_in_expected_spans, found_in_alternative_spans, setup_swebench_repo from moatless.benchmark.utils import get_missing_files +from moatless.edit.plan import ApplyChange from moatless.file_context import FileContext +from moatless.find.search import SearchRequest logger = logging.getLogger(__name__) +import logging + +from moatless import FileRepository +from moatless.benchmark.swebench import found_in_expected_spans, found_in_alternative_spans, setup_swebench_repo +from moatless.benchmark.utils import get_missing_files +from moatless.file_context import FileContext + +logger = logging.getLogger(__name__) + +import logging +from typing import Dict, List, Tuple, Optional + +from moatless import FileRepository +from moatless.benchmark.swebench import found_in_expected_spans, found_in_alternative_spans, setup_swebench_repo +from moatless.benchmark.utils import get_missing_files +from moatless.file_context import FileContext +from moatless.trajectory import Trajectory +from moatless.types import ActionTransaction, Usage, Content +from moatless.state import AgenticState + +logger = logging.getLogger(__name__) -def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[dict, list]: - info = trajectory["info"] + +def to_result(instance: Dict, trajectory: Trajectory, report: Optional[Dict] = None) -> Dict: + info = trajectory._info if report and "resolved_ids" in report and instance["instance_id"] in report["resolved_ids"]: result_status = "resolved" @@ -19,7 +43,6 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di resolved = result_status == "resolved" try: - transitions = [] result = { "instance_id": instance["instance_id"], "duration": info.get("duration", 0), @@ -27,7 +50,7 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di "resolved_by": (len(instance.get("resolved_by", []))), "status": None, "result_status": result_status, - "transitions": len(trajectory["transitions"]), + "transitions": len(trajectory.transitions), "edited": False, "planned": False, "identified": None, @@ -49,34 +72,29 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di } lint_codes = set() - search_results_spans = {} - identified_spans = {} - planned_spans = {} - edited_spans = {} + search_results_spans: Dict[str, List[str]] = {} + identified_spans: Dict[str, List[str]] = {} + planned_spans: Dict[str, List[str]] = {} + edited_spans: Dict[str, List[str]] = {} id_iterations = 0 search_iterations = 0 selected_transition_ids = [] - if "current_transition_id" in trajectory: - transitions_map = {t["id"]: t for t in trajectory["transitions"]} - - transition = transitions_map.get(trajectory["current_transition_id"]) - while transition: - selected_transition_ids.append(transition["id"]) - if "parent_id" in transition: - transition = transitions_map.get(transition["parent_id"]) - else: - break + current_state = trajectory.get_current_state() + while current_state: + selected_transition_ids.append(current_state.id) + current_state = current_state.previous_state logger.info(f"Selected transitions: {selected_transition_ids}") if instance.get("expected_spans"): - for transition in trajectory["transitions"]: - if selected_transition_ids and transition["id"] not in selected_transition_ids: + for transition in trajectory.transitions: + if selected_transition_ids and transition.id not in selected_transition_ids: continue - state_name = transition["state"]["name"] + state: AgenticState = transition.state + state_name = state.name if state_name not in result: result[state_name] = 0 @@ -88,76 +106,42 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di for file_path, span_ids in instance["expected_spans"].items(): expected_span_str += f"{file_path}: {span_ids} " - transition_result = { - "instance_id": instance["instance_id"], - "resolved": resolved, - "name": state_name, - "cost": 0, - "expected_spans": expected_span_str, - "actual_spans": "", - } - - if not transition["actions"]: + if not state._actions: continue - for traj_action in transition["actions"]: - result[f"{state_name}_cost"] += traj_action.get( - "completion_cost", 0 - ) - transition_result["cost"] += traj_action.get( - "completion_cost", 0 - ) + for action in state._actions: + result[f"{state_name}_cost"] += action.usage.completion_cost if action.usage else 0 if state_name == "SearchCode": search_iterations += 1 - action = transition["actions"][-1] + action = state._actions[-1] - if "search_requests" in action["action"]: - for search_request in action["action"]["search_requests"]: - if search_request.get("query"): + if isinstance(action.request, SearchRequest): + for search_request in action.request.search_requests: + if search_request.query: result["p_query"] += 1 - - if search_request.get("file_pattern"): + if search_request.file_pattern: result["p_file"] += 1 - - if search_request.get("code_snippet"): + if search_request.code_snippet: result["p_code"] += 1 - - if search_request.get( - "class_name" - ) or search_request.get("class_names"): + if search_request.class_name or search_request.class_names: result["p_class"] += 1 - - if search_request.get( - "function_name" - ) or search_request.get("function_names"): + if search_request.function_name or search_request.function_names: result["p_function"] += 1 if state_name == "IdentifyCode": id_iterations += 1 - state = transition["state"] - if state.get("ranked_spans"): - for ranked_span in state["ranked_spans"]: - if ( - ranked_span["file_path"] - not in search_results_spans - ): - search_results_spans[ - ranked_span["file_path"] - ] = [] - search_results_spans[ - ranked_span["file_path"] - ].append(ranked_span["span_id"]) + if state.ranked_spans: + for ranked_span in state.ranked_spans: + if ranked_span.file_path not in search_results_spans: + search_results_spans[ranked_span.file_path] = [] + search_results_spans[ranked_span.file_path].append(ranked_span.span_id) if not result["found_in_search"] and ( - found_in_expected_spans( - instance, search_results_spans - ) - or found_in_alternative_spans( - instance, search_results_spans - ) + found_in_expected_spans(instance, search_results_spans) + or found_in_alternative_spans(instance, search_results_spans) ): result["found_in_search"] = search_iterations @@ -169,24 +153,17 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di if not missing_files: result["file_in_search"] = search_iterations - action = transition["actions"][-1] - if action.get("action"): + if state._actions: + action = state._actions[-1] identified_str = "" - if action["action"].get("identified_spans"): - for span in action["action"]["identified_spans"]: - identified_str += ( - f"{span['file_path']}: {span['span_ids']} " - ) - if span["file_path"] not in identified_spans: - identified_spans[span["file_path"]] = [] - - transition_result["actual_spans"] += ( - f"{span['file_path']}: {','.join(span['span_ids'])} " - ) - for span_id in span["span_ids"]: - identified_spans[span["file_path"]].append( - span_id - ) + if action.request.identified_spans: + for span in action.request.identified_spans: + identified_str += f"{span.file_path}: {span.span_ids} " + if span.file_path not in identified_spans: + identified_spans[span.file_path] = [] + + for span_id in span.span_ids: + identified_spans[span.file_path].append(span_id) result["identified_spans"] = identified_str if not result["file_identified"]: @@ -197,92 +174,62 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di if not missing_files: result["file_identified"] = id_iterations - if result[ - "expected_identified" - ] is None and found_in_expected_spans( - instance, identified_spans - ): + if result["expected_identified"] is None and found_in_expected_spans(instance, identified_spans): result["expected_identified"] = id_iterations - if result[ - "alt_identified" - ] is None and found_in_alternative_spans( - instance, identified_spans - ): + if result["alt_identified"] is None and found_in_alternative_spans(instance, identified_spans): result["alt_identified"] = id_iterations - if result.get("alt_identified") or result.get( - "expected_identified" - ): + if result.get("alt_identified") or result.get("expected_identified"): result["identified"] = min( result.get("alt_identified") or 1000, result.get("expected_identified") or 1000, ) if state_name == "PlanToCode": - action = transition["actions"][-1]["action"] - if action.get("action") == "review": + action = state._actions[-1] + + if action.request.action == "review": result["review"] = True - if "file_path" in action: - if "span_id" not in action: - logger.warning( - f"Span id missing in planning action in {instance['instance_id']}" - ) - else: - file_path = action["file_path"] - if file_path not in planned_spans: - planned_spans[file_path] = [] - planned_spans[file_path].append(action["span_id"]) - transition_result["actual_spans"] = ( - f"{file_path}: {action['span_id']} " - ) + if action.request.file_path: + file_path = action.request.file_path + if file_path not in planned_spans: + planned_spans[file_path] = [] + planned_spans[file_path].append(action.request.span_id) if not result.get("planned") and ( - found_in_expected_spans( - instance, - planned_spans, - ) + found_in_expected_spans(instance, planned_spans) or found_in_alternative_spans(instance, planned_spans) ): result["planned"] = True if state_name == "EditCode": - result["edit_retries"] = len(transition["actions"]) - 1 + result["edit_retries"] = len(state._actions) - 1 - action = transition["actions"][-1] - edited = action.get("trigger") == "finish" + action = state._actions[-1] + edited = action.response and action.response.trigger == "finish" - if edited and "file_path" in transition["state"]: - file_path = transition["state"]["file_path"] + if edited and hasattr(state, 'file_path'): + file_path = state.file_path if file_path not in edited_spans: edited_spans[file_path] = [] - edited_spans[file_path].append( - transition["state"]["span_id"] - ) - transition_result["actual_spans"] = ( - f"{file_path}: {transition['state']['span_id']} " - ) + edited_spans[file_path].append(state.span_id) if not result.get("edited") and ( - found_in_expected_spans( - instance, - edited_spans, - ) + found_in_expected_spans(instance, edited_spans) or found_in_alternative_spans(instance, edited_spans) ): result["edited"] = True - output = action.get("output", {}) - if output: + if action.response and action.response.output: + output = action.response.output if edited: result["has_diff"] = True for lint in output.get("verification_errors", []): lint_codes.add(lint["code"]) - transitions.append(transition_result) - if result.get("alt_identified") or result.get("expected_identified"): result["identified"] = min( result.get("alt_identified") or 1000, @@ -291,9 +238,7 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di result["expected_files"] = list(instance["expected_spans"].keys()) result["edited_files"] = list(edited_spans.keys()) - result["identified_spans"] = sum( - [len(v) for v in identified_spans.values()] - ) + result["identified_spans"] = sum(len(v) for v in identified_spans.values()) result["lints"] = ",".join(lint_codes) @@ -316,7 +261,96 @@ def to_result(instance: dict, trajectory: dict, report: dict | None) -> tuple[di except Exception as e: raise e - return result, transitions + return result + +def generate_md_report(trajectory: Trajectory, instance: Dict) -> str: + info = trajectory._info + markdown = f"# {instance['instance_id']}\n" + + markdown += "\n## Problem statement\n" + markdown += f"```\n{instance['problem_statement']}\n```\n" + + if "error" in trajectory._info: + markdown += "\n## Error\n" + markdown += f"```\n{trajectory._info['error']}\n```\n" + else: + markdown += "\n## Prediction\n" + markdown += f"```diff\n{info['submission']}\n```\n" + + markdown += "\n## Golden patch\n" + markdown += f"```diff\n{instance['golden_patch']}\n```\n" + + markdown += "\n## Trajectory\n" + + repo_dir = setup_swebench_repo(instance) + file_repo = FileRepository(repo_dir) + + for j, transition in enumerate(trajectory.transitions): + state = transition.state + for i, action in enumerate(state._actions): + markdown += f"### {j+1} {state.name} ({i+1})\n\n" + + if state.name == "PlanToCode": + if action.request.file_path: + if action.request.instructions: + markdown += f"\n\n * {action.request.instructions}" + markdown += f"\n * {action.request.file_path}" + markdown += f"\n * {action.request.span_id}" + + markdown += "\n\n#### File context \n\n" + try: + file_context = FileContext(file_repo) + file_context.add_span_to_context( + action.request.file_path, + action.request.span_id, + ) + markdown += file_context.create_prompt( + show_outcommented_code=True + ) + except Exception as e: + logger.error(e) + + if state.name == "EditCode": + markdown += "#### LLM Response\n\n" + markdown += f"```\n{action.request.content if isinstance(action.request, Content) else ''}\n```\n" + + if action.response and action.response.output: + output = action.response.output + if output.get("diff"): + markdown += "#### Diff\n\n" + markdown += f"```diff\n{output['diff']}\n```\n" + + if output.get("errors"): + markdown += "#### Errors\n\n" + markdown += f"{output['errors']}\n\n" + + if output.get("message"): + markdown += "#### Message\n\n" + markdown += f"{output['message']}\n\n" + + if state.name == "ClarifyCodeChange": + + if action.request.scratch_pad: + markdown += f"*{action.request.scratch_pad}*" + + if action.response and action.response.output: + output = action.response.output + if output.get("start_line"): + markdown += f"\n* Start Line: {output['start_line']}\n" + markdown += f"\n* End Line: {output['end_line']}\n" + + if state.name == "Finished": + markdown += f"*{action.request.thoughts}*\n" + + if state.name == "Rejected": + markdown += f"*{action.request.thoughts}*\n" + + markdown += "## Alternative patches\n" + for alternative in instance["resolved_by"]: + markdown += f"### {alternative['name']}\n" + markdown += f"```diff\n{alternative['patch']}\n```\n" + + return markdown def generate_md_report(trajectory: dict, instance: dict): info = trajectory["info"] markdown = f"# {instance['instance_id']}\n" diff --git a/moatless/trajectory.py b/moatless/trajectory.py index bdebe4fe..8b5ac962 100644 --- a/moatless/trajectory.py +++ b/moatless/trajectory.py @@ -91,6 +91,8 @@ def load(cls, file_path: str): workspace=workspace ) + trajectory._info = data.get("info", {}) + trajectory._transitions = {} trajectory._current_transition_id = data.get("current_transition_id", 0) @@ -152,6 +154,10 @@ def load(cls, file_path: str): def initial_message(self): return self._initial_message + @property + def info(self): + return self._info + @property def states(self) -> List[dict]: return [t.state.model_dump() for t in self.transitions] diff --git a/moatless/types.py b/moatless/types.py index f338efaf..5546f140 100644 --- a/moatless/types.py +++ b/moatless/types.py @@ -28,9 +28,19 @@ def action_name(self): return self.__class__.__name__ class ActionResponse(BaseModel): - trigger: Optional[str] = None - output: Optional[dict[str, Any]] = None - retry_message: Optional[str] = None + trigger: Optional[str] = Field( + default=None, + description="Trigger to transition to the next state. If None, no transition is made.", + ) + output: Optional[dict[str, Any]] = Field( + default=None, + description="Output data to be passed to the next state.", + ) + + retry_message: Optional[str] = Field( + default=None, + description="Message to use in retry." + ) @classmethod def retry(cls, retry_message: str): diff --git a/tests/benchmark/test_evaluation.py b/tests/benchmark/test_evaluation.py new file mode 100644 index 00000000..28dea29c --- /dev/null +++ b/tests/benchmark/test_evaluation.py @@ -0,0 +1,75 @@ +import os +from datetime import datetime + +import pytest +from dotenv import load_dotenv + +from moatless.benchmark.evaluation import Evaluation +from moatless.edit import PlanToCode, EditCode +from moatless.find import SearchCode, IdentifyCode, DecideRelevance +from moatless.transitions import search_and_code_transitions + +load_dotenv() +moatless_dir = os.getenv("MOATLESS_DIR", "/tmp/moatless") +index_store_dir = os.getenv("INDEX_STORE_DIR", "/tmp/index_store") +repo_dir = os.getenv("REPO_DIR", "/tmp/repo") + +global_params = { + "model": "gpt-4o-mini-2024-07-18", # "azure/gpt-4o", + "temperature": 0.5, + "max_tokens": 2000, + "max_prompt_file_tokens": 8000, +} + +state_params = { + SearchCode: { + "provide_initial_context": True, + "max_search_results": 75, + "initial_context_tokens": 6000, + "initial_search_results": 100, + "initial_context_spans_per_file": 5, + }, + IdentifyCode: {"expand_context": True}, + DecideRelevance: { + "finish_after_relevant_count": 1, + }, + PlanToCode: { + "max_tokens_in_edit_prompt": 750, + "expand_context_with_related_spans": False, + "finish_on_review": True, + }, + EditCode: { + "chain_of_thought": False, + "show_file_context": False, + "max_prompt_file_tokens": 8000, + }, +} + +search_and_code = search_and_code_transitions( + global_params=global_params, state_params=state_params +) + +pytest.mark.llm_integration = pytest.mark.skipif( + "not config.getoption('--run-llm-integration')", + reason="need --run-llm-integration option to run tests that call LLMs", +) + + +@pytest.mark.llm_integration +def test_run_single_evaluation_mcts(): + datestr = datetime.now().strftime("%Y%m%d-%H%M%S") + dir = f"{moatless_dir}/eval_test" + evaluation_name = f"{datestr}_mcts" + + evaluation = Evaluation( + transitions=search_and_code, + evaluations_dir=dir, + evaluation_name=evaluation_name, + index_store_dir=index_store_dir, + repo_base_dir=repo_dir, + max_file_context_tokens=16000, + num_workers=1, + detailed_report=True, + ) + + evaluation.run_single_instance("django__django-16379") \ No newline at end of file diff --git a/tests/benchmark/test_report_v2.py b/tests/benchmark/test_report_v2.py new file mode 100644 index 00000000..e35c4771 --- /dev/null +++ b/tests/benchmark/test_report_v2.py @@ -0,0 +1,39 @@ +import json +from pathlib import Path + +import pytest + +from moatless.benchmark.report_v2 import to_result +from moatless.trajectory import Trajectory + + +@pytest.fixture +def django_trajectory(): + file_path = Path("tests/trajectories/django__django_16379.json") + return Trajectory.load(str(file_path)) + + +@pytest.fixture +def dataset(): + with open("moatless/benchmark/swebench_lite_all_evaluations.json") as f: + return json.load(f) + +@pytest.fixture +def django_instance(dataset): + for instance in dataset: + if instance["instance_id"] == "django__django-16379": + return instance + + return None + + +def test_to_result(django_trajectory, django_instance): + result = to_result(django_instance, django_trajectory) + + assert result["instance_id"] == "django__django-16379" + assert result["status"] == "edited" + assert result["transitions"] == len(django_trajectory.transitions) + assert result["edited"] + assert result["identified"] + assert result["found_in_search"] + assert result["file_identified"] diff --git a/tests/test_state.py b/tests/test_state.py index a32c0021..ef45cedd 100644 --- a/tests/test_state.py +++ b/tests/test_state.py @@ -160,4 +160,4 @@ def test_finished_state_creation_and_dump(): assert dumped_state["id"] == 1 assert dumped_state["message"] == message - assert dumped_state["output"] == output \ No newline at end of file + assert dumped_state["output"] == output diff --git a/tests/trajectories/django__django_16379.json b/tests/trajectories/django__django_16379.json index ab529b90..e8cf4cda 100644 --- a/tests/trajectories/django__django_16379.json +++ b/tests/trajectories/django__django_16379.json @@ -114,7 +114,7 @@ "type": "git", "repo_path": "/tmp/repos/swe-bench_django__django", "git_repo_url": "https://github.com/swe-bench/django__django.git", - "commit": "8c1c528fea1644fc1cf597288d047cd110b3febd" + "commit": "1d0fa848e084cad62d0bb6bde3b51e4862558e57" }, "file_context": { "max_tokens": 4000,