Skip to content

Commit

Permalink
0.0.4
Browse files Browse the repository at this point in the history
  • Loading branch information
aorwall committed Jan 17, 2025
1 parent eff49ea commit cd3c41b
Show file tree
Hide file tree
Showing 12 changed files with 313 additions and 324 deletions.
2 changes: 0 additions & 2 deletions moatless/actions/verified_finish.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,9 @@ class VerifiedFinishArgs(ActionArguments):
model_config = ConfigDict(title="Finish")

def to_prompt(self):
files_str = "\n".join(f"- {file}" for file in self.test_files_changed)
return (
f"Finish with reason: {self.finish_reason}\n"
f"Test verification: {self.test_verification}\n"
f"Modified/Created test files:\n{files_str}"
)

def equals(self, other: "ActionArguments") -> bool:
Expand Down
227 changes: 115 additions & 112 deletions moatless/benchmark/evaluation_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ def evaluate_instance(self, instance_id: str):

instance_dir = os.path.join(self.get_evaluation_dir(), instance_id)
trajectory_path = os.path.join(instance_dir, "trajectory.json")
eval_result_path = os.path.join(instance_dir, "eval_result.json")
os.makedirs(instance_dir, exist_ok=True)
instance = self.evaluation.get_instance(instance_id)
if not instance:
Expand All @@ -157,36 +156,6 @@ def evaluate_instance(self, instance_id: str):
moatless_instance = get_moatless_instance(instance_id=instance_id)
problem_statement = f"<task>\nSolve the following reported issue in the {moatless_instance['repo']} repository:\n\n{moatless_instance['problem_statement']}\n</task>"

eval_result = None

if os.path.exists(eval_result_path):
try:
with open(eval_result_path) as f:
eval_result = json.load(f)
logger.info(f"Loading eval_result from {eval_result_path}, evaluated ")
if "node_results" not in eval_result:
if len(eval_result) > 0:
logger.info(f"Found nood results with {eval_result.keys()} on root, fix format")
eval_result = {
"node_results": eval_result,
"status": "started",
"start_time": datetime.now(timezone.utc).isoformat(),
}
else:
logger.info("No node_results found")
eval_result = None
else:
logger.info(f"Found evaluated nodes {eval_result['node_results'].keys()}")

except json.JSONDecodeError:
pass

if not eval_result:
eval_result = {
"node_results": {},
"status": "started",
"start_time": datetime.now(timezone.utc).isoformat(),
}
agentic_loop = self.create_and_run_agentic_loop(
problem_statement=problem_statement,
instance=instance,
Expand All @@ -195,30 +164,17 @@ def evaluate_instance(self, instance_id: str):
)
logger.info(f"Completed agentic loop for instance {instance_id}")

start_time = time.time()
try:
if self.use_testbed:
logger.info(f"Starting testbed evaluation for instance {instance_id}")
eval_result = self.evaluate_nodes(
instance_id=instance_id,
instance=moatless_instance,
agentic_loop=agentic_loop,
eval_result=eval_result,
)
logger.info(f"Completed testbed evaluation for instance {instance_id}")
except RuntimeError as e:
logger.error(f"Runtime error in instance {instance_id}: {str(e)}")
raise e

except Exception as e:
logger.error(f"Error in testbed evaluation for instance {instance_id}: {str(e)}")
eval_result["status"] = "error"
eval_result["error"] = traceback.format_exc()
eval_result["duration"] = time.time() - start_time
resolved = None
if self.use_testbed:
logger.info(f"Starting testbed evaluation for instance {instance_id}")
resolved = self.evaluate_nodes(
instance=moatless_instance,
agentic_loop=agentic_loop,
)
logger.info(f"Completed testbed evaluation for instance {instance_id}")

# Complete instance with result
instance.complete()
self.emit_event("instance_completed", {"instance_id": instance_id})
instance.complete(resolved=resolved)
self.emit_event("instance_completed", {"instance_id": instance_id, "resolved": resolved})
return

except Exception as e:
Expand All @@ -227,11 +183,6 @@ def evaluate_instance(self, instance_id: str):
self.emit_event("instance_error", {"instance_id": instance_id, "error": str(e)})
raise
finally:
if eval_result:
# Save evaluation result
with open(eval_result_path, "w") as f:
json.dump(eval_result, f, indent=2)

if self.remove_repo_after_evaluation:
repo_path = instance_repo_path(instance_id, self.repo_base_dir)
if os.path.exists(repo_path):
Expand All @@ -244,54 +195,6 @@ def evaluate_instance(self, instance_id: str):
del eval_result
gc.collect()

def evaluate_nodes(
self,
instance_id: str,
instance: dict,
agentic_loop: AgenticLoop,
eval_result: dict,
):
"""Evaluate leaf node using the testbed."""
leaf_node = agentic_loop.get_last_node()
patch_results = {}

if str(leaf_node.node_id) in eval_result.get("node_results", {}):
logger.info(f"Leaf node {leaf_node.node_id} for instance {instance_id} have already been evaluated")
return eval_result

logger.info(f"Evaluate Node{leaf_node.node_id} for instance {instance_id}.")
patch = leaf_node.file_context.generate_git_patch(ignore_tests=True)
if patch and patch.strip():
patch_hash = create_sha256_hash(patch)

if patch_hash in patch_results:
eval_result["node_results"][str(leaf_node.node_id)] = patch_results[patch_hash]
else:
repository = create_repository(instance, repo_base_dir=self.repo_base_dir)
# TODO: Set run_id on testbed environment
run_id = hashlib.sha256(self.evaluation.evaluation_name.encode()).hexdigest()[:8]
runtime = TestbedEnvironment(
repository=repository,
instance=instance,
# run_id=run_id,
)

start_time = time.time()
result = runtime.evaluate(patch=patch)
if not result:
logger.error(f"Error in evaluating patch for {instance_id}")
return None

eval_result["node_results"][str(leaf_node.node_id)] = result.model_dump()
patch_results[patch_hash] = eval_result["node_results"][str(leaf_node.node_id)]
logger.info(
f"Evaluated patch for node {leaf_node.node_id} in {time.time() - start_time} seconds (resolved: {result.resolved})"
)
else:
logger.info(f"Skip Node{leaf_node.node_id} for instance {instance_id} with no patch.")

return eval_result

def create_and_run_agentic_loop(
self,
problem_statement: str,
Expand All @@ -305,6 +208,8 @@ def create_and_run_agentic_loop(
"instance_id": instance.instance_id,
}

start_time = time.time()

agentic_loop = None
rerun_tree = False
if os.path.exists(trajectory_path):
Expand Down Expand Up @@ -383,39 +288,137 @@ def create_and_run_agentic_loop(
metadata=metadata,
)

last_node = agentic_loop.get_last_node()
if self.rerun_errors:
last_node = agentic_loop.get_last_node()
if last_node.error or (last_node.action and last_node.action.name == "Error" and last_node.parent):
# Remove error node from parent's children
last_node.parent.children = [c for c in last_node.parent.children if c.node_id != last_node.node_id]
logger.info(
f"Removed error node {last_node.node_id} from parent {last_node.parent.node_id} on instance {instance.instance_id}"
)


# Remove the last node if it has action steps without observation
last_node = agentic_loop.get_last_node()
no_observation = all(not step.observation for step in last_node.action_steps)
if last_node.action and no_observation:
last_node.parent.children = [c for c in last_node.parent.children if c.node_id != last_node.node_id]
logger.info(f"Removed last node {last_node.node_id} from instance {instance.instance_id} because it has no observation")

def tree_event_handler(event):
logger.info(f"Got event {event['event_type']}")
if event["event_type"] == "tree_iteration":
if event["event_type"] == "loop_iteration":
instance.usage = agentic_loop.total_usage()
instance.iterations = len(agentic_loop.root.get_all_nodes())

logger.info("Emit event tree_progress")
self.emit_event(
"tree_progress",
"loop_iteration",
{
"instance_id": instance.instance_id,
**event["data"],
},
)

instance.start()
self.emit_event("instance_started", {"instance_id": instance.instance_id})
self.emit_event("loop_started", {"instance_id": instance.instance_id})

agentic_loop.add_event_handler(tree_event_handler)
agentic_loop.run()

self.emit_event("instance_completed", {"instance_id": instance.instance_id})
duration = time.time() - start_time
self.emit_event("loop_completed", {"instance_id": instance.instance_id, "duration": duration})

return agentic_loop

def evaluate_nodes(
self,
instance: dict,
agentic_loop: AgenticLoop
) -> bool | None:

instance_id = instance["instance_id"]

eval_result = None
instance_dir = os.path.join(self.get_evaluation_dir(), instance_id)
eval_result_path = os.path.join(instance_dir, "eval_result.json")

if os.path.exists(eval_result_path):
try:
with open(eval_result_path) as f:
eval_result = json.load(f)
logger.info(f"Loading eval_result from {eval_result_path}, evaluated ")
if "node_results" not in eval_result:
if len(eval_result) > 0:
logger.info(f"Found nood results with {eval_result.keys()} on root, fix format")
eval_result = {
"node_results": eval_result,
"status": "started",
"start_time": datetime.now(timezone.utc).isoformat(),
}
else:
logger.info("No node_results found")
eval_result = None
else:
logger.info(f"Found evaluated nodes {eval_result['node_results'].keys()}")

except json.JSONDecodeError:
pass

if not eval_result:
eval_result = {
"node_results": {},
"start_time": datetime.now(timezone.utc).isoformat(),
}

"""Evaluate leaf node using the testbed."""
leaf_node = agentic_loop.get_last_node()

if str(leaf_node.node_id) in eval_result.get("node_results", {}):
result = eval_result.get("node_results", {})[str(leaf_node.node_id)]
if result.get("resolved") is not None:
logger.info(f"Leaf node {leaf_node.node_id} for instance {instance_id} have already been evaluated with resolved: {result['resolved']}")
return result.get("resolved")

logger.info(f"Evaluate Node{leaf_node.node_id} for instance {instance_id}.")
patch = leaf_node.file_context.generate_git_patch(ignore_tests=True)
if patch and patch.strip():
start_time = time.time()
try:
repository = create_repository(instance, repo_base_dir=self.repo_base_dir)
# TODO: Set run_id on testbed environment
run_id = hashlib.sha256(self.evaluation.evaluation_name.encode()).hexdigest()[:8]
runtime = TestbedEnvironment(
repository=repository,
instance=instance,
# run_id=run_id,
)

result = runtime.evaluate(patch=patch)
if not result:
logger.error(f"Error in evaluating patch for {instance_id}")
return None

eval_result["node_results"][str(leaf_node.node_id)] = result.model_dump()
logger.info(
f"Evaluated patch for node {leaf_node.node_id} in {time.time() - start_time} seconds (resolved: {result.resolved})"
)
return result.resolved

except Exception as e:
logger.error(f"Error in testbed evaluation for instance {instance_id}: {str(e)}")
eval_result["error"] = traceback.format_exc()
finally:
with open(eval_result_path, "w") as f:
json.dump(eval_result, f, indent=2)
eval_result["duration"] = time.time() - start_time

del runtime
del repository
else:
logger.info(f"Skip Node{leaf_node.node_id} for instance {instance_id} with no patch.")
return None

def get_evaluation_dir(self) -> str:
"""Get the directory path for an evaluation."""
return os.path.join(self.evaluations_dir, self.evaluation.evaluation_name)
27 changes: 3 additions & 24 deletions moatless/benchmark/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from moatless.completion.log_handler import LogHandler
from moatless.model_config import MODEL_CONFIGS
from moatless.schema import MessageHistoryType
from swesearch.evaluation_setups import create_evaluation_setup

# Default evaluation settings
DEFAULT_CONFIG = {
Expand Down Expand Up @@ -342,30 +343,8 @@ def run_evaluation(config: dict):
if not instance_ids:
raise ValueError("No instance IDs provided")

model_settings = BaseCompletionModel(
model=config["model"],
temperature=config.get("temperature", 0.0),
max_tokens=4000,
model_api_key=config.get("api_key"),
model_base_url=config.get("base_url"),
response_format=config.get("response_format"),
thoughts_in_action=config.get("thoughts_in_action", False),
)

agent_settings = AgentSettings(
completion_model=model_settings,
message_history_type=config.get("message_history_type", MessageHistoryType.MESSAGES),
system_prompt=None,
thoughts_in_action=config.get("thoughts_in_action", False),
)

tree_search_settings = TreeSearchSettings(
max_iterations=config["max_iterations"],
max_expansions=config["max_expansions"],
max_cost=config["max_cost"],
model=model_settings,
agent_settings=agent_settings,
)
# Create tree search settings using the evaluation setup function
tree_search_settings = create_evaluation_setup(config)

evaluation = create_evaluation(
repository=repository,
Expand Down
Loading

0 comments on commit cd3c41b

Please sign in to comment.