-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
151 lines (124 loc) · 6.28 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# This implementation uses checkpoints, this means if the program
# is interuppted it can start again, where it left oft.
import tempfile
import json
from tqdm import tqdm
import argparse
import pandas as pd
from datetime import datetime
# So that it recognizes the SmolCoder libary
import sys
import os
sys.path.append(str(os.path.abspath('/SmolCoder/')))
from SmolCoder.src.agent_wrapper import AgentWrapper
from SmolCoder.src.toolkit import Toolkit
from SmolCoder.src.tools.list_methods import ListMethods
from SmolCoder.src.tools.list_classes import ListClasses
from SmolCoder.src.tools.list_files import ListFiles
from SmolCoder.src.tools.replace_method import ReplaceMethod
from SmolCoder.src.tools.finish import Finish
from SmolCoder.src.tools.execute_python import ExecutePythonCode
from SmolCoder.src.tools.show_method import ShowMethodBody
from SmolCoder.src.tools.move_folder import MoveFolder
from SmolCoder.src.tools.human_interaction import HumanInteraction
# Tool Definition
class_sumary = ListMethods()
list_classes = ListClasses()
list_files = ListFiles()
replace_method = ReplaceMethod()
finish = Finish()
execute_python = ExecutePythonCode()
show_method = ShowMethodBody()
move_folder = MoveFolder()
human_interaction = HumanInteraction()
toolkit = Toolkit([list_classes, list_files, replace_method, show_method, move_folder, finish])
resume_index = 0
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluation Pipeline for the SmolCoder Agent on the SWE-Benchmark.")
# Arguments for the CLI
parser.add_argument('--model_name', type=str, default='phi3:latest',
help='Name of the LLM model you want to use, as specified in ollama.')
parser.add_argument('--dataset_location', type=str, default='Evaluation/test-00000-of-00001.parquet', help='File path to the location of the SWE-Bench-Lite dataset.')
parser.add_argument('--output_directory', type=str, default="output", help="File path towards the output folder.")
parser.add_argument('--logging_enabled', type=bool, default=False, help="If logging for the agent should be enabled.")
parser.add_argument('--working_directory', type=str, default="repos", help="Working directory of the Agent, here the github repository will be downloaded to.")
parser.add_argument('--openai_key', type=str, default=None, help="Set it to your openai key, if you want to use it.")
parser.add_argument('--dummy_model', type=bool, default=False, help="If this is activated runs the script with a stub/dummy as Model.")
parser.add_argument('--n_samples', type=int, default=500, help="Number of Samples the program should be tested on, if not set uses the whole dataset.")
args = parser.parse_args()
# df = pd.read_json(os.path.abspath(args.dataset_location))
df = pd.read_parquet(os.path.abspath(args.dataset_location))
# If we use the dummy_model we want to ignore 'model_name' parameter
if args.dummy_model:
model_name = "dummy_model"
else:
model_name = args.model_name
# We need to give the file a distinct names, if we run multiple instances at once
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
checkpoint_file = f"{args.output_directory}/checkpoint_{model_name}_{current_time}.txt"
if not os.path.exists(args.output_directory):
os.makedirs(args.output_directory)
if not args.openai_key:
agent = AgentWrapper(
agent_name="SmolCoder",
toolkit=toolkit,
mode=0,
model=args.model_name,
working_directory=args.working_directory,
logging_enabled=args.logging_enabled,
dummy_model=args.dummy_model,
)
else:
agent = AgentWrapper(
agent_name="SmolCoder",
toolkit=toolkit,
mode=0,
model="gpt4-o-mini",
dummy_model=False,
working_directory=args.working_directory,
logging_enabled=args.logging_enabled,
openai=(True, args.openai_key),
)
# Check if checkpoint file exists and read the last processed index
try:
with open(checkpoint_file, 'r') as f:
resume_index = int(f.read().strip())
except FileNotFoundError:
pass
except Exception as e:
print(f"Error reading checkpoint file: {e}")
if resume_index < len(df) - 1:
# Open a file to save predictions
with open(f"{args.output_directory}/predictions_{model_name}_{current_time}.json", 'a', encoding="utf-8-sig") as json_file:
if resume_index == 0:
json_file.write('[') # Start of JSON array
json_file.write('\n')
# Generating our solution
if args.n_samples > df.shape[0]:
number_samples = df.shape[0]
else:
number_samples = args.n_samples
for index, row in tqdm(df.iterrows(), total=number_samples):
if index > args.n_samples:
break
if index % 10 == 0: print("Current idx: " + str(index))
# Skip rows that were already processed
if index < resume_index:
continue
predictions = {
"instance_id": row["instance_id"],
"model_patch": agent.predict(row),
"model_name_or_path": agent.name
}
# Convert the dictionary to a JSON formatted string and write to file
json_data = json.dumps(predictions, indent=4)
json_file.write(json_data)
if index < len(df) - 1:
json_file.write(',')
json_file.write('\n')
with open(checkpoint_file, 'w') as f:
f.write(str(index))
if index == len(df) - 1:
json_file.write(']')
else:
print(f"All task are finished see '{args.output_directory}/predictions_{args.model_name}_{current_time}.json' for the results or delete 'checkpoint.txt' for a new run.")