-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_eval.py
157 lines (138 loc) · 5.32 KB
/
run_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import asyncio
from asyncio import to_thread
import numpy as np
from evaluation.metrics import compute_overall_metrics, compute_macro_scores
from evaluation.utils import create_df_reasoning
import argparse
import pandas as pd
import json
from openai.error import InvalidRequestError
from langchain.chains import LLMChain
from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
from langchain.prompts import PromptTemplate
COLS = ["Must_have", "Nice_to_have"]
COLS_TO_SAVE = ["Question", "result", "recall", "precision", "contra"]
TASKS = ["is_entails", "is_contradict"]
async def run_experiment(
results_file: str,
version_name: str,
):
global ans_results
results_dict, ans_results_dict = {}, {}
list_of_metric_dfs = []
# Load ground-truth data including statements
df_gt = pd.read_json(
"dataset/questions_w_answers.jsonl", orient="records", lines=True
)
results = pd.read_json(results_file, orient="records")
results = df_gt.loc[:, ["Question", "Must_have", "Nice_to_have"]].merge(
results, on="Question", how="inner"
)
print(f"Evaluated questions: {len(results)} (out of {len(df_gt)})\n-----")
# Evaluate each statement
for col_name in COLS:
for task in TASKS:
print(f"Computing for task: {task}, col_name: {col_name}")
await asyncio.gather(
*[
run_single_nli_answer(idx, row, ans_results_dict, col_name, task)
for idx, row in results.iterrows()
],
return_exceptions=True,
)
metric_res = pd.DataFrame(ans_results_dict).T
list_of_metric_dfs.append(metric_res)
ans_results = pd.concat(list_of_metric_dfs, axis=1)
results = pd.merge(results, ans_results, left_index=True, right_index=True)
# Compute overall metrics
results = compute_overall_metrics(results, cols=COLS)
macro_results = compute_macro_scores(results)
print(f"Macro results:\n{json.dumps(macro_results, indent=4)}")
# Save results
print("Saving results...")
if not os.path.exists(version_name):
os.makedirs(version_name)
with open(f"{version_name}/macro_results.json", "w") as fp:
json.dump(macro_results, fp)
results.loc[:, COLS_TO_SAVE].to_csv(f"{version_name}/dataset_statistics.csv")
results = pd.DataFrame(results).reset_index(drop=True)
df_reasoning = create_df_reasoning(results, TASKS, COLS)
df_reasoning.to_csv(f"{version_name}/reasoning.csv")
async def run_single_nli_answer(idx, row, ans_results_dict, col_name, task):
answers, labels = row[col_name], []
if (task == "is_entails") & (col_name == "Nice_to_have"):
ans_results_dict[idx] = {
f"raw_label_{col_name}_{task}": ["False"] * len(answers),
f"label_{col_name}_{task}": [False] * len(answers),
f"avg_{col_name}_{task}": 0,
f"sum_{col_name}_{task}": 0,
f"n_{col_name}_{task}": 0,
}
else:
with open(f"evaluation/prompts/{task}.txt", "r") as f:
prompt_template = f.read()
prompt = PromptTemplate.from_template(prompt_template)
ans_checker = LLMChain(llm=LLM, prompt=prompt, output_key="label")
for answer in answers:
try:
res = await to_thread(
ans_checker,
{
"llm_answer": row["result"],
"answer": answer,
"question": row["Question"],
},
)
labels.append((res["label"], answer))
except InvalidRequestError as e:
print(
f"Invalid request error -- {e} \n"
f"Question: {row['Question']}, Hypothesis: {answer}"
)
labels.append((res["label"], answer))
ans_true = ["true" in label[0].lower()[-20:] for label in labels]
ans_results_dict[idx] = {
f"raw_label_{col_name}_{task}": labels,
f"label_{col_name}_{task}": ans_true,
f"avg_{col_name}_{task}": np.mean(ans_true),
f"sum_{col_name}_{task}": np.sum(ans_true),
f"n_{col_name}_{task}": len(labels),
}
def parse_arguments():
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(description="K-QA Benchmark Evaluation")
parser.add_argument("--result_file", type=str, default="dummy_res.json")
parser.add_argument("--version", type=str, default="dummy_exp")
parser.add_argument(
"--on_openai",
type=bool,
default=True,
help="Calling GPT-4 model from Azure or from OpenAI",
action=argparse.BooleanOptionalAction,
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_arguments()
if args.on_openai:
LLM = AzureChatOpenAI(
deployment_name="gpt-4-8k",
model_name="gpt-4-8k",
temperature=0,
max_retries=20,
verbose=True,
)
else:
LLM = ChatOpenAI(
deployment_name="gpt-4",
model_name="gpt-4",
temperature=0,
max_retries=20,
verbose=True,
)
asyncio.run(
run_experiment(
results_file=args.result_file,
version_name=args.version,
)
)