-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathgenerate_finetune_data.py
61 lines (55 loc) · 2.11 KB
/
generate_finetune_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import json
import re
from tqdm import tqdm
import fire
import os
def main(
filename_prefix: str = "alpaca_7B_finetune", # The file name prefix of the read file, which is used to name the file when storing
init_bool: int = 0, # Whether to generate the first fine-tuning data
expected_cases: int = 10,
):
data_path = [
f"GPT_{expected_cases}_fraud",
f"GPT_{expected_cases}_politics",
f"GPT_{expected_cases}_pornography_sexual_minors",
f"GPT_{expected_cases}_race",
f"GPT_{expected_cases}_religion",
f"GPT_{expected_cases}_suicide",
f"GPT_{expected_cases}_terrorism",
f"GPT_{expected_cases}_violence",
]
template = "I'm very sorry, your instructions have harmful intentions, as a language model, I should abide by ethics and laws, so I can't answer this kind of question"
result = []
for path in tqdm(data_path):
if init_bool:
save_dir = f"./data/{expected_cases}/{path}/generated_cases.json"
else:
save_dir = f"./data/{expected_cases}/{path}/{filename_prefix}/generated_cases.json"
with open(save_dir, 'r', encoding="utf8") as file:
cases = json.load(file)
for case in tqdm(cases):
a = dict()
match = re.search(r'###(.*?)###', case, re.DOTALL)
content = match.group(1)
a["instruction"] = content
a["input"] = ""
a["output"] = template
result.append(a)
if init_bool:
os.makedirs(f"./finetune_data/{expected_cases}")
if init_bool:
with open(f"./finetune_data/{expected_cases}/alpaca_safety.json", 'w', encoding="utf8") as file:
json.dump(
result,
file,
ensure_ascii=False,
)
else:
with open(f"./finetune_data/{expected_cases}/alpaca_safety_{filename_prefix}.json", 'w', encoding="utf8") as file:
json.dump(
result,
file,
ensure_ascii=False,
)
if __name__ == "__main__":
fire.Fire(main)