-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathfine_tuning.py
156 lines (128 loc) · 6.51 KB
/
fine_tuning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from utils import remove_release_number, encode_answer, generate_prompt, llm_inference, get_results_with_labels, update_package
# update transfomers to the latest version
update_package('transformers')
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, \
DataCollatorForLanguageModeling
import datasets
from datasets import Dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import os
from transformers.trainer_utils import get_last_checkpoint
from peft import PeftModel, PeftConfig
def tokenize_function(examples: datasets.arrow_dataset.Dataset):
"""
Tokenize input.
Args:
examples (datasets.arrow_dataset.Dataset): Samples to tokenize
Returns:
tokenized_dataset (datasets.arrow_dataset.Dataset): Tokenized dataset
"""
return tokenizer(examples['text'], max_length=512, padding='max_length', truncation=True)
# +++++++++++++++++++++++++++++++++ setup ++++++++++++++++++++++++++++++++++++++++++++++
weight_decay = 0.01
rank = 512
alpha = 1024
Quant = 16
batch_size = 8
dropout = 0.05
learning_rate = 1e-4
lr= "1e_4"
context_file = "results/context_all_train.pkl" # you get this file after running vector_store_for_rag with RAG_INFERENCE = True
model_name = f"peft_phi_2_Q{Quant}_B{batch_size}_r_{rank}_{alpha}_lr_{lr}_decay_{weight_decay}"
print(f"\n+++++++++++++ model name is {model_name}\n")
MODEL_PATH = 'microsoft/phi-2'
TUNED_MODEL_PATH = f'models/{model_name}'
if (Quant == 4):
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_compute_dtype='float16',
bnb_4bit_use_double_quant=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH,
trust_remote_code=True,
quantization_config=bnb_config)
elif (Quant == 8):
bnb_config = BitsAndBytesConfig(
load_in_4bit=False,
load_in_8bit=True,
)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH,
trust_remote_code=True,
quantization_config=bnb_config)
else:
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH,
trust_remote_code=True)
# +++++++++++++++++++++++++++++++++ load mode and tokanizer ++++++++++++++++++++++++++++
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH,
trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
# +++++++++++++++++++++++++++++++++ prepare data
print("\n+++++++++++++ preparing data")
train = pd.read_json('data/TeleQnA.txt').T
labels = pd.read_csv('data/questions_answers.csv')
labels = labels.fillna('')
# +++++++++++++++++++++++++++++++++ set question number
train['Question_ID'] = train.index.str.split(' ').str[-1].astype('int')
# Encode number to letter. LLMs seem to work better with options in the format of letters instead of numbers
labels['Answer_letter'] = labels.Answer_ID.apply(lambda x: encode_answer(x, False))
train = pd.merge(train,
labels[['Question_ID', 'Answer_letter']],
how='left',
on='Question_ID')
# +++++++++++++++++++++++++++++++++ format answer
train['answer'] = train.Answer_letter + ')' + train.answer.str[9:]
labels = labels.astype(str)
# Remove [3GPP Release <number>] from question
train = remove_release_number(train, 'question')
# +++++++++++++++++++++++++++++++++ set context
context_all_train = pd.read_pickle(context_file)
train['Context_1'] = context_all_train['Context_1']
# Generate prompts with context and answers
train['text'] = train.apply(lambda x: generate_prompt(x, 'Context:\n' + x['Context_1'] + '\n') + x['answer'], axis=1)
# +++++++++++++++++++++++++++++++++ shuffle and split data
instruction_dataset = train['text'].sample(frac=1, random_state=22)
# Get test indices (remaining 30%). They will be used at the end to evaluate results
test_idx = train[~train.index.isin(instruction_dataset.index)].index
# Convert Series to datasets and tokenize the dataset
instruction_dataset = instruction_dataset.reset_index(drop=True)
instruction_dataset = Dataset.from_pandas(pd.DataFrame(instruction_dataset))
tokenized_dataset = instruction_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
# Divide data into train and validation sets
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=22)
# +++++++++++++++++++++++++++++++++ configure fine-tuning hyper-parameters +++++++++++++
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
peft_config = LoraConfig(task_type="CAUSAL_LM",
r=rank,
lora_alpha=alpha,
target_modules=['q_proj', 'k_proj', 'v_proj', 'dense'],
lora_dropout=dropout)
peft_model = get_peft_model(model, peft_config)
training_args = TrainingArguments(output_dir=TUNED_MODEL_PATH,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
num_train_epochs=1.1,
weight_decay=weight_decay,
eval_strategy='epoch',
logging_steps=20,
fp16=True,
warmup_steps=100,
save_strategy="steps",
save_steps=100,
save_total_limit=2,
evaluation_strategy="steps",
eval_steps=100,
load_best_model_at_end=True )
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(model=peft_model,
args=training_args,
train_dataset= tokenized_dataset['train'],
eval_dataset= tokenized_dataset['test'],
tokenizer=tokenizer,
data_collator=data_collator)
print("\n+++++++++++++ fine-tuning model")
trainer.train()
print('\n+++++++++++++ saving model')
model_final = trainer.model
model_final.save_pretrained(TUNED_MODEL_PATH)