-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_LUW.py
169 lines (148 loc) · 6.37 KB
/
train_LUW.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import unicodedata
import re
import pandas as pd
import torch
import os
from tokenizers import Tokenizer,models,pre_tokenizers,normalizers,decoders,trainers
from transformers import RemBertTokenizerFast,AutoTokenizer,get_scheduler,AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.utils import shuffle
# make the model and tokenizer
# AutoModelForSequenceClassification is a function specially designed for text classification.
# the different with the previous code is that the LUW be used as tokenizer
# as you can see, the loss is a little bigger than the previous code
# but the test result is not bad, I will test them with other dataset in the future
# you also could change the num_labels to the number of your label
tokenizer=AutoTokenizer.from_pretrained("KoichiYasuoka/bert-base-japanese-luw-upos")
model = AutoModelForSequenceClassification.from_pretrained("robert", num_labels=9)
# this label dict is based on the 「livedoor ニュース」's opensource dataset
# datset website at here https://creativecommons.org/licenses/by-nd/2.1/jp/
label_dic = {
'dokujo-tsushin':0,
'it-life-hack':1,
'kaden-channel':2,
'livedoor-homme':3,
'movie-enter':4,
'peachy':5,
'topic-news':6,
'smax':7,
'sports-watch':8}
# you also could use other way to get the data not just csv file
def get_csv_train_data(file, label_dic):
data = pd.read_csv(file)
content = data["content"].to_list()
label = data["label"].to_list()
return content, label
content, label = get_csv_train_data("processed_data.csv", label_dic)
data = pd.DataFrame({"label":label, "content":content})
# shuffle the data for better training
data = shuffle(data)
# I trained the model with my cpu instead of gpu, so I just use 8000 data for training for saving time
# you also could change the
train_data = tokenizer(data.content.to_list(), padding = "max_length", max_length = 100, truncation=True ,return_tensors = "pt")
train_label = data.label.to_list()
# you could change the batch size to a bigger number if you have a better gpu
batch_size = 16
# make the dataloader
# TensorDataset is a dataset wrapping tensors, and DataLoader is a iterator for the dataset
train = TensorDataset(train_data["input_ids"], train_data["attention_mask"], torch.tensor(train_label))
train_sampler = RandomSampler(train)
train_dataloader = DataLoader(train, sampler=train_sampler, batch_size=batch_size)
# define the optimizer
# the AdamW is a great enought, but you could also use other optimizer like SGD
optimizer = AdamW(model.parameters(), lr=1e-4)
# define the learning rate(lr) and the number of epochs(num_epochs)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
# move the model to gpu if you have one
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
# start training
# the grad will be zeroed after each batch for pytorch will accumulate the grad
# b_input_ids is the input data,
# token_type_ids is choose the type of the token,
# b_labels is the label for the input data
# attention_mask is the mask for the input data to let the model pay attention to the right place
# the labels is to calculate the loss
for epoch in range(num_epochs):
total_loss = 0
model.train()
for step, batch in enumerate(train_dataloader):
if step % 10 == 0 and not step == 0:
print("step: ",step, " loss:",total_loss/(step*batch_size))
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
model.zero_grad()
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
loss = outputs.loss
total_loss += loss.item()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
lr_scheduler.step()
avg_train_loss = total_loss / len(train_dataloader)
print("avg_loss:",avg_train_loss)
for param in model.parameters():
param.data = param.data.contiguous()
# save the model to the place you want
output_dir = "./model_save/"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"the model has been saved at {output_dir}")
# step: 10 loss: 0.13954393118619918
# step: 20 loss: 0.13964745327830314
# step: 30 loss: 0.13929008295138676
# step: 40 loss: 0.13970570228993892
# step: 50 loss: 0.13944958180189132
# step: 60 loss: 0.13922391360004743
# step: 70 loss: 0.1392746395298413
# step: 80 loss: 0.139160917699337
# step: 90 loss: 0.13894355835186112
# step: 100 loss: 0.1388008949905634
# step: 110 loss: 0.13865940672430124
# step: 120 loss: 0.1383491560195883
# step: 130 loss: 0.13842499594275767
# step: 140 loss: 0.1384864976895707
# step: 150 loss: 0.13862608686089517
# step: 160 loss: 0.13856650316156446
# step: 170 loss: 0.13859204601715594
# step: 180 loss: 0.1386505912989378
# step: 190 loss: 0.13860745167261676
# step: 200 loss: 0.13859345983713867
# step: 210 loss: 0.1385004225586142
# step: 220 loss: 0.1384103136645122
# step: 230 loss: 0.1385161407939766
# step: 240 loss: 0.13853983404114842
# step: 250 loss: 0.13850072667002677
# step: 260 loss: 0.13842260668484063
# step: 270 loss: 0.13845941708595663
# step: 280 loss: 0.13849667898778403
# step: 290 loss: 0.13846129448763256
# step: 300 loss: 0.1384996289263169
# step: 310 loss: 0.13847313828526003
# step: 320 loss: 0.13837016054894774
# step: 330 loss: 0.13843489399913586
# step: 340 loss: 0.1384575102916535
# step: 350 loss: 0.13841540926269122
# step: 360 loss: 0.13839858217785755
# step: 370 loss: 0.13837410185385396
# step: 380 loss: 0.1383862859520473
# step: 390 loss: 0.13832726900776227
# step: 400 loss: 0.13829317854717374
# step: 410 loss: 0.13825558000584928
# step: 420 loss: 0.1382718673951569
# step: 430 loss: 0.1382480853345505
# step: 440 loss: 0.13819563255052675
# step: 450 loss: 0.13815274452169737
# step: 460 loss: 0.1381516316824633
# avg_loss: 2.210336681301837