-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathoptimization.py
104 lines (85 loc) · 2.68 KB
/
optimization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from src.utils.adamw_scaled import AdamWScale
from torch.optim.lr_scheduler import (
SequentialLR,
LinearLR,
CosineAnnealingLR,
ConstantLR
)
def create_optimizer(model, lr, betas, eps, weight_decay, foreach=False, kahan_sum=False, use_state_dtype=None):
no_decay = ["bias", "LayerNorm", "layernorm", "layer_norm", "ln"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optimizer = AdamWScale(
optimizer_grouped_parameters,
lr=lr,
betas=betas,
eps=eps,
weight_decay=weight_decay,
kahan_sum=kahan_sum,
foreach=foreach,
use_state_dtype=use_state_dtype
)
return optimizer
def create_cosine_scheduler(warmup_steps, warmup_ratio, num_training_steps, optimizer):
warmup_steps = warmup_steps if warmup_steps != 0 else int(num_training_steps * warmup_ratio)
if warmup_steps == 0:
lr_scheduler = CosineAnnealingLR(
optimizer,
T_max=num_training_steps - warmup_steps,
eta_min=1e-5,
)
else:
scheduler1 = LinearLR(
optimizer,
start_factor=0.5,
end_factor=1,
total_iters=warmup_steps,
last_epoch=-1,
)
scheduler2 = CosineAnnealingLR(
optimizer,
T_max=num_training_steps - warmup_steps,
eta_min=1e-5,
)
lr_scheduler = SequentialLR(
optimizer,
schedulers=[scheduler1, scheduler2],
milestones=[warmup_steps]
)
return lr_scheduler
def create_wsd_scheduler(warmup_steps, warmup_ratio, num_training_steps, optimizer):
warmup_steps = warmup_steps if warmup_steps != 0 else int(num_training_steps * warmup_ratio)
scheduler1 = LinearLR(
optimizer,
start_factor=0.5,
end_factor=1,
total_iters=warmup_steps,
last_epoch=-1,
)
scheduler2 = ConstantLR(
optimizer,
factor=1.0,
total_iters=num_training_steps - 2*warmup_steps,
last_epoch=-1,
)
scheduler3 = LinearLR(
optimizer,
start_factor=1.0,
end_factor=0.5,
total_iters=warmup_steps,
last_epoch=-1,
)
lr_scheduler = SequentialLR(
optimizer,
schedulers=[scheduler1, scheduler2, scheduler3],
milestones=[warmup_steps, num_training_steps - warmup_steps]
)
return lr_scheduler