Skip to content

Commit

Permalink
feat: Added AMP to bfloat16, Added gradient clipping
Browse files Browse the repository at this point in the history
Modified Sophia parameters, Modified slurm script
  • Loading branch information
ItsNiklas committed Jun 26, 2023
1 parent ea20e92 commit c11553e
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 13 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.idea
__pycache__
sst-classifier.pt
logdir
logdir
slurm_files
24 changes: 15 additions & 9 deletions classifier.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time, random, numpy as np, argparse, sys, re, os
from contextlib import nullcontext
from datetime import datetime
from types import SimpleNamespace
import csv
Expand Down Expand Up @@ -271,13 +272,14 @@ def train(args):
}

config = SimpleNamespace(**config)
ctx = nullcontext() if not args.use_gpu else torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16)

model = BertSentimentClassifier(config)
model = model.to(device)

lr = args.lr
# optimizer = AdamW(model.parameters(), lr=lr)
optimizer = SophiaG(model.parameters(), lr=lr, eps=1e-12, rho=0.03, weight_decay=0.0)
optimizer = SophiaG(model.parameters(), lr=lr, eps=1e-12, rho=0.03, betas=(0.985, 0.99), weight_decay=2e-1)
hess_interval = 10
iter_num = 0

Expand All @@ -296,11 +298,13 @@ def train(args):
b_mask = b_mask.to(device)
b_labels = b_labels.to(device)

logits = model(b_ids, b_mask)
loss = F.cross_entropy(logits, b_labels.view(-1))
with ctx:
logits = model(b_ids, b_mask)
loss = F.cross_entropy(logits, b_labels.view(-1))
loss.backward()

# Potentially: Clip gradients using nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Potentially: Clip gradients using
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()

optimizer.zero_grad(set_to_none=True)
Expand All @@ -311,13 +315,15 @@ def train(args):
and iter_num % hess_interval == hess_interval - 1
):
# Update the Hessian EMA
logits = model(b_ids, b_mask)
samp_dist = torch.distributions.Categorical(logits=logits)
y_sample = samp_dist.sample()
loss_sampled = F.cross_entropy(logits, y_sample.view(-1))
with ctx:
logits = model(b_ids, b_mask)
samp_dist = torch.distributions.Categorical(logits=logits)
y_sample = samp_dist.sample()
loss_sampled = F.cross_entropy(logits, y_sample.view(-1))
loss_sampled.backward()

# Potentially: Clip gradients using nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Potentially: Clip gradients using
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.update_hessian(bs=args.batch_size)

optimizer.zero_grad(set_to_none=True)
Expand Down
6 changes: 3 additions & 3 deletions run_train.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/bin/bash
#SBATCH --job-name=train-bert-token-tricksters
#SBATCH -t 05:00:00 # estimated time # TODO: adapt to your needs
#SBATCH -t 00:15:00 # estimated time # TODO: adapt to your needs
#SBATCH -p grete:shared # the partition you are training on (i.e., which nodes), for nodes see sinfo -p grete:shared --format=%N,%G
#SBATCH -G A100:1 # take 1 GPU, see https://www.hlrn.de/doc/display/PUB/GPU+Usage for more options
#SBATCH --mem-per-gpu=5G # setting the right constraints for the splitted gpu partitions
#SBATCH --nodes=1 # total number of nodes
#SBATCH --ntasks=1 # total number of tasks
#SBATCH --cpus-per-task=4 # number cores per task
#SBATCH --cpus-per-task=8 # number cores per task
#SBATCH --mail-type=all # send mail when job begins and ends
#SBATCH --mail-user=l.kaesberg@stud.uni-goettingen.de # TODO: change this to your mailaddress!
#SBATCH --output=./slurm_files/slurm-%x-%j.out # where to write output, %x give job name, %j names job id
Expand All @@ -28,4 +28,4 @@ python -m torch.utils.collect_env
nvcc -V

# Run the script:
python -u multitask_classifier.py --use_gpu
python -u classifier.py --use_gpu --batch_size 64 --lr 3e-4 --epochs 30

0 comments on commit c11553e

Please sign in to comment.