From c11553e6acaf133d69a2433340d9793b50f7ffd5 Mon Sep 17 00:00:00 2001 From: ItsNiklas Date: Mon, 26 Jun 2023 13:30:23 +0200 Subject: [PATCH] feat: Added AMP to bfloat16, Added gradient clipping Modified Sophia parameters, Modified slurm script --- .gitignore | 3 ++- classifier.py | 24 +++++++++++++++--------- run_train.sh | 6 +++--- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index 5c1b8ca..204fd7e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .idea __pycache__ sst-classifier.pt -logdir \ No newline at end of file +logdir +slurm_files diff --git a/classifier.py b/classifier.py index f20a76b..d5ec405 100644 --- a/classifier.py +++ b/classifier.py @@ -1,4 +1,5 @@ import time, random, numpy as np, argparse, sys, re, os +from contextlib import nullcontext from datetime import datetime from types import SimpleNamespace import csv @@ -271,13 +272,14 @@ def train(args): } config = SimpleNamespace(**config) + ctx = nullcontext() if not args.use_gpu else torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16) model = BertSentimentClassifier(config) model = model.to(device) lr = args.lr # optimizer = AdamW(model.parameters(), lr=lr) - optimizer = SophiaG(model.parameters(), lr=lr, eps=1e-12, rho=0.03, weight_decay=0.0) + optimizer = SophiaG(model.parameters(), lr=lr, eps=1e-12, rho=0.03, betas=(0.985, 0.99), weight_decay=2e-1) hess_interval = 10 iter_num = 0 @@ -296,11 +298,13 @@ def train(args): b_mask = b_mask.to(device) b_labels = b_labels.to(device) - logits = model(b_ids, b_mask) - loss = F.cross_entropy(logits, b_labels.view(-1)) + with ctx: + logits = model(b_ids, b_mask) + loss = F.cross_entropy(logits, b_labels.view(-1)) loss.backward() - # Potentially: Clip gradients using nn.utils.clip_grad_norm_(model.parameters(), 1.0) + # Potentially: Clip gradients using + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() optimizer.zero_grad(set_to_none=True) @@ -311,13 +315,15 @@ def train(args): and iter_num % hess_interval == hess_interval - 1 ): # Update the Hessian EMA - logits = model(b_ids, b_mask) - samp_dist = torch.distributions.Categorical(logits=logits) - y_sample = samp_dist.sample() - loss_sampled = F.cross_entropy(logits, y_sample.view(-1)) + with ctx: + logits = model(b_ids, b_mask) + samp_dist = torch.distributions.Categorical(logits=logits) + y_sample = samp_dist.sample() + loss_sampled = F.cross_entropy(logits, y_sample.view(-1)) loss_sampled.backward() - # Potentially: Clip gradients using nn.utils.clip_grad_norm_(model.parameters(), 1.0) + # Potentially: Clip gradients using + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.update_hessian(bs=args.batch_size) optimizer.zero_grad(set_to_none=True) diff --git a/run_train.sh b/run_train.sh index 6084505..9ba8829 100644 --- a/run_train.sh +++ b/run_train.sh @@ -1,12 +1,12 @@ #!/bin/bash #SBATCH --job-name=train-bert-token-tricksters -#SBATCH -t 05:00:00 # estimated time # TODO: adapt to your needs +#SBATCH -t 00:15:00 # estimated time # TODO: adapt to your needs #SBATCH -p grete:shared # the partition you are training on (i.e., which nodes), for nodes see sinfo -p grete:shared --format=%N,%G #SBATCH -G A100:1 # take 1 GPU, see https://www.hlrn.de/doc/display/PUB/GPU+Usage for more options #SBATCH --mem-per-gpu=5G # setting the right constraints for the splitted gpu partitions #SBATCH --nodes=1 # total number of nodes #SBATCH --ntasks=1 # total number of tasks -#SBATCH --cpus-per-task=4 # number cores per task +#SBATCH --cpus-per-task=8 # number cores per task #SBATCH --mail-type=all # send mail when job begins and ends #SBATCH --mail-user=l.kaesberg@stud.uni-goettingen.de # TODO: change this to your mailaddress! #SBATCH --output=./slurm_files/slurm-%x-%j.out # where to write output, %x give job name, %j names job id @@ -28,4 +28,4 @@ python -m torch.utils.collect_env nvcc -V # Run the script: -python -u multitask_classifier.py --use_gpu +python -u classifier.py --use_gpu --batch_size 64 --lr 3e-4 --epochs 30