From c11553e6acaf133d69a2433340d9793b50f7ffd5 Mon Sep 17 00:00:00 2001
From: ItsNiklas <nikbau2000@gmail.com>
Date: Mon, 26 Jun 2023 13:30:23 +0200
Subject: [PATCH] feat: Added AMP to bfloat16, Added gradient clipping

Modified Sophia parameters, Modified slurm script
---
 .gitignore    |  3 ++-
 classifier.py | 24 +++++++++++++++---------
 run_train.sh  |  6 +++---
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5c1b8ca..204fd7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .idea
 __pycache__
 sst-classifier.pt
-logdir
\ No newline at end of file
+logdir
+slurm_files
diff --git a/classifier.py b/classifier.py
index f20a76b..d5ec405 100644
--- a/classifier.py
+++ b/classifier.py
@@ -1,4 +1,5 @@
 import time, random, numpy as np, argparse, sys, re, os
+from contextlib import nullcontext
 from datetime import datetime
 from types import SimpleNamespace
 import csv
@@ -271,13 +272,14 @@ def train(args):
     }
 
     config = SimpleNamespace(**config)
+    ctx = nullcontext() if not args.use_gpu else torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16)
 
     model = BertSentimentClassifier(config)
     model = model.to(device)
 
     lr = args.lr
     # optimizer = AdamW(model.parameters(), lr=lr)
-    optimizer = SophiaG(model.parameters(), lr=lr, eps=1e-12, rho=0.03, weight_decay=0.0)
+    optimizer = SophiaG(model.parameters(), lr=lr, eps=1e-12, rho=0.03, betas=(0.985, 0.99), weight_decay=2e-1)
     hess_interval = 10
     iter_num = 0
 
@@ -296,11 +298,13 @@ def train(args):
             b_mask = b_mask.to(device)
             b_labels = b_labels.to(device)
 
-            logits = model(b_ids, b_mask)
-            loss = F.cross_entropy(logits, b_labels.view(-1))
+            with ctx:
+                logits = model(b_ids, b_mask)
+                loss = F.cross_entropy(logits, b_labels.view(-1))
             loss.backward()
 
-            # Potentially: Clip gradients using nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            # Potentially: Clip gradients using
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
             optimizer.step()
 
             optimizer.zero_grad(set_to_none=True)
@@ -311,13 +315,15 @@ def train(args):
                 and iter_num % hess_interval == hess_interval - 1
             ):
                 # Update the Hessian EMA
-                logits = model(b_ids, b_mask)
-                samp_dist = torch.distributions.Categorical(logits=logits)
-                y_sample = samp_dist.sample()
-                loss_sampled = F.cross_entropy(logits, y_sample.view(-1))
+                with ctx:
+                    logits = model(b_ids, b_mask)
+                    samp_dist = torch.distributions.Categorical(logits=logits)
+                    y_sample = samp_dist.sample()
+                    loss_sampled = F.cross_entropy(logits, y_sample.view(-1))
                 loss_sampled.backward()
 
-                # Potentially: Clip gradients using nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                # Potentially: Clip gradients using
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                 optimizer.update_hessian(bs=args.batch_size)
 
                 optimizer.zero_grad(set_to_none=True)
diff --git a/run_train.sh b/run_train.sh
index 6084505..9ba8829 100644
--- a/run_train.sh
+++ b/run_train.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 #SBATCH --job-name=train-bert-token-tricksters
-#SBATCH -t 05:00:00                  # estimated time # TODO: adapt to your needs
+#SBATCH -t 00:15:00                  # estimated time # TODO: adapt to your needs
 #SBATCH -p grete:shared              # the partition you are training on (i.e., which nodes), for nodes see sinfo -p grete:shared --format=%N,%G
 #SBATCH -G A100:1                   # take 1 GPU, see https://www.hlrn.de/doc/display/PUB/GPU+Usage for more options
 #SBATCH --mem-per-gpu=5G             # setting the right constraints for the splitted gpu partitions
 #SBATCH --nodes=1                    # total number of nodes
 #SBATCH --ntasks=1                   # total number of tasks
-#SBATCH --cpus-per-task=4            # number cores per task
+#SBATCH --cpus-per-task=8            # number cores per task
 #SBATCH --mail-type=all              # send mail when job begins and ends
 #SBATCH --mail-user=l.kaesberg@stud.uni-goettingen.de  # TODO: change this to your mailaddress!
 #SBATCH --output=./slurm_files/slurm-%x-%j.out     # where to write output, %x give job name, %j names job id
@@ -28,4 +28,4 @@ python -m torch.utils.collect_env
 nvcc -V
 
 # Run the script:
-python -u multitask_classifier.py --use_gpu
+python -u classifier.py --use_gpu --batch_size 64 --lr 3e-4 --epochs 30