Skip to content

Commit

Permalink
Merge branch 'main' into feat/attention_layer
Browse files Browse the repository at this point in the history
# Conflicts:
#	multitask_classifier.py
  • Loading branch information
lkaesberg committed Jul 10, 2023
2 parents 4442f48 + b73e835 commit 4d1547a
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 20 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ tensorboard --logdir logdir

If you want to test the multitask-classifier you can create an interactive session with
````
#TODO
srun -p grete:shared --pty -G A100:1 --interactive bash
````

## Introduction
Expand Down
2 changes: 1 addition & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ class BertConfig(PretrainedConfig):
def __init__(
self,
vocab_size=30522,
hidden_size=768,
hidden_size=1024,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
Expand Down
18 changes: 13 additions & 5 deletions multitask_classifier.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pprint import pformat
import time, random, numpy as np, argparse, sys, re, os
from datetime import datetime
from types import SimpleNamespace
Expand Down Expand Up @@ -32,7 +33,6 @@ def seed_everything(seed=11711):
torch.backends.cudnn.deterministic = True


BERT_HIDDEN_SIZE = 768
N_SENTIMENT_CLASSES = 5


Expand All @@ -49,7 +49,7 @@ def __init__(self, config):
super(MultitaskBERT, self).__init__()
# You will want to add layers here to perform the downstream tasks.
# Pretrain mode does not require updating bert paramters.
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.bert = BertModel.from_pretrained(config.model)
for param in self.bert.parameters():
if config.option == 'pretrain':
param.requires_grad = False
Expand Down Expand Up @@ -165,23 +165,29 @@ def train_multitask(args):
para_dev_dataloader = DataLoader(para_dev_data, shuffle=False, batch_size=args.batch_size,
collate_fn=para_dev_data.collate_fn)

sts_train_data = SentencePairDataset(sts_train_data, args)
sts_dev_data = SentencePairDataset(sts_dev_data, args)
sts_train_data = SentencePairDataset(sts_train_data, args, isRegression=True)
sts_dev_data = SentencePairDataset(sts_dev_data, args, isRegression=True)

sts_train_dataloader = DataLoader(sts_train_data, shuffle=True, batch_size=args.batch_size,
collate_fn=sts_train_data.collate_fn)
sts_dev_dataloader = DataLoader(sts_dev_data, shuffle=False, batch_size=args.batch_size,
collate_fn=sts_dev_data.collate_fn)

print(f"Model: {args.model}")
hidden_size = {'bert-base-uncased': 768, 'bert-large-uncased': 1024}
# Init model
config = {'hidden_dropout_prob': args.hidden_dropout_prob,
'num_labels': num_labels,
'hidden_size': 768,
'model': args.model,
'hidden_size': hidden_size[args.model],
'data_dir': '.',
'option': args.option}

config = SimpleNamespace(**config)

print("Multitask BERT model:", file=sys.stderr)
print(pformat(vars(args)), file=sys.stderr)

model = MultitaskBERT(config)
model = model.to(device)

Expand Down Expand Up @@ -340,6 +346,8 @@ def get_args():
parser.add_argument("--option", type=str,
help='pretrain: the BERT parameters are frozen; finetune: BERT parameters are updated',
choices=('pretrain', 'finetune'), default="pretrain")
parser.add_argument("--model", type=str,
choices=('bert-base-uncased', 'bert-large-uncased'), default="bert-base-uncased")
parser.add_argument("--use_gpu", action='store_true')

parser.add_argument("--sst_dev_out", type=str, default="predictions/sst-dev-output.csv")
Expand Down
20 changes: 11 additions & 9 deletions optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ def __init__(
params: Iterable[torch.nn.parameter.Parameter],
lr: float = 1e-3,
betas: Tuple[float, float] = (0.9, 0.999),
eps: float = 1e-6,
weight_decay: float = 0.0,
eps: float = 1e-8,
weight_decay: float = 1e-2,
correct_bias: bool = True,
):
if lr < 0.0:
Expand All @@ -23,6 +23,8 @@ def __init__(
raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
if not 0.0 <= weight_decay:
raise ValueError("Invalid weight_decay value: {} - should be >= 0.0".format(weight_decay))
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
super().__init__(params, defaults)

Expand All @@ -37,7 +39,7 @@ def step(self, closure: Callable = None):
if p.grad is None:
continue

grad = p.grad.data
grad = p.grad

if grad.is_sparse:
raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
Expand Down Expand Up @@ -76,22 +78,22 @@ def step(self, closure: Callable = None):

# 1- Update first and second moments of the gradients

state["m"] = beta_1 * state["m"] + (1 - beta_1) * grad
state["v"] = beta_2 * state["v"] + (1 - beta_2) * torch.square(grad)
state["m"].mul_(beta_1).add_(grad, alpha=1 - beta_1)

state["v"].mul_(beta_2).addcmul_(grad, grad, value=1 - beta_2)

# 2- Apply bias correction
# (using the "efficient version" given in https://arxiv.org/abs/1412.6980;
# also given in the pseudo-code in the project description).
if correct_bias:
alpha = alpha * torch.sqrt(1 - beta_2 ** state["t"]) / (1 - beta_1 ** state["t"])
alpha *= torch.sqrt(1 - beta_2 ** state["t"]) / (1 - beta_1 ** state["t"])

# 3- Update parameters (p.data).

p.data = p.data - alpha * state["m"] / (torch.sqrt(state["v"]) + eps)
p.data.sub_(alpha * state["m"] / (torch.sqrt(state["v"]) + eps))

# 4- After that main gradient-based update, update again using weight decay
# (incorporating the learning rate again).
p.data.sub_(group["lr"] * p.data * weight_decay)

p.data = p.data - group["lr"] * p.data * weight_decay

return loss
8 changes: 4 additions & 4 deletions run_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
#SBATCH --job-name=train-bert-token-tricksters
#SBATCH -t 05:00:00 # estimated time # TODO: adapt to your needs
#SBATCH -p grete:shared # the partition you are training on (i.e., which nodes), for nodes see sinfo -p grete:shared --format=%N,%G
#SBATCH -G A100:1 # take 1 GPU, see https://www.hlrn.de/doc/display/PUB/GPU+Usage for more options
#SBATCH -G A100:1 # take 1 GPU, see https://www.hlrn.de/doc/display/PUB/GPU+Usage for more options
#SBATCH --mem-per-gpu=5G # setting the right constraints for the splitted gpu partitions
#SBATCH --nodes=1 # total number of nodes
#SBATCH --ntasks=1 # total number of tasks
#SBATCH --cpus-per-task=4 # number cores per task
#SBATCH --mail-type=all # send mail when job begins and ends
#SBATCH --mail-user=l.kaesberg@stud.uni-goettingen.de # TODO: change this to your mailaddress!
#SBATCH --mail-type=END,FAIL # send mail when job begins and ends
#SBATCH --mail-user=l.kaesberg@stud.uni-goettingen.de
#SBATCH --output=./slurm_files/slurm-%x-%j.out # where to write output, %x give job name, %j names job id
#SBATCH --error=./slurm_files/slurm-%x-%j.err # where to write slurm error

Expand All @@ -28,4 +28,4 @@ python -m torch.utils.collect_env
nvcc -V

# Run the script:
python -u multitask_classifier.py --use_gpu --use_gpu --lr 1e-3 --batch_size 128
python -u multitask_classifier.py --use_gpu --lr 1e-3 --batch_size 128

0 comments on commit 4d1547a

Please sign in to comment.