Skip to content

Commit

Permalink
🔀 Merge branch 'main' into feat/sophia
Browse files Browse the repository at this point in the history
  • Loading branch information
ItsNiklas committed Jul 5, 2023
2 parents 92ed0cc + 545d177 commit bd46581
Show file tree
Hide file tree
Showing 7 changed files with 189 additions and 159 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
__pycache__
sst-classifier.pt
logdir
slurm_files
slurm_files
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,27 @@ If you want to update using the upstream repository use the following commands
git fetch upstream
git rebase upstream/main
````

## Usage

To run the multitask-classifier on the cluster you have to use
````
sbatch run_train.sh
````

If you want to use tensorboard you can forward it to your local machine
````
ssh -L localhost:16006:localhost:6006 bzkurs42@glogin9.hlrn.de
module load anaconda3
source activate dnlp2
tensorboard --logdir logdir
````

If you want to test the multitask-classifier you can create an interactive session with
````
srun -p grete:shared --pty -G A100:1 --interactive bash
````

## Introduction

This is the starting code for the default final project for the Deep Learning for Natural Language Processing course at the University of Göttingen. You can find the handout [here](https://1drv.ms/b/s!AkgwFZyClZ_qk718ObYhi8tF4cjSSQ?e=3gECnf)
Expand Down
5 changes: 3 additions & 2 deletions classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,6 @@ def save_model(model, optimizer, args, config, filepath):


def train(args):
name = datetime.now().strftime("%Y%m%d-%H%M%S")
writer = SummaryWriter(log_dir=args.logdir + "/classifier/" + name)
loss_idx_value = 0

device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
Expand Down Expand Up @@ -288,6 +286,9 @@ def train(args):

best_dev_acc = 0

name = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}-lr={lr}-optimizer={type(optimizer).__name__}"
writer = SummaryWriter(log_dir=args.logdir + "/classifier/" + name)

# Run for the specified number of epochs
for epoch in range(args.epochs):
model.train()
Expand Down
192 changes: 95 additions & 97 deletions evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
SentenceClassificationDataset, SentenceClassificationTestDataset, \
SentencePairDataset, SentencePairTestDataset

TQDM_DISABLE = False

TQDM_DISABLE = True

# Evaluate a multitask model for accuracy.on SST only.
def model_eval_sst(dataloader, model, device):
Expand All @@ -36,8 +36,8 @@ def model_eval_sst(dataloader, model, device):
sents = []
sent_ids = []
for step, batch in enumerate(tqdm(dataloader, desc=f'eval', disable=TQDM_DISABLE)):
b_ids, b_mask, b_labels, b_sents, b_sent_ids = batch['token_ids'],batch['attention_mask'], \
batch['labels'], batch['sents'], batch['sent_ids']
b_ids, b_mask, b_labels, b_sents, b_sent_ids = batch['token_ids'], batch['attention_mask'], \
batch['labels'], batch['sents'], batch['sent_ids']

b_ids = b_ids.to(device)
b_mask = b_mask.to(device)
Expand All @@ -57,6 +57,7 @@ def model_eval_sst(dataloader, model, device):

return acc, f1, y_pred, y_true, sents, sent_ids


# Perform model evaluation in terms by averaging accuracies across tasks.
def model_eval_multitask(sentiment_dataloader,
paraphrase_dataloader,
Expand All @@ -74,8 +75,8 @@ def model_eval_multitask(sentiment_dataloader,
(b_ids1, b_mask1,
b_ids2, b_mask2,
b_labels, b_sent_ids) = (batch['token_ids_1'], batch['attention_mask_1'],
batch['token_ids_2'], batch['attention_mask_2'],
batch['labels'], batch['sent_ids'])
batch['token_ids_2'], batch['attention_mask_2'],
batch['labels'], batch['sent_ids'])

b_ids1 = b_ids1.to(device)
b_mask1 = b_mask1.to(device)
Expand All @@ -96,14 +97,13 @@ def model_eval_multitask(sentiment_dataloader,
sts_y_pred = []
sts_sent_ids = []


# Evaluate semantic textual similarity.
for step, batch in enumerate(tqdm(sts_dataloader, desc=f'eval', disable=TQDM_DISABLE)):
(b_ids1, b_mask1,
b_ids2, b_mask2,
b_labels, b_sent_ids) = (batch['token_ids_1'], batch['attention_mask_1'],
batch['token_ids_2'], batch['attention_mask_2'],
batch['labels'], batch['sent_ids'])
batch['token_ids_2'], batch['attention_mask_2'],
batch['labels'], batch['sent_ids'])

b_ids1 = b_ids1.to(device)
b_mask1 = b_mask1.to(device)
Expand All @@ -117,17 +117,17 @@ def model_eval_multitask(sentiment_dataloader,
sts_y_pred.extend(y_hat)
sts_y_true.extend(b_labels)
sts_sent_ids.extend(b_sent_ids)
pearson_mat = np.corrcoef(sts_y_pred,sts_y_true)
pearson_mat = np.corrcoef(sts_y_pred, sts_y_true)
sts_corr = pearson_mat[1][0]


sst_y_true = []
sst_y_pred = []
sst_sent_ids = []

# Evaluate sentiment classification.
for step, batch in enumerate(tqdm(sentiment_dataloader, desc=f'eval', disable=TQDM_DISABLE)):
b_ids, b_mask, b_labels, b_sent_ids = batch['token_ids'], batch['attention_mask'], batch['labels'], batch['sent_ids']
b_ids, b_mask, b_labels, b_sent_ids = batch['token_ids'], batch['attention_mask'], batch['labels'], batch[
'sent_ids']

b_ids = b_ids.to(device)
b_mask = b_mask.to(device)
Expand All @@ -147,14 +147,15 @@ def model_eval_multitask(sentiment_dataloader,
print(f'Semantic Textual Similarity correlation: {sts_corr:.3f}')

return (paraphrase_accuracy, para_y_pred, para_sent_ids,
sentiment_accuracy,sst_y_pred, sst_sent_ids,
sentiment_accuracy, sst_y_pred, sst_sent_ids,
sts_corr, sts_y_pred, sts_sent_ids)


# Perform model evaluation in terms by averaging accuracies across tasks.
def model_eval_test_multitask(sentiment_dataloader,
paraphrase_dataloader,
sts_dataloader,
model, device):
paraphrase_dataloader,
sts_dataloader,
model, device):
model.eval() # switch to eval model, will turn off randomness like dropout

with torch.no_grad():
Expand All @@ -166,8 +167,8 @@ def model_eval_test_multitask(sentiment_dataloader,
(b_ids1, b_mask1,
b_ids2, b_mask2,
b_sent_ids) = (batch['token_ids_1'], batch['attention_mask_1'],
batch['token_ids_2'], batch['attention_mask_2'],
batch['sent_ids'])
batch['token_ids_2'], batch['attention_mask_2'],
batch['sent_ids'])

b_ids1 = b_ids1.to(device)
b_mask1 = b_mask1.to(device)
Expand All @@ -180,18 +181,16 @@ def model_eval_test_multitask(sentiment_dataloader,
para_y_pred.extend(y_hat)
para_sent_ids.extend(b_sent_ids)


sts_y_pred = []
sts_sent_ids = []


# Evaluate semantic textual similarity.
for step, batch in enumerate(tqdm(sts_dataloader, desc=f'eval', disable=TQDM_DISABLE)):
(b_ids1, b_mask1,
b_ids2, b_mask2,
b_sent_ids) = (batch['token_ids_1'], batch['attention_mask_1'],
batch['token_ids_2'], batch['attention_mask_2'],
batch['sent_ids'])
batch['token_ids_2'], batch['attention_mask_2'],
batch['sent_ids'])

b_ids1 = b_ids1.to(device)
b_mask1 = b_mask1.to(device)
Expand All @@ -204,13 +203,12 @@ def model_eval_test_multitask(sentiment_dataloader,
sts_y_pred.extend(y_hat)
sts_sent_ids.extend(b_sent_ids)


sst_y_pred = []
sst_sent_ids = []

# Evaluate sentiment classification.
for step, batch in enumerate(tqdm(sentiment_dataloader, desc=f'eval', disable=TQDM_DISABLE)):
b_ids, b_mask, b_sent_ids = batch['token_ids'], batch['attention_mask'], batch['sent_ids']
b_ids, b_mask, b_sent_ids = batch['token_ids'], batch['attention_mask'], batch['sent_ids']

b_ids = b_ids.to(device)
b_mask = b_mask.to(device)
Expand All @@ -227,77 +225,77 @@ def model_eval_test_multitask(sentiment_dataloader,


def test_model_multitask(args, model, device):
sst_test_data, num_labels,para_test_data, sts_test_data = \
load_multitask_data(args.sst_test,args.para_test, args.sts_test, split='test')

sst_dev_data, num_labels,para_dev_data, sts_dev_data = \
load_multitask_data(args.sst_dev,args.para_dev,args.sts_dev,split='dev')

sst_test_data = SentenceClassificationTestDataset(sst_test_data, args)
sst_dev_data = SentenceClassificationDataset(sst_dev_data, args)

sst_test_dataloader = DataLoader(sst_test_data, shuffle=True, batch_size=args.batch_size,
collate_fn=sst_test_data.collate_fn)
sst_dev_dataloader = DataLoader(sst_dev_data, shuffle=False, batch_size=args.batch_size,
collate_fn=sst_dev_data.collate_fn)

para_test_data = SentencePairTestDataset(para_test_data, args)
para_dev_data = SentencePairDataset(para_dev_data, args)

para_test_dataloader = DataLoader(para_test_data, shuffle=True, batch_size=args.batch_size,
collate_fn=para_test_data.collate_fn)
para_dev_dataloader = DataLoader(para_dev_data, shuffle=False, batch_size=args.batch_size,
collate_fn=para_dev_data.collate_fn)

sts_test_data = SentencePairTestDataset(sts_test_data, args)
sts_dev_data = SentencePairDataset(sts_dev_data, args, isRegression=True)

sts_test_dataloader = DataLoader(sts_test_data, shuffle=True, batch_size=args.batch_size,
collate_fn=sts_test_data.collate_fn)
sts_dev_dataloader = DataLoader(sts_dev_data, shuffle=False, batch_size=args.batch_size,
collate_fn=sts_dev_data.collate_fn)

dev_paraphrase_accuracy, dev_para_y_pred, dev_para_sent_ids, \
dev_sentiment_accuracy,dev_sst_y_pred, dev_sst_sent_ids, dev_sts_corr, \
dev_sts_y_pred, dev_sts_sent_ids = model_eval_multitask(sst_dev_dataloader,
para_dev_dataloader,
sts_dev_dataloader, model, device)

test_para_y_pred, test_para_sent_ids, test_sst_y_pred, \
test_sst_sent_ids, test_sts_y_pred, test_sts_sent_ids = \
model_eval_test_multitask(sst_test_dataloader,
para_test_dataloader,
sts_test_dataloader, model, device)

with open(args.sst_dev_out, "w+") as f:
print(f"dev sentiment acc :: {dev_sentiment_accuracy :.3f}")
f.write(f"id \t Predicted_Sentiment \n")
for p, s in zip(dev_sst_sent_ids, dev_sst_y_pred):
f.write(f"{p} , {s} \n")

with open(args.sst_test_out, "w+") as f:
f.write(f"id \t Predicted_Sentiment \n")
for p, s in zip(test_sst_sent_ids, test_sst_y_pred):
f.write(f"{p} , {s} \n")

with open(args.para_dev_out, "w+") as f:
print(f"dev paraphrase acc :: {dev_paraphrase_accuracy :.3f}")
f.write(f"id \t Predicted_Is_Paraphrase \n")
for p, s in zip(dev_para_sent_ids, dev_para_y_pred):
f.write(f"{p} , {s} \n")

with open(args.para_test_out, "w+") as f:
f.write(f"id \t Predicted_Is_Paraphrase \n")
for p, s in zip(test_para_sent_ids, test_para_y_pred):
f.write(f"{p} , {s} \n")

with open(args.sts_dev_out, "w+") as f:
print(f"dev sts corr :: {dev_sts_corr :.3f}")
f.write(f"id \t Predicted_Similiary \n")
for p, s in zip(dev_sts_sent_ids, dev_sts_y_pred):
f.write(f"{p} , {s} \n")

with open(args.sts_test_out, "w+") as f:
f.write(f"id \t Predicted_Similiary \n")
for p, s in zip(test_sts_sent_ids, test_sts_y_pred):
f.write(f"{p} , {s} \n")
sst_test_data, num_labels, para_test_data, sts_test_data = \
load_multitask_data(args.sst_test, args.para_test, args.sts_test, split='test')

sst_dev_data, num_labels, para_dev_data, sts_dev_data = \
load_multitask_data(args.sst_dev, args.para_dev, args.sts_dev, split='dev')

sst_test_data = SentenceClassificationTestDataset(sst_test_data, args)
sst_dev_data = SentenceClassificationDataset(sst_dev_data, args)

sst_test_dataloader = DataLoader(sst_test_data, shuffle=True, batch_size=args.batch_size,
collate_fn=sst_test_data.collate_fn)
sst_dev_dataloader = DataLoader(sst_dev_data, shuffle=False, batch_size=args.batch_size,
collate_fn=sst_dev_data.collate_fn)

para_test_data = SentencePairTestDataset(para_test_data, args)
para_dev_data = SentencePairDataset(para_dev_data, args)

para_test_dataloader = DataLoader(para_test_data, shuffle=True, batch_size=args.batch_size,
collate_fn=para_test_data.collate_fn)
para_dev_dataloader = DataLoader(para_dev_data, shuffle=False, batch_size=args.batch_size,
collate_fn=para_dev_data.collate_fn)

sts_test_data = SentencePairTestDataset(sts_test_data, args)
sts_dev_data = SentencePairDataset(sts_dev_data, args, isRegression=True)

sts_test_dataloader = DataLoader(sts_test_data, shuffle=True, batch_size=args.batch_size,
collate_fn=sts_test_data.collate_fn)
sts_dev_dataloader = DataLoader(sts_dev_data, shuffle=False, batch_size=args.batch_size,
collate_fn=sts_dev_data.collate_fn)

dev_paraphrase_accuracy, dev_para_y_pred, dev_para_sent_ids, \
dev_sentiment_accuracy, dev_sst_y_pred, dev_sst_sent_ids, dev_sts_corr, \
dev_sts_y_pred, dev_sts_sent_ids = model_eval_multitask(sst_dev_dataloader,
para_dev_dataloader,
sts_dev_dataloader, model, device)

test_para_y_pred, test_para_sent_ids, test_sst_y_pred, \
test_sst_sent_ids, test_sts_y_pred, test_sts_sent_ids = \
model_eval_test_multitask(sst_test_dataloader,
para_test_dataloader,
sts_test_dataloader, model, device)

with open(args.sst_dev_out, "w+") as f:
print(f"dev sentiment acc :: {dev_sentiment_accuracy :.3f}")
f.write(f"id \t Predicted_Sentiment \n")
for p, s in zip(dev_sst_sent_ids, dev_sst_y_pred):
f.write(f"{p} , {s} \n")

with open(args.sst_test_out, "w+") as f:
f.write(f"id \t Predicted_Sentiment \n")
for p, s in zip(test_sst_sent_ids, test_sst_y_pred):
f.write(f"{p} , {s} \n")

with open(args.para_dev_out, "w+") as f:
print(f"dev paraphrase acc :: {dev_paraphrase_accuracy :.3f}")
f.write(f"id \t Predicted_Is_Paraphrase \n")
for p, s in zip(dev_para_sent_ids, dev_para_y_pred):
f.write(f"{p} , {s} \n")

with open(args.para_test_out, "w+") as f:
f.write(f"id \t Predicted_Is_Paraphrase \n")
for p, s in zip(test_para_sent_ids, test_para_y_pred):
f.write(f"{p} , {s} \n")

with open(args.sts_dev_out, "w+") as f:
print(f"dev sts corr :: {dev_sts_corr :.3f}")
f.write(f"id \t Predicted_Similiary \n")
for p, s in zip(dev_sts_sent_ids, dev_sts_y_pred):
f.write(f"{p} , {s} \n")

with open(args.sts_test_out, "w+") as f:
f.write(f"id \t Predicted_Similiary \n")
for p, s in zip(test_sts_sent_ids, test_sts_y_pred):
f.write(f"{p} , {s} \n")
Loading

0 comments on commit bd46581

Please sign in to comment.