Skip to content

Commit

Permalink
Add GWDG setup
Browse files Browse the repository at this point in the history
  • Loading branch information
JonasLuehrs committed Jul 12, 2023
1 parent 25dd324 commit c6b33e8
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 17 deletions.
21 changes: 12 additions & 9 deletions classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class BertSentimentClassifier(torch.nn.Module):
def __init__(self, config):
super(BertSentimentClassifier, self).__init__()
self.num_labels = config.num_labels
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.bert = BertModel.from_pretrained('bert-base-uncased', local_files_only=args.local_files_only)

# Pretrain mode does not require updating bert paramters.
for param in self.bert.parameters():
Expand All @@ -62,7 +62,7 @@ class SentimentDataset(Dataset):
def __init__(self, dataset, args):
self.dataset = dataset
self.p = args
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', local_files_only=args.local_files_only)

def __len__(self):
return len(self.dataset)
Expand Down Expand Up @@ -100,7 +100,7 @@ class SentimentTestDataset(Dataset):
def __init__(self, dataset, args):
self.dataset = dataset
self.p = args
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', local_files_only=args.local_files_only)

def __len__(self):
return len(self.dataset)
Expand Down Expand Up @@ -247,7 +247,8 @@ def train(args):
'num_labels': num_labels,
'hidden_size': 768,
'data_dir': '.',
'option': args.option}
'option': args.option,
'local_files_only': args.local_files_only}

config = SimpleNamespace(**config)

Expand Down Expand Up @@ -333,14 +334,15 @@ def get_args():
help='pretrain: the BERT parameters are frozen; finetune: BERT parameters are updated',
choices=('pretrain', 'finetune'), default="pretrain")
parser.add_argument("--use_gpu", action='store_true')
parser.add_argument("--dev_out", type=str, default="cfimdb-dev-output.txt")
parser.add_argument("--test_out", type=str, default="cfimdb-test-output.txt")
parser.add_argument("--dev_out", type=str, default="sst-dev-out.csv")
parser.add_argument("--test_out", type=str, default="sst-test-out.csv")


parser.add_argument("--batch_size", help='sst: 64, cfimdb: 8 can fit a 12GB GPU', type=int, default=8)
parser.add_argument("--batch_size", help='sst: 64 can fit a 12GB GPU', type=int, default=64)
parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
default=1e-5)
default=1e-3)
parser.add_argument("--local_files_only", action='store_true')

args = parser.parse_args()
return args
Expand All @@ -363,7 +365,8 @@ def get_args():
test='data/ids-sst-test-student.csv',
option=args.option,
dev_out = 'predictions/'+args.option+'-sst-dev-out.csv',
test_out = 'predictions/'+args.option+'-sst-test-out.csv'
test_out = 'predictions/'+args.option+'-sst-test-out.csv',
local_files_only=args.local_files_only
)

train(config)
Expand Down
8 changes: 4 additions & 4 deletions datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class SentenceClassificationDataset(Dataset):
def __init__(self, dataset, args):
self.dataset = dataset
self.p = args
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', local_files_only=args.local_files_only)

def __len__(self):
return len(self.dataset)
Expand Down Expand Up @@ -67,7 +67,7 @@ class SentenceClassificationTestDataset(Dataset):
def __init__(self, dataset, args):
self.dataset = dataset
self.p = args
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', local_files_only=args.local_files_only)

def __len__(self):
return len(self.dataset)
Expand Down Expand Up @@ -103,7 +103,7 @@ def __init__(self, dataset, args, isRegression =False):
self.dataset = dataset
self.p = args
self.isRegression = isRegression
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', local_files_only=args.local_files_only)

def __len__(self):
return len(self.dataset)
Expand Down Expand Up @@ -160,7 +160,7 @@ class SentencePairTestDataset(Dataset):
def __init__(self, dataset, args):
self.dataset = dataset
self.p = args
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', local_files_only=args.local_files_only)

def __len__(self):
return len(self.dataset)
Expand Down
10 changes: 6 additions & 4 deletions multitask_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, config):
super(MultitaskBERT, self).__init__()
# You will want to add layers here to perform the downstream tasks.
# Pretrain mode does not require updating bert paramters.
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.bert = BertModel.from_pretrained('bert-base-uncased', local_files_only=config.local_files_only)
for param in self.bert.parameters():
if config.option == 'pretrain':
param.requires_grad = False
Expand Down Expand Up @@ -135,7 +135,8 @@ def train_multitask(args):
'num_labels': num_labels,
'hidden_size': 768,
'data_dir': '.',
'option': args.option}
'option': args.option,
'local_files_only': args.local_files_only}

config = SimpleNamespace(**config)

Expand Down Expand Up @@ -227,10 +228,11 @@ def get_args():
parser.add_argument("--sts_test_out", type=str, default="predictions/sts-test-output.csv")

# hyper parameters
parser.add_argument("--batch_size", help='sst: 64, cfimdb: 8 can fit a 12GB GPU', type=int, default=8)
parser.add_argument("--batch_size", help='sst: 64 can fit a 12GB GPU', type=int, default=64)
parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
default=1e-5)
default=1e-3)
parser.add_argument("--local_files_only", action='store_true')

args = parser.parse_args()
return args
Expand Down
7 changes: 7 additions & 0 deletions setup_gwdg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from tokenizer import BertTokenizer
from bert import BertModel

if __name__ == "__main__":
# Download files
BertTokenizer.from_pretrained('bert-base-uncased')
BertModel.from_pretrained('bert-base-uncased')
14 changes: 14 additions & 0 deletions setup_gwdg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

module load anaconda3
conda create -n dnlp-gwdg python=3.8
source activate dnlp-gwdg

pip install torch==2.0.0+cu117 torchvision==0.15.1+cu117 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu117
pip install tqdm==4.58.0
pip install requests==2.25.1
pip install importlib-metadata==3.7.0
pip install filelock==3.0.12
pip install sklearn==0.0
pip install tokenizers==0.10.1
pip install explainaboard_client==0.0.7
1 change: 1 addition & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Dict, List, Optional, Union, Tuple, BinaryIO
import fnmatch
import os
import sys
import json
Expand Down

0 comments on commit c6b33e8

Please sign in to comment.