Skip to content

Commit

Permalink
🔀 Merge branch 'main' into feat/sophia
Browse files Browse the repository at this point in the history
  • Loading branch information
ItsNiklas committed Aug 14, 2023
2 parents 8fcdf82 + 3cff0ff commit 55caa1c
Show file tree
Hide file tree
Showing 15 changed files with 42,385 additions and 42,159 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ git remote -v
If you want to update using the upstream repository use the following commands
````
git fetch upstream
git rebase upstream/main
git merge upstream/main
````

## Usage
Expand Down
9 changes: 7 additions & 2 deletions bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,14 @@ def attention(self, key: Tensor, query: Tensor, value: Tensor, attention_mask: T
# multiply the attention scores to the value and get back V'
# next, we need to concat multi-heads and recover the original shape [bs, seq_len, num_attention_heads * attention_head_size = hidden_size]

# key, query, value: [bs, num_attention_heads, seq_len, attention_head_size]
# attention_mask: [bs, 1, 1, seq_len]
# output: [bs, seq_len, num_attention_heads * attention_head_size = hidden_size]
# Note: the attention_mask is used to mask out the padding tokens
bs, h, seq_len, d_k = key.shape
S = query @ torch.transpose(key, 2, 3) + attention_mask

# normalize the scores
result = torch.softmax((S / math.sqrt(d_k)), 3) @ value
return result.transpose(1, 2).reshape(bs, seq_len, h * d_k)

Expand Down Expand Up @@ -94,7 +99,7 @@ def add_norm(self, input, output, dense_layer, dropout, ln_layer):
dropout: the dropout to be applied
ln_layer: the layer norm to be applied
"""
# Hint: Remember that BERT applies to the output of each sub-layer, before it is added to the sub-layer input and normalized
# apply layer norm to the output and skip connection
return ln_layer(input + dense_layer(dropout(output)))

def forward(self, hidden_states, attention_mask):
Expand All @@ -107,7 +112,7 @@ def forward(self, hidden_states, attention_mask):
3. a feed forward layer
4. a add-norm that takes the input and output of the feed forward layer
"""
### TODO
# apply multi-head attention
multi_head = self.self_attention(hidden_states, attention_mask)

add_norm_1 = self.add_norm(hidden_states, multi_head, self.attention_dense, self.attention_dropout,
Expand Down
53 changes: 29 additions & 24 deletions classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score
from torch.utils.tensorboard import SummaryWriter

from layers.AttentionLayer import AttentionLayer
# change it with respect to the original model
from tokenizer import BertTokenizer
from bert import BertModel
Expand Down Expand Up @@ -42,7 +43,7 @@ class BertSentimentClassifier(torch.nn.Module):
def __init__(self, config):
super(BertSentimentClassifier, self).__init__()
self.num_labels = config.num_labels
self.bert = BertModel.from_pretrained("bert-base-uncased")
self.bert = BertModel.from_pretrained('bert-base-uncased', local_files_only=args.local_files_only)

# Pretrain mode does not require updating bert paramters.
for param in self.bert.parameters():
Expand All @@ -52,6 +53,8 @@ def __init__(self, config):
param.requires_grad = True

# self.dropout = nn.Dropout(config.hidden_dropout_prob)
# linear layer to get logits
self.attention_layer = AttentionLayer(config.hidden_size)
self.linear_layer = nn.Linear(config.hidden_size, self.num_labels)

def forward(self, input_ids, attention_mask):
Expand All @@ -60,15 +63,18 @@ def forward(self, input_ids, attention_mask):
# HINT: you should consider what is the appropriate output to return given that
# the training loop currently uses F.cross_entropy as the loss function.
# Cross entropy already has a softmax therefore this should be okay

# No Dropout because it is the last layer before softmax, else worse performance
result = self.bert(input_ids, attention_mask)
return self.linear_layer(result['pooler_output'])
attention_result = self.attention_layer(result['last_hidden_state'])
return self.linear_layer(attention_result)


class SentimentDataset(Dataset):
def __init__(self, dataset, args):
self.dataset = dataset
self.p = args
self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', local_files_only=args.local_files_only)

def __len__(self):
return len(self.dataset)
Expand Down Expand Up @@ -106,7 +112,7 @@ class SentimentTestDataset(Dataset):
def __init__(self, dataset, args):
self.dataset = dataset
self.p = args
self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', local_files_only=args.local_files_only)

def __len__(self):
return len(self.dataset)
Expand Down Expand Up @@ -261,13 +267,12 @@ def train(args):
)

# Init model
config = {
"hidden_dropout_prob": args.hidden_dropout_prob,
"num_labels": num_labels,
"hidden_size": 768,
"data_dir": ".",
"option": args.option,
}
config = {'hidden_dropout_prob': args.hidden_dropout_prob,
'num_labels': num_labels,
'hidden_size': 768,
'data_dir': '.',
'option': args.option,
'local_files_only': args.local_files_only}

config = SimpleNamespace(**config)
ctx = nullcontext() if not args.use_gpu else torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16)
Expand All @@ -286,6 +291,7 @@ def train(args):

best_dev_acc = 0

# Initialize the tensorboard writer
name = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}-lr={lr}-optimizer={type(optimizer).__name__}"
writer = SummaryWriter(log_dir=args.logdir + "/classifier/" + name)

Expand Down Expand Up @@ -333,6 +339,7 @@ def train(args):
optimizer.zero_grad(set_to_none=True)

train_loss += loss.item()

writer.add_scalar("Loss/Minibatches", loss.item(), loss_idx_value)
loss_idx_value += 1
num_batches += 1
Expand Down Expand Up @@ -408,29 +415,26 @@ def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=11711)
parser.add_argument("--epochs", type=int, default=10)
parser.add_argument(
"--option",
type=str,
help="pretrain: the BERT parameters are frozen; finetune: BERT parameters are updated",
choices=("pretrain", "finetune"),
default="pretrain",
)
parser.add_argument("--use_gpu", action="store_true")
parser.add_argument("--dev_out", type=str, default="cfimdb-dev-output.txt")
parser.add_argument("--test_out", type=str, default="cfimdb-test-output.txt")
parser.add_argument("--option", type=str,
help='pretrain: the BERT parameters are frozen; finetune: BERT parameters are updated',
choices=('pretrain', 'finetune'), default="pretrain")
parser.add_argument("--use_gpu", action='store_true')

parser.add_argument("--logdir", type=str, default="logdir")
parser.add_argument("--dev_out", type=str, default="sst-dev-out.csv")
parser.add_argument("--test_out", type=str, default="sst-test-out.csv")

parser.add_argument("--batch_size", help='sst: 64, cfimdb: 8 can fit a 12GB GPU', type=int, default=8)
parser.add_argument("--batch_size", help='sst: 64 can fit a 12GB GPU', type=int, default=64)
parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
parser.add_argument("--optimizer", type=str, default="adamw")
parser.add_argument("--local_files_only", action='store_true')

args, _ = parser.parse_known_args()

# TODO: Possibly change defaults based on optimizer
parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
default=1e-5 if args.option == 'finetune' else 1e-3)

args = parser.parse_args()
return args

Expand All @@ -455,7 +459,8 @@ def get_args():
dev_out='predictions/' + args.option + '-sst-dev-out.csv',
test_out='predictions/' + args.option + '-sst-test-out.csv',
logdir=args.logdir,
optimizer=args.optimizer
optimizer=args.optimizer,
local_files_only=args.local_files_only
)

train(config)
Expand Down
Loading

0 comments on commit 55caa1c

Please sign in to comment.