From 4442f48d02c392f0524e73bbb3ebe68a453bf94f Mon Sep 17 00:00:00 2001 From: lkaesberg Date: Fri, 30 Jun 2023 14:03:52 +0200 Subject: [PATCH] feat: added attention layer to use all hidden states --- classifier.py | 7 +++++-- layers/AttentionLayer.py | 21 +++++++++++++++++++++ layers/__init__.py | 0 multitask_classifier.py | 6 +++++- 4 files changed, 31 insertions(+), 3 deletions(-) create mode 100644 layers/AttentionLayer.py create mode 100644 layers/__init__.py diff --git a/classifier.py b/classifier.py index d008627..9d51ad5 100644 --- a/classifier.py +++ b/classifier.py @@ -10,6 +10,7 @@ from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score from torch.utils.tensorboard import SummaryWriter +from layers.AttentionLayer import AttentionLayer # change it with respect to the original model from tokenizer import BertTokenizer from bert import BertModel @@ -51,6 +52,7 @@ def __init__(self, config): param.requires_grad = True # self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.attention_layer = AttentionLayer(config.hidden_size) self.linear_layer = nn.Linear(config.hidden_size, self.num_labels) def forward(self, input_ids, attention_mask): @@ -60,7 +62,8 @@ def forward(self, input_ids, attention_mask): # the training loop currently uses F.cross_entropy as the loss function. # Cross entropy already has a softmax therefore this should be okay result = self.bert(input_ids, attention_mask) - return self.linear_layer(result['pooler_output']) + attention_result = self.attention_layer(result['last_hidden_state']) + return self.linear_layer(attention_result) class SentimentDataset(Dataset): @@ -368,7 +371,7 @@ def get_args(): parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5", default=1e-5 if args.option == 'finetune' else 1e-3) - + args = parser.parse_args() return args diff --git a/layers/AttentionLayer.py b/layers/AttentionLayer.py new file mode 100644 index 0000000..00ca177 --- /dev/null +++ b/layers/AttentionLayer.py @@ -0,0 +1,21 @@ +import torch +import torch.nn as nn + + +class AttentionLayer(nn.Module): + def __init__(self, input_size): + super(AttentionLayer, self).__init__() + self.W = nn.Linear(input_size, input_size) + self.v = nn.Linear(input_size, 1, bias=False) + + def forward(self, embeddings): + # Apply linear transformation to the embeddings + transformed = torch.tanh(self.W(embeddings)) + + # Calculate attention weights + attention_weights = torch.softmax(self.v(transformed), dim=1) + + # Apply attention weights to the embeddings + attended_embeddings = torch.sum(attention_weights * embeddings, dim=1) + + return attended_embeddings diff --git a/layers/__init__.py b/layers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/multitask_classifier.py b/multitask_classifier.py index 214a5c1..3defe0f 100644 --- a/multitask_classifier.py +++ b/multitask_classifier.py @@ -9,6 +9,7 @@ from torch.utils.tensorboard import SummaryWriter from bert import BertModel +from layers.AttentionLayer import AttentionLayer from optimizer import AdamW from tqdm import tqdm @@ -55,6 +56,8 @@ def __init__(self, config): elif config.option == 'finetune': param.requires_grad = True + self.attention_layer = AttentionLayer(config.hidden_size) + self.linear_layer = nn.Linear(config.hidden_size, N_SENTIMENT_CLASSES) self.paraphrase_linear = nn.Linear(config.hidden_size, config.hidden_size) @@ -68,7 +71,8 @@ def forward(self, input_ids, attention_mask): # (e.g., by adding other layers). result = self.bert(input_ids, attention_mask) - return result['pooler_output'] + attention_result = self.attention_layer(result["last_hidden_state"]) + return attention_result def predict_sentiment(self, input_ids, attention_mask): '''Given a batch of sentences, outputs logits for classifying sentiment.