feat: added attention layer to use all hidden states

token-tricksters · Jun 30, 2023 · 4442f48 · 4442f48
1 parent f62d98e
commit 4442f48
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 3 deletions.
diff --git a/classifier.py b/classifier.py
@@ -10,6 +10,7 @@
 from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score
 from torch.utils.tensorboard import SummaryWriter
 
+from layers.AttentionLayer import AttentionLayer
 # change it with respect to the original model
 from tokenizer import BertTokenizer
 from bert import BertModel
@@ -51,6 +52,7 @@ def __init__(self, config):
                 param.requires_grad = True
 
         # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.attention_layer = AttentionLayer(config.hidden_size)
         self.linear_layer = nn.Linear(config.hidden_size, self.num_labels)
 
     def forward(self, input_ids, attention_mask):
@@ -60,7 +62,8 @@ def forward(self, input_ids, attention_mask):
         # the training loop currently uses F.cross_entropy as the loss function.
         # Cross entropy already has a softmax therefore this should be okay
         result = self.bert(input_ids, attention_mask)
-        return self.linear_layer(result['pooler_output'])
+        attention_result = self.attention_layer(result['last_hidden_state'])
+        return self.linear_layer(attention_result)
 
 
 class SentimentDataset(Dataset):
@@ -368,7 +371,7 @@ def get_args():
 
     parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
                         default=1e-5 if args.option == 'finetune' else 1e-3)
-    
+
     args = parser.parse_args()
     return args
 

diff --git a/layers/AttentionLayer.py b/layers/AttentionLayer.py
@@ -0,0 +1,21 @@
+import torch
+import torch.nn as nn
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self, input_size):
+        super(AttentionLayer, self).__init__()
+        self.W = nn.Linear(input_size, input_size)
+        self.v = nn.Linear(input_size, 1, bias=False)
+
+    def forward(self, embeddings):
+        # Apply linear transformation to the embeddings
+        transformed = torch.tanh(self.W(embeddings))
+
+        # Calculate attention weights
+        attention_weights = torch.softmax(self.v(transformed), dim=1)
+
+        # Apply attention weights to the embeddings
+        attended_embeddings = torch.sum(attention_weights * embeddings, dim=1)
+
+        return attended_embeddings
diff --git a/layers/__init__.py b/layers/__init__.py
diff --git a/multitask_classifier.py b/multitask_classifier.py
@@ -9,6 +9,7 @@
 from torch.utils.tensorboard import SummaryWriter
 
 from bert import BertModel
+from layers.AttentionLayer import AttentionLayer
 from optimizer import AdamW
 from tqdm import tqdm
 
@@ -55,6 +56,8 @@ def __init__(self, config):
             elif config.option == 'finetune':
                 param.requires_grad = True
 
+        self.attention_layer = AttentionLayer(config.hidden_size)
+
         self.linear_layer = nn.Linear(config.hidden_size, N_SENTIMENT_CLASSES)
 
         self.paraphrase_linear = nn.Linear(config.hidden_size, config.hidden_size)
@@ -68,7 +71,8 @@ def forward(self, input_ids, attention_mask):
         # (e.g., by adding other layers).
 
         result = self.bert(input_ids, attention_mask)
-        return result['pooler_output']
+        attention_result = self.attention_layer(result["last_hidden_state"])
+        return attention_result
 
     def predict_sentiment(self, input_ids, attention_mask):
         '''Given a batch of sentences, outputs logits for classifying sentiment.