Merge branch 'main' into 'feat/tensorboard'

# Conflicts: # .gitignore # multitask_classifier.py
token-tricksters · Jun 25, 2023 · 5348a65 · 5348a65
2 parents 9804eb7 + 8549c5a
commit 5348a65
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 26 deletions.
diff --git a/base_bert.py b/base_bert.py
@@ -110,6 +110,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
     # Instantiate model.
     model = cls(config, *model_args, **model_kwargs)
 
+    print("Loading checkpoint weights from", resolved_archive_file, "...")
     if state_dict is None:
       try:
         state_dict = torch.load(resolved_archive_file, map_location="cpu")
@@ -118,6 +119,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
           f"Unable to load weights from pytorch checkpoint file for '{pretrained_model_name_or_path}' "
           f"at '{resolved_archive_file}'"
         )
+    print("Loaded weights.")
 
     missing_keys = []
     unexpected_keys = []

diff --git a/classifier.py b/classifier.py
@@ -50,10 +50,8 @@ def __init__(self, config):
             elif config.option == 'finetune':
                 param.requires_grad = True
 
-        ### TODO
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.linear_layer = nn.Linear(config.hidden_size, self.num_labels)
-        # raise NotImplementedError
 
     def forward(self, input_ids, attention_mask):
         '''Takes a batch of sentences and returns logits for sentiment classes'''
@@ -62,7 +60,7 @@ def forward(self, input_ids, attention_mask):
         # the training loop currently uses F.cross_entropy as the loss function.
         # Cross entropy already has a softmax therefore this should be okay
         result = self.bert(input_ids, attention_mask)
-        return self.linear_layer(self.dropout(result['pooler_output']))
+        return self.linear_layer(result['pooler_output'])
 
 
 class SentimentDataset(Dataset):
@@ -364,9 +362,12 @@ def get_args():
 
     parser.add_argument("--batch_size", help='sst: 64, cfimdb: 8 can fit a 12GB GPU', type=int, default=8)
     parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
-    parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
-                        default=1e-5)
 
+    args, _ = parser.parse_known_args()
+
+    parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
+                        default=1e-5 if args.option == 'finetune' else 1e-3)
+
     args = parser.parse_args()
     return args
 

diff --git a/multitask_classifier.py b/multitask_classifier.py
@@ -55,43 +55,60 @@ def __init__(self, config):
             elif config.option == 'finetune':
                 param.requires_grad = True
 
+        self.linear_layer = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.paraphrase_linear = nn.Linear(config.hidden_size, 1)
+        self.similarity_linear = nn.Linear(config.hidden_size, 1)
+
     def forward(self, input_ids, attention_mask):
         'Takes a batch of sentences and produces embeddings for them.'
         # The final BERT embedding is the hidden state of [CLS] token (the first token)
         # Here, you can start by just returning the embeddings straight from BERT.
         # When thinking of improvements, you can later try modifying this
         # (e.g., by adding other layers).
-        ### TODO
-        raise NotImplementedError
+
+        result = self.bert(input_ids, attention_mask)
+        return result['pooler_output']
 
     def predict_sentiment(self, input_ids, attention_mask):
         '''Given a batch of sentences, outputs logits for classifying sentiment.
         There are 5 sentiment classes:
         (0 - negative, 1- somewhat negative, 2- neutral, 3- somewhat positive, 4- positive)
         Thus, your output should contain 5 logits for each sentence.
         '''
-        ### TODO
-        raise NotImplementedError
+        return self.linear_layer(forward(input_ids, attention_mask))
 
     def predict_paraphrase(self,
                            input_ids_1, attention_mask_1,
                            input_ids_2, attention_mask_2):
-        '''Given a batch of pairs of sentences, outputs a single logit for predicting whether they are paraphrases.
+        """
+        Given a batch of pairs of sentences, outputs a single logit for predicting whether they are paraphrases.
         Note that your output should be unnormalized (a logit); it will be passed to the sigmoid function
         during evaluation, and handled as a logit by the appropriate loss function.
-        '''
-        ### TODO
-        raise NotImplementedError
+        """
+
+        bert_result_1 = self.forward(input_ids_1, attention_mask_1)
+        bert_result_2 = self.forward(input_ids_2, attention_mask_2)
+
+        diff = torch.cosine_similarity(bert_result_1, bert_result_2)
+
+        return self.paraphrase_linear(diff)
 
     def predict_similarity(self,
                            input_ids_1, attention_mask_1,
                            input_ids_2, attention_mask_2):
-        '''Given a batch of pairs of sentences, outputs a single logit corresponding to how similar they are.
+        """
+        Given a batch of pairs of sentences, outputs a single logit corresponding to how similar they are.
         Note that your output should be unnormalized (a logit); it will be passed to the sigmoid function
         during evaluation, and handled as a logit by the appropriate loss function.
-        '''
-        ### TODO
-        raise NotImplementedError
+        """
+
+        bert_embeddings_1 = self.forward(input_ids_1, attention_mask_1)
+        bert_embeddings_2 = self.forward(input_ids_2, attention_mask_2)
+
+        diff = torch.cosine_similarity(bert_embeddings_1, bert_embeddings_2)
+
+        return self.similarity_linear(diff)
 
 
 def save_model(model, optimizer, args, config, filepath):

diff --git a/run_train.sh b/run_train.sh
@@ -14,7 +14,7 @@
 
 module load anaconda3
 module load cuda
-source activate dl-gpu # Or whatever you called your environment.
+source activate dnlp2 # Or whatever you called your environment.
 
 # Printing out some info.
 echo "Submitting job with sbatch from directory: ${SLURM_SUBMIT_DIR}"
@@ -28,4 +28,4 @@ python -m torch.utils.collect_env
 nvcc -V
 
 # Run the script:
-python -u multitask_classifier.py --use_gpu
+python -u multitask_classifier.py --use_gpu
diff --git a/setup.sh b/setup.sh
@@ -1,14 +1,14 @@
 #!/usr/bin/env bash
 
-conda create -n dnlp python=3.8
-conda activate dnlp
+conda create -n dnlp2 python=3.10
+conda activate dnlp2
 
-conda install pytorch==1.8.0 torchvision torchaudio cudatoolkit=10.1 -c pytorch
+conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
 pip install tqdm==4.58.0
 pip install requests==2.25.1
 pip install importlib-metadata==3.7.0
 pip install filelock==3.0.12
 pip install sklearn==0.0
-pip install tokenizers==0.10.1
+pip install tokenizers==0.13.2
 pip install explainaboard_client==0.0.7
 pip install tensorboard
diff --git a/utils.py b/utils.py
@@ -1,4 +1,6 @@
 from typing import Dict, List, Optional, Union, Tuple, BinaryIO
+import fnmatch
+import socket
 import os
 import sys
 import json
@@ -145,7 +147,7 @@ def get_from_cache(
 
   url_to_download = url
   etag = None
-  if not local_files_only:
+  if not 'gpu' in socket.gethostname() and not local_files_only:
     try:
       r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
       r.raise_for_status()
@@ -205,6 +207,12 @@ def get_from_cache(
   if os.path.exists(cache_path) and not force_download:
     return cache_path
 
+  if 'gpu' in socket.gethostname():
+    raise FileNotFoundError(
+            "Cannot find the requested files in the cached path and outgoing traffic has been"
+            " is not enabled."
+    )
+
   # Prevent parallel downloads of the same file with a lock.
   lock_path = cache_path + ".lock"
   with FileLock(lock_path):
@@ -343,4 +351,4 @@ def get_extended_attention_mask(attention_mask: Tensor, dtype) -> Tensor:
   extended_attention_mask = attention_mask[:, None, None, :]
   extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
   extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-  return extended_attention_mask
+  return extended_attention_mask