diff --git a/bert.py b/bert.py
index e1766d7..dade026 100644
--- a/bert.py
+++ b/bert.py
@@ -46,9 +46,14 @@ def attention(self, key: Tensor, query: Tensor, value: Tensor, attention_mask: T
         # multiply the attention scores to the value and get back V'
         # next, we need to concat multi-heads and recover the original shape [bs, seq_len, num_attention_heads * attention_head_size = hidden_size]
 
+        # key, query, value: [bs, num_attention_heads, seq_len, attention_head_size]
+        # attention_mask: [bs, 1, 1, seq_len]
+        # output: [bs, seq_len, num_attention_heads * attention_head_size = hidden_size]
+        # Note: the attention_mask is used to mask out the padding tokens
         bs, h, seq_len, d_k = key.shape
         S = query @ torch.transpose(key, 2, 3) + attention_mask
 
+        # normalize the scores
         result = torch.softmax((S / math.sqrt(d_k)), 3) @ value
         return result.transpose(1, 2).reshape(bs, seq_len, h * d_k)
 
@@ -94,7 +99,7 @@ def add_norm(self, input, output, dense_layer, dropout, ln_layer):
         dropout: the dropout to be applied
         ln_layer: the layer norm to be applied
         """
-        # Hint: Remember that BERT applies to the output of each sub-layer, before it is added to the sub-layer input and normalized
+        # apply layer norm to the output and skip connection
         return ln_layer(input + dense_layer(dropout(output)))
 
     def forward(self, hidden_states, attention_mask):
@@ -107,7 +112,7 @@ def forward(self, hidden_states, attention_mask):
         3. a feed forward layer
         4. a add-norm that takes the input and output of the feed forward layer
         """
-        ### TODO
+        # apply multi-head attention
         multi_head = self.self_attention(hidden_states, attention_mask)
 
         add_norm_1 = self.add_norm(hidden_states, multi_head, self.attention_dense, self.attention_dropout,
diff --git a/classifier.py b/classifier.py
index 38bc7a5..1e6f77d 100644
--- a/classifier.py
+++ b/classifier.py
@@ -51,6 +51,7 @@ def __init__(self, config):
                 param.requires_grad = True
 
         # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # linear layer to get logits
         self.linear_layer = nn.Linear(config.hidden_size, self.num_labels)
 
     def forward(self, input_ids, attention_mask):
@@ -59,6 +60,7 @@ def forward(self, input_ids, attention_mask):
         # HINT: you should consider what is the appropriate output to return given that
         # the training loop currently uses F.cross_entropy as the loss function.
         # Cross entropy already has a softmax therefore this should be okay
+
         # No Dropout because it is the last layer before softmax, else worse performance
         result = self.bert(input_ids, attention_mask)
         return self.linear_layer(result['pooler_output'])
@@ -266,6 +268,7 @@ def train(args):
     optimizer = AdamW(model.parameters(), lr=lr)
     best_dev_acc = 0
 
+    # Initialize the tensorboard writer
     name = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}-lr={lr}-optimizer={type(optimizer).__name__}"
     writer = SummaryWriter(log_dir=args.logdir + "/classifier/" + name)
 
@@ -290,6 +293,7 @@ def train(args):
             optimizer.step()
 
             train_loss += loss.item()
+
             writer.add_scalar("Loss/Minibatches", loss.item(), loss_idx_value)
             loss_idx_value += 1
             num_batches += 1
@@ -362,7 +366,6 @@ def get_args():
     parser.add_argument("--logdir", type=str, default="logdir")
     parser.add_argument("--dev_out", type=str, default="sst-dev-out.csv")
     parser.add_argument("--test_out", type=str, default="sst-test-out.csv")
-                                    
 
     parser.add_argument("--batch_size", help='sst: 64 can fit a 12GB GPU', type=int, default=64)
     parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
@@ -372,7 +375,7 @@ def get_args():
 
     parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
                         default=1e-5 if args.option == 'finetune' else 1e-3)
-    
+
     args = parser.parse_args()
     return args