From 82d6bd47a4880f57500349c7faafbe6d3f19098b Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Mon, 29 Apr 2024 21:56:05 -0500
Subject: [PATCH] use training set len (#137)

---
 .../02_bonus_additional-experiments/README.md | 34 +++++++++----------
 .../additional-experiments.py                 | 10 ++++--
 .../download-prepare-dataset.py               |  2 +-
 .../03_bonus_imdb-classification/train-gpt.py | 12 +++++--
 4 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/ch06/02_bonus_additional-experiments/README.md b/ch06/02_bonus_additional-experiments/README.md
index 47a215e2..59fa201a 100644
--- a/ch06/02_bonus_additional-experiments/README.md
+++ b/ch06/02_bonus_additional-experiments/README.md
@@ -9,16 +9,16 @@ For example,
 
 &nbsp;
 
-|   | Model              | Weights    | Trainable token | Trainable layers | Context length          | CPU/GPU | Training time | Training acc | Validation acc | Test acc |
-|---|--------------------|------------|-----------------|------------------|-------------------------|---------|---------------|--------------|----------------|----------|
-| 1 | gpt2-small (124M)  | pretrained | last            | last_block       | longest train ex. (120) | V100    | 0.39 min      | 96.63%       | 97.99%         | 94.33%   |
-| 2 | gpt2-small (124M)  | pretrained | first           | last_block       | longest train ex. (120) | V100    | 0.37 min      | 78.46%       | 80.54%         | 75.00%   |
-| 3 | gpt2-small (124M)  | pretrained | last            | last_layer       | longest train ex. (120) | V100    | 0.33 min      | 78.65%       | 87.25%         | 78.33%   |
-| 4 | gpt2-small (124M)  | pretrained | last            | all              | longest train ex. (120) | V100    | 0.94 min      | 99.62%       | 96.64%         | 96.33%   |
-| 5 | gpt2-medium (355M) | pretrained | last            | last_block       | longest train ex. (120) | V100    | 0.91 min      | 87.50%       | 51.01%         | 56.67%   |
-| 6 | gpt2-large (774M)  | pretrained | last            | last_block       | longest train ex. (120) | V100    | 1.91 min      | 99.52%       | 98.66%         | 96.67%   |
-| 7 | gpt2-small (124M)  | random     | last            | all              | longest train ex. (120) | V100    | 0.93 min      | 100%         | 97.32%         | 93.00%   |
-| 8 | gpt2-small (124M)  | pretrained | last            | last_block       | context length (1024)   | V100    | 3.24 min      | 83.08%       | 87.92%         | 78.33%   |
+|      | Model              | Weights    | Trainable token | Trainable layers | Context length          | CPU/GPU | Training time | Training acc | Validation acc | Test acc |
+| ---- | ------------------ | ---------- | --------------- | ---------------- | ----------------------- | ------- | ------------- | ------------ | -------------- | -------- |
+| 1    | gpt2-small (124M)  | pretrained | last            | last_block       | longest train ex. (120) | V100    | 0.39 min      | 96.63%       | 99.33%         | 95.00%   |
+| 2    | gpt2-small (124M)  | pretrained | first           | last_block       | longest train ex. (120) | V100    | 0.37 min      | 78.46%       | 80.54%         | 75.00%   |
+| 3    | gpt2-small (124M)  | pretrained | last            | last_layer       | longest train ex. (120) | V100    | 0.33 min      | 78.65%       | 79.87%         | 72.00%   |
+| 4    | gpt2-small (124M)  | pretrained | last            | all              | longest train ex. (120) | V100    | 0.94 min      | 99.62%       | 96.64%         | 96.67%   |
+| 5    | gpt2-medium (355M) | pretrained | last            | last_block       | longest train ex. (120) | V100    | 0.91 min      | 87.50%       | 91.28%         | 84.67%   |
+| 6    | gpt2-large (774M)  | pretrained | last            | last_block       | longest train ex. (120) | V100    | 1.91 min      | 99.52%       | 98.66%         | 96.67%   |
+| 7    | gpt2-small (124M)  | random     | last            | all              | longest train ex. (120) | V100    | 0.93 min      | 100%         | 96.64%         | 93.67%   |
+| 8    | gpt2-small (124M)  | pretrained | last            | last_block       | context length (1024)   | V100    | 3.24 min      | 83.08%       | 87.92%         | 78.33%   |
 
 &nbsp;
 
@@ -30,24 +30,24 @@ You can use the following code to reproduce the experiments:
 - Row 2: `python additional-experiments.py --trainable_token first` 
 - Row 3: `python additional-experiments.py --trainable_layers last_layer`
 - Row 4: `python additional-experiments.py --trainable_layers all`
-- Row 5: `python additional-experiments.py --model_size gpt2-medium (355M)`
-- Row 6: `python additional-experiments.py --model_size gpt2-large (774M)`
+- Row 5: `python additional-experiments.py --model_size "gpt2-medium (355M)"`
+- Row 6: `python additional-experiments.py --model_size "gpt2-large (774M)"`
 - Row 7: `python additional-experiments.py --weights random --trainable_layers all`
 - Row 8: `python additional-experiments.py --context_length "model_context_length"`
 
 I've kept the LLM and dataset small on purpose, so you can run the training on a regular laptop like a MacBook Air M3 in about 15 minutes in case you don't have access to a GPU.
-  
+
 &nbsp;
 
 ### Interpretation
 
-1. **Training the Last vs. First Output Token (Row 1 vs. 2)**: Training the last output token results in significantly better performance compared to the first. This improvement is expected due to the causal self-attention mask.
+1. **Training the Last vs. First Output Token (Row 1 vs. 2)**: Training the last output token results in substantially better performance compared to the first. This improvement is expected due to the causal self-attention mask.
 
-2. **Training the Last Transformer Block vs. Last Layer (Row 1 vs. 3)**: Training the entire last transformer block is much more effective than training only the last layer.
+2. **Training the Last Transformer Block vs. Last Layer (Row 1 vs. 3)**: Training the entire last transformer block is also results in substantially better results than training only the last layer.
 
-3. **Training All Layers vs. Last Transformer Block (Row 1 vs. 4)**: Training all layers shows a modest improvement of 2% over just training the last transformer block, but it requires almost three times longer in terms of training duration.
+3. **Training All Layers vs. Last Transformer Block (Row 1 vs. 4)**: Training all layers shows a modest improvement of ~2% over just training the last transformer block, but it requires almost three times longer in terms of training duration.
 
-4. **Using Larger Pretrained Models (Row 1 vs 5, and Row 1 vs. 6)**: Employing a 3x larger pretrained model leads to worse results. However, using a 5x larger model improves performance compared to the initial model, as was anticipated.
+4. **Using Larger Pretrained Models (Row 1 vs 5, and Row 1 vs. 6)**: Employing a 3x larger pretrained model leads to worse results. However, using a 5x larger model improves performance compared to the initial model, as was anticipated. (The medium model was perhaps not well pretrained or the particular finetuning configuration works not as well for this model.)
 
 5. **Using a Model with Random Weights vs. Pretrained Weights (Row 1 vs. 7)**: Utilizing a model with random weights yields results that are only slightly worse by 1.3% compared to using pretrained weights.
 
diff --git a/ch06/02_bonus_additional-experiments/additional-experiments.py b/ch06/02_bonus_additional-experiments/additional-experiments.py
index b9e824dd..81809a26 100644
--- a/ch06/02_bonus_additional-experiments/additional-experiments.py
+++ b/ch06/02_bonus_additional-experiments/additional-experiments.py
@@ -123,6 +123,9 @@ def instantiate_model(choose_model, load_weights):
     }
 
     BASE_CONFIG.update(model_configs[choose_model])
+
+    if not load_weights:
+        torch.manual_seed(123)
     model = GPTModel(BASE_CONFIG)
 
     if load_weights:
@@ -354,17 +357,20 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device,
 
     tokenizer = tiktoken.get_encoding("gpt2")
 
+    train_dataset = None
     if args.context_length == "model_context_length":
         max_length = model.pos_emb.weight.shape[0]
     elif args.context_length == "longest_training_example":
-        max_length = None
+        train_dataset = SpamDataset(base_path / "train.csv", max_length=None, tokenizer=tokenizer)
+        max_length = train_dataset.max_length
     else:
         try:
             max_length = int(args.context_length)
         except ValueError:
             raise ValueError("Invalid --context_length argument")
 
-    train_dataset = SpamDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer)
+    if train_dataset is None:
+        train_dataset = SpamDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer)
     val_dataset = SpamDataset(base_path / "validation.csv", max_length=max_length, tokenizer=tokenizer)
     test_dataset = SpamDataset(base_path / "test.csv", max_length=max_length, tokenizer=tokenizer)
 
diff --git a/ch06/03_bonus_imdb-classification/download-prepare-dataset.py b/ch06/03_bonus_imdb-classification/download-prepare-dataset.py
index 28197e61..e3e60b47 100644
--- a/ch06/03_bonus_imdb-classification/download-prepare-dataset.py
+++ b/ch06/03_bonus_imdb-classification/download-prepare-dataset.py
@@ -68,7 +68,7 @@ def partition_and_save(df, sizes=(35000, 5000, 10000)):
 
     # Save to CSV files
     train.to_csv("train.csv", index=False)
-    val.to_csv("val.csv", index=False)
+    val.to_csv("validation.csv", index=False)
     test.to_csv("test.csv", index=False)
 
 
diff --git a/ch06/03_bonus_imdb-classification/train-gpt.py b/ch06/03_bonus_imdb-classification/train-gpt.py
index dda708b6..65da1983 100644
--- a/ch06/03_bonus_imdb-classification/train-gpt.py
+++ b/ch06/03_bonus_imdb-classification/train-gpt.py
@@ -67,6 +67,9 @@ def instantiate_model(choose_model, load_weights):
     }
 
     BASE_CONFIG.update(model_configs[choose_model])
+
+    if not load_weights:
+        torch.manual_seed(123)
     model = GPTModel(BASE_CONFIG)
 
     if load_weights:
@@ -294,18 +297,21 @@ def train_classifier_simple(model, train_loader, val_loader, optimizer, device,
 
     tokenizer = tiktoken.get_encoding("gpt2")
 
+    train_dataset = None
     if args.context_length == "model_context_length":
         max_length = model.pos_emb.weight.shape[0]
     elif args.context_length == "longest_training_example":
-        max_length = None
+        train_dataset = IMDBDataset(base_path / "train.csv", max_length=None, tokenizer=tokenizer)
+        max_length = train_dataset.max_length
     else:
         try:
             max_length = int(args.context_length)
         except ValueError:
             raise ValueError("Invalid --context_length argument")
 
-    train_dataset = IMDBDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer)
-    val_dataset = IMDBDataset(base_path / "val.csv", max_length=max_length, tokenizer=tokenizer)
+    if train_dataset is None:
+        train_dataset = IMDBDataset(base_path / "train.csv", max_length=max_length, tokenizer=tokenizer)
+    val_dataset = IMDBDataset(base_path / "validation.csv", max_length=max_length, tokenizer=tokenizer)
     test_dataset = IMDBDataset(base_path / "test.csv", max_length=max_length, tokenizer=tokenizer)
 
     num_workers = 0