Rename drop_resid to drop_shortcut (#136)

rasbt · Apr 28, 2024 · 97ed381 · 97ed381
1 parent 70cd174
commit 97ed381
Show file tree

Hide file tree

Showing 10 changed files with 37 additions and 37 deletions.
diff --git a/appendix-D/01_main-chapter-code/previous_chapters.py b/appendix-D/01_main-chapter-code/previous_chapters.py
@@ -170,21 +170,21 @@ def __init__(self, cfg):
         self.ff = FeedForward(cfg)
         self.norm1 = LayerNorm(cfg["emb_dim"])
         self.norm2 = LayerNorm(cfg["emb_dim"])
-        self.drop_resid = nn.Dropout(cfg["drop_rate"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
 
     def forward(self, x):
         # Shortcut connection for attention block
         shortcut = x
         x = self.norm1(x)
         x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         # Shortcut connection for feed-forward block
         shortcut = x
         x = self.norm2(x)
         x = self.ff(x)
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         return x

diff --git a/ch04/01_main-chapter-code/ch04.ipynb b/ch04/01_main-chapter-code/ch04.ipynb
@@ -950,21 +950,21 @@
     "        self.ff = FeedForward(cfg)\n",
     "        self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
     "        self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
-    "        self.drop_resid = nn.Dropout(cfg[\"drop_rate\"])\n",
+    "        self.drop_shortcut = nn.Dropout(cfg[\"drop_rate\"])\n",
     "\n",
     "    def forward(self, x):\n",
     "        # Shortcut connection for attention block\n",
     "        shortcut = x\n",
     "        x = self.norm1(x)\n",
     "        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]\n",
-    "        x = self.drop_resid(x)\n",
+    "        x = self.drop_shortcut(x)\n",
     "        x = x + shortcut  # Add the original input back\n",
     "\n",
     "        # Shortcut connection for feed forward block\n",
     "        shortcut = x\n",
     "        x = self.norm2(x)\n",
     "        x = self.ff(x)\n",
-    "        x = self.drop_resid(x)\n",
+    "        x = self.drop_shortcut(x)\n",
     "        x = x + shortcut  # Add the original input back\n",
     "\n",
     "        return x"
@@ -1489,7 +1489,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,

diff --git a/ch04/01_main-chapter-code/exercise-solutions.ipynb b/ch04/01_main-chapter-code/exercise-solutions.ipynb
@@ -34,7 +34,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from gpt import Transfocontext_lengthmerBlock\n",
+    "from gpt import TransformerBlock\n",
     "\n",
     "GPT_CONFIG_124M = {\n",
     "    \"vocab_size\": 50257,\n",
@@ -264,9 +264,9 @@
     "    \"emb_dim\": 768,\n",
     "    \"n_heads\": 12,\n",
     "    \"n_layers\": 12,\n",
-    "    \"drop_rate_emb\": 0.1,    # NEW: dropout for embedding layers\n",
-    "    \"drop_rate_attn\": 0.1,   # NEW: dropout for multi-head attention  \n",
-    "    \"drop_rate_resid\": 0.1,   # NEW: dropout for residual connections  \n",
+    "    \"drop_rate_emb\": 0.1,        # NEW: dropout for embedding layers\n",
+    "    \"drop_rate_attn\": 0.1,       # NEW: dropout for multi-head attention  \n",
+    "    \"drop_rate_shortcut\": 0.1,   # NEW: dropout for shortcut connections  \n",
     "    \"qkv_bias\": False\n",
     "}"
    ]
@@ -295,21 +295,21 @@
     "        self.ff = FeedForward(cfg)\n",
     "        self.norm1 = LayerNorm(cfg[\"emb_dim\"])\n",
     "        self.norm2 = LayerNorm(cfg[\"emb_dim\"])\n",
-    "        self.drop_resid = nn.Dropout(cfg[\"drop_rate_resid\"])\n",
+    "        self.drop_shortcut = nn.Dropout(cfg[\"drop_rate_shortcut\"])\n",
     "\n",
     "    def forward(self, x):\n",
     "        # Shortcut connection for attention block\n",
     "        shortcut = x\n",
     "        x = self.norm1(x)\n",
     "        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]\n",
-    "        x = self.drop_resid(x)\n",
+    "        x = self.drop_shortcut(x)\n",
     "        x = x + shortcut  # Add the original input back\n",
     "\n",
     "        # Shortcut connection for feed-forward block\n",
     "        shortcut = x\n",
     "        x = self.norm2(x)\n",
     "        x = self.ff(x)\n",
-    "        x = self.drop_resid(x)\n",
+    "        x = self.drop_shortcut(x)\n",
     "        x = x + shortcut  # Add the original input back\n",
     "\n",
     "        return x\n",
@@ -370,7 +370,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,

diff --git a/ch04/01_main-chapter-code/gpt.py b/ch04/01_main-chapter-code/gpt.py
@@ -162,21 +162,21 @@ def __init__(self, cfg):
         self.ff = FeedForward(cfg)
         self.norm1 = LayerNorm(cfg["emb_dim"])
         self.norm2 = LayerNorm(cfg["emb_dim"])
-        self.drop_resid = nn.Dropout(cfg["drop_rate"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
 
     def forward(self, x):
         # Shortcut connection for attention block
         shortcut = x
         x = self.norm1(x)
         x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         # Shortcut connection for feed-forward block
         shortcut = x
         x = self.norm2(x)
         x = self.ff(x)
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         return x

diff --git a/ch05/01_main-chapter-code/exercise-solutions.ipynb b/ch05/01_main-chapter-code/exercise-solutions.ipynb
@@ -519,7 +519,7 @@
     "train_losses, val_losses, tokens_seen = train_model_simple(\n",
     "    model, train_loader, val_loader, optimizer, device,\n",
     "    num_epochs=num_epochs, eval_freq=5, eval_iter=5,\n",
-    "    start_context=\"Every effort moves you\",\n",
+    "    start_context=\"Every effort moves you\", tokenizer=tokenizer\n",
     ")"
    ]
   },
@@ -605,7 +605,7 @@
      "text": [
       "File already exists and is up-to-date: gpt2/124M/checkpoint\n",
       "File already exists and is up-to-date: gpt2/124M/encoder.json\n",
-      "File already exists and is up-to-date: gpt2/124M/settings.json\n",
+      "File already exists and is up-to-date: gpt2/124M/hparams.json\n",
       "File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001\n",
       "File already exists and is up-to-date: gpt2/124M/model.ckpt.index\n",
       "File already exists and is up-to-date: gpt2/124M/model.ckpt.meta\n",
@@ -760,7 +760,7 @@
      "text": [
       "File already exists and is up-to-date: gpt2/1558M/checkpoint\n",
       "File already exists and is up-to-date: gpt2/1558M/encoder.json\n",
-      "File already exists and is up-to-date: gpt2/1558M/settings.json\n",
+      "File already exists and is up-to-date: gpt2/1558M/hparams.json\n",
       "File already exists and is up-to-date: gpt2/1558M/model.ckpt.data-00000-of-00001\n",
       "File already exists and is up-to-date: gpt2/1558M/model.ckpt.index\n",
       "File already exists and is up-to-date: gpt2/1558M/model.ckpt.meta\n",
@@ -859,7 +859,7 @@
      "text": [
       "File already exists and is up-to-date: gpt2/1558M/checkpoint\n",
       "File already exists and is up-to-date: gpt2/1558M/encoder.json\n",
-      "File already exists and is up-to-date: gpt2/1558M/settings.json\n",
+      "File already exists and is up-to-date: gpt2/1558M/hparams.json\n",
       "File already exists and is up-to-date: gpt2/1558M/model.ckpt.data-00000-of-00001\n",
       "File already exists and is up-to-date: gpt2/1558M/model.ckpt.index\n",
       "File already exists and is up-to-date: gpt2/1558M/model.ckpt.meta\n",

diff --git a/ch05/01_main-chapter-code/previous_chapters.py b/ch05/01_main-chapter-code/previous_chapters.py
@@ -167,21 +167,21 @@ def __init__(self, cfg):
         self.ff = FeedForward(cfg)
         self.norm1 = LayerNorm(cfg["emb_dim"])
         self.norm2 = LayerNorm(cfg["emb_dim"])
-        self.drop_resid = nn.Dropout(cfg["drop_rate"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
 
     def forward(self, x):
         # Shortcut connection for attention block
         shortcut = x
         x = self.norm1(x)
         x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         # Shortcut connection for feed-forward block
         shortcut = x
         x = self.norm2(x)
         x = self.ff(x)
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         return x

diff --git a/ch05/02_alternative_weight_loading/previous_chapters.py b/ch05/02_alternative_weight_loading/previous_chapters.py
@@ -167,21 +167,21 @@ def __init__(self, cfg):
         self.ff = FeedForward(cfg)
         self.norm1 = LayerNorm(cfg["emb_dim"])
         self.norm2 = LayerNorm(cfg["emb_dim"])
-        self.drop_resid = nn.Dropout(cfg["drop_rate"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
 
     def forward(self, x):
         # Shortcut connection for attention block
         shortcut = x
         x = self.norm1(x)
         x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         # Shortcut connection for feed-forward block
         shortcut = x
         x = self.norm2(x)
         x = self.ff(x)
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         return x

diff --git a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py b/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py
@@ -164,21 +164,21 @@ def __init__(self, cfg):
         self.ff = FeedForward(cfg)
         self.norm1 = LayerNorm(cfg["emb_dim"])
         self.norm2 = LayerNorm(cfg["emb_dim"])
-        self.drop_resid = nn.Dropout(cfg["drop_rate"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
 
     def forward(self, x):
         # Shortcut connection for attention block
         shortcut = x
         x = self.norm1(x)
         x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         # Shortcut connection for feed-forward block
         shortcut = x
         x = self.norm2(x)
         x = self.ff(x)
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         return x

diff --git a/ch05/05_bonus_hparam_tuning/previous_chapters.py b/ch05/05_bonus_hparam_tuning/previous_chapters.py
@@ -167,21 +167,21 @@ def __init__(self, cfg):
         self.ff = FeedForward(cfg)
         self.norm1 = LayerNorm(cfg["emb_dim"])
         self.norm2 = LayerNorm(cfg["emb_dim"])
-        self.drop_resid = nn.Dropout(cfg["drop_rate"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
 
     def forward(self, x):
         # Shortcut connection for attention block
         shortcut = x
         x = self.norm1(x)
         x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         # Shortcut connection for feed-forward block
         shortcut = x
         x = self.norm2(x)
         x = self.ff(x)
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         return x

diff --git a/ch06/02_bonus_additional-experiments/previous_chapters.py b/ch06/02_bonus_additional-experiments/previous_chapters.py
@@ -169,21 +169,21 @@ def __init__(self, cfg):
         self.ff = FeedForward(cfg)
         self.norm1 = LayerNorm(cfg["emb_dim"])
         self.norm2 = LayerNorm(cfg["emb_dim"])
-        self.drop_resid = nn.Dropout(cfg["drop_rate"])
+        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
 
     def forward(self, x):
         # Shortcut connection for attention block
         shortcut = x
         x = self.norm1(x)
         x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         # Shortcut connection for feed-forward block
         shortcut = x
         x = self.norm2(x)
         x = self.ff(x)
-        x = self.drop_resid(x)
+        x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
         return x