-
Notifications
You must be signed in to change notification settings - Fork 5.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add alternative weight loading strategy as backup (#82)
- Loading branch information
Showing
10 changed files
with
621 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
ch05/02_alternative_weight_loading/checkpoints | ||
ch05/01_main-chapter-code/the-verdict.txt | ||
|
||
.DS_Store | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Chapter 5: Pretraining on Unlabeled Data | ||
|
||
- [ch05.ipynb](ch05.ipynb) contains all the code as it appears in the chapter | ||
- [previous_chapters.py](previous_chapters.py) is a Python module that contains the `MultiHeadAttention` module from the previous chapter, which we import in [ch05.ipynb](ch05.ipynb) to pretrain the GPT model | ||
- [train.py](train.py) is a standalone Python script file with the code that we implemented in [ch05.ipynb](ch05.ipynb) to train the GPT model | ||
- [generate.py](generate.py) is a standalone Python script file with the code that we implemented in [ch05.ipynb](ch05.ipynb) to load and use the pretrained model weights from OpenAI | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Alternative Weight Loading | ||
|
||
This folder contains alternative weight loading strategies in case the weights become unavailable from Open AI. | ||
|
||
- [weight-loading-hf-transformers.ipynb](weight-loading-hf-transformers.ipynb): contains code to load the weights from the Hugging Face Model Hub via the `transformers` library |
287 changes: 287 additions & 0 deletions
287
ch05/02_alternative_weight_loading/previous_chapters.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,287 @@ | ||
# This file collects all the relevant code that we covered thus far | ||
# throughout Chapters 2-4. | ||
# This file can be run as a standalone script. | ||
|
||
import tiktoken | ||
import torch | ||
import torch.nn as nn | ||
from torch.utils.data import Dataset, DataLoader | ||
|
||
##################################### | ||
# Chapter 2 | ||
##################################### | ||
|
||
|
||
class GPTDatasetV1(Dataset): | ||
def __init__(self, txt, tokenizer, max_length, stride): | ||
self.tokenizer = tokenizer | ||
self.input_ids = [] | ||
self.target_ids = [] | ||
|
||
# Tokenize the entire text | ||
token_ids = tokenizer.encode(txt) | ||
|
||
# Use a sliding window to chunk the book into overlapping sequences of max_length | ||
for i in range(0, len(token_ids) - max_length, stride): | ||
input_chunk = token_ids[i:i + max_length] | ||
target_chunk = token_ids[i + 1: i + max_length + 1] | ||
self.input_ids.append(torch.tensor(input_chunk)) | ||
self.target_ids.append(torch.tensor(target_chunk)) | ||
|
||
def __len__(self): | ||
return len(self.input_ids) | ||
|
||
def __getitem__(self, idx): | ||
return self.input_ids[idx], self.target_ids[idx] | ||
|
||
|
||
def create_dataloader_v1(txt, batch_size=4, max_length=256, | ||
stride=128, shuffle=True, drop_last=True): | ||
# Initialize the tokenizer | ||
tokenizer = tiktoken.get_encoding("gpt2") | ||
|
||
# Create dataset | ||
dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) | ||
|
||
# Create dataloader | ||
dataloader = DataLoader( | ||
dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last) | ||
|
||
return dataloader | ||
|
||
|
||
##################################### | ||
# Chapter 3 | ||
##################################### | ||
class MultiHeadAttention(nn.Module): | ||
def __init__(self, d_in, d_out, block_size, dropout, num_heads, qkv_bias=False): | ||
super().__init__() | ||
assert d_out % num_heads == 0, "d_out must be divisible by n_heads" | ||
|
||
self.d_out = d_out | ||
self.num_heads = num_heads | ||
self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim | ||
|
||
self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) | ||
self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) | ||
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) | ||
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs | ||
self.dropout = nn.Dropout(dropout) | ||
self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size), diagonal=1)) | ||
|
||
def forward(self, x): | ||
b, num_tokens, d_in = x.shape | ||
|
||
keys = self.W_key(x) # Shape: (b, num_tokens, d_out) | ||
queries = self.W_query(x) | ||
values = self.W_value(x) | ||
|
||
# We implicitly split the matrix by adding a `num_heads` dimension | ||
# Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim) | ||
keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) | ||
values = values.view(b, num_tokens, self.num_heads, self.head_dim) | ||
queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) | ||
|
||
# Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim) | ||
keys = keys.transpose(1, 2) | ||
queries = queries.transpose(1, 2) | ||
values = values.transpose(1, 2) | ||
|
||
# Compute scaled dot-product attention (aka self-attention) with a causal mask | ||
attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head | ||
|
||
# Original mask truncated to the number of tokens and converted to boolean | ||
mask_bool = self.mask.bool()[:num_tokens, :num_tokens] | ||
|
||
# Use the mask to fill attention scores | ||
attn_scores.masked_fill_(mask_bool, -torch.inf) | ||
|
||
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) | ||
attn_weights = self.dropout(attn_weights) | ||
|
||
# Shape: (b, num_tokens, num_heads, head_dim) | ||
context_vec = (attn_weights @ values).transpose(1, 2) | ||
|
||
# Combine heads, where self.d_out = self.num_heads * self.head_dim | ||
context_vec = context_vec.reshape(b, num_tokens, self.d_out) | ||
context_vec = self.out_proj(context_vec) # optional projection | ||
|
||
return context_vec | ||
|
||
|
||
##################################### | ||
# Chapter 4 | ||
##################################### | ||
class LayerNorm(nn.Module): | ||
def __init__(self, emb_dim): | ||
super().__init__() | ||
self.eps = 1e-5 | ||
self.scale = nn.Parameter(torch.ones(emb_dim)) | ||
self.shift = nn.Parameter(torch.zeros(emb_dim)) | ||
|
||
def forward(self, x): | ||
mean = x.mean(dim=-1, keepdim=True) | ||
var = x.var(dim=-1, keepdim=True, unbiased=False) | ||
norm_x = (x - mean) / torch.sqrt(var + self.eps) | ||
return self.scale * norm_x + self.shift | ||
|
||
|
||
class GELU(nn.Module): | ||
def __init__(self): | ||
super().__init__() | ||
|
||
def forward(self, x): | ||
return 0.5 * x * (1 + torch.tanh( | ||
torch.sqrt(torch.tensor(2.0 / torch.pi)) * | ||
(x + 0.044715 * torch.pow(x, 3)) | ||
)) | ||
|
||
|
||
class FeedForward(nn.Module): | ||
def __init__(self, cfg): | ||
super().__init__() | ||
self.layers = nn.Sequential( | ||
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), | ||
GELU(), | ||
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), | ||
nn.Dropout(cfg["drop_rate"]) | ||
) | ||
|
||
def forward(self, x): | ||
return self.layers(x) | ||
|
||
|
||
class TransformerBlock(nn.Module): | ||
def __init__(self, cfg): | ||
super().__init__() | ||
self.att = MultiHeadAttention( | ||
d_in=cfg["emb_dim"], | ||
d_out=cfg["emb_dim"], | ||
block_size=cfg["ctx_len"], | ||
num_heads=cfg["n_heads"], | ||
dropout=cfg["drop_rate"], | ||
qkv_bias=cfg["qkv_bias"]) | ||
self.ff = FeedForward(cfg) | ||
self.norm1 = LayerNorm(cfg["emb_dim"]) | ||
self.norm2 = LayerNorm(cfg["emb_dim"]) | ||
self.drop_resid = nn.Dropout(cfg["drop_rate"]) | ||
|
||
def forward(self, x): | ||
# Shortcut connection for attention block | ||
shortcut = x | ||
x = self.norm1(x) | ||
x = self.att(x) # Shape [batch_size, num_tokens, emb_size] | ||
x = self.drop_resid(x) | ||
x = x + shortcut # Add the original input back | ||
|
||
# Shortcut connection for feed-forward block | ||
shortcut = x | ||
x = self.norm2(x) | ||
x = self.ff(x) | ||
x = self.drop_resid(x) | ||
x = x + shortcut # Add the original input back | ||
|
||
return x | ||
|
||
|
||
class GPTModel(nn.Module): | ||
def __init__(self, cfg): | ||
super().__init__() | ||
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) | ||
self.pos_emb = nn.Embedding(cfg["ctx_len"], cfg["emb_dim"]) | ||
self.drop_emb = nn.Dropout(cfg["drop_rate"]) | ||
|
||
self.trf_blocks = nn.Sequential( | ||
*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) | ||
|
||
self.final_norm = LayerNorm(cfg["emb_dim"]) | ||
self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False) | ||
|
||
def forward(self, in_idx): | ||
batch_size, seq_len = in_idx.shape | ||
tok_embeds = self.tok_emb(in_idx) | ||
pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) | ||
x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] | ||
x = self.drop_emb(x) | ||
x = self.trf_blocks(x) | ||
x = self.final_norm(x) | ||
logits = self.out_head(x) | ||
return logits | ||
|
||
|
||
def generate_text_simple(model, idx, max_new_tokens, context_size): | ||
# idx is (B, T) array of indices in the current context | ||
for _ in range(max_new_tokens): | ||
|
||
# Crop current context if it exceeds the supported context size | ||
# E.g., if LLM supports only 5 tokens, and the context size is 10 | ||
# then only the last 5 tokens are used as context | ||
idx_cond = idx[:, -context_size:] | ||
|
||
# Get the predictions | ||
with torch.no_grad(): | ||
logits = model(idx_cond) | ||
|
||
# Focus only on the last time step | ||
# (batch, n_token, vocab_size) becomes (batch, vocab_size) | ||
logits = logits[:, -1, :] | ||
|
||
# Get the idx of the vocab entry with the highest logits value | ||
idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch, 1) | ||
|
||
# Append sampled index to the running sequence | ||
idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1) | ||
|
||
return idx | ||
|
||
|
||
##################################### | ||
# Chapter 5 | ||
##################################### | ||
|
||
|
||
def text_to_token_ids(text, tokenizer): | ||
encoded = tokenizer.encode(text) | ||
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension | ||
return encoded_tensor | ||
|
||
|
||
def token_ids_to_text(token_ids, tokenizer): | ||
flat = token_ids.squeeze(0) # remove batch dimension | ||
return tokenizer.decode(flat.tolist()) | ||
|
||
|
||
def generate(model, idx, max_new_tokens, context_size, temperature, top_k=None): | ||
|
||
# For-loop is the same as before: Get logits, and only focus on last time step | ||
for _ in range(max_new_tokens): | ||
idx_cond = idx[:, -context_size:] | ||
with torch.no_grad(): | ||
logits = model(idx_cond) | ||
logits = logits[:, -1, :] | ||
|
||
# New: Filter logits with top_k sampling | ||
if top_k is not None: | ||
# Keep only top_k values | ||
top_logits, _ = torch.topk(logits, top_k) | ||
min_val = top_logits[:, -1] | ||
logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits) | ||
|
||
# New: Apply temperature scaling | ||
if temperature > 0.0: | ||
logits = logits / temperature | ||
|
||
# Apply softmax to get probabilities | ||
probs = torch.softmax(logits, dim=-1) # (batch_size, context_len) | ||
|
||
# Sample from the distribution | ||
idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1) | ||
|
||
# Otherwise same as before: get idx of the vocab entry with the highest logits value | ||
else: | ||
idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1) | ||
|
||
# Same as before: append sampled index to the running sequence | ||
idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1) | ||
|
||
return idx |
Oops, something went wrong.