Lightning-AI · awaelchli · Mar 27, 2023 · Mar 27, 2023 · Mar 27, 2023 · Mar 27, 2023
@@ -5,6 +5,7 @@ __pycache__
 
 # data
 data
+checkpoints
 !data/shakespeare/prepare.py
 
 # downloaded by scripts/compare.py

@@ -74,10 +74,7 @@ python scripts/convert_checkpoint.py \
 You can now run inference:
 
 ```bash
-python scripts/generate.py \
-    --prompt "Hello, my name is" \
-    --checkpoint_path checkpoints/lit-llama/7B/state_dict.pt \
-    --tokenizer_path checkpoints/lit-llama/tokenizer.model
+python scripts/generate.py --prompt "Hello, my name is"
 ```
 
 This will run using the 7B model and will require roughly 26 GB of GPU memory (A100 GPU).

@@ -1,11 +1,13 @@
 import os
 import sys
 import time
+import torch
+from typing import Optional
 
 import lightning as L
 import torch
 
-from model import LLaMA, LLaMAConfig
+from model import LLaMA
 from quantization.bnb import quantize as quantize_model
 from tokenizer import Tokenizer
 
@@ -66,8 +68,9 @@ def main(
     # compilation fails as it does not support torch.complex64 for RoPE
     # compile: bool = False,
     accelerator: str = "auto",
-    checkpoint_path: str = "/srv/data/checkpoints/llama/converted_nano/7B/state_dict.pth",
-    tokenizer_path: str = "/srv/data/checkpoints/llama/converted_nano/tokenizer.model",
+    checkpoint_path: Optional[str] = None,
+    tokenizer_path: Optional[str] = None,
+    model_size: str = "7B",
     quantize: bool = False,
 ):
     """Generates text samples based on a pre-trained LLaMA model and tokenizer.
@@ -86,6 +89,11 @@ def main(
         tokenizer_path: The tokenizer path to load.
         quantize: Whether to quantize the model using the `LLM.int8()` method
     """
+    if not checkpoint_path:
+        checkpoint_path = f"./checkpoints/lit-llama/{model_size}/state_dict.pth"
+    if not tokenizer_path:
+        tokenizer_path = "./checkpoints/lit-llama/tokenizer.model"
+
     assert os.path.isfile(checkpoint_path)
     assert os.path.isfile(tokenizer_path)
 
@@ -94,14 +102,14 @@ def main(
     if quantize:
         print("Running quantization. This may take a minute ...")
         # TODO: Initializing the model directly on the device does not work with quantization
-        model = LLaMA(LLaMAConfig())
+        model = LLaMA.from_name(model_size)
         # The output layer can be sensitive to quantization, we keep it in default precision
         model = quantize_model(model, skip=("lm_head", "output"))
         checkpoint = torch.load(checkpoint_path)
         model.load_state_dict(checkpoint)
     else:
         with fabric.device:
-            model = LLaMA(LLaMAConfig())
+            model = LLaMA.from_name(model_size)
             checkpoint = torch.load(checkpoint_path)
             model.load_state_dict(checkpoint)
 

@@ -140,14 +140,26 @@ def forward(self, x):
         return x
 
 
+llama_configs = {
+    "7B": dict(n_layer=32, n_head=32, n_embd=4096),
+    "13B": dict(n_layer=40, n_head=40, n_embd=5120),
+    "30B": dict(n_layer=60, n_head=52, n_embd=6656),
+    "65B": dict(n_layer=80, n_head=64, n_embd=8192),
+}
+
+
 @dataclass
 class LLaMAConfig:
-    block_size: int = 4096  # 7B
+    block_size: int = 4096
     vocab_size: int = 32000
     n_layer: int = 32
     n_head: int = 32
     n_embd: int = 4096
 
+    @classmethod
+    def from_name(cls, name: str):
+        return cls(**llama_configs[name])
+
 
 class LLaMA(nn.Module):
     def __init__(self, config):
@@ -200,3 +212,7 @@ def step(self, idx, targets):
         logits = self(idx)
         loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
         return loss
+
+    @classmethod
+    def from_name(cls, name: str):
+        return cls(LLaMAConfig.from_name(name))
@@ -55,9 +55,9 @@ def convert_state_dict(state_dict):
 
 def meta_weights_for_nano_model(
     *,
-    output_dir: Path,
-    ckpt_dir: Path = Path("/srv/data/checkpoints/llama/raw"),
-    tokenizer_path: Path = Path("/srv/data/checkpoints/llama/raw/tokenizer.model"),
+    output_dir: Path = Path("checkpoints/lit-llama"),
+    ckpt_dir: Path = Path("checkpoints/llama/"),
+    tokenizer_path: Path = Path("checkpoints/llama/tokenizer.model"),
     model_size: str = "7B",
 ):
     output_dir = output_dir / model_size

@@ -28,7 +28,7 @@
 
 
 def prepare(
-    tokenizer_path: str = "/srv/data/checkpoints/llama/converted_meta/tokenizer.model",
+    tokenizer_path: str = "checkpoints/llama/tokenizer.model",
     destination_path: str = "data/shakespeare",
 ):
     os.makedirs(destination_path, exist_ok=True)

@@ -44,7 +44,7 @@ def main():
 
     train_data, val_data = load_datasets()
 
-    config = LLaMAConfig
+    config = LLaMAConfig.from_name("7B")
     config.block_size = block_size
 
     with fabric.device:
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ __pycache__ @@
     # data
     data
+    checkpoints
     !data/shakespeare/prepare.py
     # downloaded by scripts/compare.py
@@ Expand Down @@