Lightning-AI · carmocca · Feb 23, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
@@ -33,6 +33,7 @@ Supports the following popular model checkpoints:
 | [Falcon](tutorials/download_falcon.md) by TII UAE                                    | 7B, 40B, 180B                            | [TII 2023](https://falconllm.tii.ae)                                                                                         |
 | [FreeWilly2](tutorials/download_freewilly_2.md) (Stable Beluga 2) by Stability AI    | 70B                                      | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models)                             |
 | [Function Calling Llama 2](tutorials/download_function_calling_llama_2.md) by Trelis | 7B                                       | [Trelis et al. 2023](https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2)                                   |
+| [Gemma](tutorials/download_gemma.md) by Google                                       | 2B, 7B                                   | [Google Team, Google Deepmind](https://storage.googleapis.com/deepmind-media/gemma/gemma-report.pdf)                         |
 | [Llama 2](tutorials/download_llama_2.md) by Meta AI                                  | 7B, 13B, 70B                             | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                                                      |
 | [LongChat](tutorials/download_longchat.md) by LMSYS                                  | 7B, 13B                                  | [LongChat Team 2023](https://lmsys.org/blog/2023-06-29-longchat/)                                                            |
 | [Mistral and Mixtral](tutorials/download_mistral.md) by Mistral AI                   | 7B                                       | [Mistral website](https://mistral.ai/)                                                                                       |

@@ -361,6 +361,11 @@ def prompt_config(checkpoint_dir: Path, tokenizer: Tokenizer) -> Tuple[str, Tupl
         stop_tokens = ([tokenizer.eos_id],)
         return system_prompt, stop_tokens
 
+    if re.search(r"gemma.*-it", checkpoint_name):
+        system_prompt = "<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
+        stop_tokens = ([tokenizer.eos_id],)
+        return system_prompt, stop_tokens
+
     # default format
     return "{prompt}", ([tokenizer.eos_id],)
 

@@ -123,7 +123,7 @@ def __init__(self, config: Config, block_idx: int) -> None:
         # key, query, value projections for all heads, but in a batch
         self.attn = AdapterV2Linear(in_features=config.n_embd, out_features=shape, bias=config.bias)
         # output projection
-        self.proj = AdapterV2Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.proj = AdapterV2Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias)
         # disabled by default
         self.kv_cache: Optional[KVCache] = None
 
@@ -194,6 +194,14 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
 
 
+class GemmaMLP(LLaMAMLP):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_fc_1 = self.fc_1(x)
+        x_fc_2 = self.fc_2(x)
+        x = torch.nn.functional.gelu(x_fc_1) * x_fc_2
+        return self.proj(x)
+
+
 class LLaMAMoE(lit_gpt.model.LLaMAMoE):
     def __init__(self, config: Config) -> None:
         nn.Module.__init__(self)

@@ -17,12 +17,14 @@
 class Config:
     name: str = ""
     hf_config: dict = field(default_factory=dict)
+    scale_wte_output: bool = False
     block_size: int = 4096
     vocab_size: int = 50254
     padding_multiple: int = 512
     padded_vocab_size: Optional[int] = None
     n_layer: int = 16
     n_head: int = 32
+    head_size: Optional[int] = None
     n_embd: int = 4096
     rotary_percentage: float = 0.25
     parallel_residual: bool = True
@@ -51,8 +53,9 @@ class Config:
     n_query_groups: Optional[int] = None
     shared_attention_norm: bool = False
     _norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm"
+    rmsnorm_add_unit_offset: bool = False
     norm_eps: float = 1e-5
-    _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP"
+    _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP", "GemmaMLP", "LLaMAMoE"] = "GptNeoxMLP"
     gelu_approximate: str = "none"
     intermediate_size: Optional[int] = None
     rope_condense_ratio: int = 1
@@ -64,8 +67,9 @@ def __post_init__(self):
         if not self.name:
             self.name = self.hf_config.get("name", self.name)
 
-        assert self.n_embd % self.n_head == 0
-        self.head_size = self.n_embd // self.n_head
+        if self.head_size is None:
+            assert self.n_embd % self.n_head == 0
+            self.head_size = self.n_embd // self.n_head
 
         # vocab size should be a power of 2 to be optimal on hardware. compute the closest value
         if self.padded_vocab_size is None:
@@ -138,9 +142,11 @@ def mlp_class(self) -> Type:
     def norm_class(self) -> Type:
         # `self._norm_class` cannot be the type to keep the config json serializable
         if self._norm_class == "RMSNorm":
+            from functools import partial
+
             from lit_gpt.rmsnorm import RMSNorm
 
-            return RMSNorm
+            return partial(RMSNorm, add_unit_offset=self.rmsnorm_add_unit_offset)
         return getattr(torch.nn, self._norm_class)
 
 
@@ -781,6 +787,57 @@ def norm_class(self) -> Type:
         configs.append(copy)
 
 
+###############
+# Google Gemma
+###############
+gemma = [
+    # https://huggingface.co/google/gemma-2b/blob/main/config.json
+    dict(
+        name="Gemma-2b",
+        hf_config=dict(org="google", name="gemma-2b"),
+        scale_wte_output=True,
+        vocab_size=256000,
+        padding_multiple=64,
+        n_embd=2048,
+        n_layer=18,
+        n_head=8,
+        n_query_groups=1,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        rmsnorm_add_unit_offset=True,
+        _mlp_class="GemmaMLP",
+        intermediate_size=16384,
+    ),
+    # https://huggingface.co/google/gemma-7b/blob/main/config.json
+    dict(
+        name="Gemma-7b",
+        hf_config=dict(org="google", name="gemma-7b"),
+        scale_wte_output=True,
+        vocab_size=256000,
+        padding_multiple=64,
+        n_embd=3072,
+        n_layer=28,
+        n_head=16,
+        head_size=256,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        rmsnorm_add_unit_offset=True,
+        _mlp_class="GemmaMLP",
+        intermediate_size=24576,
+    ),
+]
+configs.extend(gemma)
+for c in gemma:
+    copy = deepcopy(c)
+    copy["name"] = f"{c['name']}-it"
+    copy["hf_config"]["name"] = f"{c['hf_config']['name']}-it"
+    configs.append(copy)
+
+
 ##########################
 # Stability AI FreeWilly2
 ##########################

@@ -599,7 +599,7 @@ def __init__(self, config: Config) -> None:
         )
         # output projection
         self.proj = LoRALinear(
-            config.n_embd,
+            config.head_size * config.n_head,
             config.n_embd,
             bias=config.bias,
             r=(config.r if config.to_projection else 0),
@@ -699,6 +699,14 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwa
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
 
 
+class GemmaMLP(LLaMAMLP):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_fc_1 = self.fc_1(x)
+        x_fc_2 = self.fc_2(x)
+        x = torch.nn.functional.gelu(x_fc_1) * x_fc_2
+        return self.proj(x)
+
+
 class LLaMAMoE(lit_gpt.model.LLaMAMoE):
     def __init__(self, config: Config) -> None:
         nn.Module.__init__(self)

@@ -87,6 +87,9 @@ def forward(self, idx: torch.Tensor, input_pos: Optional[torch.Tensor] = None) -
             mask = None
 
         x = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        if self.config.scale_wte_output:
+            x = x * (self.config.n_embd**0.5)
+
         for block in self.transformer.h:
             x = block(x, cos, sin, mask, input_pos)
         x = self.transformer.ln_f(x)
@@ -174,7 +177,7 @@ def __init__(self, config: Config) -> None:
         # key, query, value projections for all heads, but in a batch
         self.attn = nn.Linear(config.n_embd, shape, bias=config.bias)
         # output projection
-        self.proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.proj = nn.Linear(config.head_size * config.n_head, config.n_embd, bias=config.bias)
         # disabled by default
         self.kv_cache: Optional[KVCache] = None
 
@@ -224,7 +227,7 @@ def forward(
 
         y = self.scaled_dot_product_attention(q, k, v, mask)
 
-        y = y.reshape(B, T, self.config.n_embd)  # re-assemble all head outputs side by side
+        y = y.reshape(B, T, self.config.head_size * self.config.n_head)  # re-assemble all head outputs side by side
 
         # output projection
         return self.proj(y)
@@ -290,6 +293,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.proj(x)
 
 
+class GemmaMLP(LLaMAMLP):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_fc_1 = self.fc_1(x)
+        x_fc_2 = self.fc_2(x)
+        x = torch.nn.functional.gelu(x_fc_1) * x_fc_2
+        return self.proj(x)
+
+
 class LLaMAMoE(nn.Module):
     def __init__(self, config: Config) -> None:
         super().__init__()

@@ -10,19 +10,23 @@ class RMSNorm(torch.nn.Module):
     https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE.
     """
 
-    def __init__(self, size: int, dim: int = -1, eps: float = 1e-5) -> None:
+    def __init__(self, size: int, dim: int = -1, eps: float = 1e-6, add_unit_offset: bool = False) -> None:
         super().__init__()
         self.weight = torch.nn.Parameter(torch.ones(size))
         self.eps = eps
         self.dim = dim
+        self.add_unit_offset = add_unit_offset
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         dtype = x.dtype
         x = x.float()
         # NOTE: the original RMSNorm paper implementation is not equivalent
         norm_x = torch.mean(x * x, dim=self.dim, keepdim=True)
         x_normed = x * torch.rsqrt(norm_x + self.eps)
-        return (self.weight * x_normed).to(dtype=dtype)
+        x_normed = x_normed.to(dtype=dtype)
+        if self.add_unit_offset:
+            return x_normed * (1 + self.weight)
+        return x_normed * self.weight
 
     def reset_parameters(self) -> None:
         torch.nn.init.ones_(self.weight)
@@ -117,6 +117,7 @@ def copy_weights_hf_llama(
     qkv_weights: Dict[int, List[Optional[NotYetLoadedTensor]]],
     state_dict: Dict[str, torch.Tensor],
     hf_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    tie_weights: bool = False,
     saver: Optional[incremental_save] = None,
     dtype: Optional[torch.dtype] = None,
 ) -> None:
@@ -144,7 +145,7 @@ def copy_weights_hf_llama(
                 "model.layers.{}.block_sparse_moe.experts.{}.w2.weight": "transformer.h.{l}.mlp.experts.{e}.proj.weight",
             }
         )
-    elif config._mlp_class == "LLaMAMLP":
+    elif config._mlp_class in ("LLaMAMLP", "GemmaMLP"):
         weight_map.update(
             {
                 "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{l}.mlp.fc_1.weight",
@@ -179,6 +180,10 @@ def copy_weights_hf_llama(
             param = saver.store_early(param)
         state_dict[to_name] = param
 
+    if tie_weights:
+        state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"]
+
+    # convert separate q, k, v matrices into an interleaved qkv
     for i, (q, k, v) in list(qkv_weights.items()):
         if q is None or k is None or v is None:
             # split across different .bin files
@@ -307,10 +312,11 @@ def convert_hf_checkpoint(
 
     if "falcon" in model_name:
         copy_fn = partial(copy_weights_falcon, model_name)
-    elif config._mlp_class in ("LLaMAMLP", "LLaMAMoE"):
+    elif config._mlp_class in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
         # holder to reconstitute the split q, k, v
         qkv_weights = {}
-        copy_fn = partial(copy_weights_hf_llama, config, qkv_weights)
+        tie_weights = "Gemma" in config.name
+        copy_fn = partial(copy_weights_hf_llama, config, qkv_weights, tie_weights=tie_weights)
     elif "phi" in model_name:
         # holder to reconstitute the split q, k, v
         qkv_weights = {}

@@ -106,6 +106,7 @@ def copy_weights_llama(
     config: Config,
     state_dict: Dict[str, torch.Tensor],
     lit_weights: Dict[str, Union[torch.Tensor, NotYetLoadedTensor]],
+    untie_weights: bool = False,
     saver: Optional[incremental_save] = None,
 ) -> None:
     weight_map = {
@@ -128,7 +129,7 @@ def copy_weights_llama(
                 "transformer.h.{}.mlp.experts.{}.proj.weight": "model.layers.{l}.block_sparse_moe.experts.{e}.w2.weight",
             }
         )
-    elif config._mlp_class == "LLaMAMLP":
+    elif config._mlp_class in ("LLaMAMLP", "GemmaMLP"):
         weight_map.update(
             {
                 "transformer.h.{}.mlp.fc_1.weight": "model.layers.{l}.mlp.gate_proj.weight",
@@ -140,6 +141,8 @@ def copy_weights_llama(
         raise NotImplementedError
 
     for name, param in lit_weights.items():
+        if name == "lm_head.weight" and untie_weights:
+            continue
         if name.endswith(".attn.attn.weight"):
             from_name, l = layer_template(name, 2)
             q = "model.layers.{}.self_attn.q_proj.weight".format(l)
@@ -246,8 +249,9 @@ def convert_lit_checkpoint(checkpoint_path: Path, output_path: Path, config_path
 
     if "falcon" in config.name:
         copy_fn = partial(copy_weights_falcon, config.name)
-    elif config._mlp_class in ("LLaMAMLP", "LLaMAMoE"):
-        copy_fn = partial(copy_weights_llama, config)
+    elif config._mlp_class in ("LLaMAMLP", "GemmaMLP", "LLaMAMoE"):
+        untie_weights = "Gemma" in config.name
+        copy_fn = partial(copy_weights_llama, config, untie_weights=untie_weights)
     elif "phi" in config.name:
         copy_fn = partial(copy_weights_phi, config)
     else:

@@ -127,3 +127,12 @@ def test_from_checkpoint(tmp_path):
     assert config.name == "pythia-14m"
     assert config.block_size == 24
     assert config.n_layer == 2
+
+
+@pytest.mark.parametrize("head_size", [None, 128])
+def test_head_size(head_size):
+    from lit_gpt import Config
+
+    config = Config(head_size)
+
+    assert config.head_size == head_size or config.n_embd // config.n_head