Lightning-AI · carmocca · Feb 23, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
@@ -781,6 +781,49 @@ def norm_class(self) -> Type:
         configs.append(copy)
 
 
+###############
+# Google Gemma
+###############
+gemma = [
+    # https://huggingface.co/google/gemma-7b/blob/main/config.json
+    dict(
+        name="Gemma-7b-hf",
+        hf_config=dict(org="google", name="gemma-7b"),
+        vocab_size=256000,
+        padding_multiple=64,
+        n_embd=3072,
+        n_layer=28,
+        n_head=16,
+        n_query_groups=1,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="GemmaMLP",
+        intermediate_size=24576,
+    ),
+    # https://huggingface.co/google/gemma-2b/blob/main/config.json
+    dict(
+        name="Gemma-2b-hf",
+        hf_config=dict(org="google", name="gemma-2b"),
+        vocab_size=256000,
+        padding_multiple=64,
+        n_embd=2048,
+        n_layer=18,
+        n_head=8,
+        n_query_groups=1,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="GemmaMLP",
+        intermediate_size=16384,
+    ),
+]
+configs.extend(gemma)
+
+
+
 ##########################
 # Stability AI FreeWilly2
 ##########################

@@ -11,6 +11,7 @@
 
 import torch
 import torch.nn as nn
+from torch import Tensor
 from typing_extensions import Self
 
 from lit_gpt.config import Config
@@ -290,6 +291,39 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.proj(x)
 
 
+class GEGLU(nn.Module):
+    """
+    Source: https://github.com/pfnet-research/deep-table/blob/237c8be8a405349ce6ab78075234c60d9bfe60b7/deep_table/nn/layers/activation.py#L22
+    License: MIT, https://github.com/pfnet-research/deep-table/blob/237c8be8a405349ce6ab78075234c60d9bfe60b7/LICENSE
+    References:
+        Shazeer et al., "GLU Variants Improve Transformer," 2020.
+        https://arxiv.org/abs/2002.05202
+    """
+
+    def geglu(self, x: Tensor) -> Tensor:
+        assert x.shape[-1] % 2 == 0
+        a, b = x.chunk(2, dim=-1)
+        return a * torch.nn.functional.gelu(b)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.geglu(x)
+
+
+class GemmaMLP(nn.Module):
+    def __init__(self, config: Config) -> None:
+        super().__init__()
+        self.fc_1 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.fc_2 = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
+        self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias)
+        self.geglu = torch.nn.GELU(approximate=True)  # GEGLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_fc_1 = self.fc_1(x)
+        x_fc_2 = self.fc_2(x)
+        x = self.geglu(x_fc_1) * x_fc_2
+        return self.proj(x)
+
+
 class LLaMAMoE(nn.Module):
     def __init__(self, config: Config) -> None:
         super().__init__()

@@ -138,7 +138,7 @@ def copy_weights_hf_llama(
             "model.layers.{}.block_sparse_moe.experts.{}.w3.weight": "transformer.h.{l}.mlp.experts.{e}.fc_2.weight",
             "model.layers.{}.block_sparse_moe.experts.{}.w2.weight": "transformer.h.{l}.mlp.experts.{e}.proj.weight",
         })
-    elif config._mlp_class == "LLaMAMLP":
+    elif config._mlp_class in ("LLaMAMLP", "GemmaMLP"):
         weight_map.update({
             "model.layers.{}.mlp.gate_proj.weight": "transformer.h.{l}.mlp.fc_1.weight",
             "model.layers.{}.mlp.up_proj.weight": "transformer.h.{l}.mlp.fc_2.weight",
@@ -171,6 +171,11 @@ def copy_weights_hf_llama(
             param = saver.store_early(param)
         state_dict[to_name] = param
 
+    # If model uses weight tying:
+    if "lm_head.weight" not in state_dict.keys():
+        state_dict["lm_head.weight"] = state_dict["transformer.wte.weight"]
+
+
     for i, (q, k, v) in list(qkv_weights.items()):
         if q is None or k is None or v is None:
             # split across different .bin files
@@ -299,7 +304,7 @@ def convert_hf_checkpoint(
 
     if "falcon" in model_name:
         copy_fn = partial(copy_weights_falcon, model_name)
-    elif config._mlp_class in ("LLaMAMLP", "LLaMAMoE"):
+    elif config._mlp_class in ("LLaMAMLP", "LLaMAMoE", "GemmaMLP"):
         # holder to reconstitute the split q, k, v
         qkv_weights = {}
         copy_fn = partial(copy_weights_hf_llama, config, qkv_weights)