Merge pull request #2 from huggingface/add-llama3-convert-70b

Add llama3 convert 70b
huggingface · Apr 15, 2024 · d95e60c · d95e60c
2 parents 3e4fac9 + 1f37d2b
commit d95e60c
Showing 1 changed file with 5 additions and 3 deletions.
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
@@ -143,7 +143,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
     else:
         # Sharded
         loaded = [
-            torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
+            torch.load(os.path.join(input_base_path, f"consolidated.{i:01d}.pth"), map_location="cpu")
             for i in range(num_shards)
         ]
     param_count = 0
@@ -190,7 +190,8 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
                         for i in range(num_shards)
                     ],
                     dim=0,
-                ).reshape(dim, dim)
+                ).reshape(dim, dim),
+                n_heads=n_heads
             )
             state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
                 torch.cat(
@@ -244,10 +245,11 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
             "lm_head.weight": loaded["output.weight"],
         }
     else:
+        concat_dim = 0 if llama_version == 3 else 1
         state_dict = {
             "model.norm.weight": loaded[0]["norm.weight"],
             "model.embed_tokens.weight": torch.cat(
-                [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
+                [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=concat_dim
             ),
             "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
         }