From 0c6e4e6cd4902b5e192401902b3e04b77d94baed Mon Sep 17 00:00:00 2001 From: kirp Date: Wed, 27 Sep 2023 05:47:05 -0400 Subject: [PATCH 1/4] remove falcon style rope --- llama2.mojo | 49 +++++++------------------------------------------ 1 file changed, 7 insertions(+), 42 deletions(-) diff --git a/llama2.mojo b/llama2.mojo index 102054e..e266b19 100644 --- a/llama2.mojo +++ b/llama2.mojo @@ -599,31 +599,6 @@ fn matmul(inout C: Matrix, A: Matrix, B: Matrix, rt: Runtime) -> None: matmul_parallelized(C, A, B, rt) -# Apply RoPE rotation to the q and k vectors for each head -# roate the first and second half -fn rope_rotation_falcon(inout state: RunState, freq_cis_real_row: BufferPtrFloat32, - freq_cis_imag_row: BufferPtrFloat32, config: Config) -> None: - # tinyllama-1.1, llama model - let q = state.q.data - let k = state.k.data - let head_size = config.head_size - let off_rot = head_size // 2 - for i in range(config.n_heads): - for j in range(config.head_size // 2): - let fcr = freq_cis_real_row.offset(j).load(0) - let fci = freq_cis_imag_row.offset(j).load(0) - let q0 = q.offset(i * head_size + j).load(0) - let q1 = q.offset(i * head_size + j + off_rot).load(0) - q.offset(i * head_size + j).store(0, q0 * fcr - q1 * fci) - q.offset(i * head_size + j + off_rot).store(0, q0 * fci + q1 * fcr) - if i < config.n_kv_heads: - let k0 = k.offset(i * head_size + j).load(0) - let k1 = k.offset(i * head_size + j + off_rot).load(0) - k.offset(i * head_size + j).store(0, k0 * fcr - k1 * fci) - k.offset(i * head_size + j + off_rot).store( - 0, k0 * fci + k1 * fcr - ) - # Apply RoPE rotation to the q and k vectors for each head # rotate odd and even dim fn rope_rotation_llama(inout state: RunState, freq_cis_real_row: BufferPtrFloat32, @@ -632,28 +607,24 @@ fn rope_rotation_llama(inout state: RunState, freq_cis_real_row: BufferPtrFloat3 let q = state.q.data let k = state.k.data let head_size = config.head_size - let off_rot = 1 for i in range(config.n_heads): for j in range(0, config.head_size, 2): let fcr = freq_cis_real_row.offset(j // 2).load(0) let fci = freq_cis_imag_row.offset(j // 2).load(0) let q0 = q.offset(i * head_size + j).load(0) - let q1 = q.offset(i * head_size + j + off_rot).load(0) + let q1 = q.offset(i * head_size + j + 1).load(0) q.offset(i * head_size + j).store(0, q0 * fcr - q1 * fci) - q.offset(i * head_size + j + off_rot).store(0, q0 * fci + q1 * fcr) + q.offset(i * head_size + j + 1).store(0, q0 * fci + q1 * fcr) if i < config.n_kv_heads: let k0 = k.offset(i * head_size + j).load(0) - let k1 = k.offset(i * head_size + j + off_rot).load(0) + let k1 = k.offset(i * head_size + j + 1).load(0) k.offset(i * head_size + j).store(0, k0 * fcr - k1 * fci) - k.offset(i * head_size + j + off_rot).store( + k.offset(i * head_size + j + 1).store( 0, k0 * fci + k1 * fcr ) @always_inline -fn transformer[ - rope_rotation: fn (inout state: RunState, freq_cis_real_row: BufferPtrFloat32, - freq_cis_imag_row: BufferPtrFloat32, config: Config) -> None -]( +fn transformer( token: Int, pos: Int, config: Config, @@ -700,7 +671,7 @@ fn transformer[ matmul(state.v, state.xb, tmpw, state.rt) # Apply RoPE rotation to the q and k vectors for each head - rope_rotation(state, freq_cis_real_row, freq_cis_imag_row, config) + rope_rotation_llama(state, freq_cis_real_row, freq_cis_imag_row, config) # Multihead attention. Iterate over all heads for h in range(config.n_heads): @@ -885,7 +856,6 @@ fn print_usage(): print(" -n number of steps to run for, default 256. 0 = max_seq_len") print(" -i input prompt") print(" -z tokenizer path") - print(" -r rope architecture, default 'llama' for llama rope, 'falcon' for falcon rope") fn main() raises: @@ -897,7 +867,6 @@ fn main() raises: var steps = 256 var prompt = String("") var rng_seed: Int = time.now() - var rope_arch = String("llama") # llama | falcon @parameter fn argparse() raises -> Int: @@ -916,8 +885,6 @@ fn main() raises: rng_seed = atol(args[i + 1]) if args[i] == "-i": prompt = args[i + 1] - if args[i] == "-r": - rope_arch = args[i + 1] if args[i] == "-t": let val = args[i + 1] temperature = 0.0 @@ -979,14 +946,12 @@ fn main() raises: var next_token = 0 # Will store the next token in the sequence # Initialize with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer var token = 1 - let _transformer = transformer[rope_rotation_llama] if rope_arch == 'llama' - else transformer[rope_rotation_falcon] # Position in the sequence var pos = 0 while pos < steps: # Forward the transformer to get logits for the next token - _transformer(token, pos, config, state, weights) + transformer(token, pos, config, state, weights) if pos < len(prompt_tokens): next_token = prompt_tokens[pos] From b602a81831b25965c3cfffa8a3b84937b080471e Mon Sep 17 00:00:00 2001 From: kirp Date: Wed, 27 Sep 2023 05:48:23 -0400 Subject: [PATCH 2/4] conver hf model to llama2.c format --- export.py | 472 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 472 insertions(+) create mode 100644 export.py diff --git a/export.py b/export.py new file mode 100644 index 0000000..ad1fdcd --- /dev/null +++ b/export.py @@ -0,0 +1,472 @@ +""" +This script has functions and utilties for model export. +Basically, we have a bunch of versions of the model, and we +want to export them to .bin files to be read from and inferenced in C. + +Among the "input" versions of PyTorch files/models: +- Official Llama 2 weights released by Meta +- Huggingface weights available on the hub +- llama2.c (this repo) trained models + +Among the "output" versions of .bin files: +- v0: Legacy files of the original llama2.c repo (will eventually be DEPRECATED) +- v1-vN: Improved .bin files with a proper header, cache alignment, etc. + +This script aspires to provide all of these conversions. +""" +import os +import gzip +import shutil +import struct +import argparse +import json +from pathlib import Path + +import numpy as np +import torch +from torch import nn + +from model import ModelArgs, Transformer + +# ----------------------------------------------------------------------------- +# common utilities + +def serialize_fp32(file, tensor): + """ writes one fp32 tensor to file that is open in wb mode """ + d = tensor.detach().cpu().view(-1).to(torch.float32).numpy() + b = struct.pack(f'{len(d)}f', *d) + file.write(b) + +def serialize_int8(file, tensor): + """ writes one int8 tensor to file that is open in wb mode """ + d = tensor.detach().cpu().view(-1).numpy().astype(np.int8) + b = struct.pack(f'{len(d)}b', *d) + file.write(b) + +def quantize_q80(w, group_size): + """ + takes a tensor and returns the Q8_0 quantized version + i.e. symmetric quantization into int8, range [-127,127] + """ + assert w.numel() % group_size == 0 + ori_shape = w.shape + w = w.float() # convert to float32 + w = w.reshape(-1, group_size) + # find the max in each group + wmax = torch.abs(w).max(dim=1).values + # calculate the scaling factor such that float = quant * scale + scale = wmax / 127.0 + # scale into range [-127, 127] + quant = w / scale[:,None] + # round to nearest integer + int8val = torch.round(quant).to(torch.int8) + # dequantize by rescaling + fp32val = (int8val.float() * scale[:,None]).view(-1) + fp32valr = fp32val.reshape(-1, group_size) + # calculate the max error in each group + err = torch.abs(fp32valr - w).max(dim=1).values + # find the max error across all groups + maxerr = err.max().item() + return int8val, scale, maxerr + +# ----------------------------------------------------------------------------- +# legacy + +def legacy_export(model, filepath): + """ Original export of llama2.c bin files, i.e. version v0 """ + out_file = open(filepath, 'wb') + + # first write out the header + hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0] + p = model.params + shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight) + # legacy format uses negative/positive vocab size as a shared classifier flag + if not shared_classifier: + p.vocab_size = -p.vocab_size + n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads + header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, + n_kv_heads, p.vocab_size, p.max_seq_len) + out_file.write(header) + + # next write out the embedding weights + serialize_fp32(out_file, model.tok_embeddings.weight) + + # now all the layers + # attention weights + for layer in model.layers: + serialize_fp32(out_file, layer.attention_norm.weight) + for layer in model.layers: + serialize_fp32(out_file, layer.attention.wq.weight) + for layer in model.layers: + serialize_fp32(out_file, layer.attention.wk.weight) + for layer in model.layers: + serialize_fp32(out_file, layer.attention.wv.weight) + for layer in model.layers: + serialize_fp32(out_file, layer.attention.wo.weight) + # ffn weights + for layer in model.layers: + serialize_fp32(out_file, layer.ffn_norm.weight) + for layer in model.layers: + serialize_fp32(out_file, layer.feed_forward.w1.weight) + for layer in model.layers: + serialize_fp32(out_file, layer.feed_forward.w2.weight) + for layer in model.layers: + serialize_fp32(out_file, layer.feed_forward.w3.weight) + # final rmsnorm + serialize_fp32(out_file, model.norm.weight) + # freqs_cis + serialize_fp32(out_file, model.freqs_cos[:p.max_seq_len]) + serialize_fp32(out_file, model.freqs_sin[:p.max_seq_len]) + + # final classifier weights + if not shared_classifier: + serialize_fp32(out_file, model.output.weight) + + # write to binary file + out_file.close() + print(f"wrote {filepath}") + +# ----------------------------------------------------------------------------- +# new version + +def version1_export(model, filepath): + """ + Export the model weights in full float32 .bin file to be read from C. + This is same as legacy_export, but with a proper header. + """ + version = 1 + + out_file = open(filepath, 'wb') + # first write out the header. the header will be 256 bytes + # 1) write magic, which will be uint32 of "ak42" in ASCII + out_file.write(struct.pack('I', 0x616b3432)) + # 2) write version, which will be int + out_file.write(struct.pack('i', version)) + # 3) write the params, which will be 7 ints + p = model.params + hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0] + n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads + header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, + n_kv_heads, p.vocab_size, p.max_seq_len) + out_file.write(header) + # 4) write some other flags + shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight) + out_file.write(struct.pack('B', int(shared_classifier))) + pad = 256 - out_file.tell() # pad rest with zeros; tell returns current pos + assert pad >= 0 + out_file.write(b'\0' * pad) + + # now let's write out all the params + weights = [ + *[layer.attention_norm.weight for layer in model.layers], + *[layer.ffn_norm.weight for layer in model.layers], + model.norm.weight, + model.tok_embeddings.weight, + *[layer.attention.wq.weight for layer in model.layers], + *[layer.attention.wk.weight for layer in model.layers], + *[layer.attention.wv.weight for layer in model.layers], + *[layer.attention.wo.weight for layer in model.layers], + *[layer.feed_forward.w1.weight for layer in model.layers], + *[layer.feed_forward.w2.weight for layer in model.layers], + *[layer.feed_forward.w3.weight for layer in model.layers], + ] + if not shared_classifier: + weights.append(model.output.weight) + for w in weights: + serialize_fp32(out_file, w) + + # write to binary file + out_file.close() + print(f"wrote {filepath}") + +def version2_export(model, filepath, group_size=64): + """ + Export the model weights in Q8_0 into .bin file to be read from C. + That is: + - quantize all weights to symmetric int8, in range [-127, 127] + - all other tensors (the rmsnorm params) are kept and exported in fp32 + - quantization is done in groups of group_size to reduce the effects of any outliers + """ + version = 2 + + # let's first do some validation for this export type + while model.params.dim % group_size != 0: + group_size //= 2 + print(f"BACKOFF: reducing group size to {group_size} to fit hidden_dim") + weights = [ + model.tok_embeddings.weight, + *[layer.attention.wq.weight for layer in model.layers], + *[layer.attention.wk.weight for layer in model.layers], + *[layer.attention.wv.weight for layer in model.layers], + *[layer.attention.wo.weight for layer in model.layers], + *[layer.feed_forward.w1.weight for layer in model.layers], + *[layer.feed_forward.w2.weight for layer in model.layers], + *[layer.feed_forward.w3.weight for layer in model.layers], + ] + shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight) + if not shared_classifier: + weights.append(model.output.weight) + for w in weights: + assert w.numel() % group_size == 0, f"weight {i} has numel {w.numel()}, not a multiple of group_size {group_size}" + + # write + out_file = open(filepath, 'wb') + # first write out the header. the header will be 256 bytes + # 1) write magic, which will be uint32 of "ak42" in ASCII + out_file.write(struct.pack('I', 0x616b3432)) + # 2) write version, which will be int + out_file.write(struct.pack('i', version)) + # 3) write the params, which will be 7 ints + p = model.params + hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0] + n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads + header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, + n_kv_heads, p.vocab_size, p.max_seq_len) + out_file.write(header) + # 4) write some other flags + out_file.write(struct.pack('B', int(shared_classifier))) + out_file.write(struct.pack('i', group_size)) # group size used for quantization + pad = 256 - out_file.tell() # pad rest with zeros; tell returns current pos + assert pad >= 0 + out_file.write(b'\0' * pad) + # now that the header is done, let's write out the model + + # first let's write out all the params that we are keeping in fp32: the norms + for layer in model.layers: # attention norms + serialize_fp32(out_file, layer.attention_norm.weight) + for layer in model.layers: # MLP norms + serialize_fp32(out_file, layer.ffn_norm.weight) + serialize_fp32(out_file, model.norm.weight) # final pre-classifier norm + + # now let's write out all the params that we are quantizing to Q8_0 + # note we skip classifier weights, which are shared with the embedding + ew = [] + scales = [] + for i, w in enumerate(weights): + # quantize this weight + q, s, err = quantize_q80(w, group_size) + # save the int8 weights to file + serialize_int8(out_file, q) # save the tensor in int8 + scales.append(s) # we'll do all the scales after all the qs + # logging + ew.append((err, w.shape)) + print(f"{i+1}/{len(weights)} quantized {tuple(w.shape)} to Q8_0 with max error {err}") + + # save the scaling factors in fp32 here + # this is done to keep all the weights contiquous, making pointer arithmetic easier in C + for s in scales: + serialize_fp32(out_file, s) + + # print the highest error across all weights, should be very small, e.g. O(~0.001) + ew.sort(reverse=True) + print(f"max quantization group error across all weights: {ew[0][0]}") + + # write to binary file + out_file.close() + print(f"wrote {filepath}") + + +# ----------------------------------------------------------------------------- +# Load / import functions + +def load_checkpoint(checkpoint): + + # load the provided model checkpoint + checkpoint_dict = torch.load(checkpoint, map_location='cpu') + gptconf = ModelArgs(**checkpoint_dict['model_args']) + model = Transformer(gptconf) + state_dict = checkpoint_dict['model'] + unwanted_prefix = '_orig_mod.' + for k,v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) + model.load_state_dict(state_dict, strict=False) + model.eval() + return model + +def load_meta_model(model_path): + params_path = os.path.join(model_path, 'params.json') + with open(params_path) as f: + params = json.load(f) + print(params) + + model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth'))) + models = [torch.load(p, map_location='cpu') for p in model_paths] + + def concat_weights(models): + state_dict = {} + for name in list(models[0]): + tensors = [model[name] for model in models] + if len(tensors) == 1 or len(tensors[0].shape) == 1: + state_dict[name] = tensors[0] + continue + is_axis_1 = ( + name.startswith('tok_embeddings.') + or name.endswith('.attention.wo.weight') + or name.endswith('.feed_forward.w2.weight') + ) + axis = 1 if is_axis_1 else 0 + state_dict[name] = torch.cat(tensors, dim=axis) + for model in models: + del model[name] + return state_dict + + state_dict = concat_weights(models) + del models + + # set ModelArgs + config = ModelArgs() + config.dim = params["dim"] + config.n_layers = params["n_layers"] + config.n_heads = params["n_heads"] + config.n_kv_heads = params.get('n_kv_heads') or params['n_heads'] + config.multiple_of = params["multiple_of"] + config.norm_eps = params["norm_eps"] + + config.vocab_size = state_dict['tok_embeddings.weight'].shape[0] + config.max_seq_len = 2048 + + + # create a new Transformer object and set weights + model = Transformer(config) + + model.tok_embeddings.weight = nn.Parameter(state_dict['tok_embeddings.weight']) + model.norm.weight = nn.Parameter(state_dict['norm.weight']) + + for layer in model.layers: + i = layer.layer_id + layer.attention_norm.weight = nn.Parameter(state_dict[f'layers.{i}.attention_norm.weight']) + layer.attention.wq.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wq.weight']) + layer.attention.wk.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wk.weight']) + layer.attention.wv.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wv.weight']) + layer.attention.wo.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wo.weight']) + layer.ffn_norm.weight = nn.Parameter(state_dict[f'layers.{i}.ffn_norm.weight']) + layer.feed_forward.w1.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w1.weight']) + layer.feed_forward.w2.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w2.weight']) + layer.feed_forward.w3.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w3.weight']) + + # final classifier + model.output.weight = nn.Parameter(state_dict['output.weight']) + model.eval() + return model + +def load_hf_model(model_path): + + try: + from transformers import AutoModelForCausalLM + except ImportError: + print("Error: transformers package is required to load huggingface models") + print("Please run `pip install transformers` to install it") + return None + + # load HF model + hf_model = AutoModelForCausalLM.from_pretrained(model_path) + hf_dict = hf_model.state_dict() + + # convert LlamaConfig to ModelArgs + config = ModelArgs() + config.dim = hf_model.config.hidden_size + config.n_layers = hf_model.config.num_hidden_layers + config.n_heads = hf_model.config.num_attention_heads + config.n_kv_heads = hf_model.config.num_key_value_heads + config.vocab_size = hf_model.config.vocab_size + config.hidden_dim = hf_model.config.intermediate_size + config.norm_eps = hf_model.config.rms_norm_eps + config.max_seq_len = hf_model.config.max_position_embeddings + config.kv_dim = config.dim * config.n_kv_heads // config.n_heads + + # create a new Transformer object and set weights + model = Transformer(config) + + model.tok_embeddings.weight = nn.Parameter(hf_dict['model.embed_tokens.weight']) + model.norm.weight = nn.Parameter(hf_dict['model.norm.weight']) + + # huggingface permutes WQ and WK, this function reverses it + def permute_reverse(w, n_heads=config.n_heads, dim1=config.dim, dim2=config.dim): + return w.view(n_heads, 2, dim1 // n_heads // 2, dim2).transpose(1, 2).reshape(dim1, dim2) + + for layer in model.layers: + i = layer.layer_id + layer.attention_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.input_layernorm.weight']) + layer.attention.wq.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.q_proj.weight'])) + layer.attention.wk.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.k_proj.weight'], config.n_kv_heads, config.kv_dim, config.dim)) + layer.attention.wv.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.v_proj.weight']) + layer.attention.wo.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.o_proj.weight']) + layer.ffn_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.post_attention_layernorm.weight']) + layer.feed_forward.w1.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.gate_proj.weight']) + layer.feed_forward.w2.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.down_proj.weight']) + layer.feed_forward.w3.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.up_proj.weight']) + + # final classifier + model.output.weight = nn.Parameter(hf_dict['lm_head.weight']) + model.eval() + return model + + +# ----------------------------------------------------------------------------- +# API entrypoint + +def model_export(model, filepath, version): + if version == 0: + legacy_export(model, filepath) + elif version == 1: + version1_export(model, filepath) + elif version == 2: + version2_export(model, filepath) + else: + raise ValueError(f"unknown version {version}") + +def torchscript_export(model, filepath, zero_params=False, gzip_output=False): + """ + (This was submitted via a PR earlier. Leaving it here, but "orphaned" for now) + Saves the model as a TorchScript. + The resulting file can be loaded in C++ code and then used for training or + inference with: + #include + torch::jit::Module module = torch::jit::load("model.pt") + Note that the serialized model includes the initial parameters and with the default + ModelArgs the file is 59M and gzips down to 55M. If you want to serialize/distribute + the model parameters separately you can zero out the parameters before saving it and + it will gzip down to 780K. + """ + + # If requested zero params before saving the model. This is useful in + # conjunction with gzip_output. + if zero_params: + for p in model.parameters(): + p.detach().zero_() + + torch.jit.save(torch.jit.script(model), filepath) + + if gzip_output: + with open(filepath, "rb") as f_in: + with gzip.open(f"{filepath}.gz", "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + os.unlink(filepath) + +# ----------------------------------------------------------------------------- +# CLI entrypoint + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("filepath", type=str, help="the output filepath") + parser.add_argument("--version", default=0, type=int, help="the version to export with") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--checkpoint", type=str, help="model checkpoint, .pt file") + group.add_argument("--meta-llama", type=str, help="meta llama model path") + group.add_argument("--hf", type=str, help="huggingface model path") + args = parser.parse_args() + + if args.checkpoint: + model = load_checkpoint(args.checkpoint) + elif args.meta_llama: + model = load_meta_model(args.meta_llama) + elif args.hf: + model = load_hf_model(args.hf) + + if model is None: + parser.error("Can't load input model!") + + # export + model_export(model, args.filepath, args.version) From a9010573e237905489a5e5a23c8c0f37ba162fd4 Mon Sep 17 00:00:00 2001 From: kirp Date: Wed, 27 Sep 2023 19:11:07 -0400 Subject: [PATCH 3/4] update readme --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index eb5ca79..bc89968 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,6 @@ Then, just run the Mojo ```bash mojo llama2.mojo tl-chat.bin \ - -r falcon \ -z tok_tl-chat.bin \ -n 256 -t 0 -s 100 -i "<|im_start|>user\nGive me a python function to generate Fibonacci sequence<|im_end|>\n<|im_start|>assistant\n" ``` From 95b9b71257b0fe284be2c03e34c76f36135afd94 Mon Sep 17 00:00:00 2001 From: kirp Date: Thu, 28 Sep 2023 19:18:21 -0400 Subject: [PATCH 4/4] . --- export.py | 472 ------------------------------------------------------ 1 file changed, 472 deletions(-) delete mode 100644 export.py diff --git a/export.py b/export.py deleted file mode 100644 index ad1fdcd..0000000 --- a/export.py +++ /dev/null @@ -1,472 +0,0 @@ -""" -This script has functions and utilties for model export. -Basically, we have a bunch of versions of the model, and we -want to export them to .bin files to be read from and inferenced in C. - -Among the "input" versions of PyTorch files/models: -- Official Llama 2 weights released by Meta -- Huggingface weights available on the hub -- llama2.c (this repo) trained models - -Among the "output" versions of .bin files: -- v0: Legacy files of the original llama2.c repo (will eventually be DEPRECATED) -- v1-vN: Improved .bin files with a proper header, cache alignment, etc. - -This script aspires to provide all of these conversions. -""" -import os -import gzip -import shutil -import struct -import argparse -import json -from pathlib import Path - -import numpy as np -import torch -from torch import nn - -from model import ModelArgs, Transformer - -# ----------------------------------------------------------------------------- -# common utilities - -def serialize_fp32(file, tensor): - """ writes one fp32 tensor to file that is open in wb mode """ - d = tensor.detach().cpu().view(-1).to(torch.float32).numpy() - b = struct.pack(f'{len(d)}f', *d) - file.write(b) - -def serialize_int8(file, tensor): - """ writes one int8 tensor to file that is open in wb mode """ - d = tensor.detach().cpu().view(-1).numpy().astype(np.int8) - b = struct.pack(f'{len(d)}b', *d) - file.write(b) - -def quantize_q80(w, group_size): - """ - takes a tensor and returns the Q8_0 quantized version - i.e. symmetric quantization into int8, range [-127,127] - """ - assert w.numel() % group_size == 0 - ori_shape = w.shape - w = w.float() # convert to float32 - w = w.reshape(-1, group_size) - # find the max in each group - wmax = torch.abs(w).max(dim=1).values - # calculate the scaling factor such that float = quant * scale - scale = wmax / 127.0 - # scale into range [-127, 127] - quant = w / scale[:,None] - # round to nearest integer - int8val = torch.round(quant).to(torch.int8) - # dequantize by rescaling - fp32val = (int8val.float() * scale[:,None]).view(-1) - fp32valr = fp32val.reshape(-1, group_size) - # calculate the max error in each group - err = torch.abs(fp32valr - w).max(dim=1).values - # find the max error across all groups - maxerr = err.max().item() - return int8val, scale, maxerr - -# ----------------------------------------------------------------------------- -# legacy - -def legacy_export(model, filepath): - """ Original export of llama2.c bin files, i.e. version v0 """ - out_file = open(filepath, 'wb') - - # first write out the header - hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0] - p = model.params - shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight) - # legacy format uses negative/positive vocab size as a shared classifier flag - if not shared_classifier: - p.vocab_size = -p.vocab_size - n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads - header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, - n_kv_heads, p.vocab_size, p.max_seq_len) - out_file.write(header) - - # next write out the embedding weights - serialize_fp32(out_file, model.tok_embeddings.weight) - - # now all the layers - # attention weights - for layer in model.layers: - serialize_fp32(out_file, layer.attention_norm.weight) - for layer in model.layers: - serialize_fp32(out_file, layer.attention.wq.weight) - for layer in model.layers: - serialize_fp32(out_file, layer.attention.wk.weight) - for layer in model.layers: - serialize_fp32(out_file, layer.attention.wv.weight) - for layer in model.layers: - serialize_fp32(out_file, layer.attention.wo.weight) - # ffn weights - for layer in model.layers: - serialize_fp32(out_file, layer.ffn_norm.weight) - for layer in model.layers: - serialize_fp32(out_file, layer.feed_forward.w1.weight) - for layer in model.layers: - serialize_fp32(out_file, layer.feed_forward.w2.weight) - for layer in model.layers: - serialize_fp32(out_file, layer.feed_forward.w3.weight) - # final rmsnorm - serialize_fp32(out_file, model.norm.weight) - # freqs_cis - serialize_fp32(out_file, model.freqs_cos[:p.max_seq_len]) - serialize_fp32(out_file, model.freqs_sin[:p.max_seq_len]) - - # final classifier weights - if not shared_classifier: - serialize_fp32(out_file, model.output.weight) - - # write to binary file - out_file.close() - print(f"wrote {filepath}") - -# ----------------------------------------------------------------------------- -# new version - -def version1_export(model, filepath): - """ - Export the model weights in full float32 .bin file to be read from C. - This is same as legacy_export, but with a proper header. - """ - version = 1 - - out_file = open(filepath, 'wb') - # first write out the header. the header will be 256 bytes - # 1) write magic, which will be uint32 of "ak42" in ASCII - out_file.write(struct.pack('I', 0x616b3432)) - # 2) write version, which will be int - out_file.write(struct.pack('i', version)) - # 3) write the params, which will be 7 ints - p = model.params - hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0] - n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads - header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, - n_kv_heads, p.vocab_size, p.max_seq_len) - out_file.write(header) - # 4) write some other flags - shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight) - out_file.write(struct.pack('B', int(shared_classifier))) - pad = 256 - out_file.tell() # pad rest with zeros; tell returns current pos - assert pad >= 0 - out_file.write(b'\0' * pad) - - # now let's write out all the params - weights = [ - *[layer.attention_norm.weight for layer in model.layers], - *[layer.ffn_norm.weight for layer in model.layers], - model.norm.weight, - model.tok_embeddings.weight, - *[layer.attention.wq.weight for layer in model.layers], - *[layer.attention.wk.weight for layer in model.layers], - *[layer.attention.wv.weight for layer in model.layers], - *[layer.attention.wo.weight for layer in model.layers], - *[layer.feed_forward.w1.weight for layer in model.layers], - *[layer.feed_forward.w2.weight for layer in model.layers], - *[layer.feed_forward.w3.weight for layer in model.layers], - ] - if not shared_classifier: - weights.append(model.output.weight) - for w in weights: - serialize_fp32(out_file, w) - - # write to binary file - out_file.close() - print(f"wrote {filepath}") - -def version2_export(model, filepath, group_size=64): - """ - Export the model weights in Q8_0 into .bin file to be read from C. - That is: - - quantize all weights to symmetric int8, in range [-127, 127] - - all other tensors (the rmsnorm params) are kept and exported in fp32 - - quantization is done in groups of group_size to reduce the effects of any outliers - """ - version = 2 - - # let's first do some validation for this export type - while model.params.dim % group_size != 0: - group_size //= 2 - print(f"BACKOFF: reducing group size to {group_size} to fit hidden_dim") - weights = [ - model.tok_embeddings.weight, - *[layer.attention.wq.weight for layer in model.layers], - *[layer.attention.wk.weight for layer in model.layers], - *[layer.attention.wv.weight for layer in model.layers], - *[layer.attention.wo.weight for layer in model.layers], - *[layer.feed_forward.w1.weight for layer in model.layers], - *[layer.feed_forward.w2.weight for layer in model.layers], - *[layer.feed_forward.w3.weight for layer in model.layers], - ] - shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight) - if not shared_classifier: - weights.append(model.output.weight) - for w in weights: - assert w.numel() % group_size == 0, f"weight {i} has numel {w.numel()}, not a multiple of group_size {group_size}" - - # write - out_file = open(filepath, 'wb') - # first write out the header. the header will be 256 bytes - # 1) write magic, which will be uint32 of "ak42" in ASCII - out_file.write(struct.pack('I', 0x616b3432)) - # 2) write version, which will be int - out_file.write(struct.pack('i', version)) - # 3) write the params, which will be 7 ints - p = model.params - hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0] - n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads - header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, - n_kv_heads, p.vocab_size, p.max_seq_len) - out_file.write(header) - # 4) write some other flags - out_file.write(struct.pack('B', int(shared_classifier))) - out_file.write(struct.pack('i', group_size)) # group size used for quantization - pad = 256 - out_file.tell() # pad rest with zeros; tell returns current pos - assert pad >= 0 - out_file.write(b'\0' * pad) - # now that the header is done, let's write out the model - - # first let's write out all the params that we are keeping in fp32: the norms - for layer in model.layers: # attention norms - serialize_fp32(out_file, layer.attention_norm.weight) - for layer in model.layers: # MLP norms - serialize_fp32(out_file, layer.ffn_norm.weight) - serialize_fp32(out_file, model.norm.weight) # final pre-classifier norm - - # now let's write out all the params that we are quantizing to Q8_0 - # note we skip classifier weights, which are shared with the embedding - ew = [] - scales = [] - for i, w in enumerate(weights): - # quantize this weight - q, s, err = quantize_q80(w, group_size) - # save the int8 weights to file - serialize_int8(out_file, q) # save the tensor in int8 - scales.append(s) # we'll do all the scales after all the qs - # logging - ew.append((err, w.shape)) - print(f"{i+1}/{len(weights)} quantized {tuple(w.shape)} to Q8_0 with max error {err}") - - # save the scaling factors in fp32 here - # this is done to keep all the weights contiquous, making pointer arithmetic easier in C - for s in scales: - serialize_fp32(out_file, s) - - # print the highest error across all weights, should be very small, e.g. O(~0.001) - ew.sort(reverse=True) - print(f"max quantization group error across all weights: {ew[0][0]}") - - # write to binary file - out_file.close() - print(f"wrote {filepath}") - - -# ----------------------------------------------------------------------------- -# Load / import functions - -def load_checkpoint(checkpoint): - - # load the provided model checkpoint - checkpoint_dict = torch.load(checkpoint, map_location='cpu') - gptconf = ModelArgs(**checkpoint_dict['model_args']) - model = Transformer(gptconf) - state_dict = checkpoint_dict['model'] - unwanted_prefix = '_orig_mod.' - for k,v in list(state_dict.items()): - if k.startswith(unwanted_prefix): - state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) - model.load_state_dict(state_dict, strict=False) - model.eval() - return model - -def load_meta_model(model_path): - params_path = os.path.join(model_path, 'params.json') - with open(params_path) as f: - params = json.load(f) - print(params) - - model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth'))) - models = [torch.load(p, map_location='cpu') for p in model_paths] - - def concat_weights(models): - state_dict = {} - for name in list(models[0]): - tensors = [model[name] for model in models] - if len(tensors) == 1 or len(tensors[0].shape) == 1: - state_dict[name] = tensors[0] - continue - is_axis_1 = ( - name.startswith('tok_embeddings.') - or name.endswith('.attention.wo.weight') - or name.endswith('.feed_forward.w2.weight') - ) - axis = 1 if is_axis_1 else 0 - state_dict[name] = torch.cat(tensors, dim=axis) - for model in models: - del model[name] - return state_dict - - state_dict = concat_weights(models) - del models - - # set ModelArgs - config = ModelArgs() - config.dim = params["dim"] - config.n_layers = params["n_layers"] - config.n_heads = params["n_heads"] - config.n_kv_heads = params.get('n_kv_heads') or params['n_heads'] - config.multiple_of = params["multiple_of"] - config.norm_eps = params["norm_eps"] - - config.vocab_size = state_dict['tok_embeddings.weight'].shape[0] - config.max_seq_len = 2048 - - - # create a new Transformer object and set weights - model = Transformer(config) - - model.tok_embeddings.weight = nn.Parameter(state_dict['tok_embeddings.weight']) - model.norm.weight = nn.Parameter(state_dict['norm.weight']) - - for layer in model.layers: - i = layer.layer_id - layer.attention_norm.weight = nn.Parameter(state_dict[f'layers.{i}.attention_norm.weight']) - layer.attention.wq.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wq.weight']) - layer.attention.wk.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wk.weight']) - layer.attention.wv.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wv.weight']) - layer.attention.wo.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wo.weight']) - layer.ffn_norm.weight = nn.Parameter(state_dict[f'layers.{i}.ffn_norm.weight']) - layer.feed_forward.w1.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w1.weight']) - layer.feed_forward.w2.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w2.weight']) - layer.feed_forward.w3.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w3.weight']) - - # final classifier - model.output.weight = nn.Parameter(state_dict['output.weight']) - model.eval() - return model - -def load_hf_model(model_path): - - try: - from transformers import AutoModelForCausalLM - except ImportError: - print("Error: transformers package is required to load huggingface models") - print("Please run `pip install transformers` to install it") - return None - - # load HF model - hf_model = AutoModelForCausalLM.from_pretrained(model_path) - hf_dict = hf_model.state_dict() - - # convert LlamaConfig to ModelArgs - config = ModelArgs() - config.dim = hf_model.config.hidden_size - config.n_layers = hf_model.config.num_hidden_layers - config.n_heads = hf_model.config.num_attention_heads - config.n_kv_heads = hf_model.config.num_key_value_heads - config.vocab_size = hf_model.config.vocab_size - config.hidden_dim = hf_model.config.intermediate_size - config.norm_eps = hf_model.config.rms_norm_eps - config.max_seq_len = hf_model.config.max_position_embeddings - config.kv_dim = config.dim * config.n_kv_heads // config.n_heads - - # create a new Transformer object and set weights - model = Transformer(config) - - model.tok_embeddings.weight = nn.Parameter(hf_dict['model.embed_tokens.weight']) - model.norm.weight = nn.Parameter(hf_dict['model.norm.weight']) - - # huggingface permutes WQ and WK, this function reverses it - def permute_reverse(w, n_heads=config.n_heads, dim1=config.dim, dim2=config.dim): - return w.view(n_heads, 2, dim1 // n_heads // 2, dim2).transpose(1, 2).reshape(dim1, dim2) - - for layer in model.layers: - i = layer.layer_id - layer.attention_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.input_layernorm.weight']) - layer.attention.wq.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.q_proj.weight'])) - layer.attention.wk.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.k_proj.weight'], config.n_kv_heads, config.kv_dim, config.dim)) - layer.attention.wv.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.v_proj.weight']) - layer.attention.wo.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.o_proj.weight']) - layer.ffn_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.post_attention_layernorm.weight']) - layer.feed_forward.w1.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.gate_proj.weight']) - layer.feed_forward.w2.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.down_proj.weight']) - layer.feed_forward.w3.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.up_proj.weight']) - - # final classifier - model.output.weight = nn.Parameter(hf_dict['lm_head.weight']) - model.eval() - return model - - -# ----------------------------------------------------------------------------- -# API entrypoint - -def model_export(model, filepath, version): - if version == 0: - legacy_export(model, filepath) - elif version == 1: - version1_export(model, filepath) - elif version == 2: - version2_export(model, filepath) - else: - raise ValueError(f"unknown version {version}") - -def torchscript_export(model, filepath, zero_params=False, gzip_output=False): - """ - (This was submitted via a PR earlier. Leaving it here, but "orphaned" for now) - Saves the model as a TorchScript. - The resulting file can be loaded in C++ code and then used for training or - inference with: - #include - torch::jit::Module module = torch::jit::load("model.pt") - Note that the serialized model includes the initial parameters and with the default - ModelArgs the file is 59M and gzips down to 55M. If you want to serialize/distribute - the model parameters separately you can zero out the parameters before saving it and - it will gzip down to 780K. - """ - - # If requested zero params before saving the model. This is useful in - # conjunction with gzip_output. - if zero_params: - for p in model.parameters(): - p.detach().zero_() - - torch.jit.save(torch.jit.script(model), filepath) - - if gzip_output: - with open(filepath, "rb") as f_in: - with gzip.open(f"{filepath}.gz", "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - os.unlink(filepath) - -# ----------------------------------------------------------------------------- -# CLI entrypoint - -if __name__ == "__main__": - - parser = argparse.ArgumentParser() - parser.add_argument("filepath", type=str, help="the output filepath") - parser.add_argument("--version", default=0, type=int, help="the version to export with") - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument("--checkpoint", type=str, help="model checkpoint, .pt file") - group.add_argument("--meta-llama", type=str, help="meta llama model path") - group.add_argument("--hf", type=str, help="huggingface model path") - args = parser.parse_args() - - if args.checkpoint: - model = load_checkpoint(args.checkpoint) - elif args.meta_llama: - model = load_meta_model(args.meta_llama) - elif args.hf: - model = load_hf_model(args.hf) - - if model is None: - parser.error("Can't load input model!") - - # export - model_export(model, args.filepath, args.version)