Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8-bit Quantization #298

Open
wants to merge 31 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
f1a7235
no gitignore
Aug 10, 2023
22bda2e
adding initial weight exploration
Aug 10, 2023
84ac19d
adding weight hist plots
Aug 10, 2023
f5d08d2
adding example hist to readme
Aug 10, 2023
ea9b018
adding quantize and runq
Aug 11, 2023
6251785
update readme with model sizes
Aug 11, 2023
f1aa0ff
use fabs(.) instead of max
Aug 11, 2023
f45187d
speedup dequant_ints
kroggen Aug 12, 2023
9c6a5c6
runq: dequantize weights on-the-fly
kroggen Aug 12, 2023
cd3b918
quantize weights
kroggen Aug 13, 2023
2b3c1ce
fix pointer arithmetic
kroggen Aug 13, 2023
5f62209
fix quantization factor
kroggen Aug 13, 2023
f546dcf
print quantization info
kroggen Aug 13, 2023
6e73fc6
fix wcls weights
kroggen Aug 13, 2023
19d4b17
fix: use uint8_t instead of int8_t
kroggen Aug 13, 2023
a030c5c
quantize token embeddings
kroggen Aug 13, 2023
53370b7
rename get_max_vals to get_minmax
kroggen Aug 13, 2023
6f5e426
update quantization to incorporate changes upstream for multiquery
Aug 14, 2023
7ff42dc
update quantization to incorporate changes upstream for multiquery
Aug 14, 2023
dd09af7
Merge branch 'master' into quantization-q8
kroggen Aug 14, 2023
ec64f5c
update runq.c with changes from run.c
kroggen Aug 14, 2023
96f471d
update printed output of quantize.c
kroggen Aug 14, 2023
424170e
Merge branch 'master' into quantization-q8
kroggen Aug 15, 2023
61a8ba9
update runq with UTF-8 support
kroggen Aug 15, 2023
b961e3c
remove files for PR
kroggen Aug 15, 2023
9c0e59e
fix build
kroggen Aug 16, 2023
3bb4ed3
process wcls when not using shared weights
kroggen Aug 17, 2023
c141a3a
enhance output of quantize tool
kroggen Aug 17, 2023
557a321
rename runq.c to run-q8.c
kroggen Aug 22, 2023
c897efa
run-q8: compute freq_cis on-the-fly
kroggen Aug 22, 2023
b011870
do not quantize the freq_cis
kroggen Aug 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 254 additions & 0 deletions quantize.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
/*
Inference for Llama-2 Transformer model in pure C.

Example compile: (see README for more details)
$ gcc -O3 -o run run.c -lm

Then run with:
$ ./run
*/

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <fcntl.h>
#include <stdint.h>
#if defined _WIN32
#include "win.h"
#else
#include <unistd.h>
#include <sys/mman.h>
#endif

// ----------------------------------------------------------------------------
// Transformer and RunState structs, and related memory management

typedef struct {
int dim; // transformer dimension
int hidden_dim; // for ffn layers
int n_layers; // number of layers
int n_heads; // number of query heads
int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
int vocab_size; // vocabulary size, usually 256 (byte-level)
int seq_len; // max sequence length
} Config;

typedef struct {
// token embedding table
float* token_embedding_table; // (vocab_size, dim)
// weights for rmsnorms
float* rms_att_weight; // (layer, dim) rmsnorm weights
float* rms_ffn_weight; // (layer, dim)
// weights for matmuls. note dim == n_heads * head_sizes
float* wq; // (layer, dim, n_heads * head_size)
float* wk; // (layer, dim, n_kv_heads * head_size)
float* wv; // (layer, dim, n_kv_heads * head_size)
float* wo; // (layer, n_heads * head_size, dim)
// weights for ffn
float* w1; // (layer, hidden_dim, dim)
float* w2; // (layer, dim, hidden_dim)
float* w3; // (layer, hidden_dim, dim)
// final rmsnorm
float* rms_final_weight; // (dim,)
// freq_cis for RoPE relatively positional embeddings
float* freq_cis_real; // (seq_len, dim/2)
float* freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer
float* wcls;
} TransformerWeights;

// ----------------------------------------------------------------------------
// initialization: read from checkpoint

void checkpoint_init_weights(TransformerWeights *w, Config* p, float* f, int shared_weights) {
int head_size = p->dim / p->n_heads;
float* ptr = f;
w->token_embedding_table = ptr;
ptr += p->vocab_size * p->dim;
w->rms_att_weight = ptr;
ptr += p->n_layers * p->dim;
w->wq = ptr;
ptr += p->n_layers * p->dim * (p->n_heads * head_size);
w->wk = ptr;
ptr += p->n_layers * p->dim * (p->n_kv_heads * head_size);
w->wv = ptr;
ptr += p->n_layers * p->dim * (p->n_kv_heads * head_size);
w->wo = ptr;
ptr += p->n_layers * (p->n_heads * head_size) * p->dim;
w->rms_ffn_weight = ptr;
ptr += p->n_layers * p->dim;
w->w1 = ptr;
ptr += p->n_layers * p->dim * p->hidden_dim;
w->w2 = ptr;
ptr += p->n_layers * p->hidden_dim * p->dim;
w->w3 = ptr;
ptr += p->n_layers * p->dim * p->hidden_dim;
w->rms_final_weight = ptr;
ptr += p->dim;
w->freq_cis_real = ptr;
ptr += p->seq_len * head_size / 2;
w->freq_cis_imag = ptr;
ptr += p->seq_len * head_size / 2;
w->wcls = shared_weights ? w->token_embedding_table : ptr;
}

void get_minmax(float *ptr, int size, float* pmin, float* pmax){
float min = INFINITY;
float max = -INFINITY;

for (int i = 0; i < size; i++){
if (ptr[i] < min) min = ptr[i];
if (ptr[i] > max) max = ptr[i];
}

*pmin = min;
*pmax = max;
}

void quantize_weights(FILE* file, float *weights, int n_layers, int layer_size, char *name) {

puts("------------------------");
printf("%s layer_size=%d\n", name, layer_size);

// for each layer
for (int l = 0; l < n_layers; l++) {
// get the min and max values for this layer
float min;
float max;
get_minmax(weights, layer_size, &min, &max);
// compute the scale factor
float scale = (max - min) / 255;
printf("l=%d min=%f max=%f scale=%f\n", l, min, max, scale);
// save min value and scale factor to file
fwrite(&min, sizeof(float), 1, file);
fwrite(&scale, sizeof(float), 1, file);
// quantize the weights from this layer and save to file
uint8_t qweight;
for (int i = 0; i < layer_size; i++){
qweight = round((weights[i] - min) / (max - min) * 255);
fwrite(&qweight, sizeof(uint8_t), 1, file);
}
// advance to the weights of the next layer
weights += layer_size;
}

}

void write_weights(FILE* file, float *weights, int n_layers, int layer_size, char *name) {
puts("------------------------");
printf("%s layer_size=%d\n", name, layer_size);
printf("%d layer(s) - not quantized\n", n_layers);
fwrite(weights, sizeof(float), n_layers * layer_size, file);
}

int convert_weights_q8(TransformerWeights *w, Config *p, int shared_weights){

FILE* file = fopen("data.bin", "wb");
if (file == NULL) {
perror("Error opening file");
return 1;
}

int vocab_size = p->vocab_size;
if (!shared_weights) {
vocab_size = -vocab_size;
}

// write headers
fwrite(&p->dim, sizeof(int), 1, file);
fwrite(&p->hidden_dim, sizeof(int), 1, file);
fwrite(&p->n_layers, sizeof(int), 1, file);
fwrite(&p->n_heads, sizeof(int), 1, file);
fwrite(&p->n_kv_heads, sizeof(int), 1, file);
fwrite(&vocab_size, sizeof(int), 1, file);
fwrite(&p->seq_len, sizeof(int), 1, file);

// write quantized weights
int head_size = p->dim / p->n_heads;

quantize_weights(file, w->token_embedding_table, 1, p->vocab_size * p->dim, "token_embedding_table");

quantize_weights(file, w->rms_att_weight, p->n_layers, p->dim, "rms_att_weight");

quantize_weights(file, w->wq, p->n_layers, p->dim * (p->n_heads * head_size), "wq");
quantize_weights(file, w->wk, p->n_layers, p->dim * (p->n_kv_heads * head_size), "wk");
quantize_weights(file, w->wv, p->n_layers, p->dim * (p->n_kv_heads * head_size), "wv");
quantize_weights(file, w->wo, p->n_layers, (p->n_heads * head_size) * p->dim, "wo");

quantize_weights(file, w->rms_ffn_weight, p->n_layers, p->dim, "rms_ffn_weight");

quantize_weights(file, w->w1, p->n_layers, p->dim * p->hidden_dim, "w1");
quantize_weights(file, w->w2, p->n_layers, p->hidden_dim * p->dim, "w2");
quantize_weights(file, w->w3, p->n_layers, p->dim * p->hidden_dim, "w3");

quantize_weights(file, w->rms_final_weight, 1, p->dim, "rms_final_weight");

if (!shared_weights) {
quantize_weights(file, w->wcls, 1, p->vocab_size * p->dim, "wcls");
}

puts("------------------------");

fclose(file);
return 0;
}

int main(int argc, char *argv[]) {

// poor man's C argparse
char *checkpoint = NULL; // e.g. out/model.bin

// 'checkpoint' is necessary arg
if (argc < 2) {
printf("Usage: %s <checkpoint_file>\n", argv[0]);
return 1;
}
if (argc >= 2) {
checkpoint = argv[1];
}

// read in the model.bin file
Config config;
TransformerWeights weights;
int fd = 0; // file descriptor for memory mapping
float* data = NULL; // memory mapped data pointer
long file_size; // size of the checkpoint file in bytes
{
FILE *file = fopen(checkpoint, "rb");
if (!file) { printf("Couldn't open file %s\n", checkpoint); return 1; }

// read in the config header
if (fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
// negative vocab size is hacky way of signaling unshared weights. bit yikes.
int shared_weights = config.vocab_size > 0 ? 1 : 0;
config.vocab_size = abs(config.vocab_size);
printf("vocab size = %d shared_weights = %s\n", config.vocab_size, shared_weights ? "yes" : "no");

// figure out the file size
fseek(file, 0, SEEK_END); // move file pointer to end of file
file_size = ftell(file); // get the file size, in bytes
fclose(file);
printf("Model file size = %ldMB\n", file_size/1024/1024);

// // memory map the Transformer weights into the data pointer
fd = open(checkpoint, O_RDONLY); // open in read only mode
if (fd == -1) { printf("open failed!\n"); return 1; }
data = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (data == MAP_FAILED) { printf("mmap failed!\n"); return 1; }

// fast-forward to weight data, skipping metadata
float* weights_ptr = data + sizeof(Config)/sizeof(float);

checkpoint_init_weights(&weights, &config, weights_ptr, shared_weights);

int ret = convert_weights_q8(&weights, &config, shared_weights);
if (ret == 0) printf("model converted and saved to data.bin\n");
}

// memory and file handles cleanup
if (data != MAP_FAILED) munmap(data, file_size);
if (fd != -1) close(fd);
return 0;
}
Loading