Skip to content

Commit

Permalink
ggml : new Q4 and Q5 quantization formats + backward ops
Browse files Browse the repository at this point in the history
sync llama.cpp

- bump GGML_QNT_VERSION -> 1
- increase cwggml object overhead size from 256 to 512 in examples
- drop Q4_2 support
- tensor backend support CUDA
  • Loading branch information
ggerganov committed May 14, 2023
1 parent effcfa6 commit df6a3d3
Show file tree
Hide file tree
Showing 14 changed files with 4,545 additions and 2,384 deletions.
6 changes: 0 additions & 6 deletions examples/common-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
{"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
Expand Down Expand Up @@ -46,7 +45,6 @@ bool ggml_common_quantize_0(
switch (ftype) {
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
Expand Down Expand Up @@ -171,10 +169,6 @@ bool ggml_common_quantize_0(
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_2:
{
cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_0:
{
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
Expand Down
5 changes: 3 additions & 2 deletions examples/dolly-v2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <cmath>
#include <cstdio>
#include <cstring>
#include <cinttypes>
#include <fstream>
#include <map>
#include <string>
Expand Down Expand Up @@ -199,7 +200,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

ctx_size += (6 + 16*n_layer)*256; // object overhead
ctx_size += (6 + 16*n_layer)*512; // object overhead

printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
Expand Down Expand Up @@ -307,7 +308,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo

const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);

printf("%s: memory_size = %8.2f MB, n_mem = %ld\n", __func__, memory_size/1024.0/1024.0, n_mem);
printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
}

// load weights
Expand Down
3 changes: 2 additions & 1 deletion examples/gpt-2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

ctx_size += (6 + 12*n_layer)*256; // object overhead
ctx_size += (6 + 12*n_layer)*512; // object overhead

printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}

Expand Down
2 changes: 1 addition & 1 deletion examples/gpt-j/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v

ctx_size += (5 + 10*n_layer)*256; // object overhead
ctx_size += (5 + 10*n_layer)*512; // object overhead

printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
Expand Down
5 changes: 3 additions & 2 deletions examples/gpt-neox/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <cmath>
#include <cstdio>
#include <cstring>
#include <cinttypes>
#include <fstream>
#include <map>
#include <string>
Expand Down Expand Up @@ -188,7 +189,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

ctx_size += (6 + 16*n_layer)*256; // object overhead
ctx_size += (6 + 16*n_layer)*512; // object overhead

printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
Expand Down Expand Up @@ -293,7 +294,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_

const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);

printf("%s: memory_size = %8.2f MB, n_mem = %ld\n", __func__, memory_size/1024.0/1024.0, n_mem);
printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
}

// load weights
Expand Down
1 change: 0 additions & 1 deletion examples/mnist/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,6 @@ int main(int argc, char ** argv) {
exit(0);
}

mnist_hparams params;
int64_t t_load_us = 0;

mnist_model model;
Expand Down
2 changes: 1 addition & 1 deletion examples/starcoder/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

ctx_size += (6 + 12*n_layer)*256; // object overhead
ctx_size += (6 + 12*n_layer)*512; // object overhead

printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
Expand Down
2 changes: 1 addition & 1 deletion examples/whisper/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
fout.write((char *) &ftype, sizeof(hparams.ftype));
fout.write((char *) &ftype_dst, sizeof(hparams.ftype));
}

// load mel filters
Expand Down
68 changes: 28 additions & 40 deletions examples/whisper/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -291,15 +291,6 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
{ MODEL_LARGE, 1124ull*MB },
},
},
{ GGML_TYPE_Q4_2,
{
{ MODEL_TINY, 26ull*MB },
{ MODEL_BASE, 50ull*MB },
{ MODEL_SMALL, 154ull*MB },
{ MODEL_MEDIUM, 470ull*MB },
{ MODEL_LARGE, 940ull*MB },
},
},
{ GGML_TYPE_Q5_0,
{
{ MODEL_TINY, 30ull*MB },
Expand Down Expand Up @@ -861,6 +852,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
model.type = e_model::MODEL_LARGE;
}

const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;

hparams.ftype %= GGML_QNT_VERSION_FACTOR;

// for the big tensors, we have the option to store the data in 16-bit floats or quantized
Expand All @@ -873,8 +866,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

const size_t scale = model.hparams.ftype ? 1 : 2;

const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;

fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
Expand Down Expand Up @@ -1111,7 +1102,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
}

ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead

fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
Expand Down Expand Up @@ -1559,14 +1550,14 @@ static bool whisper_encode_internal(
Qcur),
Qcur);

//Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
//Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

// note: no bias for Key
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
layer.attn_k_w,
cur);

//Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
//Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
layer.attn_v_w,
Expand Down Expand Up @@ -1626,12 +1617,12 @@ static bool whisper_encode_internal(
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
ggml_scale_inplace(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
);

struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);

struct ggml_tensor * V =
ggml_cpy(ctx0,
Expand Down Expand Up @@ -1814,7 +1805,7 @@ static bool whisper_encode_internal(
layer.cross_attn_k_w,
cur);

Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));

wstate.use_buf(ctx0, 1);

Expand Down Expand Up @@ -1961,14 +1952,14 @@ static bool whisper_decode_internal(
Qcur),
Qcur);

Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

// note: no bias for Key
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
layer.attn_k_w,
cur);

Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

// store key and value to memory
{
Expand Down Expand Up @@ -2017,14 +2008,14 @@ static bool whisper_decode_internal(
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

//struct ggml_tensor * KQ_scaled =
// ggml_scale(ctx0,
// ggml_scale_inplace(ctx0,
// KQ,
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
// );

struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);

struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);

struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
Expand Down Expand Up @@ -2088,7 +2079,7 @@ static bool whisper_decode_internal(
Qcur),
Qcur);

Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

// Kcross is already scaled
struct ggml_tensor * Kcross =
Expand Down Expand Up @@ -2128,15 +2119,15 @@ static bool whisper_decode_internal(
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

//struct ggml_tensor * KQ_scaled =
// ggml_scale(ctx0,
// ggml_scale_inplace(ctx0,
// KQ,
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
// );

// no masking for cross-attention
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);

struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);

struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);

Expand Down Expand Up @@ -4908,15 +4899,14 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
// b: N*N*sizeof(float)
// c: N*N*sizeof(float)
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);

// put a bunch of random data in the buffer
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;

for (int j = 0; j < (int) sizes.size(); j++) {
int n_q4_0 = 0;
int n_q4_1 = 0;
int n_q4_2 = 0;
int n_q5_0 = 0;
int n_q5_1 = 0;
int n_q8_0 = 0;
Expand All @@ -4926,7 +4916,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
// GFLOPS/s
double s_q4_0 = 0.0;
double s_q4_1 = 0.0;
double s_q4_2 = 0.0;
double s_q5_0 = 0.0;
double s_q5_1 = 0.0;
double s_q8_0 = 0.0;
Expand All @@ -4935,18 +4924,17 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {

const size_t N = sizes[j];

for (int k = 0; k < 8; ++k) {
for (int k = 0; k < 7; ++k) {
const ggml_type wtype =
k == 0 ? GGML_TYPE_Q4_0 :
k == 1 ? GGML_TYPE_Q4_1 :
k == 2 ? GGML_TYPE_Q4_2 :
k == 3 ? GGML_TYPE_Q5_0 :
k == 4 ? GGML_TYPE_Q5_1 :
k == 5 ? GGML_TYPE_Q8_0 :
k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32;
k == 2 ? GGML_TYPE_Q5_0 :
k == 3 ? GGML_TYPE_Q5_1 :
k == 4 ? GGML_TYPE_Q8_0 :
k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32;

double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32;
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32;
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;

struct ggml_init_params gparams = {
/*.mem_size =*/ buf.size(),
Expand Down Expand Up @@ -4990,9 +4978,9 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
s = ((2.0*N*N*N*n)/tsum)*1e-9;
}

// Q4_0 | Q4_1 | Q4_2
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n",
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2);
// Q4_0 | Q4_1
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
s += strbuf;

// Q5_0 | Q5_1 | Q8_0
Expand Down
Loading

0 comments on commit df6a3d3

Please sign in to comment.