ggml : new Q4 and Q5 quantization formats + backward ops

sync llama.cpp - bump GGML_QNT_VERSION -> 1 - increase cwggml object overhead size from 256 to 512 in examples - drop Q4_2 support - tensor backend support CUDA
ggml-org · May 14, 2023 · df6a3d3 · df6a3d3
1 parent effcfa6
commit df6a3d3
Show file tree

Hide file tree

Showing 14 changed files with 4,545 additions and 2,384 deletions.
diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
@@ -6,7 +6,6 @@
 static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
     {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
     {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
-    {"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
     {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
     {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
     {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
@@ -46,7 +45,6 @@ bool ggml_common_quantize_0(
     switch (ftype) {
         case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
         case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
-        case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
         case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
         case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
         case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
@@ -171,10 +169,6 @@ bool ggml_common_quantize_0(
                     {
                         cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                     } break;
-                case GGML_TYPE_Q4_2:
-                    {
-                        cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
                 case GGML_TYPE_Q5_0:
                     {
                         cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());

diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <cinttypes>
 #include <fstream>
 #include <map>
 #include <string>
@@ -199,7 +200,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
 
-        ctx_size += (6 + 16*n_layer)*256; // object overhead
+        ctx_size += (6 + 16*n_layer)*512; // object overhead
 
         printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
@@ -307,7 +308,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo
 
         const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
 
-        printf("%s: memory_size = %8.2f MB, n_mem = %ld\n", __func__, memory_size/1024.0/1024.0, n_mem);
+        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
     }
 
     // load weights

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
@@ -186,8 +186,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
 
-        ctx_size += (6 + 12*n_layer)*256; // object overhead
+        ctx_size += (6 + 12*n_layer)*512; // object overhead
 
+        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
         printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
 

diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp
@@ -185,7 +185,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
 
-        ctx_size += (5 + 10*n_layer)*256; // object overhead
+        ctx_size += (5 + 10*n_layer)*512; // object overhead
 
         printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }

diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <cinttypes>
 #include <fstream>
 #include <map>
 #include <string>
@@ -188,7 +189,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
 
-        ctx_size += (6 + 16*n_layer)*256; // object overhead
+        ctx_size += (6 + 16*n_layer)*512; // object overhead
 
         printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
@@ -293,7 +294,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_
 
         const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
 
-        printf("%s: memory_size = %8.2f MB, n_mem = %ld\n", __func__, memory_size/1024.0/1024.0, n_mem);
+        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
     }
 
     // load weights

diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
@@ -205,7 +205,6 @@ int main(int argc, char ** argv) {
         exit(0);
     }
 
-    mnist_hparams params;
     int64_t t_load_us = 0;
 
     mnist_model model;

diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp
@@ -193,7 +193,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
 
-        ctx_size += (6 + 12*n_layer)*256; // object overhead
+        ctx_size += (6 + 12*n_layer)*512; // object overhead
 
         printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }

diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp
@@ -109,7 +109,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
         fout.write((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
         fout.write((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
         fout.write((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
-        fout.write((char *) &ftype,                 sizeof(hparams.ftype));
+        fout.write((char *) &ftype_dst,             sizeof(hparams.ftype));
     }
 
     // load mel filters

diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp
@@ -291,15 +291,6 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
             { MODEL_LARGE,  1124ull*MB },
         },
     },
-    { GGML_TYPE_Q4_2,
-        {
-            { MODEL_TINY,     26ull*MB },
-            { MODEL_BASE,     50ull*MB },
-            { MODEL_SMALL,   154ull*MB },
-            { MODEL_MEDIUM,  470ull*MB },
-            { MODEL_LARGE,   940ull*MB },
-        },
-    },
     { GGML_TYPE_Q5_0,
         {
             { MODEL_TINY,     30ull*MB },
@@ -861,6 +852,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             model.type = e_model::MODEL_LARGE;
         }
 
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
         hparams.ftype %= GGML_QNT_VERSION_FACTOR;
 
         // for the big tensors, we have the option to store the data in 16-bit floats or quantized
@@ -873,8 +866,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
         const size_t scale = model.hparams.ftype ? 1 : 2;
 
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
         fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
         fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
         fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
@@ -1111,7 +1102,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
         }
 
-        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
+        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
 
         fprintf(stderr, "%s: model ctx     = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
@@ -1559,14 +1550,14 @@ static bool whisper_encode_internal(
                             Qcur),
                         Qcur);
 
-                //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+                //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
 
                 // note: no bias for Key
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                         layer.attn_k_w,
                         cur);
 
-                //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+                //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
 
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
                         layer.attn_v_w,
@@ -1626,12 +1617,12 @@ static bool whisper_encode_internal(
                 struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 
                 struct ggml_tensor * KQ_scaled =
-                    ggml_scale(ctx0,
+                    ggml_scale_inplace(ctx0,
                             KQ,
                             ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
                             );
 
-                struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
+                struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
 
                 struct ggml_tensor * V =
                     ggml_cpy(ctx0,
@@ -1814,7 +1805,7 @@ static bool whisper_encode_internal(
                 layer.cross_attn_k_w,
                 cur);
 
-            Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
+            Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
 
             wstate.use_buf(ctx0, 1);
 
@@ -1961,14 +1952,14 @@ static bool whisper_decode_internal(
                         Qcur),
                     Qcur);
 
-            Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
 
             // note: no bias for Key
             struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                     layer.attn_k_w,
                     cur);
 
-            Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
 
             // store key and value to memory
             {
@@ -2017,14 +2008,14 @@ static bool whisper_decode_internal(
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 
             //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale(ctx0,
+            //    ggml_scale_inplace(ctx0,
             //            KQ,
             //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
 
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
 
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
 
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
@@ -2088,7 +2079,7 @@ static bool whisper_decode_internal(
                         Qcur),
                     Qcur);
 
-            Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
 
             // Kcross is already scaled
             struct ggml_tensor * Kcross =
@@ -2128,15 +2119,15 @@ static bool whisper_decode_internal(
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 
             //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale(ctx0,
+            //    ggml_scale_inplace(ctx0,
             //            KQ,
             //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
 
             // no masking for cross-attention
-            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
 
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
 
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
 
@@ -4908,15 +4899,14 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
     // b: N*N*sizeof(float)
     // c: N*N*sizeof(float)
     // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
-    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
+    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
 
     // put a bunch of random data in the buffer
     for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
 
     for (int j = 0; j < (int) sizes.size(); j++) {
         int n_q4_0 = 0;
         int n_q4_1 = 0;
-        int n_q4_2 = 0;
         int n_q5_0 = 0;
         int n_q5_1 = 0;
         int n_q8_0 = 0;
@@ -4926,7 +4916,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
         // GFLOPS/s
         double s_q4_0 = 0.0;
         double s_q4_1 = 0.0;
-        double s_q4_2 = 0.0;
         double s_q5_0 = 0.0;
         double s_q5_1 = 0.0;
         double s_q8_0 = 0.0;
@@ -4935,18 +4924,17 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
 
         const size_t N = sizes[j];
 
-        for (int k = 0; k < 8; ++k) {
+        for (int k = 0; k < 7; ++k) {
             const ggml_type wtype =
                 k == 0 ? GGML_TYPE_Q4_0 :
                 k == 1 ? GGML_TYPE_Q4_1 :
-                k == 2 ? GGML_TYPE_Q4_2 :
-                k == 3 ? GGML_TYPE_Q5_0 :
-                k == 4 ? GGML_TYPE_Q5_1 :
-                k == 5 ? GGML_TYPE_Q8_0 :
-                k == 6 ? GGML_TYPE_F16  : GGML_TYPE_F32;
+                k == 2 ? GGML_TYPE_Q5_0 :
+                k == 3 ? GGML_TYPE_Q5_1 :
+                k == 4 ? GGML_TYPE_Q8_0 :
+                k == 5 ? GGML_TYPE_F16  : GGML_TYPE_F32;
 
-            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32;
-            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32;
+            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
+            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
 
             struct ggml_init_params gparams = {
                 /*.mem_size   =*/ buf.size(),
@@ -4990,9 +4978,9 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
             s = ((2.0*N*N*N*n)/tsum)*1e-9;
         }
 
-        // Q4_0 | Q4_1 | Q4_2
-        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n",
-                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2);
+        // Q4_0 | Q4_1
+        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
+                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
         s += strbuf;
 
         // Q5_0 | Q5_1 | Q8_0