From 8409f2f47ec71f25a51736769401c4d7ca2ae36f Mon Sep 17 00:00:00 2001
From: Alan Gray <alangray3@gmail.com>
Date: Tue, 4 Jun 2024 05:23:13 -0700
Subject: [PATCH] ggml: avoid rebuild of GGML graph for each token (#7456)

Introduces caching of GGML graph to avoid unnecessary full rebuild
between each token. KV cache parameters, which change with each token,
are updated directly in cached GGML graph. Can be disabled with
GGML_DISABLE_GRAPH_CACHING environment variable.

fix seg fault

restrict to nsplit=2

Improve identification of K and V nodes for param updates

Up LCPP Graph PR by Agray3
---
 ggml/include/ggml-backend.h |   5 ++
 ggml/include/ggml.h         |  11 ++-
 ggml/src/ggml-backend.c     |  31 ++++++++-
 ggml/src/ggml.c             |   3 +-
 src/llama.cpp               | 131 +++++++++++++++++++++++++++++++++---
 5 files changed, 168 insertions(+), 13 deletions(-)

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 5f3f1e286990e4..a6ea2b06a5e5ad 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -232,6 +232,11 @@ extern "C" {
     GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
     GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
 
+    // Utility to query whether cached GGML graph is in use
+    GGML_API bool ggml_use_cached_graph(ggml_backend_sched_t sched);
+
+    // Set whether or not to use GGML graph caching
+    GGML_API void ggml_set_cached_graph(ggml_backend_sched_t sched, bool set_value);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 9bbf3cb20bc60c..6faf6b25598e4c 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -568,6 +568,13 @@ extern "C" {
         GGML_TENSOR_FLAG_PARAM  = 4,
     };
 
+    // Flag (used on GGML_OP_CPY nodes) on whether node is associated with K or V cache
+    enum ggml_kv_cache_flag {
+        GGML_KV_CACHE_FLAG_NONE = 0,
+        GGML_KV_CACHE_FLAG_K = 1,
+        GGML_KV_CACHE_FLAG_V = 2
+    };
+
     // ggml object
     struct ggml_object {
         size_t offs;
@@ -602,6 +609,8 @@ extern "C" {
         // op params - allocated as int32_t for alignment
         int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
 
+        enum ggml_kv_cache_flag kv_cache_flag;
+
         int32_t flags;
 
         struct ggml_tensor * grad;
@@ -617,7 +626,7 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        // char padding[4];
+        char padding[1];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index 954ab20725acc9..c55f478a45aeed 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -1045,6 +1045,13 @@ struct ggml_backend_sched_split {
     struct ggml_cgraph graph;
 };
 
+// Object to facilitate GML graph caching
+struct ggml_cached_graph {
+    bool is_active;
+    ggml_backend_t input_backend;
+    struct ggml_tensor * input_cpy[GGML_SCHED_MAX_SPLIT_INPUTS];
+};
+
 struct ggml_backend_sched {
     bool is_reset; // true if the scheduler has been reset since the last graph split
     bool is_alloc;
@@ -1767,6 +1774,14 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
             struct ggml_tensor * input = split->inputs[j];
             struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
 
+            if (!sched->cached_graph.is_active) {
+                sched->cached_graph.input_backend = input_backend;
+                sched->cached_graph.input_cpy[j] = input_cpy;
+            }
+            else {
+                input_backend = sched->cached_graph.input_backend;
+                input_cpy = sched->cached_graph.input_cpy[j];
+            }
             if (input->flags & GGML_TENSOR_FLAG_INPUT) {
                 // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
                 if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
@@ -1888,6 +1903,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
 
     ggml_backend_sched_reset(sched);
 
+    sched->cached_graph.is_active = false;
+
     return sched;
 }
 
@@ -1964,6 +1981,9 @@ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, st
 }
 
 enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+
+    if(!sched->cached_graph.is_active)
+    {
     if (!sched->is_reset && !sched->is_alloc) {
         ggml_backend_sched_reset(sched);
     }
@@ -1973,7 +1993,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch
             return GGML_STATUS_ALLOC_FAILED;
         }
     }
-
+    }
     return ggml_backend_sched_compute_splits(sched);
 }
 
@@ -2238,3 +2258,12 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
 
     return true;
 }
+
+bool ggml_use_cached_graph(ggml_backend_sched_t sched) {
+    return sched->cached_graph.is_active;
+}
+
+void ggml_set_cached_graph(ggml_backend_sched_t sched, bool set_value) {
+    sched->cached_graph.is_active = set_value;
+}
+
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index dbe718679cde4c..053749e14c71a8 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3746,6 +3746,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.nb           =*/ { 0, 0, 0, 0 },
         /*.op           =*/ GGML_OP_NONE,
         /*.op_params    =*/ { 0 },
+        /*.kv_cache_flag=*/ GGML_KV_CACHE_FLAG_NONE,
         /*.flags        =*/ 0,
         /*.grad         =*/ NULL,
         /*.src          =*/ { NULL },
@@ -3754,7 +3755,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
-        ///*.padding      =*/ { 0 },
+        /*.padding      =*/ { 0 },
     };
 
 #ifdef __clang__
diff --git a/src/llama.cpp b/src/llama.cpp
index b2dd601e03e141..9bfb1a3fe32311 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2648,6 +2648,17 @@ struct llama_model {
     }
 };
 
+// Object used to allow caching of GGML graph between tokens where possible.
+struct ggml_cached_graph {
+    bool is_active = false;
+    ggml_cgraph * gf;
+    size_t n;
+    ggml_backend_t backend_res;
+    ggml_backend_t backend_embd;
+    struct ggml_tensor * res;
+    struct ggml_tensor * embd;
+};
+
 struct llama_context {
     llama_context(const llama_model & model)
         : model(model)
@@ -2748,6 +2759,9 @@ struct llama_context {
     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+
+    // cached Cuda Graphs
+    struct ggml_cached_graph cached_graph;
 };
 
 struct llama_lora_weight {
@@ -7900,7 +7914,9 @@ static void llm_build_kv_store(
     cb(k_cache_view, "k_cache_view", il);
 
     // note: storing RoPE-ed version of K in the KV cache
-    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
+    ggml_tensor * tmp = ggml_cpy(ctx, k_cur, k_cache_view);
+    tmp->kv_cache_flag = GGML_KV_CACHE_FLAG_K;
+    ggml_build_forward_expand(graph, tmp);
 
     assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
 
@@ -7918,8 +7934,9 @@ static void llm_build_kv_store(
         v_cur = ggml_transpose(ctx, v_cur);
     }
     cb(v_cache_view, "v_cache_view", il);
-
-    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
+    tmp=ggml_cpy(ctx, v_cur, v_cache_view);
+    tmp->kv_cache_flag = GGML_KV_CACHE_FLAG_V;
+    ggml_build_forward_expand(graph, tmp);
 }
 
 // do mat_mul, while optionally apply lora
@@ -14726,12 +14743,44 @@ static int llama_decode_internal(
         ggml_backend_sched_reset(lctx.sched);
         ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
-        ggml_cgraph * gf = llama_build_graph(lctx, u_batch, false);
-
+        ggml_cgraph * gf;
         // the output is always the last tensor in the graph
-        struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
-        struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
+        struct ggml_tensor * res;
+        struct ggml_tensor * embd;
+
+        bool n_has_changed_since_last_token = false;
+        if(lctx.cached_graph.n != kv_self.n) n_has_changed_since_last_token = true;
+        lctx.cached_graph.n = kv_self.n;
+
+        // Re-build graph only if graph caching is not possible
+        if(!ggml_use_cached_graph(lctx.sched) || n_has_changed_since_last_token) {
+
+        gf = llama_build_graph(lctx, u_batch, false);
+
+        // Set whether GGML graph caching is in use within GGML module, based on
+        // whether caching was activated here during the previous token
+        ggml_set_cached_graph(lctx.sched,lctx.cached_graph.is_active);
+
+        // Disable future graph caching in presence of env var,
+        // if there are multiple devices, if batch size is greater than 1,
+        // or if nsplits is not 2.
+        // TO DO enable graph caching for these cases
+        bool disable_cached_ggml_graph = (getenv("GGML_DISABLE_GRAPH_CACHING") != nullptr)
+            || (llama_get_device_count(model) > 1)
+            || (ggml_backend_sched_get_n_splits(lctx.sched) != 2);
+        for (int i = 0 ; i < gf->n_nodes; i++) {
+            if (gf->nodes[i]->op == GGML_OP_ADD && gf->nodes[i]->src[1] && gf->nodes[i]->src[1]->ne[1] > 1) {
+                disable_cached_ggml_graph = true;
+                break;
+            }
+        }
+
+        // Set whether graph caching should be used for future tokens
+        lctx.cached_graph.is_active=!disable_cached_ggml_graph;
 
+        // the output is always the last tensor in the graph
+        res  = gf->nodes[gf->n_nodes - 1];
+        embd = gf->nodes[gf->n_nodes - 2];
         if (lctx.n_outputs == 0) {
             // no output
             res  = nullptr;
@@ -14747,10 +14796,62 @@ static int llama_decode_internal(
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
+        lctx.cached_graph.res = res;
+        lctx.cached_graph.embd = embd;
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
 
+        }
+        else {
+            gf = lctx.cached_graph.gf;
+            res = lctx.cached_graph.res;
+            embd = lctx.cached_graph.embd;
+        }
+        lctx.cached_graph.gf = gf;
+
+        if(ggml_use_cached_graph(lctx.sched)) {
+
+            // Temporarily store KV cache parameters that will need updated in cached graph.
+            const struct llama_hparams & hparams = model.hparams;
+            const int64_t  n_layer = hparams.n_layer;
+            const int64_t kv_head = kv_self.head;
+            std::vector<void *> k_cache_ptrs;
+            std::vector<void *> v_cache_ptrs;
+            for (int il = 0; il < n_layer; ++il) {
+                const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+                const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+                ggml_tensor * tmp_tensor =  kv_self.k_l[il];
+                size_t tmp_offset = (ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa))*kv_head;
+                k_cache_ptrs.push_back(static_cast<char*>(tmp_tensor->data) + tmp_offset);
+                tmp_tensor = kv_self.v_l[il];
+                if (cparams.flash_attn) {
+                    tmp_offset = (kv_head)*ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+                } else {
+                    tmp_offset = (kv_head)*ggml_element_size(kv_self.v_l[il]);
+                }
+                v_cache_ptrs.push_back(static_cast<char*>(tmp_tensor->data) + tmp_offset);
+            }
+
+            // Update KV cache parameters in cached graph.
+            int k_count = 0;
+            int v_count = 0;
+            if(gf != nullptr && gf->nodes != nullptr){
+                for (int i = 0; i < gf->n_nodes; i++) {
+                    ggml_tensor * node = gf->nodes[i];
+                    if (node->op == GGML_OP_CPY) {
+                        if (node->kv_cache_flag == GGML_KV_CACHE_FLAG_K) {
+                            node->src[1]->data = k_cache_ptrs[k_count++];
+                        }
+                        if (node->kv_cache_flag == GGML_KV_CACHE_FLAG_V) {
+                            node->src[1]->data = v_cache_ptrs[v_count++];
+                        }
+                    }
+                }
+            }
+
+        }
+
         llama_set_inputs(lctx, u_batch);
 
         llama_graph_compute(lctx, gf, n_threads);
@@ -14773,11 +14874,15 @@ static int llama_decode_internal(
         // extract logits
         if (res) {
             ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
-            GGML_ASSERT(backend_res != nullptr);
-            GGML_ASSERT(lctx.logits != nullptr);
-
             float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
             const int32_t n_outputs_new = lctx.n_outputs;
+            if(!ggml_use_cached_graph(lctx.sched))
+                lctx.cached_graph.backend_res = backend_res;
+            else
+                backend_res = lctx.cached_graph.backend_res;
+
+            GGML_ASSERT(backend_res != nullptr);
+            GGML_ASSERT(lctx.logits != nullptr);
 
             if (n_outputs_new) {
                 GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
@@ -14789,6 +14894,12 @@ static int llama_decode_internal(
         // extract embeddings
         if (embd) {
             ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
+
+
+            if(!ggml_use_cached_graph(lctx.sched))
+                lctx.cached_graph.backend_embd = backend_embd;
+            else
+                backend_embd = lctx.cached_graph.backend_embd;
             GGML_ASSERT(backend_embd != nullptr);
 
             switch (cparams.pooling_type) {