From 61f0ae73ef87267a8453f4e917254ae215f27c94 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 28 Mar 2024 15:17:43 +0100 Subject: [PATCH 1/2] llama: remove redundant reshape in build_kv_store This commit removes the reshape of the V matrix in the build_kv_store. The motivation for this is that V matrix has the shape: ```console (gdb) p *v_cur $46 = {type = GGML_TYPE_F32, backend = GGML_BACKEND_TYPE_CPU, buffer = 0x0, ne = {4096, 512, 1, 1}, nb = {4, 16384, 8388608, 8388608}, op = GGML_OP_MUL_MAT, op_params = { 0 }, flags = 0, grad = 0x0, src = {0xb496b0, 0x7ffef1c40950, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, perf_runs = 0, perf_cycles = 0, perf_time_us = 0, view_src = 0x0, view_offs = 0, data = 0x0, name = "Vcur-0", '\000' , extra = 0x0, padding = "\000\000\000\000\000\000\000"} ``` And after reshaping this tensor we get: ```console gdb) p *ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens) $44 = {type = GGML_TYPE_F32, backend = GGML_BACKEND_TYPE_CPU, buffer = 0x0, ne = {4096, 512, 1, 1}, nb = {4, 16384, 8388608, 8388608}, op = GGML_OP_RESHAPE, op_params = { 0 }, flags = 0, grad = 0x0, src = {0x7ffef1c40e00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, perf_runs = 0, perf_cycles = 0, perf_time_us = 0, view_src = 0x7ffef1c40e00, view_offs = 0, data = 0x0, name = "Vcur-0 (reshaped)", '\000' , extra = 0x0, padding = "\000\000\000\000\000\000\000"} ``` I noticed that the `src` and `view_src` fields are different but that the dimensions are the same. From the code comment it seems like the reshape call is not needed and perhaps the above can motivate the removal of the reshape call. Signed-off-by: Daniel Bevenius --- llama.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 77ec9b7a1935d..4cefaf0da4eef 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5523,8 +5523,7 @@ static void llm_build_kv_store( GGML_ASSERT(kv.size == n_ctx); // compute the transposed [n_tokens, n_embd] V matrix - struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens)); - //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed + struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); cb(v_cur_t, "v_cur_t", il); struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, From d907f70b0bc3e3aaec0859f8787d4fc71f1c98f4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 29 Mar 2024 09:22:38 +0200 Subject: [PATCH 2/2] llama : add assert --- llama.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llama.cpp b/llama.cpp index 4cefaf0da4eef..1875e24716841 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5523,6 +5523,7 @@ static void llm_build_kv_store( GGML_ASSERT(kv.size == n_ctx); // compute the transposed [n_tokens, n_embd] V matrix + assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); cb(v_cur_t, "v_cur_t", il);