From 61f0ae73ef87267a8453f4e917254ae215f27c94 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Thu, 28 Mar 2024 15:17:43 +0100
Subject: [PATCH 1/2] llama: remove redundant reshape in build_kv_store

This commit removes the reshape of the V matrix in the build_kv_store.

The motivation for this is that V matrix has the shape:
```console
(gdb) p *v_cur
$46 = {type = GGML_TYPE_F32, backend = GGML_BACKEND_TYPE_CPU,
       buffer = 0x0, ne = {4096, 512, 1, 1}, nb = {4, 16384, 8388608,
       8388608}, op = GGML_OP_MUL_MAT, op_params = {
       0 <repeats 16 times>}, flags = 0, grad = 0x0,
       src = {0xb496b0, 0x7ffef1c40950, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
       0x0, 0x0}, perf_runs = 0, perf_cycles = 0, perf_time_us = 0,
       view_src = 0x0, view_offs = 0, data = 0x0,
       name = "Vcur-0", '\000' <repeats 57 times>, extra = 0x0,
       padding = "\000\000\000\000\000\000\000"}
```
And after reshaping this tensor we get:
```console
gdb) p *ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens)
$44 = {type = GGML_TYPE_F32, backend = GGML_BACKEND_TYPE_CPU,
       buffer = 0x0, ne = {4096, 512, 1, 1}, nb = {4, 16384, 8388608,
       8388608}, op = GGML_OP_RESHAPE, op_params = {
       0 <repeats 16 times>}, flags = 0, grad = 0x0,
       src = {0x7ffef1c40e00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
       0x0}, perf_runs = 0, perf_cycles = 0, perf_time_us = 0,
       view_src = 0x7ffef1c40e00, view_offs = 0, data = 0x0,
       name = "Vcur-0 (reshaped)", '\000' <repeats 46 times>, extra = 0x0,
       padding = "\000\000\000\000\000\000\000"}
```
I noticed that the `src` and `view_src` fields are different but that the
dimensions are the same. From the code comment it seems like the reshape
call is not needed and perhaps the above can motivate the removal of the
reshape call.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 llama.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llama.cpp b/llama.cpp
index 77ec9b7a1935d..4cefaf0da4eef 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5523,8 +5523,7 @@ static void llm_build_kv_store(
     GGML_ASSERT(kv.size == n_ctx);
 
     // compute the transposed [n_tokens, n_embd] V matrix
-    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
-    //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
+    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
     cb(v_cur_t, "v_cur_t", il);
 
     struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,

From d907f70b0bc3e3aaec0859f8787d4fc71f1c98f4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 29 Mar 2024 09:22:38 +0200
Subject: [PATCH 2/2] llama : add assert

---
 llama.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama.cpp b/llama.cpp
index 4cefaf0da4eef..1875e24716841 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5523,6 +5523,7 @@ static void llm_build_kv_store(
     GGML_ASSERT(kv.size == n_ctx);
 
     // compute the transposed [n_tokens, n_embd] V matrix
+    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
     struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
     cb(v_cur_t, "v_cur_t", il);