From 89a33636e4b96cdd9a2998ea0fc4c381c5ed720e Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 13 Feb 2024 18:29:23 +0100
Subject: [PATCH] fix(llama.cpp): disable infinite context shifting

Infinite context loop might as well trigger an infinite loop of context
shifting if the model hallucinates and does not stop answering.
This has the unpleasant effect that the predicion never terminates,
which is the case especially on small models which tends to hallucinate.

Workarounds https://github.com/mudler/LocalAI/issues/1333 by removing
context-shifting.

See also upstream issue: https://github.com/ggerganov/llama.cpp/issues/3969
---
 backend/cpp/llama/grpc-server.cpp | 34 ++++++++++---------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index 35ca6ea55bd9..56eb30838d3c 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -1387,30 +1387,18 @@ struct llama_server_context
             {
                 if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
                 {
-                    // Shift context
-                    const int n_left    = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
-                    const int n_discard = n_left / 2;
-
-                    LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
-                    llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
-                    llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
-
-                    for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
-                    {
-                        slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
-                    }
-
-                    slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
-
-                    slot.n_past -= n_discard;
-
-                    slot.truncated = true;
+                    // LOCALAI:
+                    // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
+                    // Context is exhausted, release the slot
+                    slot.release();
+                    send_final_response(slot);
+                    slot.cache_tokens.clear();
+                    slot.n_past = 0;
+                    slot.truncated = false;
+                    slot.has_next_token = true; // Assuming this flag exists to indicate if more tokens are expected
+                    LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
 
-                    LOG_VERBOSE("context shift", {
-                        { "n_ctx", n_ctx },
-                        { "n_keep", params.n_keep },
-                        { "n_left", n_left },
-                    });
+                    continue;
                 }
             }
         }