From f8e9f1142893240102e32de06c7b55ea02ad4e2a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 Nov 2023 18:47:56 +0200 Subject: [PATCH] common : add -dkvc arg for enabling kv cache dumps --- common/common.cpp | 4 ++++ common/common.h | 1 + examples/parallel/parallel.cpp | 8 ++++++-- llama.h | 6 +++--- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 636ed35627f4a..1dcc235eac0e6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -496,6 +496,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.chatml = true; } else if (arg == "--infill") { params.infill = true; + } else if (arg == "-dkvc" || arg == "--dump-kv-cache") { + params.dump_kv_cache = true; } else if (arg == "--multiline-input") { params.multiline_input = true; } else if (arg == "--simple-io") { @@ -836,6 +838,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #endif // GGML_USE_CUBLAS #endif printf(" --verbose-prompt print prompt before generation\n"); + printf(" -dkvc, --dump-kv-cache\n"); + printf(" verbose print of the KV cache\n"); printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); diff --git a/common/common.h b/common/common.h index dc57b0ae6e174..2f6fe48ab53d3 100644 --- a/common/common.h +++ b/common/common.h @@ -122,6 +122,7 @@ struct gpt_params { bool numa = false; // attempt optimizations that help on some NUMA systems bool verbose_prompt = false; // print prompt tokens before generation bool infill = false; // use infill mode + bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes // multimodal models (see examples/llava) std::string mmproj = ""; // path to multimodal projector diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 8cc20b422d0ce..e7de84bf2a951 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -113,6 +113,8 @@ int main(int argc, char ** argv) { // insert new requests as soon as the previous one is done const bool cont_batching = params.cont_batching; + const bool dump_kv_cache = params.dump_kv_cache; + #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("parallel", "log")); LOG_TEE("Log start\n"); @@ -203,8 +205,10 @@ int main(int argc, char ** argv) { LOG_TEE("Processing requests ...\n\n"); while (true) { - llama_kv_cache_view_update(ctx, &kvc_view); - dump_kv_cache_view_seqs(kvc_view, 40); + if (dump_kv_cache) { + llama_kv_cache_view_update(ctx, &kvc_view); + dump_kv_cache_view_seqs(kvc_view, 40); + } llama_batch_clear(batch); diff --git a/llama.h b/llama.h index 3208f158cf833..1a62058d1406b 100644 --- a/llama.h +++ b/llama.h @@ -400,13 +400,13 @@ extern "C" { llama_seq_id * cells_sequences; }; - // Create an empty KV cache view. + // Create an empty KV cache view. (use only for debugging purposes) LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq); - // Free a KV cache view. + // Free a KV cache view. (use only for debugging purposes) LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view); - // Update the KV cache view structure with the current state of the KV cache. + // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes) LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view); // Returns the number of tokens in the KV cache (slow, use only for debug)