From 7b1b575fe8de8ece4122f7bd7a8242e63841ec70 Mon Sep 17 00:00:00 2001 From: Green Sky Date: Wed, 22 Mar 2023 12:56:42 +0100 Subject: [PATCH 1/2] preallocate a buffer of fitting size for tokenization (utils.cpp) --- utils.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils.cpp b/utils.cpp index 1679ae10a3af7..3909c974f1e1f 100644 --- a/utils.cpp +++ b/utils.cpp @@ -146,8 +146,10 @@ std::string gpt_random_prompt(std::mt19937 & rng) { // TODO: not great allocating this every time std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { - std::vector res(8096); + // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars + std::vector res(text.size() + (int)add_bos); int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); + assert(n >= 0); res.resize(n); return res; From 57fee166d2b5714ceae8266fc61c371737d55dea Mon Sep 17 00:00:00 2001 From: Green Sky Date: Wed, 22 Mar 2023 12:58:20 +0100 Subject: [PATCH 2/2] don't create a new std::string (especially here, where it's usually large) --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index c164c102dbf41..fbb43a8cca15b 100644 --- a/main.cpp +++ b/main.cpp @@ -85,7 +85,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` - auto tokens = ::llama_tokenize(ctx, params.prompt.c_str(), true); + auto tokens = ::llama_tokenize(ctx, params.prompt, true); int count = 0; double nll = 0.0;