From 107923cdd212c77738278541b6090110ee7c34eb Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Tue, 25 Jun 2024 17:33:56 +0200 Subject: [PATCH] Better leading space removal --- llama.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0391583704472..ab8620ec6db65 100644 --- a/llama.cpp +++ b/llama.cpp @@ -18505,12 +18505,12 @@ int32_t llama_detokenize( int32_t text_len_max, bool remove_special, bool unparse_special) { - // remove the leading space of the first non-control token - static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL; - bool remove_space = !unparse_special && model->vocab.tokenizer_add_space_prefix; int32_t avail = text_len_max; int32_t total = 0; + // remove the leading space + bool remove_space = model->vocab.tokenizer_add_space_prefix; + if (remove_special && model->vocab.tokenizer_add_bos) { if (n_tokens > 0 && tokens[0] == model->vocab.special_bos_id) { n_tokens--; @@ -18527,15 +18527,15 @@ int32_t llama_detokenize( for (int32_t i = 0; i < n_tokens; ++i) { GGML_ASSERT(avail >= 0); int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, unparse_special); - const llama_token_attr attr = llama_token_get_attr(model, tokens[i]); - remove_space = remove_space && (attr & attr_special); // until non-control token if (n_chars < 0) { avail = 0; total -= n_chars; + remove_space = false; } else if (n_chars > 0) { avail -= n_chars; text += n_chars; total += n_chars; + remove_space = false; } }