Skip to content

Commit

Permalink
Better leading space removal
Browse files Browse the repository at this point in the history
  • Loading branch information
jaime-m-p committed Jun 25, 2024
1 parent 9854a9c commit 107923c
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18505,12 +18505,12 @@ int32_t llama_detokenize(
int32_t text_len_max,
bool remove_special,
bool unparse_special) {
// remove the leading space of the first non-control token
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
bool remove_space = !unparse_special && model->vocab.tokenizer_add_space_prefix;
int32_t avail = text_len_max;
int32_t total = 0;

// remove the leading space
bool remove_space = model->vocab.tokenizer_add_space_prefix;

if (remove_special && model->vocab.tokenizer_add_bos) {
if (n_tokens > 0 && tokens[0] == model->vocab.special_bos_id) {
n_tokens--;
Expand All @@ -18527,15 +18527,15 @@ int32_t llama_detokenize(
for (int32_t i = 0; i < n_tokens; ++i) {
GGML_ASSERT(avail >= 0);
int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, unparse_special);
const llama_token_attr attr = llama_token_get_attr(model, tokens[i]);
remove_space = remove_space && (attr & attr_special); // until non-control token
if (n_chars < 0) {
avail = 0;
total -= n_chars;
remove_space = false;
} else if (n_chars > 0) {
avail -= n_chars;
text += n_chars;
total += n_chars;
remove_space = false;
}
}

Expand Down

0 comments on commit 107923c

Please sign in to comment.