From e6b1a5003e9475a4e415902a3607111cf723fdd9 Mon Sep 17 00:00:00 2001 From: goerch Date: Sun, 23 Jul 2023 18:17:32 +0200 Subject: [PATCH] Fix for #2310 Waiting for the fallout ... --- examples/common.cpp | 2 +- examples/embedding/embedding.cpp | 2 +- examples/main/main.cpp | 12 +- examples/save-load-state/save-load-state.cpp | 4 +- examples/simple/simple.cpp | 4 +- .../train-text-from-scratch.cpp | 10 +- llama.cpp | 107 ++++++++++++------ llama.h | 16 ++- tests/test-tokenizer-0.cpp | 46 ++------ tests/test-tokenizer-1.cpp | 31 ++--- 10 files changed, 119 insertions(+), 115 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 09901959956f9..ada0e3e609642 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -564,7 +564,7 @@ std::string gpt_random_prompt(std::mt19937 & rng) { // TODO: not great allocating this every time std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars - std::vector res(text.size() + (int) add_bos); + std::vector res(text.size() + (int) add_bos + 1); const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); assert(n >= 0); res.resize(n); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 5192d6df5c2f8..8788571cbf9d4 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -67,7 +67,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str()); } fprintf(stderr, "\n"); } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 656382f8161dd..4b4d0397f5c52 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -196,10 +196,6 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector embd_inp; - - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); - if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { embd_inp = ::llama_tokenize(ctx, params.prompt, true); } else { @@ -283,7 +279,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str()); } if (ctx_guidance) { @@ -291,14 +287,14 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); for (int i = 0; i < (int) guidance_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i])); + fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str()); } } if (params.n_keep > 0) { fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i])); + fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str()); } fprintf(stderr, "'\n"); } @@ -636,7 +632,7 @@ int main(int argc, char ** argv) { // display text if (input_echo) { for (auto id : embd) { - printf("%s", llama_token_to_str(ctx, id)); + printf("%s", llama_token_to_str(ctx, id).c_str()); } fflush(stdout); } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 4c868850317fe..d09c27dae2f3f 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -91,7 +91,7 @@ int main(int argc, char ** argv) { auto next_token_str = llama_token_to_str(ctx, next_token); last_n_tokens_data.push_back(next_token); - printf("%s", next_token_str); + printf("%s", next_token_str.c_str()); if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); llama_free(ctx); @@ -151,7 +151,7 @@ int main(int argc, char ** argv) { auto next_token_str = llama_token_to_str(ctx2, next_token); last_n_tokens_data.push_back(next_token); - printf("%s", next_token_str); + printf("%s", next_token_str.c_str()); if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); llama_free(ctx2); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index aa2c4352df294..886825d0f0551 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -102,7 +102,7 @@ int main(int argc, char ** argv) for( auto id : tokens_list ) { - printf( "%s" , llama_token_to_str( ctx , id ) ); + printf( "%s" , llama_token_to_str( ctx , id ).c_str() ); } fflush(stdout); @@ -162,7 +162,7 @@ int main(int argc, char ** argv) } // Print the new token : - printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); + printf( "%s" , llama_token_to_str( ctx , new_token_id ).c_str() ); fflush( stdout ); // Push this new token for next evaluation : diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 449b4e9ecdd54..6c6806e5e4fdf 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1959,7 +1959,7 @@ void print_matrix(struct ggml_tensor * probs) { void print_token(struct llama_context * ctx, llama_token token) { - printf("%s", llama_token_to_str(ctx, token)); + printf("%s", llama_token_to_str(ctx, token).c_str()); } void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) { @@ -2198,17 +2198,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto const char * in = buf.data(); const char * end = buf.data() + buf.size(); for (int i = 0; i < (int) out.size(); ++i) { - const char * s = llama_token_to_str(lctx, out[i]); - int len = strlen(s); + std::string s = llama_token_to_str(lctx, out[i]); + int len = s.length(); if (in >= end) { printf("%s: unexpected end of original text.\n", __func__); break; } - const bool matches = (strncmp(in, s, len) == 0); + const bool matches = (strncmp(in, s.c_str(), len) == 0); if (matches) { in += len; } else { - printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s); + printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str()); } } } diff --git a/llama.cpp b/llama.cpp index 9aa29f4572ed7..e566465fa78f4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -242,13 +242,6 @@ struct llama_kv_cache { } }; -struct llama_trie { - std::unordered_map map; -}; - -void llama_trie_insert(struct llama_trie& trie, const std::string& text, size_t offs); -size_t llama_trie_find(const struct llama_trie& trie, const std::string& text, size_t offs); - struct llama_vocab { using id = int32_t; using token = std::string; @@ -260,7 +253,6 @@ struct llama_vocab { std::unordered_map token_to_id; std::vector id_to_token; - struct llama_trie trie; }; struct llama_model { @@ -524,13 +516,12 @@ struct llama_file_loader { float score = 0.0f; file.read_raw(&score, sizeof(score)); + assert(vocab.token_to_id.find(word) == vocab.token_to_id.end()); vocab.token_to_id[word] = i; auto & tok_score = vocab.id_to_token[i]; tok_score.tok = word; tok_score.score = score; - - llama_trie_insert(vocab.trie, word, 0); } } void read_tensor_metadata(llama_load_tensors_map & tensors_map) { @@ -1804,26 +1795,37 @@ struct llama_sp_bigram { size_t size; }; -void llama_trie_insert(struct llama_trie& trie, const std::string& text, size_t offs) { - if (offs < text.size()) { - size_t char_len = utf8_len(text[offs]); - std::string key = text.substr(offs, char_len); - if (trie.map.find(key) == trie.map.end()) { - trie.map[key] = llama_trie(); +static std::string llama_escape_whitespace(const std::string& text) { + std::string result; + bool escaping = false; + result += char(0xe2); + result += char(0x96); + result += char(0x81); + for (size_t offs = 0; offs < text.length(); ++offs) { + if (text[offs] == ' ') { + if (!escaping) { + result += char(0xe2); + result += char(0x96); + result += char(0x81); + escaping = true; + } + } + else { + escaping = false; + result += text[offs]; } - llama_trie_insert(trie.map.at(key), text, offs + char_len); } + return result; } -size_t llama_trie_find(const struct llama_trie& trie, const std::string & text, size_t offs) { - if (offs < text.size()) { - size_t char_len = utf8_len(text[offs]); - std::string key = text.substr(offs, char_len); - if (trie.map.find(key) != trie.map.end()) { - return char_len + llama_trie_find(trie.map.at(key), text, offs + char_len); - } - } - return 0; +static std::string llama_unescape_whitespace(const std::string& word) { + if (word.length() >= 3 && + word[0] == char(0xe2) && + word[1] == char(0x96) && + word[2] == char(0x81)) { + return std::string(" ") + word.substr(3); + } + return word; } // original implementation: @@ -1832,13 +1834,12 @@ struct llama_tokenizer { llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {} void tokenize(const std::string & text, std::vector & output) { - // split string into utf8 chars / token? + // split string into utf8 chars int index = 0; size_t offs = 0; while (offs < text.size()) { llama_sp_symbol sym; size_t len = utf8_len(text[offs]); - // size_t len = llama_trie_find(vocab_.trie, text, offs); if (len == 0) { len = utf8_len(text[offs]); } @@ -1908,7 +1909,7 @@ struct llama_tokenizer { if (p == rev_merge.end()) { // output any symbols that did not form tokens as bytes. - for (int j = 0; j < (int) symbol.n; ++j) { + for (int j = 0; j < (int)symbol.n; ++j) { llama_vocab::id token_id = static_cast(symbol.text[j]) + 3; output.push_back(token_id); } @@ -1954,11 +1955,11 @@ struct llama_tokenizer { std::map > rev_merge; }; -static std::vector llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) { +static std::vector llama_tokenize(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) { llama_tokenizer tokenizer(vocab); std::vector output; - if (text.empty()) { + if (raw_text.empty()) { return output; } @@ -1966,6 +1967,13 @@ static std::vector llama_tokenize(const llama_vocab & vocab, co output.push_back(llama_token_bos()); } + std::string text; + if (escape) { + text = llama_escape_whitespace(raw_text); + } else { + text = raw_text; + } + tokenizer.tokenize(text, output); return output; } @@ -3620,7 +3628,7 @@ int llama_tokenize_with_model( llama_token * tokens, int n_max_tokens, bool add_bos) { - auto res = llama_tokenize(model->vocab, text, add_bos); + auto res = llama_tokenize(model->vocab, text, add_bos, true); if (n_max_tokens < (int) res.size()) { fprintf(stderr, "%s: too many tokens\n", __func__); @@ -3643,6 +3651,27 @@ int llama_tokenize( return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); } +int llama_tokenize_bpe( + struct llama_context * ctx, + const char * text, + llama_token * tokens, + int n_max_tokens, + bool add_bos) { + auto res = llama_tokenize(ctx->model.vocab, text, add_bos, false); + + if (n_max_tokens < (int) res.size()) { + fprintf(stderr, "%s: too many tokens\n", __func__); + return -((int) res.size()); + } + + for (size_t i = 0; i < res.size(); i++) { + tokens[i] = res[i]; + } + + return res.size(); +} + + int llama_n_vocab_from_model(const struct llama_model * model) { return model->vocab.id_to_token.size(); } @@ -3696,18 +3725,26 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } -const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) { +std::string llama_token_to_str_with_model(const struct llama_model * model, llama_token token) { if (token >= llama_n_vocab_from_model(model)) { return nullptr; } - return model->vocab.id_to_token[token].tok.c_str(); + return llama_unescape_whitespace(model->vocab.id_to_token[token].tok); } -const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { +std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { return llama_token_to_str_with_model(&ctx->model, token); } +std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) { + if (token >= llama_n_vocab_from_model(&ctx->model)) { + return nullptr; + } + + return ctx->model.vocab.id_to_token[token].tok; +} + llama_token llama_token_bos() { return 1; } diff --git a/llama.h b/llama.h index bbf28e68684cf..f6e574bb968b4 100644 --- a/llama.h +++ b/llama.h @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef LLAMA_SHARED # if defined(_WIN32) && !defined(__MINGW32__) @@ -278,6 +279,13 @@ extern "C" { int n_max_tokens, bool add_bos); + LLAMA_API int llama_tokenize_bpe( + struct llama_context * ctx, + const char * text, + llama_token * tokens, + int n_max_tokens, + bool add_bos); + LLAMA_API int llama_tokenize_with_model( const struct llama_model * model, const char * text, @@ -319,11 +327,15 @@ extern "C" { LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); // Token Id -> String. Uses the vocabulary in the provided context - LLAMA_API const char * llama_token_to_str( + LLAMA_API std::string llama_token_to_str( + const struct llama_context * ctx, + llama_token token); + + LLAMA_API std::string llama_token_to_str_bpe( const struct llama_context * ctx, llama_token token); - LLAMA_API const char * llama_token_to_str_with_model( + LLAMA_API std::string llama_token_to_str_with_model( const struct llama_model * model, llama_token token); diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index d738bf680f821..40c0c6b4e3ba2 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -5,44 +5,10 @@ #include #include -static std::string escape_whitespace(const std::string& text) { - std::string result; - bool escaping = false; - result += char(0xe2); - result += char(0x96); - result += char(0x81); - for (size_t offs = 0; offs < text.length(); ++offs) { - if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') { - if (!escaping) { - result += char(0xe2); - result += char(0x96); - result += char(0x81); - escaping = true; - } - } - else { - escaping = false; - result += text[offs]; - } - } - return result; -} - -static std::string unescape_whitespace(llama_context* ctx, llama_token token) { - const char* word = llama_token_to_str(ctx, token); - if (strlen(word) >= 3 && - word[0] == char(0xe2) && - word[1] == char(0x96) && - word[2] == char(0x81)) { - return std::string(" ") + (word + 3); - } - return word; -} - static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) { std::string result; for (int i = 0; i < count; ++i) { - result += unescape_whitespace(ctx, tokens[i]); + result += llama_token_to_str(ctx, tokens[i]); } return result; } @@ -50,6 +16,9 @@ static std::string unescape_whitespace(llama_context* ctx, const llama_token* to static const std::map> & k_tests() { static std::map> _k_tests = { + {" ", {1, 259,},}, + { "\t", { 1, 29871, 12, }, }, + { "\n", { 1, 29871, 13, }, }, { "Hello world", { 1, 15043, 3186, }, }, { " Hello world", { 1, 29871, 15043, 3186, }, }, { "Hello World", { 1, 15043, 2787, }, }, @@ -58,7 +27,8 @@ static const std::map> & k_tests() {" this is 🦙.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, {"w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, {"нещо на Български", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, }, - }; + {"How are you?", { 1, 1128, 526, 366, 29973, }, }, + }; return _k_tests; }; @@ -109,8 +79,8 @@ int main(int argc, char **argv) { } for (const auto & test_kv : k_tests()) { - std::vector res(test_kv.first.size()); - const int n = llama_tokenize(ctx, escape_whitespace(test_kv.first.c_str()).c_str(), res.data(), int(res.size()), true); + std::vector res(test_kv.first.size() + 2); + const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true); fprintf(stderr, "%s : '%s' tokenized to '%s'\n", __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res.data(), n).c_str()); res.resize(n); diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp index 632e0525a9b15..cd105f0980e4e 100644 --- a/tests/test-tokenizer-1.cpp +++ b/tests/test-tokenizer-1.cpp @@ -15,7 +15,7 @@ static std::string escape_whitespace(const std::string& text) { result += char(0x96); result += char(0x81); for (size_t offs = 0; offs < text.length(); ++offs) { - if (text[offs] == ' ' || text[offs] == '\t' || text[offs] == '\n') { + if (text[offs] == ' ') { if (!escaping) { result += char(0xe2); result += char(0x96); @@ -31,21 +31,10 @@ static std::string escape_whitespace(const std::string& text) { return result; } -static std::string unescape_whitespace(llama_context* ctx, llama_token token) { - const char* word = llama_token_to_str(ctx, token); - if (strlen(word) >= 3 && - word[0] == char(0xe2) && - word[1] == char(0x96) && - word[2] == char(0x81)) { - return std::string(" ") + (word + 3); - } - return word; -} - static std::string unescape_whitespace(llama_context* ctx, const llama_token* tokens, int count) { std::string result; for (int i = 0; i < count; ++i) { - result += unescape_whitespace(ctx, tokens[i]); + result += llama_token_to_str(ctx, tokens[i]); } return result; } @@ -97,22 +86,22 @@ int main(int argc, char **argv) { } for (int i = 0; i < n_vocab; ++i) { - std::string forward = llama_token_to_str(ctx, i); + std::string forward = llama_token_to_str_bpe(ctx, i); std::vector tokens(forward.length()); - int n = llama_tokenize(ctx, forward.c_str(), tokens.data(), forward.length(), false); + int n = llama_tokenize_bpe(ctx, forward.c_str(), tokens.data(), forward.length(), false); if (n == 1) { if (i != tokens[0]) { - std::string backward = unescape_whitespace(ctx, tokens[0]); + std::string backward = llama_token_to_str(ctx, tokens[0]); fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns token %d %s\n", - __func__, i, unescape_whitespace(ctx, i).c_str(), tokens[0], backward.c_str()); + __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str()); } } else { if (i <= 258) { fprintf(stderr, "%s : info: token %d is string %s and tokenize() returns tokens %s\n", - __func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str()); + __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str()); } else { fprintf(stderr, "%s : error: token %d is string %s but tokenize() returns tokens %s\n", - __func__, i, unescape_whitespace(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str()); + __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens.data(), n).c_str()); } } } @@ -121,8 +110,8 @@ int main(int argc, char **argv) { for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) { std::wstring wstr(1, ch); std::string str = converter.to_bytes(wstr); - std::vector tokens(strlen(str.c_str())); - auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length(), false); + std::vector tokens(str.length() + 1); + auto n = llama_tokenize(ctx, escape_whitespace(str).c_str(), tokens.data(), str.length() + 1, false); if (n == 1) { fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);