From 685d236d8b99006dd96d313017241242e5f12bd6 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sun, 2 Jul 2023 22:57:14 +0800 Subject: [PATCH 1/2] Add BPE dropout support, use it in training. --- examples/common.cpp | 2 +- examples/save-load-state/save-load-state.cpp | 2 +- .../train-text-from-scratch.cpp | 2 +- llama.cpp | 28 +++++++++++++++---- llama.h | 3 +- tests/test-tokenizer-0.cpp | 2 +- 6 files changed, 29 insertions(+), 10 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 3278a064346b4..3e4ae2ba6d880 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -527,7 +527,7 @@ std::string gpt_random_prompt(std::mt19937 & rng) { std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars std::vector res(text.size() + (int) add_bos); - const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); + const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos, 0.0); assert(n >= 0); res.resize(n); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 4c868850317fe..e39de5a884388 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -45,7 +45,7 @@ int main(int argc, char ** argv) { return 1; } auto tokens = std::vector(params.n_ctx); - auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true); + auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true, 0.0); if (n_prompt_tokens < 1) { fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index c50eeb343bcef..817dcdba3a2ff 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -2187,7 +2187,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto out.resize(buf.size()); - int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false); + int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false, 0.1f); if (n_tokens >= 0) { out.resize(n_tokens); } diff --git a/llama.cpp b/llama.cpp index a869bbac80304..4e0979b45a275 100644 --- a/llama.cpp +++ b/llama.cpp @@ -48,6 +48,7 @@ #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -1717,7 +1718,7 @@ struct llama_sp_bigram { // original implementation: // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 struct llama_tokenizer { - llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {} + llama_tokenizer(const llama_vocab & vocab, float dropout): vocab_(vocab), dropout_(dropout) {} void tokenize(const std::string & text, std::vector & output) { // split string into utf8 chars @@ -1759,6 +1760,9 @@ struct llama_tokenizer { right_sym.n = 0; //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size); + if (skip_merge()) { + continue; + } // remove the right sym from the chain left_sym.next = right_sym.next; @@ -1814,13 +1818,26 @@ struct llama_tokenizer { work_queue_.push(bigram); } + bool skip_merge() + { + std::uniform_real_distribution<> gen(0.0, 1.0); + if (dropout_ <= 0.0) { + return false; + } + if (dropout_ >= 1.0) + return true; + return gen(rng) < dropout_; + } + const llama_vocab & vocab_; std::vector symbols_; llama_sp_bigram::queue work_queue_; + float dropout_; + std::mt19937 rng; }; -static std::vector llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) { - llama_tokenizer tokenizer(vocab); +static std::vector llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos, float dropout) { + llama_tokenizer tokenizer(vocab, dropout); std::vector output; if (text.empty()) { @@ -3407,8 +3424,9 @@ int llama_tokenize( const char * text, llama_token * tokens, int n_max_tokens, - bool add_bos) { - auto res = llama_tokenize(ctx->vocab, text, add_bos); + bool add_bos, + float dropout) { + auto res = llama_tokenize(ctx->vocab, text, add_bos, dropout); if (n_max_tokens < (int) res.size()) { fprintf(stderr, "%s: too many tokens\n", __func__); diff --git a/llama.h b/llama.h index 5bb1964bd390d..c8a97b9f60887 100644 --- a/llama.h +++ b/llama.h @@ -252,7 +252,8 @@ extern "C" { const char * text, llama_token * tokens, int n_max_tokens, - bool add_bos); + bool add_bos, + float dropout); LLAMA_API int llama_n_vocab(const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx); diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 20abe710018ee..198e0cb5d253e 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -64,7 +64,7 @@ int main(int argc, char **argv) { for (const auto & test_kv : k_tests()) { std::vector res(test_kv.first.size()); - const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true); + const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true, 0.0); res.resize(n); bool correct = res.size() == test_kv.second.size(); From b48bef807408fb931e1010e18b7cef322d5757d7 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Mon, 3 Jul 2023 09:22:45 +0800 Subject: [PATCH 2/2] Fix style --- llama.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 4e0979b45a275..c3858dd1b8ebf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1824,8 +1824,9 @@ struct llama_tokenizer { if (dropout_ <= 0.0) { return false; } - if (dropout_ >= 1.0) + if (dropout_ >= 1.0) { return true; + } return gen(rng) < dropout_; }