From cc50e78fbe0928c9b2073503fe70174f143efede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Tue, 30 Jul 2024 16:57:47 +0200 Subject: [PATCH 1/4] llama-vocab, llama : handle <|eom_id|> Llama-3.1 token --- src/llama-vocab.cpp | 7 ++++++- src/llama-vocab.h | 2 ++ src/llama.cpp | 17 +++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 133094904c2d2..9be076f6d7c52 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1444,7 +1444,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { return token != -1 && ( token == llama_token_eos_impl(vocab) || - token == llama_token_eot_impl(vocab) + token == llama_token_eot_impl(vocab) || + token == llama_token_eom_impl(vocab) ); } @@ -1500,6 +1501,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { return vocab.special_eot_id; } +llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { + return vocab.special_eom_id; +} + int32_t llama_tokenize_impl( const struct llama_vocab & vocab, const char * text, diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 30b565d55dad5..7adfc16da3af3 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -45,6 +45,7 @@ struct llama_vocab { id special_suffix_id = -1; id special_middle_id = -1; id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token + id special_eom_id = -1; // tokenizer flags bool tokenizer_add_space_prefix = false; @@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); llama_token llama_token_suffix_impl(const struct llama_vocab & vocab); llama_token llama_token_eot_impl (const struct llama_vocab & vocab); +llama_token llama_token_eom_impl (const struct llama_vocab & vocab); int32_t llama_tokenize_impl( const struct llama_vocab & vocab, diff --git a/src/llama.cpp b/src/llama.cpp index a207451f58507..e37bd5c14e433 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -362,6 +362,7 @@ enum llm_kv { LLM_KV_TOKENIZER_SUFFIX_ID, LLM_KV_TOKENIZER_MIDDLE_ID, LLM_KV_TOKENIZER_EOT_ID, + LLM_KV_TOKENIZER_EOM_ID, LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, @@ -459,6 +460,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, @@ -5585,6 +5587,7 @@ static void llm_load_vocab( { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id }, { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id }, { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id }, + { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id }, }; for (const auto & it : special_token_types) { @@ -5637,6 +5640,20 @@ static void llm_load_vocab( } } } + + // find EOM token: "<|eom_id|>" + // + // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID + // for now, we apply this workaround to find the EOM token based on its text + if (vocab.special_eom_id == -1) { + for (const auto & t : vocab.token_to_id) { + if (t.first == "<|eom_id|>") { + vocab.special_eom_id = t.second; + break; + } + } + } + } // build special tokens cache From f10b0e2c39e03ec3e4abead1573dcf46575f3e82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Sun, 4 Aug 2024 20:22:48 +0200 Subject: [PATCH 2/4] gguf-py : add constants and method related to <|eom_id|> token --- gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_writer.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e343c2ef1659a..59ffd92ea00cc 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -161,6 +161,7 @@ class Tokenizer: SUFFIX_ID = "tokenizer.ggml.suffix_token_id" MIDDLE_ID = "tokenizer.ggml.middle_token_id" EOT_ID = "tokenizer.ggml.eot_token_id" + EOM_ID = "tokenizer.ggml.eom_token_id" class Adapter: TYPE = "adapter.type" @@ -1327,3 +1328,4 @@ def get_type(val: Any) -> GGUFValueType: KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index ba6f53cda25a1..29b378f6d3ebf 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -826,6 +826,9 @@ def add_middle_token_id(self, id: int) -> None: def add_eot_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOT_ID, id) + def add_eom_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.EOM_ID, id) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: From 0b7211387eccc6e93f2ea5e66c2dc02ae155ab67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Sun, 4 Aug 2024 20:47:47 +0200 Subject: [PATCH 3/4] llama : Use token_to_id map find() method instead of iterating over all tokens. --- src/llama.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 132a6ea339e13..d5abe3f58259a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5647,11 +5647,9 @@ static void llm_load_vocab( // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID // for now, we apply this workaround to find the EOM token based on its text if (vocab.special_eom_id == -1) { - for (const auto & t : vocab.token_to_id) { - if (t.first == "<|eom_id|>") { - vocab.special_eom_id = t.second; - break; - } + const auto & t = vocab.token_to_id.find("<|eom_id|>"); + if (t != vocab.token_to_id.end()) { + vocab.special_eom_id = t->second; } } From 5efd8264264289198d19bc812b5ce4d8f42fe520 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Sun, 4 Aug 2024 21:14:43 +0200 Subject: [PATCH 4/4] llama : whitespace formatting --- src/llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index d5abe3f58259a..592954053ea00 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5652,7 +5652,6 @@ static void llm_load_vocab( vocab.special_eom_id = t->second; } } - } // build special tokens cache