From 7b575e70f5768e909643c8562cec132320e651dd Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Tue, 16 Jul 2024 17:03:29 +0800 Subject: [PATCH] fix func call tokens for internlm2 --- convert_hf_to_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c2aba909706d0..de56c05035a7a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2213,7 +2213,7 @@ def set_vocab(self): chat_eos_token = '<|im_end|>' chat_eos_token_id = None - + func_call_tokens =('<|plugin|>', '<|interpreter|>', '<|action_end|>', '<|action_start|>') tokenizer_config_file = self.dir_model / 'tokenizer_config.json' if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: @@ -2230,7 +2230,7 @@ def set_vocab(self): tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): + if foken_data.get("special") and not foken_data["content"] in func_call_tokens: toktypes[token_id] = SentencePieceTokenTypes.CONTROL tokenizer_file = self.dir_model / 'tokenizer.json' @@ -2249,7 +2249,7 @@ def set_vocab(self): tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED - if foken_data.get("special"): + if foken_data.get("special") and not foken_data["content"] in func_call_tokens: toktypes[token_id] = SentencePieceTokenTypes.CONTROL self.gguf_writer.add_tokenizer_model("llama")