Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tokenizer SPM fixes for phi-3 and llama-spm #7375

Merged
merged 6 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1776,6 +1776,38 @@ def set_vocab(self):
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED

tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
for token_id, foken_data in added_tokens_decoder.items():
token_id = int(token_id)
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert(tokens[token_id] == token)
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL

tokenizer_file = self.dir_model / 'tokenizer.json'
if tokenizer_file.is_file():
with open(tokenizer_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
added_tokens = tokenizer_json.get("added_tokens", [])
for foken_data in added_tokens:
token_id = int(foken_data["id"])
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert(tokens[token_id] == token)
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL

self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
Expand Down
10 changes: 5 additions & 5 deletions examples/server/tests/features/server.feature
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ Feature: llama.cpp server

Examples: Prompts
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not |
| I believe the meaning of life is | 8 | (read\|going\|pretty)+ | 18 | 8 | not |
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 45 | 64 | not |

Scenario: Completion prompt truncated
Given a prompt:
Expand Down Expand Up @@ -67,8 +67,8 @@ Feature: llama.cpp server

Examples: Prompts
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | |
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 76 | 8 | disabled | not |
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|fireplace)+ | -1 | 64 | enabled | |


Scenario Outline: OAI Compatibility w/ response format
Expand All @@ -84,7 +84,7 @@ Feature: llama.cpp server
| response_format | n_predicted | re_content |
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
| {"type": "json_object"} | 10 | \{ " Jacky. |
| {"type": "json_object"} | 10 | \{ " Saragine. |


Scenario: Tokenize / Detokenize
Expand Down
4 changes: 2 additions & 2 deletions examples/server/tests/features/slotsave.feature
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Feature: llama.cpp server slot management
# Since we have cache, this should only process the last tokens
Given a user prompt "What is the capital of Germany?"
And a completion request with no api error
Then 24 tokens are predicted matching (Thank|special)
Then 24 tokens are predicted matching (Thank|special|Lily)
And 7 prompt tokens are processed
# Loading the original cache into slot 0,
# we should only be processing 1 prompt token and get the same output
Expand All @@ -41,7 +41,7 @@ Feature: llama.cpp server slot management
Given a user prompt "What is the capital of Germany?"
And using slot id 1
And a completion request with no api error
Then 24 tokens are predicted matching (Thank|special)
Then 24 tokens are predicted matching (Thank|special|Lily)
And 1 prompt tokens are processed

Scenario: Erase Slot
Expand Down
31 changes: 27 additions & 4 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4582,7 +4582,8 @@ static void llm_load_vocab(
(t.first == "<|eot_id|>" ||
t.first == "<|im_end|>" ||
t.first == "<|end|>" ||
t.first == "<end_of_turn>"
t.first == "<end_of_turn>" ||
t.first == "<|endoftext|>"
)
) {
vocab.special_eot_id = t.second;
Expand Down Expand Up @@ -12800,6 +12801,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
output.push_back(vocab.special_bos_id);
}

static const bool rtrim = true; //TODO: as param
bool is_prev_special = false;
bool special_token_rtrim = false;

for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
// without adding this leading whitespace, we do not get the same results as the original tokenizer
Expand All @@ -12809,9 +12814,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
// and passing 'add space prefix' as bool argument
//
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
if (&fragment == &fragment_buffer.front()) {
if (vocab.add_space_prefix) {
raw_text = " " + raw_text; // prefix with space if the first token is not special

if (special_token_rtrim) {
size_t num_whitespaces = 0;
while (isspace(raw_text[num_whitespaces])) {
num_whitespaces++;
}
if (num_whitespaces == raw_text.size()) {
continue; // skip if all whitespaces
}
raw_text = raw_text.substr(num_whitespaces);
}

if (vocab.add_space_prefix) {
if (!output.size() || is_prev_special) { // prefix with space if first token
raw_text = " " + raw_text;
}
}

Expand All @@ -12823,6 +12840,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
tokenizer.tokenize(raw_text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
is_prev_special = true;
// phi-3 special tokens without rtrim, works fine for llama-spm too
special_token_rtrim = rtrim
&& fragment.token != vocab.special_bos_id
&& fragment.token != vocab.special_unk_id
&& fragment.token != vocab.special_eos_id;
}
}

Expand Down
35 changes: 32 additions & 3 deletions tests/test-tokenizer-random.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,23 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
'Cửa Việt', # llama-3, ignore_merges = true
'<s>a', # TODO: Phi-3 fail
'<s>a', # Phi-3 fail
'<unk><|endoftext|><s>' # Phi-3 fail
'a\na', # TODO: Bert fail
]


def generator_random_special_tokens(special_tokens:list[str], iterations=100) -> Iterator[str]:
special_tokens = set(special_tokens)
special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
special_tokens = list(sorted(special_tokens))
rand = random.Random()
for m in range(iterations):
rand.seed(m)
words = rand.choices(special_tokens, k=500)
yield "".join(words)


def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
"""Brute force check all vocab words"""
yield from vocab
Expand Down Expand Up @@ -289,14 +301,31 @@ def func_tokenize1(text: str):
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer.all_special_tokens, 10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
# test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL

model.free()


if __name__ == "__main__":
main()
# main()

path_tokenizers = "./models/tokenizers/"
path_vocab_format = "./models/ggml-vocab-%s.gguf"

# import os
# tokenizers = os.listdir(path_tokenizers)
tokenizers = [
"llama-spm", # SPM
"phi-3", # SPM
]

for tokenizer in tokenizers:
print("\n" + "=" * 50 + "\n" + tokenizer + "\n") # noqa
vocab_file = path_vocab_format % tokenizer
dir_tokenizer = path_tokenizers + "/" + tokenizer
main([vocab_file, dir_tokenizer, "--verbose"])
Loading