Skip to content

Commit

Permalink
tests : use new tokenizer type API (#2692)
Browse files Browse the repository at this point in the history
* Merge tokenizer fixes into the gguf branch.

* Add test vocabularies

* Adapt convert-new.py (and fix a clang-cl compiler error on windows)

* Improved tokenizer test

But does it work on MacOS?

* Improve token type support

- Added @klosax code to convert.py
- Improved token type support in vocabulary

* Exclude platform dependent tests

* More sentencepiece compatibility by eliminating magic numbers

* Restored accidentally removed comment

* Improve commentary

* Use token type API in test-tokenizer-1.cpp
  • Loading branch information
goerch authored Aug 21, 2023
1 parent 0b53b8b commit 49c25cc
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 4 deletions.
4 changes: 2 additions & 2 deletions convert.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,8 @@ def add_meta_vocab(self, vocab: Vocab) -> None:
tokens = []
scores = []
toktypes = []
# NOTE: `all_tokens` returns the the base vocabulary and added tokens
# TODO: add special tokens?
for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
Expand All @@ -751,8 +753,6 @@ def add_meta_vocab(self, vocab: Vocab) -> None:
self.gguf.add_token_scores(scores)
self.gguf.add_token_types(toktypes)

# TODO: added / special tokens

def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
n_elements = 1
for dim in tensor.shape:
Expand Down
4 changes: 2 additions & 2 deletions tests/test-tokenizer-1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ int main(int argc, char **argv) {
return 2;
}
} else {
// TODO: needs access to token types
if (0 <= i && i < 259) {
llama_token_type type = llama_token_get_type(ctx, i);
if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) {
fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
} else {
Expand Down

0 comments on commit 49c25cc

Please sign in to comment.