Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve token type support #2668

Merged
merged 19 commits into from
Aug 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
bb89266
Merge tokenizer fixes into the gguf branch.
goerch Aug 8, 2023
98f5b1f
Merge branch 'gguf' of https://github.com/goerch/llama.cpp into gguf
goerch Aug 8, 2023
4566533
Add test vocabularies
goerch Aug 8, 2023
712b614
Merge branch 'gguf' into gguf
goerch Aug 14, 2023
fb591e1
Merge branch 'gguf' of https://github.com/ggerganov/llama.cpp into gguf
goerch Aug 14, 2023
7108448
Merge branch 'gguf' of https://github.com/goerch/llama.cpp into gguf
goerch Aug 14, 2023
cfb0e6f
Adapt convert-new.py (and fix a clang-cl compiler error on windows)
goerch Aug 14, 2023
c9c3b87
Merge branch 'gguf' of https://github.com/goerch/llama.cpp into gguf
goerch Aug 14, 2023
99e0e90
Improved tokenizer test
goerch Aug 15, 2023
c545d85
Merge branch 'gguf' of https://github.com/ggerganov/llama.cpp into gguf
goerch Aug 15, 2023
d864596
Merge branch 'gguf' of https://github.com/ggerganov/llama.cpp into gguf
goerch Aug 17, 2023
c16ea8e
Merge branch 'ggerganov:gguf' into gguf
goerch Aug 19, 2023
21d8864
Merge branch 'gguf' of https://github.com/ggerganov/llama.cpp into gguf
goerch Aug 19, 2023
370a95f
Improve token type support
goerch Aug 19, 2023
dc65fb3
Merge branch 'gguf' of https://github.com/goerch/llama.cpp into gguf
goerch Aug 19, 2023
da83740
Exclude platform dependent tests
goerch Aug 19, 2023
aea173f
More sentencepiece compatibility by eliminating magic numbers
goerch Aug 19, 2023
dea1e4c
Merge branch 'gguf' of https://github.com/ggerganov/llama.cpp into gguf
goerch Aug 20, 2023
6586487
Restored accidentally removed comment
goerch Aug 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,12 +261,12 @@ def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
for i, item in enumerate(tokenizer):
text: bytes = item.encode("utf-8")
score: float = -i
yield text, score
yield text, score, 4

def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score
yield text.encode("utf-8"), score, 4

def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.bpe_tokens()
Expand Down Expand Up @@ -303,12 +303,28 @@ def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
piece = tokenizer.id_to_piece(i)
text: bytes = piece.encode("utf-8")
score: float = tokenizer.get_score(i)
yield text, score

toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3

# NOTE: I think added_tokens are user defined.
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4

if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6

yield text, score, toktype

def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score
yield text.encode("utf-8"), score, 4

def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.sentencepiece_tokens()
Expand Down Expand Up @@ -720,14 +736,16 @@ def add_meta_arch(self, params: Params) -> None:
def add_meta_vocab(self, vocab: Vocab) -> None:
tokens = []
scores = []
for text, score in vocab.all_tokens():
toktypes = []
for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
toktypes.append(toktype)

self.gguf.add_tokenizer_model("llama")
self.gguf.add_token_list(tokens)
self.gguf.add_token_scores(scores)
#self.gguf.add_token_types(toktypes) # TODO: add this
self.gguf.add_token_types(toktypes)

# TODO: added / special tokens

Expand Down
128 changes: 51 additions & 77 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -772,15 +772,16 @@ struct llama_vocab {
using id = int32_t;
using token = std::string;

struct token_score {
struct token_data {
token tok;
float score;
int toktype;
};

llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;

std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
std::vector<token_data> id_to_token;

// default LLaMA special tokens
id special_bos_id = 1;
Expand Down Expand Up @@ -1499,17 +1500,25 @@ static void llama_model_load_internal(

const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);

const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
if (toktype_idx == -1) {
throw std::runtime_error("cannot find token type list in GGUF file\n");
}

const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);

for (uint32_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);

vocab.token_to_id[word] = i;

auto & tok_score = vocab.id_to_token[i];
tok_score.tok = std::move(word);
tok_score.score = scores[i];
auto & token_data = vocab.id_to_token[i];
token_data.tok = std::move(word);
token_data.score = scores[i];
token_data.toktype = toktypes[i];

// determine the newline token: 0x0A == 10 == '\n'
if (tok_score.tok == "<0x0A>") {
if (token_data.tok == "<0x0A>") {
vocab.linefeed_id = i;
}
}
Expand Down Expand Up @@ -2337,92 +2346,57 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
return vocab.type;
}

static bool llama_is_normal_token(const llama_vocab & vocab, llama_token token) {
if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
return token >= 259;
}

if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_BPE) {
return token >= 95;
}

return false;
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 1;
}

static bool llama_is_bos_token(const llama_vocab & vocab, llama_token token) {
return token == vocab.special_bos_id;
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 2;
}

static bool llama_is_eos_token(const llama_vocab & vocab, llama_token token) {
return token == vocab.special_eos_id;
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 3;
}

static bool llama_is_control_token(const llama_vocab & vocab, llama_token token) {
if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
return token == llama_is_bos_token(vocab, token) || token == llama_is_eos_token(vocab, token);
}

// TODO: improve?
return false;
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_is_control_token(vocab, id));
return id == vocab.special_bos_id;
}

static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token token) {
if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
return token == 0;
}

// TODO: improve?
return false;
static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
GGML_ASSERT(llama_is_control_token(vocab, id));
return id == vocab.special_eos_id;
}

static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token token) {
GGML_UNUSED(vocab);
GGML_UNUSED(token);
// TODO: improve?
return false;
static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
return id == vocab.special_pad_id;
}

static bool llama_is_unused_token(const llama_vocab & vocab, llama_token token) {
GGML_UNUSED(vocab);
GGML_UNUSED(token);
// TODO: improve?
return false;
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 4;
}

static bool llama_is_byte_token(const llama_vocab & vocab, llama_token token) {
if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
return 3 <= token && token < 259;
}

if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_BPE) {
return 1 <= token && token < 95;
}

return false;
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 5;
}

static uint8_t llama_byte_to_char(const llama_vocab & vocab, uint8_t byte) {
if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
return byte - 3;
}

if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_BPE) {
return byte + 32;
}

return false;
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
return vocab.id_to_token[id].toktype == 6;
}

static uint8_t llama_char_to_byte(const llama_vocab & vocab, uint8_t ch) {
if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_SPM) {
return ch + 3;
}

if (llama_vocab_get_type(vocab) == LLAMA_VOCAB_TYPE_BPE) {
return ch - 32;
}
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id);
auto buf = token_data.tok.substr(3, 2);
return strtol(buf.c_str(), NULL, 16);
}

return false;
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
char buf[7];
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
GGML_ASSERT(0 <= result && result < 7);
return vocab.token_to_id.at(buf);
}

static std::string llama_escape_whitespace(const std::string& text) {
Expand Down Expand Up @@ -2561,7 +2535,7 @@ struct llama_tokenizer {
if (p == rev_merge.end()) {
// output any symbols that did not form tokens as bytes.
for (int j = 0; j < (int)symbol.n; ++j) {
llama_vocab::id token_id = llama_char_to_byte(vocab_, symbol.text[j]);
llama_vocab::id token_id = llama_byte_to_token(vocab_, symbol.text[j]);
output.push_back(token_id);
}
return;
Expand All @@ -2587,12 +2561,12 @@ struct llama_tokenizer {
return;
}

const auto &tok_score = vocab_.id_to_token[(*token).second];
const auto &tok_data = vocab_.id_to_token[(*token).second];

llama_sp_bigram bigram;
bigram.left = left;
bigram.right = right;
bigram.score = tok_score.score;
bigram.score = tok_data.score;
bigram.size = text.size();
work_queue_.push(bigram);

Expand Down Expand Up @@ -5101,7 +5075,7 @@ int llama_token_to_str_with_model(const struct llama_model * model, llama_token
if (length < 1) {
return -1;
}
buf[0] = llama_byte_to_char(model->vocab, token);
buf[0] = llama_token_to_byte(model->vocab, token);
return 1;
}
}
Expand Down
Binary file modified models/ggml-vocab-llama.gguf
Binary file not shown.
34 changes: 19 additions & 15 deletions tests/test-tokenizer-1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@
#include <vector>
#include <locale>

static std::string vocab_type(llama_context * ctx) {
return llama_n_vocab(ctx) == 32000 ? "spm": "bpe";
}

static std::string escape_whitespace(const std::string& text) {
std::string result;
bool escaping = false;
Expand Down Expand Up @@ -91,8 +87,8 @@ int main(int argc, char **argv) {
return 2;
}
} else {
if ((vocab_type(ctx) == "spm" && i <= 258) ||
(vocab_type(ctx) == "bpe" && (i == 0 || i >= 100000))) {
// TODO: needs access to token types
if (0 <= i && i < 259) {
Copy link
Owner

@ggerganov ggerganov Aug 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can expose token types through the C-style API

e.g.:

LLAMA_API bool llama_token_is_normal(llama_token id);
...

Would that be enough?

Copy link
Collaborator Author

@goerch goerch Aug 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that would be fine for me. Not sure if I manage to prepare a PR today, will try later.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I'll merge this and implement the token type API on the gguf branch. You can update the tests later

fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
} else {
Expand All @@ -103,20 +99,28 @@ int main(int argc, char **argv) {
}
}

std::wstring_convert<typename std::codecvt_utf8<wchar_t>, wchar_t> converter;
for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
std::wstring wstr(1, ch);
std::string str;
try {
str = converter.to_bytes(wstr);
} catch (std::exception & e) {
continue;
#ifdef _WIN32
std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
std::u16string u16str(1, ch);
std::string str = u16converter.to_bytes(u16str);
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
if (tokens.size() == 1) {
fprintf(stderr, "%s : info: %s tokenized to %d \n",
__func__, str.c_str(), tokens[0]);
}
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str), false);
}

std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
std::u32string u32str(1, ch);
std::string str = u32converter.to_bytes(u32str);
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
if (tokens.size() == 1) {
fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
}
}
#endif

llama_free_model(model);
llama_free(ctx);
Expand Down