Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : cache llama_token_to_piece #7587

Merged
merged 4 commits into from
May 30, 2024
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 34 additions & 29 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2163,11 +2163,9 @@ struct llama_vocab {
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;

bool has_cache = false;

std::vector<id> cache_special_tokens;
std::unordered_map<id, token> cache_token_to_piece; // llama_token_to_piece(special = false);
std::unordered_map<id, token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
std::vector<id> cache_special_tokens;
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);

std::map<std::pair<std::string, std::string>, int> bpe_ranks;

Expand Down Expand Up @@ -4597,20 +4595,14 @@ static void llm_load_vocab(
vocab.special_cls_id = 101;
vocab.special_mask_id = 103;
vocab.add_space_prefix = false;
} else {
if (tokenizer_model == "gpt2") {
vocab.type = LLAMA_VOCAB_TYPE_BPE;
} else if (tokenizer_model == "gpt2") {
vocab.type = LLAMA_VOCAB_TYPE_BPE;

const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
if (add_space_prefix_keyidx != -1) {
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
}
} else {
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
vocab.type = LLAMA_VOCAB_TYPE_SPM;
return;
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
if (add_space_prefix_keyidx != -1) {
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
}

// read bpe merges and populate bpe ranks
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
if (merges_keyidx == -1) {
Expand Down Expand Up @@ -4644,6 +4636,8 @@ static void llm_load_vocab(
vocab.special_pad_id = -1;
vocab.special_cls_id = -1;
vocab.special_mask_id = -1;
} else {
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
HanClinto marked this conversation as resolved.
Show resolved Hide resolved
}

// for now, only BPE models have pre-tokenizers
Expand Down Expand Up @@ -4852,12 +4846,18 @@ static void llm_load_vocab(
}

// build token to piece caches
for (llama_token id = 0; id < (llama_token) n_vocab; ++id) {
vocab.cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
vocab.cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
}
{
std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);

vocab.has_cache = true;
for (uint32_t id = 0; id < n_vocab; ++id) {
cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
}

std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
HanClinto marked this conversation as resolved.
Show resolved Hide resolved
}
}

static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
Expand Down Expand Up @@ -14417,7 +14417,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c

std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
candidates_decoded.reserve(candidates->size);
std::vector<llama_grammar_candidate> candidates_grammar;

std::vector<llama_grammar_candidate> candidates_grammar;
candidates_grammar.reserve(candidates->size);

for (size_t i = 0; i < candidates->size; ++i) {
Expand Down Expand Up @@ -18305,14 +18306,18 @@ static std::string llama_decode_text(const std::string & text) {

// does not write null-terminator to buf
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
if (model->vocab.has_cache) {
// if we have a cache - use it
{
const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Maybe we could get away w/ a single cache (built w/ special=true) and early-exit in special case at the top of the function?

int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
    if (!special && llama_is_control_token(model->vocab, token)) {
        return 0;
    }
    // if we have a cache - use it
    if (!model->vocab.cache_token_to_piece.empty()) {
         ....
    }
    ...

const auto & res = cache.at(token);
if (length < (int) res.size()) {
return -(int) res.size();

if (!cache.empty()) {
HanClinto marked this conversation as resolved.
Show resolved Hide resolved
const auto & res = cache.at(token);
if (length < (int) res.size()) {
return -(int) res.size();
}
memcpy(buf, res.c_str(), res.size());
return res.size();
}
memcpy(buf, res.c_str(), res.size());
return res.size();
}

if (0 <= token && token < llama_n_vocab(model)) {
Expand Down
Loading