Skip to content

Commit

Permalink
Fix for ggerganov#2310
Browse files Browse the repository at this point in the history
Waiting for the fallout ...
  • Loading branch information
goerch committed Jul 23, 2023
1 parent 0e74a72 commit e6b1a50
Show file tree
Hide file tree
Showing 10 changed files with 119 additions and 115 deletions.
2 changes: 1 addition & 1 deletion examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
// TODO: not great allocating this every time
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
std::vector<llama_token> res(text.size() + (int) add_bos);
std::vector<llama_token> res(text.size() + (int) add_bos + 1);
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
assert(n >= 0);
res.resize(n);
Expand Down
2 changes: 1 addition & 1 deletion examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
}
fprintf(stderr, "\n");
}
Expand Down
12 changes: 4 additions & 8 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,6 @@ int main(int argc, char ** argv) {

// tokenize the prompt
std::vector<llama_token> embd_inp;

// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');

if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
} else {
Expand Down Expand Up @@ -283,22 +279,22 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
}

if (ctx_guidance) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
}
}

if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
}
fprintf(stderr, "'\n");
}
Expand Down Expand Up @@ -636,7 +632,7 @@ int main(int argc, char ** argv) {
// display text
if (input_echo) {
for (auto id : embd) {
printf("%s", llama_token_to_str(ctx, id));
printf("%s", llama_token_to_str(ctx, id).c_str());
}
fflush(stdout);
}
Expand Down
4 changes: 2 additions & 2 deletions examples/save-load-state/save-load-state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ int main(int argc, char ** argv) {
auto next_token_str = llama_token_to_str(ctx, next_token);
last_n_tokens_data.push_back(next_token);

printf("%s", next_token_str);
printf("%s", next_token_str.c_str());
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx);
Expand Down Expand Up @@ -151,7 +151,7 @@ int main(int argc, char ** argv) {
auto next_token_str = llama_token_to_str(ctx2, next_token);
last_n_tokens_data.push_back(next_token);

printf("%s", next_token_str);
printf("%s", next_token_str.c_str());
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx2);
Expand Down
4 changes: 2 additions & 2 deletions examples/simple/simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ int main(int argc, char ** argv)

for( auto id : tokens_list )
{
printf( "%s" , llama_token_to_str( ctx , id ) );
printf( "%s" , llama_token_to_str( ctx , id ).c_str() );
}

fflush(stdout);
Expand Down Expand Up @@ -162,7 +162,7 @@ int main(int argc, char ** argv)
}

// Print the new token :
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
printf( "%s" , llama_token_to_str( ctx , new_token_id ).c_str() );
fflush( stdout );

// Push this new token for next evaluation :
Expand Down
10 changes: 5 additions & 5 deletions examples/train-text-from-scratch/train-text-from-scratch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1959,7 +1959,7 @@ void print_matrix(struct ggml_tensor * probs) {


void print_token(struct llama_context * ctx, llama_token token) {
printf("%s", llama_token_to_str(ctx, token));
printf("%s", llama_token_to_str(ctx, token).c_str());
}

void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
Expand Down Expand Up @@ -2198,17 +2198,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
const char * in = buf.data();
const char * end = buf.data() + buf.size();
for (int i = 0; i < (int) out.size(); ++i) {
const char * s = llama_token_to_str(lctx, out[i]);
int len = strlen(s);
std::string s = llama_token_to_str(lctx, out[i]);
int len = s.length();
if (in >= end) {
printf("%s: unexpected end of original text.\n", __func__);
break;
}
const bool matches = (strncmp(in, s, len) == 0);
const bool matches = (strncmp(in, s.c_str(), len) == 0);
if (matches) {
in += len;
} else {
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
}
}
}
Expand Down
107 changes: 72 additions & 35 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,13 +242,6 @@ struct llama_kv_cache {
}
};

struct llama_trie {
std::unordered_map<std::string, llama_trie> map;
};

void llama_trie_insert(struct llama_trie& trie, const std::string& text, size_t offs);
size_t llama_trie_find(const struct llama_trie& trie, const std::string& text, size_t offs);

struct llama_vocab {
using id = int32_t;
using token = std::string;
Expand All @@ -260,7 +253,6 @@ struct llama_vocab {

std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
struct llama_trie trie;
};

struct llama_model {
Expand Down Expand Up @@ -524,13 +516,12 @@ struct llama_file_loader {
float score = 0.0f;
file.read_raw(&score, sizeof(score));

assert(vocab.token_to_id.find(word) == vocab.token_to_id.end());
vocab.token_to_id[word] = i;

auto & tok_score = vocab.id_to_token[i];
tok_score.tok = word;
tok_score.score = score;

llama_trie_insert(vocab.trie, word, 0);
}
}
void read_tensor_metadata(llama_load_tensors_map & tensors_map) {
Expand Down Expand Up @@ -1804,26 +1795,37 @@ struct llama_sp_bigram {
size_t size;
};

void llama_trie_insert(struct llama_trie& trie, const std::string& text, size_t offs) {
if (offs < text.size()) {
size_t char_len = utf8_len(text[offs]);
std::string key = text.substr(offs, char_len);
if (trie.map.find(key) == trie.map.end()) {
trie.map[key] = llama_trie();
static std::string llama_escape_whitespace(const std::string& text) {
std::string result;
bool escaping = false;
result += char(0xe2);
result += char(0x96);
result += char(0x81);
for (size_t offs = 0; offs < text.length(); ++offs) {
if (text[offs] == ' ') {
if (!escaping) {
result += char(0xe2);
result += char(0x96);
result += char(0x81);
escaping = true;
}
}
else {
escaping = false;
result += text[offs];
}
llama_trie_insert(trie.map.at(key), text, offs + char_len);
}
return result;
}

size_t llama_trie_find(const struct llama_trie& trie, const std::string & text, size_t offs) {
if (offs < text.size()) {
size_t char_len = utf8_len(text[offs]);
std::string key = text.substr(offs, char_len);
if (trie.map.find(key) != trie.map.end()) {
return char_len + llama_trie_find(trie.map.at(key), text, offs + char_len);
}
}
return 0;
static std::string llama_unescape_whitespace(const std::string& word) {
if (word.length() >= 3 &&
word[0] == char(0xe2) &&
word[1] == char(0x96) &&
word[2] == char(0x81)) {
return std::string(" ") + word.substr(3);
}
return word;
}

// original implementation:
Expand All @@ -1832,13 +1834,12 @@ struct llama_tokenizer {
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}

void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
// split string into utf8 chars / token?
// split string into utf8 chars
int index = 0;
size_t offs = 0;
while (offs < text.size()) {
llama_sp_symbol sym;
size_t len = utf8_len(text[offs]);
// size_t len = llama_trie_find(vocab_.trie, text, offs);
if (len == 0) {
len = utf8_len(text[offs]);
}
Expand Down Expand Up @@ -1908,7 +1909,7 @@ struct llama_tokenizer {

if (p == rev_merge.end()) {
// output any symbols that did not form tokens as bytes.
for (int j = 0; j < (int) symbol.n; ++j) {
for (int j = 0; j < (int)symbol.n; ++j) {
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
output.push_back(token_id);
}
Expand Down Expand Up @@ -1954,18 +1955,25 @@ struct llama_tokenizer {
std::map<std::string, std::pair<int, int> > rev_merge;
};

static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
llama_tokenizer tokenizer(vocab);
std::vector<llama_vocab::id> output;

if (text.empty()) {
if (raw_text.empty()) {
return output;
}

if (bos) {
output.push_back(llama_token_bos());
}

std::string text;
if (escape) {
text = llama_escape_whitespace(raw_text);
} else {
text = raw_text;
}

tokenizer.tokenize(text, output);
return output;
}
Expand Down Expand Up @@ -3620,7 +3628,7 @@ int llama_tokenize_with_model(
llama_token * tokens,
int n_max_tokens,
bool add_bos) {
auto res = llama_tokenize(model->vocab, text, add_bos);
auto res = llama_tokenize(model->vocab, text, add_bos, true);

if (n_max_tokens < (int) res.size()) {
fprintf(stderr, "%s: too many tokens\n", __func__);
Expand All @@ -3643,6 +3651,27 @@ int llama_tokenize(
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
}

int llama_tokenize_bpe(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos) {
auto res = llama_tokenize(ctx->model.vocab, text, add_bos, false);

if (n_max_tokens < (int) res.size()) {
fprintf(stderr, "%s: too many tokens\n", __func__);
return -((int) res.size());
}

for (size_t i = 0; i < res.size(); i++) {
tokens[i] = res[i];
}

return res.size();
}


int llama_n_vocab_from_model(const struct llama_model * model) {
return model->vocab.id_to_token.size();
}
Expand Down Expand Up @@ -3696,18 +3725,26 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data();
}

const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
std::string llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
if (token >= llama_n_vocab_from_model(model)) {
return nullptr;
}

return model->vocab.id_to_token[token].tok.c_str();
return llama_unescape_whitespace(model->vocab.id_to_token[token].tok);
}

const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
return llama_token_to_str_with_model(&ctx->model, token);
}

std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
if (token >= llama_n_vocab_from_model(&ctx->model)) {
return nullptr;
}

return ctx->model.vocab.id_to_token[token].tok;
}

llama_token llama_token_bos() {
return 1;
}
Expand Down
16 changes: 14 additions & 2 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include <string>

#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
Expand Down Expand Up @@ -278,6 +279,13 @@ extern "C" {
int n_max_tokens,
bool add_bos);

LLAMA_API int llama_tokenize_bpe(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);

LLAMA_API int llama_tokenize_with_model(
const struct llama_model * model,
const char * text,
Expand Down Expand Up @@ -319,11 +327,15 @@ extern "C" {
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);

// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API const char * llama_token_to_str(
LLAMA_API std::string llama_token_to_str(
const struct llama_context * ctx,
llama_token token);

LLAMA_API std::string llama_token_to_str_bpe(
const struct llama_context * ctx,
llama_token token);

LLAMA_API const char * llama_token_to_str_with_model(
LLAMA_API std::string llama_token_to_str_with_model(
const struct llama_model * model,
llama_token token);

Expand Down
Loading

0 comments on commit e6b1a50

Please sign in to comment.