Skip to content

Commit

Permalink
llama : improve infill support
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov committed Oct 8, 2024
1 parent 458367a commit 7b92156
Show file tree
Hide file tree
Showing 10 changed files with 524 additions and 376 deletions.
248 changes: 111 additions & 137 deletions common/arg.cpp

Large diffs are not rendered by default.

18 changes: 17 additions & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <algorithm>
#include <cinttypes>
#include <climits>
#include <cmath>
#include <codecvt>
#include <cstdarg>
Expand All @@ -23,10 +24,10 @@
#include <regex>
#include <sstream>
#include <string>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <thread>

#if defined(__APPLE__) && defined(__MACH__)
#include <sys/types.h>
Expand Down Expand Up @@ -400,6 +401,21 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
// String utils
//

std::string string_format(const char * fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}

std::vector<std::string> string_split(std::string input, char separator) {
std::vector<std::string> parts;
size_t separator_pos = input.find(separator);
Expand Down
19 changes: 16 additions & 3 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,15 +352,28 @@ void gpt_init();

std::string gpt_params_get_system_info(const gpt_params & params);

bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
bool set_process_priority(enum ggml_sched_priority prio);

//
// String utils
//

#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
#endif

LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
std::string string_format(const char * fmt, ...);

std::vector<std::string> string_split(std::string input, char separator);

std::string string_strip(const std::string & str);
Expand Down
14 changes: 7 additions & 7 deletions examples/infill/infill.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,11 +205,11 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);

GGML_ASSERT(llama_token_prefix(model) >= 0);
GGML_ASSERT(llama_token_suffix(model) >= 0);
GGML_ASSERT(llama_token_fim_pre(model) >= 0);
GGML_ASSERT(llama_token_fim_suf(model) >= 0);

inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));

embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
Expand All @@ -218,7 +218,7 @@ int main(int argc, char ** argv) {
}
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());

const llama_token middle_token = llama_token_middle(model);
const llama_token middle_token = llama_token_fim_mid(model);
if (middle_token >= 0) {
embd_inp.push_back(middle_token);
}
Expand Down Expand Up @@ -508,8 +508,8 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);

inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));

embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
Expand Down
2 changes: 1 addition & 1 deletion examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ Takes a prefix and a suffix and returns the predicted completion as stream.
- `input_prefix`: Set the prefix of the code to infill.
- `input_suffix`: Set the suffix of the code to infill.

It also accepts all the options of `/completion` except `stream` and `prompt`.
It also accepts all the options of `/completion`.

### **GET** `/props`: Get server global properties.

Expand Down
79 changes: 43 additions & 36 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -753,12 +753,7 @@ struct server_context {
metrics.init();
}

std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
// TODO: currently, we tokenize using special tokens by default
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
// but it's better compared to completely ignoring ChatML and other chat templates
const bool TMP_FORCE_SPECIAL = true;

std::vector<llama_token> tokenize(const json & json_prompt, bool add_special, bool parse_special) const {
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
std::vector<llama_token> prompt_tokens;
Expand All @@ -771,10 +766,10 @@ struct server_context {

std::vector<llama_token> p;
if (first) {
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
p = ::llama_tokenize(ctx, s, add_special, parse_special);
first = false;
} else {
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
p = ::llama_tokenize(ctx, s, false, parse_special);
}

prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
Expand All @@ -788,7 +783,7 @@ struct server_context {
}
} else {
auto s = json_prompt.template get<std::string>();
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
prompt_tokens = ::llama_tokenize(ctx, s, add_special, parse_special);
}

return prompt_tokens;
Expand Down Expand Up @@ -1215,7 +1210,7 @@ struct server_context {
slot.params.n_predict, n_ctx_train);
}

SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str());
SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());

return slot.has_next_token; // continue
}
Expand Down Expand Up @@ -1483,9 +1478,8 @@ struct server_context {
if (prompt.is_string() || json_is_array_of_numbers(prompt)) {
data["index"] = 0;
create_task(data, false, nullptr);
}
// otherwise, it's a multiple-prompt task, we break it into smaller tasks
else if (prompt.is_array()) {
} else if (prompt.is_array()) {
// otherwise, it's a multiple-prompt task, we break it into smaller tasks
std::vector<json> prompts = prompt;
if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
// prompts[0] is the question
Expand All @@ -1510,9 +1504,8 @@ struct server_context {
}
}
}
}
// invalid case
else {
} else {
// invalid case
throw std::runtime_error(error_msg);
}

Expand Down Expand Up @@ -1973,31 +1966,23 @@ struct server_context {

if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) {
const bool add_bos = llama_add_bos_token(model);
bool suff_rm_leading_spc = true;
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}

auto prefix_tokens = tokenize(slot.params.input_prefix, false);
auto suffix_tokens = tokenize(slot.params.input_suffix, false);
auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);

const int space_token = 29871; // TODO: this should not be hardcoded
if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
suffix_tokens.erase(suffix_tokens.begin());
}

prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
prefix_tokens.insert(prefix_tokens.begin(), llama_token_fim_pre(model));
suffix_tokens.insert(suffix_tokens.begin(), llama_token_fim_suf(model));

auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;

if (add_bos) {
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
}

embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());

const llama_token middle_token = llama_token_middle(model);
const llama_token middle_token = llama_token_fim_mid(model);
if (middle_token >= 0) {
embd_inp.push_back(middle_token);
}
Expand All @@ -2016,25 +2001,30 @@ struct server_context {
prompt_tokens.clear();
prompt_tokens.push_back(llama_token_bos(model));
{
const auto part = tokenize(slot.prompt[0], false);
const auto part = tokenize(slot.prompt[0], false, false);
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
}
prompt_tokens.push_back(llama_token_eos(model));
prompt_tokens.push_back(llama_token_sep(model));
{
const auto part = tokenize(slot.prompt[1], false);
const auto part = tokenize(slot.prompt[1], false, false);
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
}
prompt_tokens.push_back(llama_token_eos(model));
} else {
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
prompt_tokens = tokenize(slot.prompt, system_prompt.empty(), true); // add BOS if there isn't system prompt
}

slot.n_past = 0;
slot.n_prompt_tokens = prompt_tokens.size();

SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);

// print prompt tokens:
for (int i = 0; i < (int) prompt_tokens.size(); i++) {
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], llama_token_to_piece(ctx, prompt_tokens[i]).c_str());
}

// empty prompt passed -> release the slot and send empty response
if (prompt_tokens.empty()) {
SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
Expand Down Expand Up @@ -2924,7 +2914,23 @@ int main(int argc, char ** argv) {
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res);
};

const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
std::string err;
if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
err += "prefix token is missing. ";
}
if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) {
err += "suffix token is missing. ";
}
if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) {
err += "middle token is missing. ";
}

if (!err.empty()) {
res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
return;
}

json data = json::parse(req.body);
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res);
};
Expand Down Expand Up @@ -3010,7 +3016,8 @@ int main(int argc, char ** argv) {
if (body.count("content") != 0) {
const bool add_special = json_value(body, "add_special", false);
const bool with_pieces = json_value(body, "with_pieces", false);
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);

std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special, true);

if (with_pieces) {
for (const auto& token : tokens) {
Expand Down
17 changes: 12 additions & 5 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -896,6 +896,7 @@ extern "C" {
// Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
Expand All @@ -904,11 +905,17 @@ extern "C" {
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);

// Codellama infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
// infill tokens
DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");

LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);

//
// Tokenization
Expand Down
38 changes: 31 additions & 7 deletions src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1663,6 +1663,14 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
return vocab.special_eos_id;
}

llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
return vocab.special_eot_id;
}

llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
return vocab.special_eom_id;
}

llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
return vocab.special_cls_id;
}
Expand All @@ -1688,23 +1696,39 @@ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
}

llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
return vocab.special_prefix_id;
return vocab.special_fim_pre_id;
}

llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
return vocab.special_middle_id;
return vocab.special_fim_mid_id;
}

llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
return vocab.special_suffix_id;
return vocab.special_fim_suf_id;
}

llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
return vocab.special_eot_id;
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_pre_id;
}

llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
return vocab.special_eom_id;
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_suf_id;
}

llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_mid_id;
}

llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_pad_id;
}

llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_rep_id;
}

llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_sep_id;
}

int32_t llama_tokenize_impl(
Expand Down
Loading

0 comments on commit 7b92156

Please sign in to comment.