From 125835b25338402fc6f09fb75152527d2247e65b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 23 Oct 2024 23:09:54 +0200 Subject: [PATCH 01/12] server : refactor slot input data, move tokenizer to HTTP thread --- examples/server/server.cpp | 653 +++++++++++++------------------------ examples/server/utils.hpp | 240 +++++++++++++- 2 files changed, 448 insertions(+), 445 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3992108e7f383..13bea289b048e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -100,6 +100,7 @@ struct server_task { int id = -1; // to be filled by server_queue int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL + std::vector prompt_tokens; server_task_type type; json data; @@ -161,18 +162,12 @@ struct server_slot { int32_t i_batch = -1; int32_t n_predict = -1; // TODO: disambiguate from params.n_predict + // n_prompt_tokens may not be equal to prompt_tokens.size(), because prompt maybe truncated int32_t n_prompt_tokens = 0; int32_t n_prompt_tokens_processed = 0; - json prompt; // can be either a string, array of strings or array of token ids - - json input_prefix; - json input_suffix; - json input_extra; - - // when a task is submitted, we first tokenize the prompt and store it here + // input prompt tokens std::vector prompt_tokens; - std::vector extra_tokens; size_t last_nl_pos = 0; @@ -735,39 +730,7 @@ struct server_context { } std::vector tokenize(const json & json_prompt, bool add_special, bool parse_special) const { - // If `add_bos` is true, we only add BOS, when json_prompt is a string, - // or the first element of the json_prompt array is a string. - std::vector prompt_tokens; - - if (json_prompt.is_array()) { - bool first = true; - for (const auto & p : json_prompt) { - if (p.is_string()) { - auto s = p.template get(); - - std::vector p; - if (first) { - p = common_tokenize(ctx, s, add_special, parse_special); - first = false; - } else { - p = common_tokenize(ctx, s, false, parse_special); - } - - prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); - } else { - if (first) { - first = false; - } - - prompt_tokens.push_back(p.template get()); - } - } - } else { - auto s = json_prompt.template get(); - prompt_tokens = common_tokenize(ctx, s, add_special, parse_special); - } - - return prompt_tokens; + return tokenize_mixed(ctx, json_prompt, add_special, parse_special); } server_slot * get_slot_by_id(int id) { @@ -794,22 +757,16 @@ struct server_context { continue; } - // skip the slot if it does not contains prompt - if (!slot.prompt.is_string()) { + // skip the slot if it does not contains cached tokens + if (slot.prompt_tokens.empty()) { continue; } - // current slot's prompt - std::string slot_prompt = slot.prompt.get(); - - // length of the current slot's prompt - int slot_prompt_len = slot_prompt.size(); - // length of the Longest Common Prefix between the current slot's prompt and the input prompt - int lcp_len = longest_common_prefix(slot_prompt, prompt); + int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens); // fraction of the common substring length compared to the current slot's prompt length - similarity = static_cast(lcp_len) / slot_prompt_len; + similarity = static_cast(lcp_len) / static_cast(slot.prompt_tokens.size()); // select the current slot if the criteria match if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) { @@ -914,57 +871,6 @@ struct server_context { SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict); } - // infill - slot.input_prefix = json_value(data, "input_prefix", json()); - slot.input_suffix = json_value(data, "input_suffix", json()); - slot.input_extra = json_value(data, "input_extra", json()); - - SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size()); - for (const auto & chunk : slot.input_extra) { - // { "text": string, "filename": string } - if (!chunk.contains("text") || !chunk["text"].is_string()) { - send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST); - return false; - } - - // filename is optional - if (chunk.contains("filename") && !chunk["filename"].is_string()) { - send_error(task, "extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST); - return false; - } - - SLT_DBG(slot, "extra_context chunk in file '%s':\n%s\n", chunk.value("filename", "").c_str(), chunk.value("text", "").c_str()); - } - - // get prompt - { - const auto & prompt = data.find("prompt"); - if (prompt == data.end()) { - send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST); - return false; - } - - if ((prompt->is_string()) || - (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) || - (prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) { - slot.prompt = *prompt; - } else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_array()) { - slot.prompt = prompt->at(0); - } else if (prompt->is_array() && prompt->size() > 1) { - // array of strings - for (const auto & el : *prompt) { - if (!el.is_string()) { - send_error(task, "\"prompt\" must be a string, an array of strings or an array of integers", ERROR_TYPE_INVALID_REQUEST); - return false; - } - } - slot.prompt = *prompt; - } else { - send_error(task, "\"prompt\" must be a string, an array of strings or an array of integers", ERROR_TYPE_INVALID_REQUEST); - return false; - } - } - { slot.sparams.logit_bias.clear(); @@ -1045,7 +951,6 @@ struct server_context { } slot.state = SLOT_STATE_PROCESSING_PROMPT; - slot.prompt_tokens.clear(); SLT_INF(slot, "%s", "processing task\n"); @@ -1333,7 +1238,7 @@ struct server_context { {"tokens_predicted", slot.n_decoded}, {"tokens_evaluated", slot.n_prompt_tokens}, {"generation_settings", get_formated_generation(slot)}, - {"prompt", slot.prompt}, + {"prompt", common_detokenize(ctx, slot.prompt_tokens)}, {"has_new_line", slot.has_new_line}, {"truncated", slot.truncated}, {"stopped_eos", slot.stopped_eos}, @@ -1457,19 +1362,21 @@ struct server_context { // Functions to create new task(s) and receive result(s) // + // break the input "prompt" into multiple tasks if needed, then format and tokenize the input prompt(s) std::vector create_tasks_cmpl(json data, server_task_cmpl_type cmpl_type) { std::vector tasks; - auto create_task = [&](json & task_data, bool replace_prompt, json prompt) { - server_task task; - task.id = queue_tasks.get_new_id(); - task.cmpl_type = cmpl_type; - task.type = SERVER_TASK_TYPE_COMPLETION; - if (replace_prompt) { - task.data = task_data; - task.data["prompt"] = std::move(prompt); - } else { - task.data = std::move(task_data); + auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) { + if (prompt_tokens.empty()) { + // TODO @ngxson : should not throw an error + throw std::runtime_error("prompt must not be empty"); } + SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size()); + server_task task; + task.id = queue_tasks.get_new_id(); + task.cmpl_type = cmpl_type; + task.type = SERVER_TASK_TYPE_COMPLETION; + task.data = task_data; + task.prompt_tokens = std::move(prompt_tokens); tasks.push_back(std::move(task)); }; @@ -1478,41 +1385,49 @@ struct server_context { throw std::runtime_error(error_msg); } - json prompt = data.at("prompt"); - - // if the prompt is a singleton (i.e. a string or a list of tokens), we only need to create single task - if (prompt.is_string() || json_is_array_of_numbers(prompt)) { - data["index"] = 0; - create_task(data, false, nullptr); - } else if (prompt.is_array()) { - // otherwise, it's a multiple-prompt task, we break it into smaller tasks - std::vector prompts = prompt; - if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { - // prompts[0] is the question - // the rest are the answers/documents - SRV_DBG("creating rerank tasks, n_prompts = %d\n", (int) prompts.size() - 1); - for (size_t i = 1; i < prompts.size(); i++) { - json qd; - qd.push_back(prompts[0]); - qd.push_back(prompts[i]); - data["index"] = i - 1; - create_task(data, true, qd); - } - } else { - SRV_DBG("creating multi-prompt tasks, n_prompts = %d\n", (int) prompts.size()); - for (size_t i = 0; i < prompts.size(); i++) { - const auto & e = prompts[i]; - if (e.is_string() || json_is_array_of_numbers(e)) { + // because llama_tokenize api is thread-safe, we can tokenize the prompt from HTTP thread + bool add_special = cmpl_type != SERVER_TASK_CMPL_TYPE_RERANK && cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL; + std::vector tokenized_prompts = tokenize_input_prompts(ctx, data.at("prompt"), add_special, true); + switch (cmpl_type) { + case SERVER_TASK_CMPL_TYPE_RERANK: + { + // prompts[0] is the question + // the rest are the answers/documents + GGML_ASSERT(tokenized_prompts.size() > 1); + SRV_DBG("creating rerank tasks, n_prompts = %d\n", (int) tokenized_prompts.size() - 1); + for (size_t i = 1; i < tokenized_prompts.size(); i++) { + data["index"] = i - 1; + auto tokens = format_rerank(model, tokenized_prompts[0], tokenized_prompts[i]); + create_task(data, tokens); + } + } break; + case SERVER_TASK_CMPL_TYPE_INFILL: + { + SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); + for (size_t i = 0; i < tokenized_prompts.size(); i++) { data["index"] = i; - create_task(data, true, e); - } else { - throw std::runtime_error(error_msg); + auto tokens = format_infill( + ctx, + data.at("input_prefix"), + data.at("input_suffix"), + data.at("input_extra"), + params.n_batch, + params.n_predict, + slots[0].n_ctx, // TODO: there should be a better way + params.spm_infill, + tokenized_prompts[i] + ); + create_task(data, tokens); + } + } break; + default: + { + SRV_DBG("creating multi-prompt tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); + for (size_t i = 0; i < tokenized_prompts.size(); i++) { + data["index"] = i; + create_task(data, tokenized_prompts[i]); } } - } - } else { - // invalid case - throw std::runtime_error(error_msg); } return tasks; @@ -1623,9 +1538,10 @@ struct server_context { slot->reset(); - slot->id_task = task.id; - slot->cmpl_type = task.cmpl_type; - slot->index = json_value(task.data, "index", 0); + slot->id_task = task.id; + slot->cmpl_type = task.cmpl_type; + slot->index = json_value(task.data, "index", 0); + slot->prompt_tokens = std::move(task.prompt_tokens); if (!launch_slot_with_task(*slot, task)) { SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id); @@ -1658,7 +1574,7 @@ struct server_context { slot_data["id"] = slot.id; slot_data["id_task"] = slot.id_task; slot_data["state"] = slot.state; - slot_data["prompt"] = slot.prompt; + slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens); slot_data["next_token"] = { {"has_next_token", slot.has_next_token}, {"has_new_line", slot.has_new_line}, @@ -1785,9 +1701,6 @@ struct server_context { } slot->cache_tokens.resize(token_count); - // TODO: maybe detokenize the slot->cache_tokens instead? - slot->prompt = string_format("[restored %d tokens from file]", (int) token_count); - const int64_t t_end = ggml_time_us(); const double t_restore_ms = (t_end - t_start) / 1000.0; @@ -1953,349 +1866,225 @@ struct server_context { // next, batch any pending prompts without exceeding n_batch if (params.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { + if (!slot.is_processing()) { + continue; + } + // this slot still has a prompt to be processed - if (slot.state == SLOT_STATE_PROCESSING_PROMPT) { + if (!slot.prompt_tokens.empty() && slot.state == SLOT_STATE_PROCESSING_PROMPT) { auto & prompt_tokens = slot.prompt_tokens; - // we haven't tokenized the prompt yet - do it now: - if (prompt_tokens.empty()) { - SLT_INF(slot, "tokenizing prompt, len = %d\n", (int) slot.prompt.size()); - - slot.t_start_process_prompt = ggml_time_us(); - slot.t_start_generation = 0; - - switch (slot.cmpl_type) { - case SERVER_TASK_CMPL_TYPE_NORMAL: - case SERVER_TASK_CMPL_TYPE_EMBEDDING: - { - prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true); - } break; - case SERVER_TASK_CMPL_TYPE_RERANK: - { - // require slot.prompt to be array of 2 strings - if (!slot.prompt.is_array() || slot.prompt.size() != 2) { - SLT_ERR(slot, "%s", "invalid prompt for rerank task\n"); - slot.release(); - send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST); - continue; - } - - // prompt: [BOS]query[EOS][SEP]doc[EOS] - prompt_tokens.clear(); - prompt_tokens.push_back(llama_token_bos(model)); - { - const auto part = tokenize(slot.prompt[0], false, false); - prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); - } - prompt_tokens.push_back(llama_token_eos(model)); - prompt_tokens.push_back(llama_token_sep(model)); - { - const auto part = tokenize(slot.prompt[1], false, false); - prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); - } - prompt_tokens.push_back(llama_token_eos(model)); - } break; - case SERVER_TASK_CMPL_TYPE_INFILL: - { - // TODO: optimize this block by reducing memory allocations and movement - - // use FIM repo-level pattern: - // ref: https://arxiv.org/pdf/2409.12186 - // - // [FIM_REP]myproject - // [FIM_SEP]filename0 - // extra chunk 0 - // [FIM_SEP]filename1 - // extra chunk 1 - // ... - // [FIM_SEP]filename - // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt - // - auto tokens_prefix = tokenize(slot.input_prefix, false, false); - auto tokens_suffix = tokenize(slot.input_suffix, false, false); - auto tokens_prompt = tokenize(slot.prompt, false, false); - - slot.extra_tokens.clear(); - if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { - static const auto k_fim_repo = tokenize("myproject\n", false, false); - - slot.extra_tokens.push_back(llama_token_fim_rep(model)); - slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); - } - - for (const auto & chunk : slot.input_extra) { - // { "text": string, "filename": string } - const std::string text = chunk.value("text", ""); - const std::string filename = chunk.value("filename", "tmp"); - - if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { - const auto k_fim_file = tokenize(filename + "\n", false, false); - - slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); - slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); - } else { - // chunk separator in binary form to avoid confusing the AI - static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; - static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false); - - slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); - } - - const auto chunk_tokens = tokenize(text, false, false); - slot.extra_tokens.insert(slot.extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); - } + slot.t_start_process_prompt = ggml_time_us(); + slot.t_start_generation = 0; + slot.n_past = 0; + slot.n_prompt_tokens = prompt_tokens.size(); - if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { - // TODO: current filename - static const auto k_fim_file = tokenize("filename\n", false, false); + SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); - slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); - slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); - } - - // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) - const int n_suffix_take = std::min(tokens_suffix.size(), (n_batch/4)); - const int n_prefix_take = std::min(tokens_prefix.size(), 3*(n_batch/4) - 3); - - // fill the rest of the context with extra chunks - const int n_extra_take = std::min(std::max(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size()); - - tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); - tokens_suffix.resize(n_suffix_take); - - tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model)); - tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); - tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model)); - - auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix; - auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix; - - if (llama_add_bos_token(model)) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); - } - - SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size()); - - // put the extra context before the FIM prefix - embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end()); - - embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - embd_inp.push_back(llama_token_fim_mid(model)); - - prompt_tokens = std::move(embd_inp); - } break; + // print prompt tokens (for debugging) + if (1) { + // first 16 tokens (avoid flooding logs) + for (int i = 0; i < std::min(16, prompt_tokens.size()); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); } - - slot.n_past = 0; - slot.n_prompt_tokens = prompt_tokens.size(); - - SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); - - // print prompt tokens (for debugging) - if (1) { - // first 16 tokens (avoid flooding logs) - for (int i = 0; i < std::min(16, prompt_tokens.size()); i++) { - SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); - } - } else { - // all - for (int i = 0; i < (int) prompt_tokens.size(); i++) { - SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); - } + } else { + // all + for (int i = 0; i < (int) prompt_tokens.size(); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); } + } - // empty prompt passed -> release the slot and send empty response - if (prompt_tokens.empty()) { - SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); + // empty prompt passed -> release the slot and send empty response + if (prompt_tokens.empty()) { + SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); + slot.release(); + slot.print_timings(); + send_final_response(slot); + continue; + } + + if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { + // this prompt is too large to process - discard it + if (slot.n_prompt_tokens > n_ubatch) { slot.release(); - slot.print_timings(); - send_final_response(slot); + send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); continue; } - - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { - // this prompt is too large to process - discard it - if (slot.n_prompt_tokens > n_ubatch) { + } else { + if (!params.ctx_shift) { + // if context shift is disabled, we make sure prompt size is smaller than KV size + // TODO: there should be a separate parameter that control prompt truncation + // context shift should be applied only during the generation phase + if (slot.n_prompt_tokens >= slot.n_ctx) { slot.release(); - send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); + send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); continue; } - } else { - if (!params.ctx_shift) { - // if context shift is disabled, we make sure prompt size is smaller than KV size - // TODO: there should be a separate parameter that control prompt truncation - // context shift should be applied only during the generation phase - if (slot.n_prompt_tokens >= slot.n_ctx) { - slot.release(); - send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); - continue; - } - } - if (slot.params.n_keep < 0) { - slot.params.n_keep = slot.n_prompt_tokens; - } - slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - - // if input prompt is too big, truncate it - if (slot.n_prompt_tokens >= slot.n_ctx) { - const int n_left = slot.n_ctx - slot.params.n_keep; + } + if (slot.params.n_keep < 0) { + slot.params.n_keep = slot.n_prompt_tokens; + } + slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - const int n_block_size = n_left / 2; - const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; + // if input prompt is too big, truncate it + if (slot.n_prompt_tokens >= slot.n_ctx) { + const int n_left = slot.n_ctx - slot.params.n_keep; - std::vector new_tokens( - prompt_tokens.begin(), - prompt_tokens.begin() + slot.params.n_keep); + const int n_block_size = n_left / 2; + const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; - new_tokens.insert( - new_tokens.end(), - prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, - prompt_tokens.end()); + std::vector new_tokens( + prompt_tokens.begin(), + prompt_tokens.begin() + slot.params.n_keep); - prompt_tokens = std::move(new_tokens); + new_tokens.insert( + new_tokens.end(), + prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, + prompt_tokens.end()); - slot.truncated = true; - slot.n_prompt_tokens = prompt_tokens.size(); + prompt_tokens = std::move(new_tokens); - SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens); + slot.truncated = true; + slot.n_prompt_tokens = prompt_tokens.size(); - GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); - } + SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens); - common_sampler_reset(slot.smpl); + GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); + } - if (slot.params.cache_prompt) { - // reuse any previously computed tokens that are common with the new prompt - slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); + common_sampler_reset(slot.smpl); - // push the prompt into the sampling context (do not apply grammar) - for (int i = 0; i < slot.n_past; ++i) { - common_sampler_accept(slot.smpl, slot.cache_tokens[i], false); - } + if (slot.params.cache_prompt) { + // reuse any previously computed tokens that are common with the new prompt + slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); - // reuse chunks from the cached prompt by shifting their KV cache in the new position - if (params.n_cache_reuse > 0) { - size_t head_c = slot.n_past; // cache - size_t head_p = slot.n_past; // current prompt + // push the prompt into the sampling context (do not apply grammar) + for (int i = 0; i < slot.n_past; ++i) { + common_sampler_accept(slot.smpl, slot.cache_tokens[i], false); + } - SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); + // reuse chunks from the cached prompt by shifting their KV cache in the new position + if (params.n_cache_reuse > 0) { + size_t head_c = slot.n_past; // cache + size_t head_p = slot.n_past; // current prompt - while (head_c < slot.cache_tokens.size() && - head_p < prompt_tokens.size()) { + SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); - size_t n_match = 0; - while (head_c + n_match < slot.cache_tokens.size() && - head_p + n_match < prompt_tokens.size() && - slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) { + while (head_c < slot.cache_tokens.size() && + head_p < prompt_tokens.size()) { - n_match++; - } + size_t n_match = 0; + while (head_c + n_match < slot.cache_tokens.size() && + head_p + n_match < prompt_tokens.size() && + slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) { - if (n_match >= (size_t) params.n_cache_reuse) { - SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); - //for (size_t i = head_p; i < head_p + n_match; i++) { - // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); - //} + n_match++; + } - const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; + if (n_match >= (size_t) params.n_cache_reuse) { + SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); + //for (size_t i = head_p; i < head_p + n_match; i++) { + // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + //} - llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c); - llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift); + const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - for (size_t i = 0; i < n_match; i++) { - slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; + llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c); + llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift); - common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false); + for (size_t i = 0; i < n_match; i++) { + slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; - slot.n_past++; - } + common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false); - head_c += n_match; - head_p += n_match; - } else { - head_c += 1; + slot.n_past++; } - } - SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past); + head_c += n_match; + head_p += n_match; + } else { + head_c += 1; + } } - } - } - - if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { - // we have to evaluate at least 1 token to generate logits. - SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); - slot.n_past--; + SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past); + } } - - slot.n_prompt_tokens_processed = 0; } - // non-causal tasks require to fit the entire prompt in the physical batch - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { - // cannot fit the prompt in the current batch - will try next iter - if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { - continue; - } + if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { + // we have to evaluate at least 1 token to generate logits. + SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); + + slot.n_past--; } - // check that we are in the right batch_type, if not defer the slot - const bool slot_type = - slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || - slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK ? 1 : 0; + slot.n_prompt_tokens_processed = 0; + } - if (batch_type == -1) { - batch_type = slot_type; - } else if (batch_type != slot_type) { + // non-causal tasks require to fit the entire prompt in the physical batch + if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { + // cannot fit the prompt in the current batch - will try next iter + if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { continue; } + } - // keep only the common part - if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) { - // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); + // check that we are in the right batch_type, if not defer the slot + const bool slot_type = + slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || + slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK ? 1 : 0; - // there is no common part left - slot.n_past = 0; + if (batch_type == -1) { + batch_type = slot_type; + } else if (batch_type != slot_type) { + continue; + } - common_sampler_reset(slot.smpl); - } + // keep only the common part + if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) { + // could not partially delete (likely using a non-Transformer model) + llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); - SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past); + // there is no common part left + slot.n_past = 0; - // remove the non-common part from the cache - slot.cache_tokens.resize(slot.n_past); + common_sampler_reset(slot.smpl); + } - // add prompt tokens for processing in the current batch - while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { - common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); + SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past); - if (slot.params.cache_prompt) { - slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); - } + // remove the non-common part from the cache + slot.cache_tokens.resize(slot.n_past); - slot.n_prompt_tokens_processed++; - slot.n_past++; + // add prompt tokens for processing in the current batch + while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { + common_batch_add(batch, slot.prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); + + if (slot.params.cache_prompt) { + slot.cache_tokens.push_back(slot.prompt_tokens[slot.n_past]); } - SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); + slot.n_prompt_tokens_processed++; + slot.n_past++; + } + + SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); - // entire prompt has been processed - if (slot.n_past == slot.n_prompt_tokens) { - slot.state = SLOT_STATE_DONE_PROMPT; + // entire prompt has been processed + if (slot.n_past == slot.n_prompt_tokens) { + slot.state = SLOT_STATE_DONE_PROMPT; - GGML_ASSERT(batch.n_tokens > 0); + GGML_ASSERT(batch.n_tokens > 0); - // extract the logits only for the last token - batch.logits[batch.n_tokens - 1] = true; + // extract the logits only for the last token + batch.logits[batch.n_tokens - 1] = true; - slot.n_decoded = 0; - slot.i_batch = batch.n_tokens - 1; + slot.n_decoded = 0; + slot.i_batch = batch.n_tokens - 1; - SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens); - } + SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens); } if (batch.n_tokens >= n_batch) { diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 69519ef95b2d9..05513e5335cbd 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -24,6 +24,7 @@ #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" using json = nlohmann::ordered_json; +using llama_tokens = std::vector; // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 enum error_type { @@ -52,9 +53,234 @@ static T json_value(const json & body, const std::string & key, const T & defaul } // -// chat template utils +// tokenizer and input processing utils // +static bool json_is_array_of_numbers(const json & data) { + if (data.is_array()) { + for (const auto & e : data) { + if (!e.is_number_integer()) { + return false; + } + } + return true; + } + return false; +} + +// is array having BOTH numbers & strings? +static bool json_is_array_of_mixed_numbers_strings(const json & data) { + bool seen_string = false; + bool seen_number = false; + if (data.is_array()) { + for (const auto & e : data) { + seen_string |= e.is_string(); + seen_number |= e.is_number_integer(); + if (seen_number && seen_string) { + return true; + } + } + } + return false; +} + +/** + * this handles 2 cases: + * - only string, example: "string" + * - mixed string and tokens, example: [12, 34, "string", 56, 78] + */ +static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { + // If `add_bos` is true, we only add BOS, when json_prompt is a string, + // or the first element of the json_prompt array is a string. + llama_tokens prompt_tokens; + + if (json_prompt.is_array()) { + bool first = true; + for (const auto & p : json_prompt) { + if (p.is_string()) { + auto s = p.template get(); + + llama_tokens p; + if (first) { + p = common_tokenize(ctx, s, add_special, parse_special); + first = false; + } else { + p = common_tokenize(ctx, s, false, parse_special); + } + + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + } else { + if (first) { + first = false; + } + + prompt_tokens.push_back(p.template get()); + } + } + } else { + auto s = json_prompt.template get(); + prompt_tokens = common_tokenize(ctx, s, add_special, parse_special); + } + + return prompt_tokens; +} + +/** + * break the input "prompt" object into multiple prompt if needed, then tokenize them + * this supports these cases: + * - "prompt": "string" + * - "prompt": [12, 34, 56] + * - "prompt": [12, 34, "string", 56, 78] + * and multiple prompts (multi-tasks): + * - "prompt": ["string1", "string2"] + * - "prompt": ["string1", [12, 34, 56]] + * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]] + */ +static std::vector tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { + std::vector result; + if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) { + // string or mixed + result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special)); + } else if (json_is_array_of_numbers(json_prompt)) { + // array of tokens + result.push_back(json_prompt.get()); + } else if (json_prompt.is_array()) { + // array of prompts + result.reserve(json_prompt.size()); + for (const auto & p : json_prompt) { + if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) { + result.push_back(tokenize_mixed(ctx, p, add_special, parse_special)); + } else if (json_is_array_of_numbers(p)) { + // array of tokens + result.push_back(p.get()); + } else { + throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens"); + } + } + } else { + throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts"); + } + return result; +} + +// +// template utils +// + +// format rerank task: [BOS]query[EOS][SEP]doc[EOS] +static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) { + llama_tokens result; + result.reserve(doc.size() + query.size() + 4); + result.push_back(llama_token_bos(model)); + result.insert(result.end(), query.begin(), query.end()); + result.push_back(llama_token_eos(model)); + result.push_back(llama_token_sep(model)); + result.insert(result.end(), doc.begin(), doc.end()); + result.push_back(llama_token_eos(model)); + return result; +} + +// format infill task +static llama_tokens format_infill( + const llama_context * ctx, + const json & input_prefix, + const json & input_suffix, + const json & input_extra, + const int n_batch, + const int n_predict, + const int n_ctx, + const bool spm_infill, + const llama_tokens & tokens_prompt + ) { + // TODO: optimize this block by reducing memory allocations and movement + + // use FIM repo-level pattern: + // ref: https://arxiv.org/pdf/2409.12186 + // + // [FIM_REP]myproject + // [FIM_SEP]filename0 + // extra chunk 0 + // [FIM_SEP]filename1 + // extra chunk 1 + // ... + // [FIM_SEP]filename + // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt + // + llama_tokens extra_tokens; + extra_tokens.reserve(n_ctx); + + auto model = llama_get_model(ctx); + auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false); + auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false); + + if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { + static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false); + + extra_tokens.push_back(llama_token_fim_rep(model)); + extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); + } + for (const auto & chunk : input_extra) { + // { "text": string, "filename": string } + const std::string text = chunk.value("text", ""); + const std::string filename = chunk.value("filename", "tmp"); + + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false); + + extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model)); + extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } else { + // chunk separator in binary form to avoid confusing the AI + static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; + static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false); + + extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); + } + + const auto chunk_tokens = common_tokenize(ctx, text, false, false); + extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); + } + + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + // TODO: current filename + static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false); + + extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model)); + extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } + + // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) + const int n_suffix_take = std::min(tokens_suffix.size(), (n_batch/4)); + const int n_prefix_take = std::min(tokens_prefix.size(), 3*(n_batch/4) - 3); + + // fill the rest of the context with extra chunks + const int n_extra_take = std::min(std::max(0, n_ctx - (n_batch) - 2*n_predict), extra_tokens.size()); + + tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); + tokens_suffix.resize(n_suffix_take); + + tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model)); + tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); + tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model)); + + auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix; + auto embd_end = spm_infill ? tokens_prefix : tokens_suffix; + + if (llama_add_bos_token(model)) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + + LOG_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size()); + + // put the extra context before the FIM prefix + embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end()); + + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + embd_inp.push_back(llama_token_fim_mid(model)); + + return embd_inp; +} + // Format given chat. If tmpl is empty, we take the template from model metadata inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { std::vector chat; @@ -229,18 +455,6 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin return std::string::npos; } -static bool json_is_array_of_numbers(const json & data) { - if (data.is_array()) { - for (const auto & e : data) { - if (!e.is_number()) { - return false; - } - } - return true; - } - return false; -} - // TODO: reuse llama_detokenize template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { From 5c749bea009183582b1f2583fa3d198375844fb4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 23 Oct 2024 23:39:48 +0200 Subject: [PATCH 02/12] move prompt_tokens.empty() check --- examples/server/server.cpp | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 13bea289b048e..e049927d0163c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1866,12 +1866,8 @@ struct server_context { // next, batch any pending prompts without exceeding n_batch if (params.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { - if (!slot.is_processing()) { - continue; - } - // this slot still has a prompt to be processed - if (!slot.prompt_tokens.empty() && slot.state == SLOT_STATE_PROCESSING_PROMPT) { + if (slot.state == SLOT_STATE_PROCESSING_PROMPT) { auto & prompt_tokens = slot.prompt_tokens; slot.t_start_process_prompt = ggml_time_us(); @@ -1879,6 +1875,16 @@ struct server_context { slot.n_past = 0; slot.n_prompt_tokens = prompt_tokens.size(); + // empty prompt passed -> release the slot and send empty response + if (prompt_tokens.empty()) { + SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); + + slot.release(); + slot.print_timings(); + send_final_response(slot); + continue; + } + SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); // print prompt tokens (for debugging) @@ -1894,16 +1900,6 @@ struct server_context { } } - // empty prompt passed -> release the slot and send empty response - if (prompt_tokens.empty()) { - SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); - - slot.release(); - slot.print_timings(); - send_final_response(slot); - continue; - } - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { // this prompt is too large to process - discard it if (slot.n_prompt_tokens > n_ubatch) { From 60d4194bfed9de1108234fa9cdacba757f9c9557 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 23 Oct 2024 23:48:08 +0200 Subject: [PATCH 03/12] fix incorrect if branch --- examples/server/server.cpp | 100 ++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fc382c68d97a0..734bde5c297ac 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2009,75 +2009,75 @@ struct server_context { } slot.n_prompt_tokens_processed = 0; - } - // non-causal tasks require to fit the entire prompt in the physical batch - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { - // cannot fit the prompt in the current batch - will try next iter - if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { - continue; + // non-causal tasks require to fit the entire prompt in the physical batch + if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { + // cannot fit the prompt in the current batch - will try next iter + if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { + continue; + } } - } - // check that we are in the right batch_type, if not defer the slot - const bool slot_type = - slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || - slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK ? 1 : 0; + // check that we are in the right batch_type, if not defer the slot + const bool slot_type = + slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || + slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK ? 1 : 0; - if (batch_type == -1) { - batch_type = slot_type; - } else if (batch_type != slot_type) { - continue; - } + if (batch_type == -1) { + batch_type = slot_type; + } else if (batch_type != slot_type) { + continue; + } - // keep only the common part - if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) { - // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); + // keep only the common part + if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) { + // could not partially delete (likely using a non-Transformer model) + llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); - // there is no common part left - slot.n_past = 0; + // there is no common part left + slot.n_past = 0; - common_sampler_reset(slot.smpl); - } + common_sampler_reset(slot.smpl); + } - SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past); + SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past); - // remove the non-common part from the cache - slot.cache_tokens.resize(slot.n_past); + // remove the non-common part from the cache + slot.cache_tokens.resize(slot.n_past); - // add prompt tokens for processing in the current batch - while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { - common_batch_add(batch, slot.prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); + // add prompt tokens for processing in the current batch + while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { + common_batch_add(batch, slot.prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); - if (slot.params.cache_prompt) { - slot.cache_tokens.push_back(slot.prompt_tokens[slot.n_past]); - } + if (slot.params.cache_prompt) { + slot.cache_tokens.push_back(slot.prompt_tokens[slot.n_past]); + } - slot.n_prompt_tokens_processed++; - slot.n_past++; - } + slot.n_prompt_tokens_processed++; + slot.n_past++; + } - SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); + SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); - // entire prompt has been processed - if (slot.n_past == slot.n_prompt_tokens) { - slot.state = SLOT_STATE_DONE_PROMPT; + // entire prompt has been processed + if (slot.n_past == slot.n_prompt_tokens) { + slot.state = SLOT_STATE_DONE_PROMPT; - GGML_ASSERT(batch.n_tokens > 0); + GGML_ASSERT(batch.n_tokens > 0); - // Process all prompt tokens through sampler system - for (int i = 0; i < slot.n_prompt_tokens; ++i) { - common_sampler_accept(slot.smpl, slot.prompt_tokens[i], false); - } + // Process all prompt tokens through sampler system + for (int i = 0; i < slot.n_prompt_tokens; ++i) { + common_sampler_accept(slot.smpl, slot.prompt_tokens[i], false); + } - // extract the logits only for the last token - batch.logits[batch.n_tokens - 1] = true; + // extract the logits only for the last token + batch.logits[batch.n_tokens - 1] = true; - slot.n_decoded = 0; - slot.i_batch = batch.n_tokens - 1; + slot.n_decoded = 0; + slot.i_batch = batch.n_tokens - 1; - SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens); + SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens); + } } if (batch.n_tokens >= n_batch) { From b550011be3e181690583cc20be4cfa44f1b7befb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Oct 2024 00:03:00 +0200 Subject: [PATCH 04/12] fix infinite generation loop --- examples/server/server.cpp | 222 +++++++++++++++++++------------------ 1 file changed, 113 insertions(+), 109 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 734bde5c297ac..cdb97419fc8af 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -68,6 +68,7 @@ enum stop_type { // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 enum slot_state { SLOT_STATE_IDLE, + SLOT_STATE_STARTED, SLOT_STATE_PROCESSING_PROMPT, SLOT_STATE_DONE_PROMPT, SLOT_STATE_GENERATING, @@ -950,7 +951,7 @@ struct server_context { } } - slot.state = SLOT_STATE_PROCESSING_PROMPT; + slot.state = SLOT_STATE_STARTED; SLT_INF(slot, "%s", "processing task\n"); @@ -1867,148 +1868,151 @@ struct server_context { if (params.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { // this slot still has a prompt to be processed - if (slot.state == SLOT_STATE_PROCESSING_PROMPT) { + if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { auto & prompt_tokens = slot.prompt_tokens; - slot.t_start_process_prompt = ggml_time_us(); - slot.t_start_generation = 0; - slot.n_past = 0; - slot.n_prompt_tokens = prompt_tokens.size(); - - // empty prompt passed -> release the slot and send empty response - if (prompt_tokens.empty()) { - SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); - - slot.release(); - slot.print_timings(); - send_final_response(slot); - continue; - } + // TODO: maybe move branch to outside of this loop in the future + if (slot.state == SLOT_STATE_STARTED) { + slot.t_start_process_prompt = ggml_time_us(); + slot.t_start_generation = 0; + slot.n_past = 0; + slot.n_prompt_tokens = prompt_tokens.size(); + slot.state = SLOT_STATE_PROCESSING_PROMPT; - SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); + SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); - // print prompt tokens (for debugging) - if (1) { - // first 16 tokens (avoid flooding logs) - for (int i = 0; i < std::min(16, prompt_tokens.size()); i++) { - SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); - } - } else { - // all - for (int i = 0; i < (int) prompt_tokens.size(); i++) { - SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + // print prompt tokens (for debugging) + if (1) { + // first 16 tokens (avoid flooding logs) + for (int i = 0; i < std::min(16, prompt_tokens.size()); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + } + } else { + // all + for (int i = 0; i < (int) prompt_tokens.size(); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + } } - } - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { - // this prompt is too large to process - discard it - if (slot.n_prompt_tokens > n_ubatch) { + // empty prompt passed -> release the slot and send empty response + if (prompt_tokens.empty()) { + SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); + slot.release(); - send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); + slot.print_timings(); + send_final_response(slot); continue; } - } else { - if (!params.ctx_shift) { - // if context shift is disabled, we make sure prompt size is smaller than KV size - // TODO: there should be a separate parameter that control prompt truncation - // context shift should be applied only during the generation phase - if (slot.n_prompt_tokens >= slot.n_ctx) { + + if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { + // this prompt is too large to process - discard it + if (slot.n_prompt_tokens > n_ubatch) { slot.release(); - send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); + send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); continue; } - } - if (slot.params.n_keep < 0) { - slot.params.n_keep = slot.n_prompt_tokens; - } - slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); + } else { + if (!params.ctx_shift) { + // if context shift is disabled, we make sure prompt size is smaller than KV size + // TODO: there should be a separate parameter that control prompt truncation + // context shift should be applied only during the generation phase + if (slot.n_prompt_tokens >= slot.n_ctx) { + slot.release(); + send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); + continue; + } + } + if (slot.params.n_keep < 0) { + slot.params.n_keep = slot.n_prompt_tokens; + } + slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - // if input prompt is too big, truncate it - if (slot.n_prompt_tokens >= slot.n_ctx) { - const int n_left = slot.n_ctx - slot.params.n_keep; + // if input prompt is too big, truncate it + if (slot.n_prompt_tokens >= slot.n_ctx) { + const int n_left = slot.n_ctx - slot.params.n_keep; - const int n_block_size = n_left / 2; - const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; + const int n_block_size = n_left / 2; + const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; - std::vector new_tokens( - prompt_tokens.begin(), - prompt_tokens.begin() + slot.params.n_keep); + std::vector new_tokens( + prompt_tokens.begin(), + prompt_tokens.begin() + slot.params.n_keep); - new_tokens.insert( - new_tokens.end(), - prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, - prompt_tokens.end()); + new_tokens.insert( + new_tokens.end(), + prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, + prompt_tokens.end()); - prompt_tokens = std::move(new_tokens); + prompt_tokens = std::move(new_tokens); - slot.truncated = true; - slot.n_prompt_tokens = prompt_tokens.size(); + slot.truncated = true; + slot.n_prompt_tokens = prompt_tokens.size(); - SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens); + SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens); - GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); - } + GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); + } - common_sampler_reset(slot.smpl); + if (slot.params.cache_prompt) { + // reuse any previously computed tokens that are common with the new prompt + slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); - if (slot.params.cache_prompt) { - // reuse any previously computed tokens that are common with the new prompt - slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); + // reuse chunks from the cached prompt by shifting their KV cache in the new position + if (params.n_cache_reuse > 0) { + size_t head_c = slot.n_past; // cache + size_t head_p = slot.n_past; // current prompt - // reuse chunks from the cached prompt by shifting their KV cache in the new position - if (params.n_cache_reuse > 0) { - size_t head_c = slot.n_past; // cache - size_t head_p = slot.n_past; // current prompt + SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); - SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); + while (head_c < slot.cache_tokens.size() && + head_p < prompt_tokens.size()) { - while (head_c < slot.cache_tokens.size() && - head_p < prompt_tokens.size()) { + size_t n_match = 0; + while (head_c + n_match < slot.cache_tokens.size() && + head_p + n_match < prompt_tokens.size() && + slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) { - size_t n_match = 0; - while (head_c + n_match < slot.cache_tokens.size() && - head_p + n_match < prompt_tokens.size() && - slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) { + n_match++; + } - n_match++; - } + if (n_match >= (size_t) params.n_cache_reuse) { + SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); + //for (size_t i = head_p; i < head_p + n_match; i++) { + // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + //} - if (n_match >= (size_t) params.n_cache_reuse) { - SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); - //for (size_t i = head_p; i < head_p + n_match; i++) { - // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); - //} + const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; + llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c); + llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift); - llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c); - llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift); + for (size_t i = 0; i < n_match; i++) { + slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; - for (size_t i = 0; i < n_match; i++) { - slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; - slot.n_past++; - } + slot.n_past++; + } - head_c += n_match; - head_p += n_match; - } else { - head_c += 1; + head_c += n_match; + head_p += n_match; + } else { + head_c += 1; + } } - } - SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past); + SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past); + } } } - } - if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { - // we have to evaluate at least 1 token to generate logits. - SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); + if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { + // we have to evaluate at least 1 token to generate logits. + SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); - slot.n_past--; - } + slot.n_past--; + } - slot.n_prompt_tokens_processed = 0; + slot.n_prompt_tokens_processed = 0; + } // non-causal tasks require to fit the entire prompt in the physical batch if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { @@ -2036,8 +2040,6 @@ struct server_context { // there is no common part left slot.n_past = 0; - - common_sampler_reset(slot.smpl); } SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past); @@ -2047,10 +2049,10 @@ struct server_context { // add prompt tokens for processing in the current batch while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { - common_batch_add(batch, slot.prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); + common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); if (slot.params.cache_prompt) { - slot.cache_tokens.push_back(slot.prompt_tokens[slot.n_past]); + slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); } slot.n_prompt_tokens_processed++; @@ -2065,9 +2067,11 @@ struct server_context { GGML_ASSERT(batch.n_tokens > 0); + common_sampler_reset(slot.smpl); + // Process all prompt tokens through sampler system for (int i = 0; i < slot.n_prompt_tokens; ++i) { - common_sampler_accept(slot.smpl, slot.prompt_tokens[i], false); + common_sampler_accept(slot.smpl, prompt_tokens[i], false); } // extract the logits only for the last token From cff97ad3f4044e7863857dfc3f0a728afc88cfc0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Oct 2024 13:20:44 +0200 Subject: [PATCH 05/12] bring back infill validation --- examples/server/server.cpp | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index cdb97419fc8af..df71ffc3ccd54 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -68,7 +68,7 @@ enum stop_type { // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 enum slot_state { SLOT_STATE_IDLE, - SLOT_STATE_STARTED, + SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future SLOT_STATE_PROCESSING_PROMPT, SLOT_STATE_DONE_PROMPT, SLOT_STATE_GENERATING, @@ -2761,6 +2761,7 @@ int main(int argc, char ** argv) { }; const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + // check model compatibility std::string err; if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) { err += "prefix token is missing. "; @@ -2771,13 +2772,33 @@ int main(int argc, char ** argv) { if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) { err += "middle token is missing. "; } - if (!err.empty()) { res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED)); return; } json data = json::parse(req.body); + + // validate input + if (data.contains("input_extra") && !data.at("input_extra").is_array()) { + res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST)); + return; + } + json input_extra = json_value(data, "input_extra", json::array()); + + for (const auto & chunk : input_extra) { + // { "text": string, "filename": string } + if (!chunk.contains("text") || !chunk.at("text").is_string()) { + res_error(res, format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST)); + return; + } + // filename is optional + if (chunk.contains("filename") && !chunk.at("filename").is_string()) { + res_error(res, format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST)); + return; + } + } + return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res); }; From fea5ca45242fa5736ab2f8004165749b3f137d65 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Oct 2024 15:56:35 +0200 Subject: [PATCH 06/12] add infill test --- examples/server/tests/features/infill.feature | 36 +++++++++++++++ examples/server/tests/features/steps/steps.py | 46 +++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 examples/server/tests/features/infill.feature diff --git a/examples/server/tests/features/infill.feature b/examples/server/tests/features/infill.feature new file mode 100644 index 0000000000000..28ef9b912ea5b --- /dev/null +++ b/examples/server/tests/features/infill.feature @@ -0,0 +1,36 @@ +@llama.cpp +@infill +Feature: llama.cpp server + + # The current model is made by adding FIM tokens to the existing stories260K + # We may want to use a better model in the future, maybe something like SmolLM 360M + + Background: Server startup + Given a server listening on localhost:8080 + And a model file tinyllamas/stories260K-infill.gguf from HF repo ggml-org/models + And a model file test-model-infill.gguf + And a model alias tinyllama-infill + And 42 as server seed + And 1024 as batch size + And 1024 as ubatch size + And 2048 KV cache size + And 64 max tokens to predict + And 0.0 temperature + Then the server is starting + Then the server is healthy + + Scenario: Infill without input_extra + Given a prompt "Complete this" + And an infill input extra none none + And an infill input prefix "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_" + And an infill input suffix "}\n" + And an infill request with no api error + Then 64 tokens are predicted matching Lily|was|so|excited + + Scenario: Infill with input_extra + Given a prompt "Complete this" + And an infill input extra "llama.h" "LLAMA_API int32_t llama_n_threads();\n" + And an infill input prefix "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_" + And an infill input suffix "}\n" + And an infill request with no api error + Then 64 tokens are predicted matching cuts|Jimmy|mom|came|into|the|room" diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 540a2ecd56374..2e418d8aa571b 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -80,6 +80,11 @@ def step_server_config(context, server_fqdn: str, server_port: str): context.lora_file = None context.disable_ctx_shift = False + # infill + context.infill_input_extra = None + context.infill_input_suffix = '' + context.infill_input_prefix = '' + context.tasks_result = [] context.concurrent_tasks = [] context.prompts = [] @@ -291,6 +296,28 @@ async def step_request_completion(context, api_error: Literal['raised'] | str): assert completion == api_error_code, f"completion must be an {api_error_code} status code: {completion}" +@step('an infill request with {api_error} api error') +@async_run_until_complete +async def step_request_completion(context, api_error: Literal['raised'] | str): + if api_error != 'no': + raise ValueError(f'api_error={api_error} is not yet implemented') + payload = { + "prompt": context.prompts[0], + "input_suffix": context.infill_input_suffix, + "input_prefix": context.infill_input_prefix, + "n_predict": context.n_predict, + "seed": context.seed, + "temperature": context.temperature, + } + if context.infill_input_extra is not None: + payload['input_extra'] = context.infill_input_extra + async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: + async with session.post(f'{context.base_url}/infill', + json=payload) as response: + assert response.status == 200 + context.tasks_result = [await response.json()] + + @step('{predicted_n:d} tokens are predicted matching {re_content}') def step_n_tokens_predicted_with_content(context, predicted_n, re_content): context.completion = context.tasks_result.pop() @@ -539,6 +566,25 @@ def step_a_prompt_prompt(context, prompt): context.n_prompts = len(context.prompts) +# TODO: allow this to be repeated +@step('an infill input extra {filename} {text}') +def step_infill_input_extra(context, filename, text): + if filename == 'none': + context.infill_input_extra = None + else: + context.infill_input_extra = [{'filename': filename, 'text': text}] + + +@step('an infill input suffix {text}') +def step_infill_input_suffix(context, text): + context.infill_input_suffix = text + + +@step('an infill input prefix {text}') +def step_infill_input_prefix(context, text): + context.infill_input_prefix = text + + @step('{num_prompts:d} prompts {prompt} with seed {seed:d}') def step_many_prompts(context, num_prompts, prompt, seed): if context.seed is None: From 07381f7d976a9672f6e390fc6ccfd12e950bd59a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Oct 2024 15:56:49 +0200 Subject: [PATCH 07/12] try fixing format_infill --- examples/server/server.cpp | 25 +++++++++---------------- examples/server/utils.hpp | 22 +++++++++++++++++++--- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index df71ffc3ccd54..e611370f1450a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -43,21 +43,6 @@ #include #include -#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) -#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) -#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) -#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) - -#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) - -#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) - using json = nlohmann::ordered_json; enum stop_type { @@ -2780,12 +2765,19 @@ int main(int argc, char ** argv) { json data = json::parse(req.body); // validate input + if (!data.contains("input_prefix")) { + res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST)); + } + + if (!data.contains("input_suffix")) { + res_error(res, format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST)); + } + if (data.contains("input_extra") && !data.at("input_extra").is_array()) { res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST)); return; } json input_extra = json_value(data, "input_extra", json::array()); - for (const auto & chunk : input_extra) { // { "text": string, "filename": string } if (!chunk.contains("text") || !chunk.at("text").is_string()) { @@ -2798,6 +2790,7 @@ int main(int argc, char ** argv) { return; } } + data["input_extra"] = input_extra; // default to empty array if it's not exist return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res); }; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 05513e5335cbd..8112420624185 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -26,6 +26,21 @@ using json = nlohmann::ordered_json; using llama_tokens = std::vector; +#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) + +#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) + +#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) + // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 enum error_type { ERROR_TYPE_INVALID_REQUEST, @@ -214,6 +229,7 @@ static llama_tokens format_infill( auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false); if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { + // TODO: make project name an input static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false); extra_tokens.push_back(llama_token_fim_rep(model)); @@ -221,8 +237,8 @@ static llama_tokens format_infill( } for (const auto & chunk : input_extra) { // { "text": string, "filename": string } - const std::string text = chunk.value("text", ""); - const std::string filename = chunk.value("filename", "tmp"); + const std::string text = json_value(chunk, "text", std::string()); + const std::string filename = json_value(chunk, "filename", std::string("tmp")); if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false); @@ -270,7 +286,7 @@ static llama_tokens format_infill( embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); } - LOG_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size()); + SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size()); // put the extra context before the FIM prefix embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end()); From c34ab08a167846b4d6d6d345c9e0c139ffa31a51 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Oct 2024 15:59:49 +0200 Subject: [PATCH 08/12] fix test --- examples/server/tests/features/infill.feature | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/tests/features/infill.feature b/examples/server/tests/features/infill.feature index 28ef9b912ea5b..a0bbfef77707b 100644 --- a/examples/server/tests/features/infill.feature +++ b/examples/server/tests/features/infill.feature @@ -25,7 +25,7 @@ Feature: llama.cpp server And an infill input prefix "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_" And an infill input suffix "}\n" And an infill request with no api error - Then 64 tokens are predicted matching Lily|was|so|excited + Then 64 tokens are predicted matching One|day|she|saw|big|scary|bird Scenario: Infill with input_extra Given a prompt "Complete this" From 575b1332ab28c1892f726a6e92a815461bbc4240 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Oct 2024 16:23:55 +0200 Subject: [PATCH 09/12] remove redundant code --- examples/server/server.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e611370f1450a..9ff3c5e70f452 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -715,10 +715,6 @@ struct server_context { metrics.init(); } - std::vector tokenize(const json & json_prompt, bool add_special, bool parse_special) const { - return tokenize_mixed(ctx, json_prompt, add_special, parse_special); - } - server_slot * get_slot_by_id(int id) { for (server_slot & slot : slots) { if (slot.id == id) { @@ -1352,10 +1348,6 @@ struct server_context { std::vector create_tasks_cmpl(json data, server_task_cmpl_type cmpl_type) { std::vector tasks; auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) { - if (prompt_tokens.empty()) { - // TODO @ngxson : should not throw an error - throw std::runtime_error("prompt must not be empty"); - } SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size()); server_task task; task.id = queue_tasks.get_new_id(); @@ -2877,7 +2869,7 @@ int main(int argc, char ** argv) { const bool add_special = json_value(body, "add_special", false); const bool with_pieces = json_value(body, "with_pieces", false); - std::vector tokens = ctx_server.tokenize(body.at("content"), add_special, true); + std::vector tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true); if (with_pieces) { for (const auto& token : tokens) { From 4a9f3e7628a3300f76d14c0a7cc7dbff000d1623 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Oct 2024 16:29:38 +0200 Subject: [PATCH 10/12] rename completion to inference --- examples/server/server.cpp | 66 +++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9ff3c5e70f452..dce20ce96684e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -65,7 +65,7 @@ enum server_state { }; enum server_task_type { - SERVER_TASK_TYPE_COMPLETION, + SERVER_TASK_TYPE_INFERENCE, SERVER_TASK_TYPE_CANCEL, SERVER_TASK_TYPE_NEXT_RESPONSE, SERVER_TASK_TYPE_METRICS, @@ -75,11 +75,11 @@ enum server_task_type { SERVER_TASK_TYPE_SET_LORA, }; -enum server_task_cmpl_type { - SERVER_TASK_CMPL_TYPE_NORMAL, - SERVER_TASK_CMPL_TYPE_EMBEDDING, - SERVER_TASK_CMPL_TYPE_RERANK, - SERVER_TASK_CMPL_TYPE_INFILL, +enum server_task_inf_type { + SERVER_TASK_INF_TYPE_COMPLETION, + SERVER_TASK_INF_TYPE_EMBEDDING, + SERVER_TASK_INF_TYPE_RERANK, + SERVER_TASK_INF_TYPE_INFILL, }; struct server_task { @@ -90,7 +90,7 @@ struct server_task { server_task_type type; json data; - server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; + server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; // utility function static std::unordered_set get_list_id(const std::vector & tasks) { @@ -161,7 +161,7 @@ struct server_slot { std::vector cache_tokens; std::vector generated_token_probs; - server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; + server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; bool has_next_token = true; bool has_new_line = false; @@ -210,7 +210,7 @@ struct server_slot { n_past = 0; n_sent_text = 0; n_sent_token_probs = 0; - cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; + inf_type = SERVER_TASK_INF_TYPE_COMPLETION; generated_token_probs.clear(); } @@ -1345,14 +1345,14 @@ struct server_context { // // break the input "prompt" into multiple tasks if needed, then format and tokenize the input prompt(s) - std::vector create_tasks_cmpl(json data, server_task_cmpl_type cmpl_type) { + std::vector create_tasks_inference(json data, server_task_inf_type inf_type) { std::vector tasks; auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) { SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size()); server_task task; task.id = queue_tasks.get_new_id(); - task.cmpl_type = cmpl_type; - task.type = SERVER_TASK_TYPE_COMPLETION; + task.inf_type = inf_type; + task.type = SERVER_TASK_TYPE_INFERENCE; task.data = task_data; task.prompt_tokens = std::move(prompt_tokens); tasks.push_back(std::move(task)); @@ -1364,10 +1364,10 @@ struct server_context { } // because llama_tokenize api is thread-safe, we can tokenize the prompt from HTTP thread - bool add_special = cmpl_type != SERVER_TASK_CMPL_TYPE_RERANK && cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL; + bool add_special = inf_type != SERVER_TASK_INF_TYPE_RERANK && inf_type != SERVER_TASK_INF_TYPE_INFILL; std::vector tokenized_prompts = tokenize_input_prompts(ctx, data.at("prompt"), add_special, true); - switch (cmpl_type) { - case SERVER_TASK_CMPL_TYPE_RERANK: + switch (inf_type) { + case SERVER_TASK_INF_TYPE_RERANK: { // prompts[0] is the question // the rest are the answers/documents @@ -1379,7 +1379,7 @@ struct server_context { create_task(data, tokens); } } break; - case SERVER_TASK_CMPL_TYPE_INFILL: + case SERVER_TASK_INF_TYPE_INFILL: { SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); for (size_t i = 0; i < tokenized_prompts.size(); i++) { @@ -1427,7 +1427,7 @@ struct server_context { queue_tasks.post(cancel_tasks, true); } - // receive the results from task(s) created by create_tasks_cmpl + // receive the results from task(s) created by create_tasks_inference void receive_cmpl_results( const std::unordered_set & id_tasks, const std::function&)> & result_handler, @@ -1451,7 +1451,7 @@ struct server_context { result_handler(results); } - // receive the results from task(s) created by create_tasks_cmpl, in stream mode + // receive the results from task(s) created by create_tasks_inference, in stream mode void receive_cmpl_results_stream( const std::unordered_set & id_tasks, const std::function & result_handler, const @@ -1484,7 +1484,7 @@ struct server_context { void process_single_task(const server_task & task) { switch (task.type) { - case SERVER_TASK_TYPE_COMPLETION: + case SERVER_TASK_TYPE_INFERENCE: { const int id_slot = json_value(task.data, "id_slot", -1); @@ -1517,7 +1517,7 @@ struct server_context { slot->reset(); slot->id_task = task.id; - slot->cmpl_type = task.cmpl_type; + slot->inf_type = task.inf_type; slot->index = json_value(task.data, "index", 0); slot->prompt_tokens = std::move(task.prompt_tokens); @@ -1881,7 +1881,7 @@ struct server_context { continue; } - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { + if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { // this prompt is too large to process - discard it if (slot.n_prompt_tokens > n_ubatch) { slot.release(); @@ -1992,7 +1992,7 @@ struct server_context { } // non-causal tasks require to fit the entire prompt in the physical batch - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { + if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { // cannot fit the prompt in the current batch - will try next iter if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { continue; @@ -2001,8 +2001,8 @@ struct server_context { // check that we are in the right batch_type, if not defer the slot const bool slot_type = - slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || - slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK ? 1 : 0; + slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || + slot.inf_type == SERVER_TASK_INF_TYPE_RERANK ? 1 : 0; if (batch_type == -1) { batch_type = slot_type; @@ -2120,7 +2120,7 @@ struct server_context { } if (slot.state == SLOT_STATE_DONE_PROMPT) { - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) { + if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING) { // prompt evaluated for embedding send_embedding(slot, batch_view); slot.release(); @@ -2128,7 +2128,7 @@ struct server_context { continue; // continue loop of slots } - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { + if (slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { send_rerank(slot, batch_view); slot.release(); slot.i_batch = -1; @@ -2682,13 +2682,13 @@ int main(int argc, char ** argv) { res_ok(res, {{ "success", true }}); }; - const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) { + const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) { if (ctx_server.params.embedding || ctx_server.params.reranking) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); return; } - std::vector tasks = ctx_server.create_tasks_cmpl(data, cmpl_type); + std::vector tasks = ctx_server.create_tasks_inference(data, inf_type); ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -2734,7 +2734,7 @@ int main(int argc, char ** argv) { const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { json data = json::parse(req.body); - return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res); + return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res); }; const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { @@ -2784,7 +2784,7 @@ int main(int argc, char ** argv) { } data["input_extra"] = input_extra; // default to empty array if it's not exist - return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res); + return handle_completions_generic(SERVER_TASK_INF_TYPE_INFILL, data, res); }; // TODO: maybe merge this function with "handle_completions_generic" @@ -2796,7 +2796,7 @@ int main(int argc, char ** argv) { json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); - std::vector tasks = ctx_server.create_tasks_cmpl(data, SERVER_TASK_CMPL_TYPE_NORMAL); + std::vector tasks = ctx_server.create_tasks_inference(data, SERVER_TASK_INF_TYPE_COMPLETION); ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -2940,7 +2940,7 @@ int main(int argc, char ** argv) { json responses = json::array(); bool error = false; { - std::vector tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_EMBEDDING); + std::vector tasks = ctx_server.create_tasks_inference({{"prompt", prompt}}, SERVER_TASK_INF_TYPE_EMBEDDING); ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -3017,7 +3017,7 @@ int main(int argc, char ** argv) { json responses = json::array(); bool error = false; { - std::vector tasks = ctx_server.create_tasks_cmpl({{"prompt", prompt}}, SERVER_TASK_CMPL_TYPE_RERANK); + std::vector tasks = ctx_server.create_tasks_inference({{"prompt", prompt}}, SERVER_TASK_INF_TYPE_RERANK); ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); From 13ee779313f89983da665dd6a59d738a3505c8a3 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Oct 2024 16:39:03 +0200 Subject: [PATCH 11/12] update docs --- examples/server/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/examples/server/README.md b/examples/server/README.md index 09f1aa249ab1f..8f00fcc793293 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -319,6 +319,18 @@ node index.js - The prompt is a string or an array with the first element given as a string - The model's `tokenizer.ggml.add_bos_token` metadata is `true` + These input shapes and data type are allowed for `prompt`: + + - Single string: `"string"` + - Single sequence of tokens: `[12, 34, 56]` + - Mixed tokens and strings: `[12, 34, "string", 56, 78]` + + Multiple prompts are also supported. In this case, the completion result will be an array. + + - Only strings: `["string1", "string2"]` + - Strings and sequences of tokens: `["string1", [12, 34, 56]]` + - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]` + `temperature`: Adjust the randomness of the generated text. Default: `0.8` `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled. From 7f7acdbec56a23930a80c2ccb414728f815651cc Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 24 Oct 2024 16:53:38 +0200 Subject: [PATCH 12/12] use llama_tokens everywhere --- examples/server/server.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index dce20ce96684e..58f93694f6846 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -86,7 +86,7 @@ struct server_task { int id = -1; // to be filled by server_queue int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL - std::vector prompt_tokens; + llama_tokens prompt_tokens; server_task_type type; json data; @@ -153,12 +153,12 @@ struct server_slot { int32_t n_prompt_tokens_processed = 0; // input prompt tokens - std::vector prompt_tokens; + llama_tokens prompt_tokens; size_t last_nl_pos = 0; std::string generated_text; - std::vector cache_tokens; + llama_tokens cache_tokens; std::vector generated_token_probs; server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; @@ -1184,7 +1184,7 @@ struct server_context { }; if (slot.sparams.n_probs > 0) { - const std::vector to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); + const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); @@ -1235,7 +1235,7 @@ struct server_context { if (slot.sparams.n_probs > 0) { std::vector probs; if (!slot.params.stream && slot.stopped_word) { - const std::vector stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); + const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); probs = std::vector( @@ -1911,7 +1911,7 @@ struct server_context { const int n_block_size = n_left / 2; const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; - std::vector new_tokens( + llama_tokens new_tokens( prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep); @@ -2869,7 +2869,7 @@ int main(int argc, char ** argv) { const bool add_special = json_value(body, "add_special", false); const bool with_pieces = json_value(body, "with_pieces", false); - std::vector tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true); + llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true); if (with_pieces) { for (const auto& token : tokens) { @@ -2906,7 +2906,7 @@ int main(int argc, char ** argv) { std::string content; if (body.count("tokens") != 0) { - const std::vector tokens = body.at("tokens"); + const llama_tokens tokens = body.at("tokens"); content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend()); }