Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server : Smart selection of available slot using Longest Common Prefix #7728

Merged
merged 6 commits into from
Jun 8, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Rename argument
sasha0552 authored Jun 7, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit a8842fdf56dc725b69c19332d46dc8bbf612069e
8 changes: 4 additions & 4 deletions common/common.cpp
Original file line number Diff line number Diff line change
@@ -1460,12 +1460,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.chat_template = argv[i];
return true;
}
if (arg == "--lcp-similarity") {
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.lcp_similarity = std::stof(argv[i]);
params.slot_prompt_similarity = std::stof(argv[i]);
return true;
}
if (arg == "-pps") {
@@ -1839,8 +1839,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"set custom jinja chat template (default: template taken from model's metadata)\n"
"only commonly used templates are accepted:\n"
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "server", " --lcp-similarity SIMILARITY",
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f)\n", params.lcp_similarity });
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });

#ifndef LOG_DISABLE_LOGS
options.push_back({ "logging" });
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
@@ -202,7 +202,7 @@ struct gpt_params {

std::string slot_save_path;

float lcp_similarity = 0.0f;
float slot_prompt_similarity = 0.5f;

// batched-bench params
bool is_pp_shared = false;
12 changes: 6 additions & 6 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
@@ -647,8 +647,8 @@ struct server_context {

server_metrics metrics;

// Longest Common Prefix similarity for slot selection
float lcp_similarity = 0.0f;
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;

~server_context() {
if (ctx) {
@@ -812,7 +812,7 @@ struct server_context {
server_slot * ret = nullptr;

// find the slot that has at least n% prompt similarity
if (ret == nullptr && lcp_similarity != 0.0f && !prompt.empty()) {
if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
int max_lcp_len = 0;
float similarity = 0;

@@ -840,7 +840,7 @@ struct server_context {
similarity = static_cast<float>(lcp_len) / slot_prompt_len;

// select the current slot if the criteria match
if (lcp_len > max_lcp_len && similarity > lcp_similarity) {
if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
max_lcp_len = lcp_len;
ret = &slot;
}
@@ -2568,8 +2568,8 @@ int main(int argc, char ** argv) {
log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
}

// Longest Common Prefix similarity for slot selection
ctx_server.lcp_similarity = params.lcp_similarity;
// Necessary similarity of prompt for slot selection
ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;

// load the model
if (!ctx_server.load_model(params)) {