Skip to content

Commit

Permalink
[BodhiApp] changes for bodhiapp.
Browse files Browse the repository at this point in the history
[21-dec-24] using prompt if passed in chat completions, and not using messages
- added python integration tests for changes in server
- having add_special as request param to pass to upstream, allows pre-formatted chat messages to not be formatted again
- modified workflow to download and cache the llama2-7b model used for integration testing
[17-jan-25] updated to latest llama.cpp
server.cpp had changes where ctx_server.vocab was used instead of the earlier ctx_server.ctx
[26-jan-25] merged examples/server/utils.hpp, llama.cpp is starting to support tools in the chat template, resolved rebase conflict by merging our logic of preformatted messages with new feature
adding the github workflow to create the artifacts based on platform and gpu architecture, for macos-cpu+metal, ubuntu-cpu+cuda, windows-cpu+cuda-12.4+11.6
also added for other ubuntu+windows cpu variants and gpu archs like vulkan etc.
  • Loading branch information
anagri committed Jan 26, 2025
1 parent f35726c commit 9cec3af
Show file tree
Hide file tree
Showing 7 changed files with 879 additions and 1 deletion.
707 changes: 707 additions & 0 deletions .github/workflows/llama-server.yml

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions .github/workflows/server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,26 @@ jobs:
with:
python-version: '3.11'

- name: envs
shell: bash
run: |
echo "USER_HOME=$HOME" >> $GITHUB_ENV
- name: Cache HuggingFace models
uses: actions/cache@v4
id: cache-hf
with:
path: ${{ env.USER_HOME }}/.cache/huggingface
key: hf-cache-llama2-7b-chat
enableCrossOsArchive: true

- name: Check and Download Llama model
if: steps.cache-hf.outputs.cache-hit != 'true'
run: |
python -m pip install -U pip
python -m pip install -U "huggingface_hub[cli]"
huggingface-cli download --revision 191239b3e26b2882fb562ffccdd1cf0f65402adb TheBloke/Llama-2-7B-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf
- name: Tests dependencies
id: test_dependencies
run: |
Expand Down Expand Up @@ -199,6 +219,25 @@ jobs:
run: |
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
- name: Set environment variables
shell: pwsh
run: |
echo "USER_HOME=${HOME}" >> $env:GITHUB_ENV
- name: Cache HuggingFace models
uses: actions/cache@v4
id: cache-hf
with:
path: ${{ env.USER_HOME }}\.cache\huggingface
key: hf-cache-Windows-llama2-7b-chat

- name: Check and Download Llama model
if: steps.cache-hf.outputs.cache-hit != 'true'
run: |
python -m pip install -U pip
python -m pip install -U "huggingface_hub[cli]"
huggingface-cli download --revision 191239b3e26b2882fb562ffccdd1cf0f65402adb TheBloke/Llama-2-7B-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf
- name: Tests
id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
Expand Down
5 changes: 4 additions & 1 deletion examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3763,7 +3763,10 @@ int main(int argc, char ** argv) {
std::vector<server_task> tasks;

try {
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), true, true);
const bool add_special = json_value(data, "add_special", true);
const bool with_pieces = json_value(data, "with_pieces", true);

std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), add_special, with_pieces);
tasks.reserve(tokenized_prompts.size());
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
server_task task = server_task(type);
Expand Down
84 changes: 84 additions & 0 deletions examples/server/tests/unit/test_preformatted_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pytest
from utils import *

server = ServerPreset.llama2()


@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.llama2()


@pytest.mark.parametrize(
"model,data,max_tokens,re_content,n_prompt,n_predicted,finish_reason, prompt",
[
(
"llama2",
{
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What day comes after Monday?"},
]
},
16,
"(Tuesday)+",
56,
8,
"stop",
"""<s> <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
What day comes after Monday?<|im_end|>
<|im_start|>assistant
""",
),
(
"llama2",
{
"prompt": """<s>[INST] <<SYS>>
You are a helpful assistant.
<</SYS>>
What day comes after Monday? [/INST]""",
"add_special": False,
},
1024,
"(Tuesday)+",
33,
25,
"stop",
"""<s> [INST] <<SYS>>
You are a helpful assistant.
<</SYS>>
What day comes after Monday? [/INST]""",
),
],
)
def test_chat_completion_without_preformatted_prompt(
model, data, max_tokens, re_content, n_prompt, n_predicted, finish_reason, prompt
):
global server
server.start()
res = server.make_request(
"POST",
"/chat/completions",
data={
"model": model,
"max_tokens": max_tokens,
**data,
},
)
assert res.status_code == 200
assert (
"cmpl" in res.body["id"]
) # make sure the completion id has the expected format
assert res.body["model"] == model
# assert res.body["usage"]["prompt_tokens"] == n_prompt
# assert res.body["usage"]["completion_tokens"] == n_predicted
choice = res.body["choices"][0]
assert "assistant" == choice["message"]["role"]
assert match_regex(re_content, choice["message"]["content"])
assert choice["finish_reason"] == finish_reason
assert res.body["__verbose"]["prompt"] == prompt
23 changes: 23 additions & 0 deletions examples/server/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,29 @@ def jina_reranker_tiny() -> ServerProcess:
server.server_reranking = True
return server

@staticmethod
def llama2() -> ServerProcess:
server = ServerProcess()
server.model_file = os.path.join(
os.path.expanduser("~"),
".cache",
"huggingface",
"hub",
"models--TheBloke--Llama-2-7B-Chat-GGUF",
"snapshots",
"191239b3e26b2882fb562ffccdd1cf0f65402adb",
"llama-2-7b-chat.Q4_K_M.gguf",
)
server.debug = True
server.model_hf_repo = None
server.model_hf_file = None
server.model_alias = "llama2"
server.n_ctx = 2048
server.n_batch = 32
server.n_slots = 2
server.n_predict = 2048
server.seed = 42
return server

def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
"""
Expand Down
6 changes: 6 additions & 0 deletions examples/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -617,12 +617,18 @@ static json oaicompat_completion_params_parse(
}
}

std::string prompt = json_value(body, "prompt", std::string(""));
if (prompt != "") {
LOG_WRN("Using prompt from body");
llama_params["prompt"] = prompt;
} else {
// Apply chat template to the list of messages
if (use_jinja) {
llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
} else {
llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
}
}

// Handle "n" field
int n_choices = json_value(body, "n", 1);
Expand Down
16 changes: 16 additions & 0 deletions rebase.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
set -x
git checkout master
git fetch gg
git pull --rebase
git rebase gg/master
git push --force origin master

git checkout bodhiserver_lastcommit
git pull --rebase
git checkout -b bodhiserver_newcommit
git rebase origin/master

cd examples/server/test && pip install -r requirements.txt && pytest
git push -u origin bodhiserver_newcommit

git checkout master

0 comments on commit 9cec3af

Please sign in to comment.