[BodhiApp] changes for bodhiapp.

[21-dec-24] using prompt if passed in chat completions, and not using messages - added python integration tests for changes in server - having add_special as request param to pass to upstream, allows pre-formatted chat messages to not be formatted again - modified workflow to download and cache the llama2-7b model used for integration testing [17-jan-25] updated to latest llama.cpp server.cpp had changes where ctx_server.vocab was used instead of the earlier ctx_server.ctx [26-jan-25] merged examples/server/utils.hpp, llama.cpp is starting to support tools in the chat template, resolved rebase conflict by merging our logic of preformatted messages with new feature adding the github workflow to create the artifacts based on platform and gpu architecture, for macos-cpu+metal, ubuntu-cpu+cuda, windows-cpu+cuda-12.4+11.6 also added for other ubuntu+windows cpu variants and gpu archs like vulkan etc.
BodhiSearch · Jan 26, 2025 · 9cec3af · 9cec3af
1 parent f35726c
commit 9cec3af
Show file tree

Hide file tree

Showing 7 changed files with 879 additions and 1 deletion.
diff --git a/.github/workflows/llama-server.yml b/.github/workflows/llama-server.yml
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
@@ -71,6 +71,26 @@ jobs:
         with:
           python-version: '3.11'
 
+      - name: envs
+        shell: bash
+        run: |
+          echo "USER_HOME=$HOME" >> $GITHUB_ENV
+
+      - name: Cache HuggingFace models
+        uses: actions/cache@v4
+        id: cache-hf
+        with:
+          path: ${{ env.USER_HOME }}/.cache/huggingface
+          key: hf-cache-llama2-7b-chat
+          enableCrossOsArchive: true
+
+      - name: Check and Download Llama model
+        if: steps.cache-hf.outputs.cache-hit != 'true'
+        run: |
+          python -m pip install -U pip
+          python -m pip install -U "huggingface_hub[cli]"
+          huggingface-cli download --revision 191239b3e26b2882fb562ffccdd1cf0f65402adb TheBloke/Llama-2-7B-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf
+
       - name: Tests dependencies
         id: test_dependencies
         run: |
@@ -199,6 +219,25 @@ jobs:
         run: |
           cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
 
+      - name: Set environment variables
+        shell: pwsh
+        run: |
+          echo "USER_HOME=${HOME}" >> $env:GITHUB_ENV
+
+      - name: Cache HuggingFace models
+        uses: actions/cache@v4
+        id: cache-hf
+        with:
+          path: ${{ env.USER_HOME }}\.cache\huggingface
+          key: hf-cache-Windows-llama2-7b-chat
+
+      - name: Check and Download Llama model
+        if: steps.cache-hf.outputs.cache-hit != 'true'
+        run: |
+          python -m pip install -U pip
+          python -m pip install -U "huggingface_hub[cli]"
+          huggingface-cli download --revision 191239b3e26b2882fb562ffccdd1cf0f65402adb TheBloke/Llama-2-7B-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf
+
       - name: Tests
         id: server_integration_tests
         if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -3763,7 +3763,10 @@ int main(int argc, char ** argv) {
         std::vector<server_task> tasks;
 
         try {
-            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), true, true);
+            const bool add_special = json_value(data, "add_special", true);
+            const bool with_pieces = json_value(data, "with_pieces", true);
+
+            std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), add_special, with_pieces);
             tasks.reserve(tokenized_prompts.size());
             for (size_t i = 0; i < tokenized_prompts.size(); i++) {
                 server_task task = server_task(type);

diff --git a/examples/server/tests/unit/test_preformatted_prompt.py b/examples/server/tests/unit/test_preformatted_prompt.py
@@ -0,0 +1,84 @@
+import pytest
+from utils import *
+
+server = ServerPreset.llama2()
+
+
+@pytest.fixture(scope="module", autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.llama2()
+
+
+@pytest.mark.parametrize(
+    "model,data,max_tokens,re_content,n_prompt,n_predicted,finish_reason, prompt",
+    [
+        (
+            "llama2",
+            {
+                "messages": [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": "What day comes after Monday?"},
+                ]
+            },
+            16,
+            "(Tuesday)+",
+            56,
+            8,
+            "stop",
+            """<s> <|im_start|>system
+You are a helpful assistant.<|im_end|>
+<|im_start|>user
+What day comes after Monday?<|im_end|>
+<|im_start|>assistant
+""",
+        ),
+        (
+            "llama2",
+            {
+                "prompt": """<s>[INST] <<SYS>>
+You are a helpful assistant.
+<</SYS>>
+
+What day comes after Monday? [/INST]""",
+                "add_special": False,
+            },
+            1024,
+            "(Tuesday)+",
+            33,
+            25,
+            "stop",
+            """<s> [INST] <<SYS>>
+You are a helpful assistant.
+<</SYS>>
+
+What day comes after Monday? [/INST]""",
+        ),
+    ],
+)
+def test_chat_completion_without_preformatted_prompt(
+    model, data, max_tokens, re_content, n_prompt, n_predicted, finish_reason, prompt
+):
+    global server
+    server.start()
+    res = server.make_request(
+        "POST",
+        "/chat/completions",
+        data={
+            "model": model,
+            "max_tokens": max_tokens,
+            **data,
+        },
+    )
+    assert res.status_code == 200
+    assert (
+        "cmpl" in res.body["id"]
+    )  # make sure the completion id has the expected format
+    assert res.body["model"] == model
+    # assert res.body["usage"]["prompt_tokens"] == n_prompt
+    # assert res.body["usage"]["completion_tokens"] == n_predicted
+    choice = res.body["choices"][0]
+    assert "assistant" == choice["message"]["role"]
+    assert match_regex(re_content, choice["message"]["content"])
+    assert choice["finish_reason"] == finish_reason
+    assert res.body["__verbose"]["prompt"] == prompt
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
@@ -342,6 +342,29 @@ def jina_reranker_tiny() -> ServerProcess:
         server.server_reranking = True
         return server
 
+    @staticmethod
+    def llama2() -> ServerProcess:
+        server = ServerProcess()
+        server.model_file = os.path.join(
+            os.path.expanduser("~"),
+            ".cache",
+            "huggingface",
+            "hub",
+            "models--TheBloke--Llama-2-7B-Chat-GGUF",
+            "snapshots",
+            "191239b3e26b2882fb562ffccdd1cf0f65402adb",
+            "llama-2-7b-chat.Q4_K_M.gguf",
+        )
+        server.debug = True
+        server.model_hf_repo = None
+        server.model_hf_file = None
+        server.model_alias = "llama2"
+        server.n_ctx = 2048
+        server.n_batch = 32
+        server.n_slots = 2
+        server.n_predict = 2048
+        server.seed = 42
+        return server
 
 def parallel_function_calls(function_list: List[Tuple[Callable[..., Any], Tuple[Any, ...]]]) -> List[Any]:
     """

diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
@@ -617,12 +617,18 @@ static json oaicompat_completion_params_parse(
         }
     }
 
+    std::string prompt = json_value(body, "prompt", std::string(""));
+    if (prompt != "") {
+        LOG_WRN("Using prompt from body");
+        llama_params["prompt"] = prompt;
+    } else {
     // Apply chat template to the list of messages
     if (use_jinja) {
         llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
     } else {
         llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
     }
+    }
 
     // Handle "n" field
     int n_choices = json_value(body, "n", 1);

diff --git a/rebase.md b/rebase.md
@@ -0,0 +1,16 @@
+set -x
+git checkout master
+git fetch gg
+git pull --rebase
+git rebase gg/master
+git push --force origin master
+
+git checkout bodhiserver_lastcommit
+git pull --rebase
+git checkout -b bodhiserver_newcommit
+git rebase origin/master
+
+cd examples/server/test && pip install -r requirements.txt && pytest
+git push -u origin bodhiserver_newcommit
+
+git checkout master