From 34095b7b0cb69295699dd9cd9bf5ac3f9fd9c60d Mon Sep 17 00:00:00 2001
From: Siddharth Venkatesan <siddhave@amazon.com>
Date: Tue, 28 Jan 2025 10:35:03 -0800
Subject: [PATCH] add deepspeek-r1-distill integration test

---
 .../djl_python/chat_completions/vllm_chat_utils.py   |  4 +++-
 tests/integration/llm/client.py                      | 12 +++++++++++-
 tests/integration/llm/prepare.py                     |  3 +++
 tests/integration/tests.py                           |  7 +++++++
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py b/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py
index bb1b15872..dea00555d 100644
--- a/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py
+++ b/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py
@@ -51,6 +51,7 @@ def parse_chat_completions_request_vllm(
     exclude = {"messages"}
     param = chat_params.model_dump(exclude_none=True, exclude=exclude)
 
+    # TODO - figure out what we need to pass for given format
     content_format = resolve_chat_template_content_format(
         chat_template=None,
         given_format="auto",
@@ -58,7 +59,8 @@ def parse_chat_completions_request_vllm(
     )
 
     conversation, mm_data = parse_chat_messages(
-        chat_params.messages, rolling_batch.get_model_config(), tokenizer, content_format)
+        chat_params.messages, rolling_batch.get_model_config(), tokenizer,
+        content_format)
 
     prompt_data: Union[str, List[int]]
     if is_mistral_tokenizer:
diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
index 821f55d12..55f0967d5 100644
--- a/tests/integration/llm/client.py
+++ b/tests/integration/llm/client.py
@@ -566,6 +566,11 @@ def get_model_name():
         "batch_size": [1, 4],
         "seq_length": [256],
     },
+    "deepseek-r1-llama": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+    },
 }
 
 vllm_neo_model_spec = {
@@ -597,7 +602,12 @@ def get_model_name():
         "batch_size": [1, 4],
         "seq_length": [256],
         "tokenizer": "TheBloke/Llama-2-7B-Chat-fp16"
-    }
+    },
+    "deepseek-r1-llama": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+    },
 }
 
 lmi_dist_aiccl_model_spec = {
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
index 1142a1b7f..861484b85 100644
--- a/tests/integration/llm/prepare.py
+++ b/tests/integration/llm/prepare.py
@@ -1056,6 +1056,9 @@
         "option.max_model_len": 8192,
         "option.max_rolling_batch_size": 16,
         "option.enforce_eager": True,
+    },
+    "deepseek-r1-llama": {
+        "option.model_id": "s3://djl-llm/deepseek-r1-distill-llama-8b/"
     }
 }
 
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
index d25e3b46b..59f0d8558 100644
--- a/tests/integration/tests.py
+++ b/tests/integration/tests.py
@@ -596,6 +596,13 @@ def test_llama2_7b_chat(self):
             r.launch()
             client.run("vllm_chat llama2-7b-chat".split())
 
+    def test_deepspeed_1_distill_llama_8b(self):
+        with Runner('lmi', 'deepseek-r1-llama') as r:
+            prepare.build_vllm_model('deepseek-r1-llama')
+            r.launch()
+            client.run("vllm deepseek-r1-llama")
+            client.run("vllm_chat deepseek-r1-llama")
+
     @pytest.mark.skipif(not is_applicable_cuda_capability(89),
                         reason="Unsupported CUDA capability")
     def test_qwen2_7b_fp8(self):