From 1d0528198ac3de227e53c8c36bbd12e538d00a07 Mon Sep 17 00:00:00 2001
From: Siddharth Venkatesan <siddhave@amazon.com>
Date: Sun, 2 Feb 2025 20:35:19 -0800
Subject: [PATCH] fix lmi/vllm virtual envs, update to vllm 0.7.1 (#2703)

---
 .../chat_completions/vllm_chat_utils.py       | 20 ++++---------------
 .../properties_manager/vllm_rb_properties.py  | 10 ++++++++++
 .../lmi-container-requirements-common.txt     |  9 ++++-----
 serving/docker/requirements-lmi.txt           |  1 +
 serving/docker/requirements-vllm.txt          |  3 ++-
 serving/docker/scripts/create_virtual_env.sh  |  8 +-------
 tests/integration/llm/client.py               |  5 +++++
 tests/integration/tests.py                    |  1 +
 8 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py b/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py
index ebe5d3863..68b646052 100644
--- a/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py
+++ b/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py
@@ -21,16 +21,11 @@
                                          resolve_chat_template_content_format)
 
 
-def is_chat_completions_request(inputs: Dict) -> bool:
-    return "messages" in inputs
-
-
 def parse_chat_completions_request_vllm(
     input_map: Dict,
     is_rolling_batch: bool,
     rolling_batch,
     tokenizer,
-    chat_template: Optional[str] = None,
     configs: Properties = None,
     is_mistral_tokenizer: bool = False,
 ):
@@ -41,12 +36,6 @@ def parse_chat_completions_request_vllm(
             "You must enable rolling batch to use the chat completions format."
         )
 
-    if not is_mistral_tokenizer and not hasattr(tokenizer,
-                                                "apply_chat_template"):
-        raise AttributeError(
-            f"Cannot provide chat completion for tokenizer: {tokenizer.__class__}, "
-            f"please ensure that your tokenizer supports chat templates.")
-
     tool_parser = rolling_batch.get_tool_parser()
     chat_params = ChatProperties(**input_map)
 
@@ -85,16 +74,15 @@ def parse_chat_completions_request_vllm(
     if is_mistral_tokenizer:
         text_inputs = apply_mistral_chat_template(
             tokenizer,
-            messages=chat_params.messages,
-            chat_template=chat_template,
-            add_generation_prompt=True,
+            chat_params.messages,
+            None,
             tools=tool_dicts,
         )
     else:
         text_inputs = apply_hf_chat_template(
             tokenizer,
-            conversation=conversation,
-            chat_template=chat_template,
+            conversation,
+            None,
             add_generation_prompt=True,
             tools=tool_dicts,
         )
diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
index 4309467ae..8fae3f5f6 100644
--- a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
+++ b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
@@ -66,6 +66,7 @@ class VllmRbProperties(Properties):
     # The following configs have different defaults, or additional processing in DJL compared to vLLM
     dtype: str = "auto"
     max_loras: int = 4
+    task: str = 'auto'
     # The following configs have broken processing in vllm via the FlexibleArgumentParser
     long_lora_scaling_factors: Optional[Tuple[float, ...]] = None
     use_v2_block_manager: bool = True
@@ -89,6 +90,14 @@ def validate_engine(cls, engine):
                 f"Need python engine to start vLLM RollingBatcher")
         return engine
 
+    @field_validator('task')
+    def validate_task(cls, task):
+        # TODO: conflicts between HF and VLLM tasks, need to separate these.
+        # for backwards compatibility, max text-generation to generate
+        if task == 'text-generation':
+            task = 'generate'
+        return task
+
     @field_validator('dtype')
     def validate_dtype(cls, val):
         if val not in DTYPE_MAPPER:
@@ -114,6 +123,7 @@ def validate_tool_call_parser(self):
                 raise ValueError(
                     f"Invalid tool call parser: {self.tool_call_parser} "
                     f"(chose from {{ {','.join(valid_tool_parses)} }})")
+        return self
 
     @field_validator('override_neuron_config', mode="before")
     def validate_override_neuron_config(cls, val):
diff --git a/serving/docker/lmi-container-requirements-common.txt b/serving/docker/lmi-container-requirements-common.txt
index a351b5628..e0da26dbc 100644
--- a/serving/docker/lmi-container-requirements-common.txt
+++ b/serving/docker/lmi-container-requirements-common.txt
@@ -1,6 +1,6 @@
-peft==0.13.2
+peft
 protobuf==3.20.3
-transformers==4.45.2
+transformers>=4.45.2
 hf-transfer
 zstandard
 datasets==3.0.1
@@ -23,9 +23,8 @@ onnx
 sentence_transformers
 onnxruntime-gpu==1.20.0
 autoawq==0.2.5
-llmcompressor==0.3.1
-tokenizers==0.20.3
-pydantic==2.9.2
+tokenizers>=0.20.3
+pydantic>=2.9.2
 optimum==1.23.2
 torch==2.5.1
 torchvision==0.20.1
diff --git a/serving/docker/requirements-lmi.txt b/serving/docker/requirements-lmi.txt
index 87ec8c0ac..387a70b43 100644
--- a/serving/docker/requirements-lmi.txt
+++ b/serving/docker/requirements-lmi.txt
@@ -1,4 +1,5 @@
 -r requirements-common.txt
+llmcompressor
 # flash infer kernels for vllm/lmi-dist
 https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl
 # vllm wheel built with pt2.5.1
diff --git a/serving/docker/requirements-vllm.txt b/serving/docker/requirements-vllm.txt
index 2c1fe83f5..7977615bd 100644
--- a/serving/docker/requirements-vllm.txt
+++ b/serving/docker/requirements-vllm.txt
@@ -1,2 +1,3 @@
 -r requirements-common.txt
-vllm==0.7.0
\ No newline at end of file
+llmcompressor
+vllm==0.7.1
\ No newline at end of file
diff --git a/serving/docker/scripts/create_virtual_env.sh b/serving/docker/scripts/create_virtual_env.sh
index d97bc31c6..04e7abf18 100755
--- a/serving/docker/scripts/create_virtual_env.sh
+++ b/serving/docker/scripts/create_virtual_env.sh
@@ -7,12 +7,6 @@ requirements_file=$2
 # This was copied over from the previous pip install defined in the lmi.Dockerfile, so it's specific to that Dockerfile
 python -m venv --system-site-packages $venv_directory
 venv_pip="${venv_directory}/bin/pip"
-$venv_pip install -r $requirements_file
+$venv_pip install -r $requirements_file || exit 1
 $venv_pip install https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl --no-deps
-git clone https://github.com/neuralmagic/AutoFP8.git
-cd AutoFP8
-git reset --hard 4b2092c
-$venv_pip install .
-cd ..
-rm -rf AutoFP8
 $venv_pip cache purge
diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
index 7f705f516..b99111cfa 100644
--- a/tests/integration/llm/client.py
+++ b/tests/integration/llm/client.py
@@ -602,6 +602,11 @@ def get_model_name():
         "seq_length": [256],
         "tokenizer": "TheBloke/Llama-2-7B-Chat-fp16"
     },
+    "mistral-7b": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "TheBloke/Llama-2-7B-Chat-fp16",
+    }
 }
 
 vllm_tool_model_spec = {
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
index 58762f4de..e15e12aec 100644
--- a/tests/integration/tests.py
+++ b/tests/integration/tests.py
@@ -571,6 +571,7 @@ def test_mistral_7b(self):
             prepare.build_vllm_model("mistral-7b")
             r.launch()
             client.run("vllm mistral-7b".split())
+            client.run("vllm_chat mistral-7b".split())
 
     def test_phi2(self):
         with Runner('lmi', 'phi-2') as r: