From 1d0528198ac3de227e53c8c36bbd12e538d00a07 Mon Sep 17 00:00:00 2001 From: Siddharth Venkatesan Date: Sun, 2 Feb 2025 20:35:19 -0800 Subject: [PATCH] fix lmi/vllm virtual envs, update to vllm 0.7.1 (#2703) --- .../chat_completions/vllm_chat_utils.py | 20 ++++--------------- .../properties_manager/vllm_rb_properties.py | 10 ++++++++++ .../lmi-container-requirements-common.txt | 9 ++++----- serving/docker/requirements-lmi.txt | 1 + serving/docker/requirements-vllm.txt | 3 ++- serving/docker/scripts/create_virtual_env.sh | 8 +------- tests/integration/llm/client.py | 5 +++++ tests/integration/tests.py | 1 + 8 files changed, 28 insertions(+), 29 deletions(-) diff --git a/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py b/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py index ebe5d3863..68b646052 100644 --- a/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py +++ b/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py @@ -21,16 +21,11 @@ resolve_chat_template_content_format) -def is_chat_completions_request(inputs: Dict) -> bool: - return "messages" in inputs - - def parse_chat_completions_request_vllm( input_map: Dict, is_rolling_batch: bool, rolling_batch, tokenizer, - chat_template: Optional[str] = None, configs: Properties = None, is_mistral_tokenizer: bool = False, ): @@ -41,12 +36,6 @@ def parse_chat_completions_request_vllm( "You must enable rolling batch to use the chat completions format." ) - if not is_mistral_tokenizer and not hasattr(tokenizer, - "apply_chat_template"): - raise AttributeError( - f"Cannot provide chat completion for tokenizer: {tokenizer.__class__}, " - f"please ensure that your tokenizer supports chat templates.") - tool_parser = rolling_batch.get_tool_parser() chat_params = ChatProperties(**input_map) @@ -85,16 +74,15 @@ def parse_chat_completions_request_vllm( if is_mistral_tokenizer: text_inputs = apply_mistral_chat_template( tokenizer, - messages=chat_params.messages, - chat_template=chat_template, - add_generation_prompt=True, + chat_params.messages, + None, tools=tool_dicts, ) else: text_inputs = apply_hf_chat_template( tokenizer, - conversation=conversation, - chat_template=chat_template, + conversation, + None, add_generation_prompt=True, tools=tool_dicts, ) diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py index 4309467ae..8fae3f5f6 100644 --- a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py +++ b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py @@ -66,6 +66,7 @@ class VllmRbProperties(Properties): # The following configs have different defaults, or additional processing in DJL compared to vLLM dtype: str = "auto" max_loras: int = 4 + task: str = 'auto' # The following configs have broken processing in vllm via the FlexibleArgumentParser long_lora_scaling_factors: Optional[Tuple[float, ...]] = None use_v2_block_manager: bool = True @@ -89,6 +90,14 @@ def validate_engine(cls, engine): f"Need python engine to start vLLM RollingBatcher") return engine + @field_validator('task') + def validate_task(cls, task): + # TODO: conflicts between HF and VLLM tasks, need to separate these. + # for backwards compatibility, max text-generation to generate + if task == 'text-generation': + task = 'generate' + return task + @field_validator('dtype') def validate_dtype(cls, val): if val not in DTYPE_MAPPER: @@ -114,6 +123,7 @@ def validate_tool_call_parser(self): raise ValueError( f"Invalid tool call parser: {self.tool_call_parser} " f"(chose from {{ {','.join(valid_tool_parses)} }})") + return self @field_validator('override_neuron_config', mode="before") def validate_override_neuron_config(cls, val): diff --git a/serving/docker/lmi-container-requirements-common.txt b/serving/docker/lmi-container-requirements-common.txt index a351b5628..e0da26dbc 100644 --- a/serving/docker/lmi-container-requirements-common.txt +++ b/serving/docker/lmi-container-requirements-common.txt @@ -1,6 +1,6 @@ -peft==0.13.2 +peft protobuf==3.20.3 -transformers==4.45.2 +transformers>=4.45.2 hf-transfer zstandard datasets==3.0.1 @@ -23,9 +23,8 @@ onnx sentence_transformers onnxruntime-gpu==1.20.0 autoawq==0.2.5 -llmcompressor==0.3.1 -tokenizers==0.20.3 -pydantic==2.9.2 +tokenizers>=0.20.3 +pydantic>=2.9.2 optimum==1.23.2 torch==2.5.1 torchvision==0.20.1 diff --git a/serving/docker/requirements-lmi.txt b/serving/docker/requirements-lmi.txt index 87ec8c0ac..387a70b43 100644 --- a/serving/docker/requirements-lmi.txt +++ b/serving/docker/requirements-lmi.txt @@ -1,4 +1,5 @@ -r requirements-common.txt +llmcompressor # flash infer kernels for vllm/lmi-dist https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl # vllm wheel built with pt2.5.1 diff --git a/serving/docker/requirements-vllm.txt b/serving/docker/requirements-vllm.txt index 2c1fe83f5..7977615bd 100644 --- a/serving/docker/requirements-vllm.txt +++ b/serving/docker/requirements-vllm.txt @@ -1,2 +1,3 @@ -r requirements-common.txt -vllm==0.7.0 \ No newline at end of file +llmcompressor +vllm==0.7.1 \ No newline at end of file diff --git a/serving/docker/scripts/create_virtual_env.sh b/serving/docker/scripts/create_virtual_env.sh index d97bc31c6..04e7abf18 100755 --- a/serving/docker/scripts/create_virtual_env.sh +++ b/serving/docker/scripts/create_virtual_env.sh @@ -7,12 +7,6 @@ requirements_file=$2 # This was copied over from the previous pip install defined in the lmi.Dockerfile, so it's specific to that Dockerfile python -m venv --system-site-packages $venv_directory venv_pip="${venv_directory}/bin/pip" -$venv_pip install -r $requirements_file +$venv_pip install -r $requirements_file || exit 1 $venv_pip install https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl --no-deps -git clone https://github.com/neuralmagic/AutoFP8.git -cd AutoFP8 -git reset --hard 4b2092c -$venv_pip install . -cd .. -rm -rf AutoFP8 $venv_pip cache purge diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 7f705f516..b99111cfa 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -602,6 +602,11 @@ def get_model_name(): "seq_length": [256], "tokenizer": "TheBloke/Llama-2-7B-Chat-fp16" }, + "mistral-7b": { + "batch_size": [1, 4], + "seq_length": [256], + "tokenizer": "TheBloke/Llama-2-7B-Chat-fp16", + } } vllm_tool_model_spec = { diff --git a/tests/integration/tests.py b/tests/integration/tests.py index 58762f4de..e15e12aec 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -571,6 +571,7 @@ def test_mistral_7b(self): prepare.build_vllm_model("mistral-7b") r.launch() client.run("vllm mistral-7b".split()) + client.run("vllm_chat mistral-7b".split()) def test_phi2(self): with Runner('lmi', 'phi-2') as r: