From dc08153b59538250e6b3e4068141b2a37bba75d4 Mon Sep 17 00:00:00 2001 From: Siddharth Venkatesan Date: Fri, 31 Jan 2025 13:36:26 -0800 Subject: [PATCH] =?UTF-8?q?[docker]=20separate=20vllm=20and=20lmi-dist=20m?= =?UTF-8?q?odes=20into=20separate=20virtual=20envir=E2=80=A6=20(#2690)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../chat_completions/vllm_chat_utils.py | 15 ++++++--- .../rolling_batch/lmi_dist_rolling_batch.py | 8 ++--- .../lmi-container-requirements-common.txt | 32 ++++++++++++++++++ serving/docker/lmi.Dockerfile | 25 ++++++-------- serving/docker/requirements-lmi.txt | 33 +------------------ serving/docker/requirements-vllm.txt | 2 ++ serving/docker/scripts/create_virtual_env.sh | 18 ++++++++++ .../djl/serving/wlm/LmiConfigRecommender.java | 16 +++++++++ 8 files changed, 92 insertions(+), 57 deletions(-) create mode 100644 serving/docker/lmi-container-requirements-common.txt create mode 100644 serving/docker/requirements-vllm.txt create mode 100755 serving/docker/scripts/create_virtual_env.sh diff --git a/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py b/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py index fddd85585..ebe5d3863 100644 --- a/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py +++ b/engines/python/setup/djl_python/chat_completions/vllm_chat_utils.py @@ -15,10 +15,10 @@ from djl_python.chat_completions.vllm_chat_properties import ChatProperties from djl_python.properties_manager.properties import Properties from djl_python.rolling_batch.rolling_batch_vllm_utils import maybe_serialize_tool_calls -from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, - apply_hf_chat_template, +from vllm.entrypoints.chat_utils import (apply_hf_chat_template, apply_mistral_chat_template, - parse_chat_messages) + parse_chat_messages, + resolve_chat_template_content_format) def is_chat_completions_request(inputs: Dict) -> bool: @@ -70,9 +70,16 @@ def parse_chat_completions_request_vllm( tool_dicts = None if chat_params.tools is None else [ tool.model_dump() for tool in chat_params.tools ] + # TODO - figure out what we need to pass for given format + content_format = resolve_chat_template_content_format( + chat_template=None, + given_format="auto", + tokenizer=tokenizer, + ) conversation, mm_data = parse_chat_messages( - chat_params.messages, rolling_batch.get_model_config(), tokenizer) + chat_params.messages, rolling_batch.get_model_config(), tokenizer, + content_format) prompt_data: Union[str, List[int]] if is_mistral_tokenizer: diff --git a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py index 773b3e3eb..1ef86d98a 100644 --- a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py +++ b/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py @@ -132,12 +132,8 @@ def get_model_config(self): return self.engine.preprocessor.model_config if not self.is_t5_model else None def use_vllm_chat_completions(self): - return True - - def get_huggingface_model_config(self): - # TODO: this is a hack right now to get the model config from the engine. We should expose this as - # an interface method and retrieve it from there after v12 - return self.engine.preprocessor.model_config.hf_config if not self.is_t5_model else None + # vllm chat parsing requires 0.7.0 currently, lmi-dist is on 0.6.3.post1 + return False def get_huggingface_model_config(self): # TODO: this is a hack right now to get the model config from the engine. We should expose this as diff --git a/serving/docker/lmi-container-requirements-common.txt b/serving/docker/lmi-container-requirements-common.txt new file mode 100644 index 000000000..0d4f69dde --- /dev/null +++ b/serving/docker/lmi-container-requirements-common.txt @@ -0,0 +1,32 @@ +peft==0.13.2 +protobuf==3.20.3 +transformers==4.45.2 +hf-transfer +zstandard +datasets==3.0.1 +mpi4py +sentencepiece +tiktoken +blobfile +einops +accelerate==1.0.1 +bitsandbytes==0.44.1 +auto-gptq==0.7.1 +pandas +pyarrow +jinja2 +retrying +opencv-contrib-python-headless +safetensors +scipy +onnx +sentence_transformers +onnxruntime-gpu==1.20.0 +autoawq==0.2.5 +tokenizers==0.20.3 +pydantic==2.9.2 +optimum==1.23.2 +torch==2.5.1 +torchvision==0.20.1 +# sequence scheduler wheel for hf accelerate rolling batch +https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl \ No newline at end of file diff --git a/serving/docker/lmi.Dockerfile b/serving/docker/lmi.Dockerfile index 6abaa8a41..20e93b8da 100644 --- a/serving/docker/lmi.Dockerfile +++ b/serving/docker/lmi.Dockerfile @@ -73,9 +73,6 @@ COPY config.properties /opt/djl/conf/config.properties COPY partition /opt/djl/partition COPY scripts/telemetry.sh /opt/djl/bin -COPY distribution[s]/ ./ -RUN mv *.deb djl-serving_all.deb || true - RUN apt-get update && apt-get install -yq libaio-dev libopenmpi-dev g++ unzip cuda-compat-12-4 \ && scripts/install_openssh.sh \ && scripts/install_python.sh ${python_version} \ @@ -84,24 +81,22 @@ RUN apt-get update && apt-get install -yq libaio-dev libopenmpi-dev g++ unzip cu && apt-get clean -y \ && rm -rf /var/lib/apt/lists/* -COPY requirements-lmi.txt ./requirements.txt -RUN pip3 install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu124 && pip3 cache purge -RUN pip3 install -r requirements.txt \ - && pip3 install ${djl_converter_wheel} --no-deps \ - && git clone https://github.com/neuralmagic/AutoFP8.git \ - && cd AutoFP8 \ - && git reset --hard 4b2092c \ - && pip3 install . \ - && cd .. \ - && rm -rf AutoFP8 \ - && pip3 cache purge - RUN scripts/patch_oss_dlc.sh python \ && scripts/security_patch.sh lmi \ && useradd -m -d /home/djl djl \ && chown -R djl:djl /opt/djl \ && apt-get clean -y && rm -rf /var/lib/apt/lists/* +COPY lmi-container-requirements-common.txt ./requirements-common.txt +COPY requirements-lmi.txt ./requirements-lmi.txt +COPY requirements-vllm.txt ./requirements-vllm.txt +RUN pip3 install -r requirements-common.txt \ + && scripts/create_virtual_env.sh /opt/djl/vllm_venv requirements-vllm.txt \ + && scripts/create_virtual_env.sh /opt/djl/lmi_dist_venv requirements-lmi.txt + +COPY distribution[s]/ ./ +RUN mv *.deb djl-serving_all.deb || true + RUN scripts/install_djl_serving.sh $djl_version $djl_serving_version ${djl_torch_version} \ && djl-serving -i ai.djl.onnxruntime:onnxruntime-engine:$djl_version \ && djl-serving -i com.microsoft.onnxruntime:onnxruntime_gpu:$djl_onnx_version diff --git a/serving/docker/requirements-lmi.txt b/serving/docker/requirements-lmi.txt index 38f492378..87ec8c0ac 100644 --- a/serving/docker/requirements-lmi.txt +++ b/serving/docker/requirements-lmi.txt @@ -1,35 +1,4 @@ -peft==0.13.2 -protobuf==3.20.3 -transformers==4.45.2 -hf-transfer -zstandard -datasets==3.0.1 -mpi4py -sentencepiece -tiktoken -blobfile -einops -accelerate==1.0.1 -bitsandbytes==0.44.1 -auto-gptq==0.7.1 -pandas -pyarrow -jinja2 -retrying -opencv-contrib-python-headless -safetensors -scipy -onnx -sentence_transformers -onnxruntime-gpu==1.20.0 -autoawq==0.2.5 -tokenizers==0.20.3 -pydantic==2.9.2 -optimum==1.23.2 -torch==2.5.1 -torchvision==0.20.1 -# sequence scheduler wheel for hf accelerate rolling batch -https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl +-r requirements-common.txt # flash infer kernels for vllm/lmi-dist https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl # vllm wheel built with pt2.5.1 diff --git a/serving/docker/requirements-vllm.txt b/serving/docker/requirements-vllm.txt new file mode 100644 index 000000000..2c1fe83f5 --- /dev/null +++ b/serving/docker/requirements-vllm.txt @@ -0,0 +1,2 @@ +-r requirements-common.txt +vllm==0.7.0 \ No newline at end of file diff --git a/serving/docker/scripts/create_virtual_env.sh b/serving/docker/scripts/create_virtual_env.sh new file mode 100755 index 000000000..d97bc31c6 --- /dev/null +++ b/serving/docker/scripts/create_virtual_env.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# used in the dockerfiles to create virtualenvs per engine +# currently only intended for use in lmi.Dockerfile, need to refactor this to work for trtllm/neuron if needed +venv_directory=$1 +requirements_file=$2 + +# This was copied over from the previous pip install defined in the lmi.Dockerfile, so it's specific to that Dockerfile +python -m venv --system-site-packages $venv_directory +venv_pip="${venv_directory}/bin/pip" +$venv_pip install -r $requirements_file +$venv_pip install https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl --no-deps +git clone https://github.com/neuralmagic/AutoFP8.git +cd AutoFP8 +git reset --hard 4b2092c +$venv_pip install . +cd .. +rm -rf AutoFP8 +$venv_pip cache purge diff --git a/wlm/src/main/java/ai/djl/serving/wlm/LmiConfigRecommender.java b/wlm/src/main/java/ai/djl/serving/wlm/LmiConfigRecommender.java index f44e7412d..ea8fe3532 100644 --- a/wlm/src/main/java/ai/djl/serving/wlm/LmiConfigRecommender.java +++ b/wlm/src/main/java/ai/djl/serving/wlm/LmiConfigRecommender.java @@ -43,6 +43,7 @@ static void configure(Properties lmiProperties, LmiUtils.HuggingFaceModelConfig setRollingBatchSize(lmiProperties); setIsPeftModel(lmiProperties, modelConfig); setPropertiesForLora(lmiProperties); + setPythonExecutable(lmiProperties); } private static void setRollingBatch( @@ -206,4 +207,19 @@ private static boolean isTextGenerationModel(LmiUtils.HuggingFaceModelConfig mod OPTIMIZED_TASK_ARCHITECTURES); return false; } + + private static void setPythonExecutable(Properties lmiProperties) { + if (lmiProperties.containsKey("option.pythonExecutable")) { + return; + } + String rollingBatch = lmiProperties.getProperty("option.rolling_batch"); + if ("vllm".equals(rollingBatch)) { + lmiProperties.setProperty("option.pythonExecutable", "/opt/djl/vllm_venv/bin/python"); + return; + } + if ("lmi-dist".equals(rollingBatch)) { + lmiProperties.setProperty( + "option.pythonExecutable", "/opt/djl/lmi_dist_venv/bin/python"); + } + } }