From 5c100ac105bc1984df13ebd129874d148b8a4b20 Mon Sep 17 00:00:00 2001
From: Shaojun Liu <61072813+liu-shaojun@users.noreply.github.com>
Date: Thu, 27 Feb 2025 17:33:58 +0800
Subject: [PATCH] Add ENTRYPOINT to Dockerfile to auto-start vllm service on
 container launch (for CVTE customer) (#12901)

* Add ENTRYPOINT to Dockerfile to auto-start service on container launch (for CVTE client)

* Update start-vllm-service.sh

* Update README.md

* Update README.md

* Update start-vllm-service.sh

* Update README.md
---
 docker/llm/serving/xpu/docker/Dockerfile      |  3 +-
 docker/llm/serving/xpu/docker/README.md       | 67 +++++++++++++++++--
 .../serving/xpu/docker/start-vllm-service.sh  | 19 ++++--
 3 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile
index 3455d4f36f6..599403d46f9 100644
--- a/docker/llm/serving/xpu/docker/Dockerfile
+++ b/docker/llm/serving/xpu/docker/Dockerfile
@@ -13,7 +13,7 @@ ARG PIP_NO_CACHE_DIR=false
 ENV TZ=Asia/Shanghai PYTHONUNBUFFERED=1
 
 # Copy patch file and benchmark scripts
-ADD ./ccl_torch.patch /tmp/
+COPY ./ccl_torch.patch /tmp/
 COPY ./vllm_online_benchmark.py ./vllm_offline_inference.py ./vllm_offline_inference_vision_language.py \
      ./payload-1024.lua ./start-vllm-service.sh ./benchmark_vllm_throughput.py ./benchmark_vllm_latency.py \
      ./start-pp_serving-service.sh /llm/
@@ -165,3 +165,4 @@ RUN set -eux && \
     pip install ray
 
 WORKDIR /llm/
+ENTRYPOINT ["bash", "/llm/start-vllm-service.sh"]
diff --git a/docker/llm/serving/xpu/docker/README.md b/docker/llm/serving/xpu/docker/README.md
index cf4797f73ad..c506dcad014 100644
--- a/docker/llm/serving/xpu/docker/README.md
+++ b/docker/llm/serving/xpu/docker/README.md
@@ -1,6 +1,7 @@
 # IPEX-LLM-serving XPU Image: Build and Usage Guide
 
 This document outlines the steps to build and use the `IPEX-LLM-serving-xpu` Docker image, including inference, serving, and benchmarking functionalities for XPU.
+
 ---
 
 ## 1. Build the Image
@@ -62,21 +63,73 @@ For detailed instructions on running inference with `IPEX-LLM` on XPU, refer to
 
 To run XPU serving, you need to map the XPU into the container by specifying `--device=/dev/dri` when booting the container.
 
+### 3.1 **Start the Container and Automatically Launch the Service**
+
+By default, the container is configured to automatically start the service when it is run. You can also specify the model path, model name, and tensor parallel size using environment variables (MODEL_PATH, SERVED_MODEL_NAME, and TENSOR_PARALLEL_SIZE). This allows the service to start with the specific model and tensor parallel configuration you want to use. Additionally, make sure to mount the model directory into the container using the `-v` option.
+
 ### Example:
 
 ```bash
 #/bin/bash
-export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:latest
+export DOCKER_IMAGE=intelanalytics/ipex-llm-xpu:2.2.0-SNAPSHOT
 
 sudo docker run -itd \
         --net=host \
         --device=/dev/dri \
+        --memory="32G" \
         --name=CONTAINER_NAME \
         --shm-size="16g" \
+        -e MODEL_PATH="/llm/models" \
+        -e SERVED_MODEL_NAME="my_model" \
+        -e TENSOR_PARALLEL_SIZE=4 \
+        -v /home/intel/LLM/:/llm/models/ \
         $DOCKER_IMAGE
 ```
 
-After the container starts, access it using `docker exec`.
+- This command will start the container and automatically launch the service with the specified model path (`/llm/models`), model name (`my_model`), and tensor parallel size (`4`).
+- The `-e TENSOR_PARALLEL_SIZE=4` option specifies the number of GPUs (or cards) on which the service will run. You can adjust this value based on your parallelism needs.
+- The `-v /home/intel/LLM/:/llm/models/` option mounts the model directory from the host (`/home/intel/LLM/`) to the container (`/llm/models/`).
+
+Once the container is running, the service will be launched automatically based on the provided model or the default settings.
+
+#### View Logs:
+
+To view the logs of the container and monitor the service startup, you can use the following command:
+
+```bash
+docker logs CONTAINER_NAME
+```
+
+This will display the logs generated by the service, allowing you to check if everything is running as expected.
+
+### 3.2 **Start the Container and Manually Launch the Service**
+
+If you prefer to manually start the service or need to troubleshoot, you can override the entrypoint with `/bin/bash` when starting the container. This allows you to enter the container and run commands interactively. Use the following command:
+
+### Example:
+
+```bash
+#/bin/bash
+export DOCKER_IMAGE=intelanalytics/ipex-llm-xpu:2.2.0-SNAPSHOT
+
+sudo docker run -itd \
+        --net=host \
+        --device=/dev/dri \
+        --memory="32G" \
+        --name=CONTAINER_NAME \
+        --shm-size="16g" \
+        --entrypoint /bin/bash \
+        -v /home/intel/LLM/:/llm/models/ \
+        $DOCKER_IMAGE
+```
+
+After running this command, the container will start and drop you into an interactive shell (`bash`). From there, you can manually start the service by running:
+
+```bash
+bash /llm/start-vllm-service.sh
+```
+
+This option provides more control over the container and allows you to start the service at your convenience.
 
 To verify that the device is correctly mapped, run:
 
@@ -88,9 +141,9 @@ The output will be similar to the example in the inference section above.
 
 Currently, the image supports two different serving engines: **FastChat** and **vLLM**.
 
-### Serving Engines
+### 3.3 Serving Engines
 
-#### 3.1 Lightweight Serving Engine
+#### 3.3.1 Lightweight Serving Engine
 
 For running lightweight serving on Intel GPUs using `IPEX-LLM` as the backend, refer to the [Lightweight-Serving README](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Lightweight-Serving).
 
@@ -100,7 +153,7 @@ We have included a script `/llm/start-lightweight_serving-service` in the image.
 pip install transformers==4.37.0
 ```
 
-#### 3.2 Pipeline Parallel Serving Engine
+#### 3.3.2 Pipeline Parallel Serving Engine
 
 To use the **Pipeline Parallel** serving engine with `IPEX-LLM` as the backend, refer to this [Pipeline-Parallel-FastAPI README](https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/Pipeline-Parallel-FastAPI).
 
@@ -110,7 +163,7 @@ A convenience script `/llm/start-pp_serving-service.sh` is included in the image
 pip install transformers==4.37.0
 ```
 
-#### 3.3 vLLM Serving Engine
+#### 3.3.3 vLLM Serving Engine
 
 For running the **vLLM engine** with `IPEX-LLM` as the backend, refer to this [vLLM Docker Quickstart Guide](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/DockerGuides/vllm_docker_quickstart.md).
 
@@ -212,4 +265,4 @@ python3 /llm/benchmark_vllm_throughput.py \
     --gpu-memory-utilization 0.85
 ```
 
----
\ No newline at end of file
+---
diff --git a/docker/llm/serving/xpu/docker/start-vllm-service.sh b/docker/llm/serving/xpu/docker/start-vllm-service.sh
index 576c3b4a42a..139a088ce5e 100644
--- a/docker/llm/serving/xpu/docker/start-vllm-service.sh
+++ b/docker/llm/serving/xpu/docker/start-vllm-service.sh
@@ -1,6 +1,11 @@
 #!/bin/bash
-model="YOUR_MODEL_PATH"
-served_model_name="YOUR_MODEL_NAME"
+MODEL_PATH=${MODEL_PATH:-"default_model_path"}
+SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"default_model_name"}
+TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-1}  # Default to 1 if not set
+
+echo "Starting service with model: $MODEL_PATH"
+echo "Served model name: $SERVED_MODEL_NAME"
+echo "Tensor parallel size: $TENSOR_PARALLEL_SIZE"
 
 export CCL_WORKER_COUNT=2
 export SYCL_CACHE_PERSISTENT=1
@@ -19,9 +24,9 @@ export CCL_BLOCKING_WAIT=0
 source /opt/intel/1ccl-wks/setvars.sh
 
 python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
-  --served-model-name $served_model_name \
+  --served-model-name $SERVED_MODEL_NAME \
   --port 8000 \
-  --model $model \
+  --model $MODEL_PATH \
   --trust-remote-code \
   --block-size 8 \
   --gpu-memory-utilization 0.95 \
@@ -29,9 +34,9 @@ python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
   --dtype float16 \
   --enforce-eager \
   --load-in-low-bit fp8 \
-  --max-model-len 2048 \
-  --max-num-batched-tokens 4000 \
+  --max-model-len 2000 \
+  --max-num-batched-tokens 3000 \
   --max-num-seqs 256 \
-  --tensor-parallel-size 1 \
+  --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
   --disable-async-output-proc \
   --distributed-executor-backend ray