vllm-project · WoosukKwon · Nov 6, 2024 · Sep 6, 2024 · Sep 3, 2024 · Sep 4, 2024
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
@@ -0,0 +1,18 @@
+FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-hpu.txt
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -43,7 +43,7 @@ vLLM is flexible and easy to use with:
 * Tensor parallelism and pipeline parallelism support for distributed inference
 * Streaming outputs
 * OpenAI-compatible API server
-* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and Gaudi® accelerators, GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
 * Prefix caching support
 * Multi-lora support
 
@@ -66,6 +66,7 @@ Documentation
    getting_started/amd-installation
    getting_started/openvino-installation
    getting_started/cpu-installation
+   getting_started/gaudi-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
    getting_started/xpu-installation

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
@@ -0,0 +1,9 @@
+# Common dependencies
+-r requirements-common.txt
+
+# Dependencies for HPU code
+ray == 2.32.0
+triton
+pandas
+tabulate
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0a7adab
diff --git a/setup.py b/setup.py
@@ -246,14 +246,32 @@ def run(self):
             self.copy_file(file, dst_file)
 
 
+def _is_hpu() -> bool:
+    is_hpu_available = True
+    try:
+        subprocess.run(["hl-smi"], capture_output=True, check=True)
+    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
+        if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
+                '/dev/accel/accel_controlD0'):
+            # last resort...
+            try:
+                output = subprocess.check_output(
+                    'lsmod | grep habanalabs | wc -l', shell=True)
+                is_hpu_available = int(output) > 0
+            except (ValueError, FileNotFoundError, PermissionError,
+                    subprocess.CalledProcessError):
+                is_hpu_available = False
+    return is_hpu_available or VLLM_TARGET_DEVICE == "hpu"
+
+
 def _no_device() -> bool:
     return VLLM_TARGET_DEVICE == "empty"
 
 
 def _is_cuda() -> bool:
     has_cuda = torch.version.cuda is not None
     return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
-            and not (_is_neuron() or _is_tpu()))
+            and not (_is_neuron() or _is_tpu() or _is_hpu()))
 
 
 def _is_hip() -> bool:
@@ -291,7 +309,8 @@ def _build_custom_ops() -> bool:
 
 
 def _build_core_ext() -> bool:
-    return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())
+    return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu()
+                or _is_hpu())
 
 
 def get_hipcc_rocm_version():
@@ -353,6 +372,23 @@ def get_path(*filepath) -> str:
     return os.path.join(ROOT_DIR, *filepath)
 
 
+def get_gaudi_sw_version():
+    """
+    Returns the driver version.
+    """
+    # Enable console printing for `hl-smi` check
+    output = subprocess.run("hl-smi",
+                            shell=True,
+                            text=True,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE,
+                            env={"ENABLE_CONSOLE": "true"})
+    if output.returncode == 0 and output.stdout:
+        return output.stdout.split("\n")[2].replace(
+            " ", "").split(":")[1][:-1].split("-")[0]
+    return "0.0.0"  # when hl-smi is not available
+
+
 def get_vllm_version() -> str:
     version = get_version()
     sep = "+" if "+" not in version else "."  # dev versions might contain +
@@ -379,6 +415,13 @@ def get_vllm_version() -> str:
         if neuron_version != MAIN_CUDA_VERSION:
             neuron_version_str = neuron_version.replace(".", "")[:3]
             version += f"{sep}neuron{neuron_version_str}"
+            version += f"+neuron{neuron_version_str}"
+    elif _is_hpu():
+        # Get the Intel Gaudi Software Suite version
+        gaudi_sw_version = str(get_gaudi_sw_version())
+        if gaudi_sw_version != MAIN_CUDA_VERSION:
+            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
+            version += f"{sep}gaudi{gaudi_sw_version}"
     elif _is_openvino():
         version += f"{sep}openvino"
     elif _is_tpu():
@@ -434,6 +477,8 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-rocm.txt")
     elif _is_neuron():
         requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_hpu():
+        requirements = _read_requirements("requirements-hpu.txt")
     elif _is_openvino():
         requirements = _read_requirements("requirements-openvino.txt")
     elif _is_tpu():
@@ -444,7 +489,7 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-xpu.txt")
     else:
         raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, "
+            "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
             "OpenVINO, or CPU.")
     return requirements
 

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -11,7 +11,7 @@
 
 logger = init_logger(__name__)
 
-if not current_platform.is_tpu():
+if not current_platform.is_tpu() and not current_platform.is_hpu():
     try:
         import vllm._C
     except ImportError as e: