Merge branch 'vllm-project:main' into main

vllm-project · Jun 3, 2024 · 8f36146 · 8f36146
2 parents 5650b95 + 7a64d24
commit 8f36146
Show file tree

Hide file tree

Showing 113 changed files with 5,220 additions and 1,071 deletions.
diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
@@ -1,7 +1,7 @@
 import os
 import zipfile
 
-MAX_SIZE_MB = 150
+MAX_SIZE_MB = 200
 
 
 def print_top_10_largest_files(zip_file):

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -5,6 +5,34 @@ set -ex
 echo "--- ROCm info"
 rocminfo
 
+# cleanup older docker images
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes
+    docker volume prune -f
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+
+# Call the cleanup docker function
+cleanup_docker
+
 echo "--- Resetting GPUs"
 
 echo "reset" > /opt/amdgpu/etc/gpu_state

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -37,7 +37,6 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   commands:
-  - pytest -v -s distributed/test_pynccl_library.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -37,6 +37,7 @@ jobs:
         mypy vllm/distributed --config-file pyproject.toml
         mypy vllm/entrypoints --config-file pyproject.toml
         mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/multimodal --config-file pyproject.toml
         mypy vllm/usage --config-file pyproject.toml
         mypy vllm/*.py --config-file pyproject.toml
         mypy vllm/transformers_utils --config-file pyproject.toml

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -177,7 +177,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   include(FetchContent)
   SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
   FetchContent_Declare(
-        cutlass 
+        cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
         # CUTLASS 3.5.0
         GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
@@ -200,11 +200,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # The CUTLASS kernels for Hopper require sm90a to be enabled.
   # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
   # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
-  set_source_files_properties(
-      "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
-      PROPERTIES
-      COMPILE_FLAGS
-      "-gencode arch=compute_90a,code=sm_90a")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
+    set_source_files_properties(
+          "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
+          PROPERTIES
+          COMPILE_FLAGS
+          "-gencode arch=compute_90a,code=sm_90a")
+  endif()
 
 endif()
 
@@ -309,6 +311,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling C extension.")
   add_dependencies(default _C)
 
+  message(STATUS "Enabling moe extension.")
+  add_dependencies(default _moe_C)
+
   # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
   # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
   # there are supported target arches.
@@ -318,8 +323,3 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
     add_dependencies(default _punica_C)
   endif()
 endif()
-
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  message(STATUS "Enabling moe extension.")
-  add_dependencies(default _moe_C)
-endif()
diff --git a/Dockerfile b/Dockerfile
@@ -79,12 +79,6 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 RUN python3 check-wheel-size.py dist
 
-# the `vllm_nccl` package must be installed from source distribution
-# pip is too smart to store a wheel in the cache, and other CI jobs
-# will directly use the wheel from the cache, which is not what we want.
-# we need to remove it manually
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip cache remove vllm_nccl*
 #################### EXTENSION Build IMAGE ####################
 
 #################### vLLM installation IMAGE ####################

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -108,6 +108,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     && python3 setup.py install \
     && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
     && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.cpython-39-x86_64-linux-gnu.so vllm/ \
     && cd ..