Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Hardware][CPU] Support chunked-prefill and prefix-caching on CPU #10355

Merged
merged 13 commits into from
Nov 20, 2024
9 changes: 8 additions & 1 deletion .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg

function cpu_tests() {
set -e
export NUMA_NODE=$2

# offline inference
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
Expand Down Expand Up @@ -57,6 +58,12 @@ function cpu_tests() {
pytest -s -v \
tests/quantization/test_ipex_quant.py"

# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"

# online inference
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
Expand All @@ -75,4 +82,4 @@ function cpu_tests() {

# All of CPU tests are expected to be finished less than 25 mins.
export -f cpu_tests
timeout 25m bash -c "cpu_tests $CORE_RANGE"
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
10 changes: 5 additions & 5 deletions docs/source/getting_started/cpu-installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ Installation with CPU

vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:

- Tensor Parallel (``-tp = N``)
- Quantization (``INT8 W8A8, AWQ``)

.. note::
More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon.
- Tensor Parallel
- Model Quantization (``INT8 W8A8, AWQ``)
- Chunked-prefill
- Prefix-caching
- FP8-E5M2 KV-Caching (TODO)

Table of contents:

Expand Down
4 changes: 2 additions & 2 deletions docs/source/serving/compatibility_matrix.rst
Original file line number Diff line number Diff line change
Expand Up @@ -344,15 +344,15 @@ Feature x Hardware
- ✅
- ✅
- ✅
-
-
- ✅
* - :ref:`APC <apc>`
- `<https://github.com/vllm-project/vllm/issues/3687>`__
- ✅
- ✅
- ✅
- ✅
-
-
- ✅
* - :ref:`LoRA <lora>`
- ✅
Expand Down
63 changes: 62 additions & 1 deletion tests/basic_correctness/test_chunked_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pytest

from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform

from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test
Expand Down Expand Up @@ -206,12 +207,14 @@ def test_models_with_fp8_kv_cache(
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching(
vllm_runner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
tensor_parallel_size: int,
dtype: str,
) -> None:
"""
Checks exact match decode with and without prefix caching
Expand All @@ -233,7 +236,7 @@ def test_with_prefix_caching(
for enable in (True, False):
with vllm_runner(
model,
dtype="half",
dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True,
enable_prefix_caching=enable,
Expand All @@ -260,3 +263,61 @@ def test_with_prefix_caching(
name_0="w/o prefix caching",
name_1="with prefix caching",
)


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
chunked_prefill_token_size: int,
enforce_eager: bool,
attention_backend: str,
monkeypatch,
) -> None:
test_models(
hf_runner,
vllm_runner,
example_prompts,
model,
dtype,
max_tokens,
chunked_prefill_token_size,
enforce_eager,
1,
attention_backend,
monkeypatch,
)


@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("chunk_size", [30, 32])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu(
vllm_runner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
dtype: str,
) -> None:
test_with_prefix_caching(
vllm_runner,
max_tokens,
enforce_eager,
chunk_size,
1,
dtype,
)
Loading