Skip to content

Commit 63f1fde

Browse files
authored
[Hardware][CPU] Support chunked-prefill and prefix-caching on CPU (#10355)
Signed-off-by: jiang1.li <[email protected]>
1 parent d5b2844 commit 63f1fde

File tree

8 files changed

+559
-369
lines changed

8 files changed

+559
-369
lines changed

.buildkite/run-cpu-test.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
2525

2626
function cpu_tests() {
2727
set -e
28+
export NUMA_NODE=$2
2829

2930
# offline inference
3031
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
@@ -57,6 +58,12 @@ function cpu_tests() {
5758
pytest -s -v \
5859
tests/quantization/test_ipex_quant.py"
5960

61+
# Run chunked-prefill and prefix-cache test
62+
docker exec cpu-test-"$NUMA_NODE" bash -c "
63+
set -e
64+
pytest -s -v -k cpu_model \
65+
tests/basic_correctness/test_chunked_prefill.py"
66+
6067
# online inference
6168
docker exec cpu-test-"$NUMA_NODE" bash -c "
6269
set -e
@@ -75,4 +82,4 @@ function cpu_tests() {
7582

7683
# All of CPU tests are expected to be finished less than 25 mins.
7784
export -f cpu_tests
78-
timeout 25m bash -c "cpu_tests $CORE_RANGE"
85+
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

docs/source/getting_started/cpu-installation.rst

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@ Installation with CPU
55

66
vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
77

8-
- Tensor Parallel (``-tp = N``)
9-
- Quantization (``INT8 W8A8, AWQ``)
10-
11-
.. note::
12-
More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon.
8+
- Tensor Parallel
9+
- Model Quantization (``INT8 W8A8, AWQ``)
10+
- Chunked-prefill
11+
- Prefix-caching
12+
- FP8-E5M2 KV-Caching (TODO)
1313

1414
Table of contents:
1515

docs/source/serving/compatibility_matrix.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -344,15 +344,15 @@ Feature x Hardware
344344
- ✅
345345
- ✅
346346
- ✅
347-
-
347+
-
348348
- ✅
349349
* - :ref:`APC <apc>`
350350
- `<https://github.com/vllm-project/vllm/issues/3687>`__
351351
- ✅
352352
- ✅
353353
- ✅
354354
- ✅
355-
-
355+
-
356356
- ✅
357357
* - :ref:`LoRA <lora>`
358358
- ✅

tests/basic_correctness/test_chunked_prefill.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import pytest
1313

1414
from tests.kernels.utils import override_backend_env_variable
15+
from vllm.platforms import current_platform
1516

1617
from ..models.utils import check_logprobs_close, check_outputs_equal
1718
from ..utils import multi_gpu_test
@@ -206,12 +207,14 @@ def test_models_with_fp8_kv_cache(
206207
# NOTE: Increasing this in this suite will fail CI because we currently cannot
207208
# reset distributed env properly. Use a value > 1 just when you test.
208209
@pytest.mark.parametrize("tensor_parallel_size", [1])
210+
@pytest.mark.parametrize("dtype", ["half"])
209211
def test_with_prefix_caching(
210212
vllm_runner,
211213
max_tokens: int,
212214
enforce_eager: bool,
213215
chunk_size: int,
214216
tensor_parallel_size: int,
217+
dtype: str,
215218
) -> None:
216219
"""
217220
Checks exact match decode with and without prefix caching
@@ -233,7 +236,7 @@ def test_with_prefix_caching(
233236
for enable in (True, False):
234237
with vllm_runner(
235238
model,
236-
dtype="half",
239+
dtype=dtype,
237240
max_num_batched_tokens=max_num_batched_tokens,
238241
enable_chunked_prefill=True,
239242
enable_prefix_caching=enable,
@@ -260,3 +263,61 @@ def test_with_prefix_caching(
260263
name_0="w/o prefix caching",
261264
name_1="with prefix caching",
262265
)
266+
267+
268+
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
269+
@pytest.mark.parametrize("dtype", ["bfloat16"])
270+
@pytest.mark.parametrize("max_tokens", [32])
271+
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
272+
@pytest.mark.parametrize("enforce_eager", [False])
273+
@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
274+
@pytest.mark.cpu_model
275+
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
276+
def test_models_cpu(
277+
hf_runner,
278+
vllm_runner,
279+
example_prompts,
280+
model: str,
281+
dtype: str,
282+
max_tokens: int,
283+
chunked_prefill_token_size: int,
284+
enforce_eager: bool,
285+
attention_backend: str,
286+
monkeypatch,
287+
) -> None:
288+
test_models(
289+
hf_runner,
290+
vllm_runner,
291+
example_prompts,
292+
model,
293+
dtype,
294+
max_tokens,
295+
chunked_prefill_token_size,
296+
enforce_eager,
297+
1,
298+
attention_backend,
299+
monkeypatch,
300+
)
301+
302+
303+
@pytest.mark.parametrize("max_tokens", [16])
304+
@pytest.mark.parametrize("enforce_eager", [False])
305+
@pytest.mark.parametrize("chunk_size", [30, 32])
306+
@pytest.mark.parametrize("dtype", ["bfloat16"])
307+
@pytest.mark.cpu_model
308+
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
309+
def test_with_prefix_caching_cpu(
310+
vllm_runner,
311+
max_tokens: int,
312+
enforce_eager: bool,
313+
chunk_size: int,
314+
dtype: str,
315+
) -> None:
316+
test_with_prefix_caching(
317+
vllm_runner,
318+
max_tokens,
319+
enforce_eager,
320+
chunk_size,
321+
1,
322+
dtype,
323+
)

0 commit comments

Comments
 (0)