Skip to content
8 changes: 4 additions & 4 deletions .buildkite/scripts/hardware_ci/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ function cpu_tests() {
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"

# Run AWQ test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
VLLM_USE_V1=0 pytest -s -v \
tests/quantization/test_ipex_quant.py"
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
# VLLM_USE_V1=0 pytest -s -v \
# tests/quantization/test_ipex_quant.py"

# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$NUMA_NODE" bash -c "
Expand Down
2 changes: 0 additions & 2 deletions .buildkite/scripts/hardware_ci/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,5 @@ docker run \
--name "${container_name}" \
"${image_name}" \
sh -c '
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
'
2 changes: 1 addition & 1 deletion examples/online_serving/chart-helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ image:
# -- Image tag
tag: "latest"
# -- Container launch command
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]

# -- Container port
containerPort: 8000
Expand Down
17 changes: 10 additions & 7 deletions tests/kernels/attention/test_attention_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def clear_cache():
DEVICE_MLA_BLOCK_SIZES = {
"cuda": [16, 64], # CUDA supports both standard and extended block sizes
"hip": [16, 1], # HIP requires special handling for block_size=1
"cpu": [16] # CPU uses fixed block size from test cases
# "cpu": [16] # CPU uses fixed block size from test cases
"cpu": [] # FIXME(woosuk): Temporarily disable CPU tests
}


Expand Down Expand Up @@ -81,14 +82,14 @@ def test_env(
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")

if device == "cpu":
if not use_v1:
pytest.skip("CPU backend only supports V1")

with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16,
block_size, False)
if use_v1:
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
else:
assert backend.get_name() == "TORCH_SDPA"
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"

elif device == "hip":
with patch("vllm.attention.selector.current_platform",
Expand Down Expand Up @@ -193,12 +194,14 @@ def test_fp32_fallback(
m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")

if device == "cpu":
if not use_v1:
pytest.skip("CPU backend only supports V1")

with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
backend = get_attn_backend(16, torch.float32, torch.float32,
16, False)
assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
if use_v1 else "TORCH_SDPA")
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"

elif device == "cuda":
with patch("vllm.attention.selector.current_platform",
Expand Down
307 changes: 0 additions & 307 deletions vllm/attention/backends/cpu_mla.py

This file was deleted.

Loading