Skip to content

Commit 37e16cb

Browse files
mgoinminpeter
authored andcommitted
Speed up the kernels/quantization/ tests (vllm-project#18669)
Signed-off-by: mgoin <[email protected]> Signed-off-by: minpeter <[email protected]>
1 parent 8033452 commit 37e16cb

File tree

3 files changed

+17
-25
lines changed

3 files changed

+17
-25
lines changed

tests/kernels/quantization/test_block_fp8.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,16 @@
3636

3737
# Test configurations
3838
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
39-
NUM_TOKENS = [7, 83, 2048]
39+
NUM_TOKENS = [7, 2050]
4040
D = [512, 4096, 5120, 13824]
41-
GROUP_SIZE = [64, 128, 256, 512]
42-
M = [1, 7, 8, 83, 84, 512, 2048, 4096]
43-
N = [128, 512, 1024, 4096, 7168, 7748, 13824]
44-
K = [256, 4096, 5120, 3884, 13824, 16384]
41+
GROUP_SIZE = [64, 128, 512]
42+
M = [1, 7, 8, 83, 84, 4096]
43+
N = [128, 512, 7168, 7748, 13824]
44+
K = [256, 3884, 4096, 13824, 16384]
4545
# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
4646
# and its hidden size is 7168.
47-
M_moe = [1, 2, 7, 83, 128, 512, 2048]
48-
M_moe_dg = [128, 192, 512, 1335, 2048]
47+
M_moe = [1, 2, 7, 83, 128, 2048]
48+
M_moe_dg = [128, 192, 1335, 2048]
4949
N_moe = [128, 256, 1024, 4608] # [13824]
5050
K_moe = [256, 512, 7168] # [13824]
5151
BLOCK_SIZE = [[128, 128]]

tests/kernels/quantization/test_gguf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@ def get_gguf_MoE_tensors(
3535
return GGUFReader(sample_file).tensors
3636

3737

38-
DTYPES = [torch.half, torch.bfloat16, torch.float32]
38+
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
3939
# Hidden_size for testing, must match the sample file in HF repo,
4040
# we have `hidden_size = 256, 1024` for test in HF repo currently.
4141
HIDDEN_SIZES = [256, 1024]
42-
NUM_TOKENS = [7, 83, 128, 2048] # Arbitrary values for testing
42+
NUM_TOKENS = [7, 2050] # Arbitrary values for testing
4343
SEEDS = [0]
4444
QUANT_TYPES = [
4545
# i-matrix

tests/kernels/quantization/test_triton_scaled_mm.py

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,13 @@
1313

1414
device = "cuda"
1515

16+
triton_scaled_mm_module = importlib.import_module(
17+
"vllm.model_executor.layers.quantization.compressed_tensors."
18+
"triton_scaled_mm")
19+
triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
1620

17-
def scaled_mm_torch(a: torch.Tensor,
21+
22+
def torch_scaled_mm(a: torch.Tensor,
1823
b: torch.Tensor,
1924
scale_a: torch.Tensor,
2025
scale_b: torch.Tensor,
@@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
101106
if use_bias:
102107
bias = torch.rand((N, ), device=device, dtype=out_dtype)
103108

104-
triton_scaled_mm_module = importlib.import_module(
105-
"vllm.model_executor.layers.quantization.compressed_tensors."
106-
"triton_scaled_mm")
107-
triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
108-
109109
c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
110110

111-
a_cpu = a.cpu()
112-
b_cpu = b.cpu()
113-
scale_a_cpu = scale_a.cpu()
114-
scale_b_cpu = scale_b.cpu()
115-
bias_cpu = None if bias is None else bias.cpu()
116-
117-
c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu,
118-
out_dtype, bias_cpu)
111+
c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
119112

120-
c_check_cpu = c_check.cpu()
121-
torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1)
113+
torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1)

0 commit comments

Comments
 (0)