diff --git a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu index 236d76ed5208..6c8f6309ef43 100644 --- a/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu +++ b/csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu @@ -201,11 +201,10 @@ void run_blockwise_scaled_group_mm( reinterpret_cast( layout_sfb.data_ptr())}; - cutlass::KernelHardwareInfo hw_info; - hw_info.device_id = a_ptrs.get_device(); - hw_info.sm_count = - cutlass::KernelHardwareInfo::query_device_multiprocessor_count( - hw_info.device_id); + int device_id = a_ptrs.device().index(); + static const cutlass::KernelHardwareInfo hw_info{ + device_id, cutlass::KernelHardwareInfo::query_device_multiprocessor_count( + device_id)}; // Epilogue Arguments typename GemmKernel::EpilogueArguments epilogue_args{ diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index b4fdbfcc7f60..8d47a417f9cd 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -216,8 +216,8 @@ async def _cross_encoding_score( # cross_encoder models defaults to using pad_token. tokenized_prompts = await asyncio.gather(*( tokenize_async( - text=t1, # type: ignore[arg-type] - text_pair=t2, # type: ignore[arg-type] + text=t1, # type: ignore[arg-type] + text_pair=t2, # type: ignore[arg-type] **tokenization_kwargs) for t1, t2 in input_pairs)) else: # `llm as reranker` models defaults to not using pad_token. diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index c8a8415baf23..836e0a64052a 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -547,8 +547,10 @@ def cutlass_moe_fp4(a: torch.Tensor, return out.to(dtype=out_dtype) -def _valid_cutlass_block_scaled_grouped_gemm(w1: torch.Tensor, - w2: torch.Tensor) -> bool: +def _valid_cutlass_block_scaled_grouped_gemm( + w1: torch.Tensor, w2: torch.Tensor, inplace: bool, activation: str, + apply_router_weight_on_input: bool, + expert_map: Optional[torch.Tensor]) -> bool: def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int): return N % 128 == 0 and K % 128 == 0 @@ -564,6 +566,29 @@ def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int): "CutlassBlockScaledGroupedGemm disabled: invalid weight dtype(s).") return False + if expert_map is not None: + logger.debug( + "CutlassBlockScaledGroupedGemm disabled: expert_parallel is" + " not supported.") + return False + + if activation != "silu": + logger.debug( + "CutlassBlockScaledGroupedGemm disabled: only activation silu is" + " supported.") + return False + + if apply_router_weight_on_input: + logger.debug("CutlassBlockScaledGroupedGemm disabled:" + " apply_router_weight_on_input is not supported.") + return False + + if inplace: + logger.debug( + "CutlassBlockScaledGroupedGemm disabled: inplace is not supported." + ) + return False + return True diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 26eeed1cd07f..63496aac6290 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1190,8 +1190,9 @@ def fused_experts( apply_router_weight_on_input=apply_router_weight_on_input, ) elif (allow_cutlass_block_scaled_grouped_gemm and use_fp8_w8a8 - and _valid_cutlass_block_scaled_grouped_gemm(w1, w2)): - assert apply_router_weight_on_input is False + and _valid_cutlass_block_scaled_grouped_gemm( + w1, w2, inplace, activation, apply_router_weight_on_input, + expert_map)): return run_cutlass_block_scaled_fused_experts( a=hidden_states, w1=w1,