diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 4a6a3b95ec7f..ee3d041464d7 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -372,10 +372,18 @@ def apply( logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: if enable_eplb: - assert expert_load_view is not None - assert logical_to_physical_map is not None - assert logical_replica_count is not None - assert isinstance(layer, FusedMoE) + if expert_load_view is None: + raise ValueError( + "expert_load_view must be provided when enable_eplb is True") + if logical_to_physical_map is None: + raise ValueError( + "logical_to_physical_map must be provided when enable_eplb is True") + if logical_replica_count is None: + raise ValueError( + "logical_replica_count must be provided when enable_eplb is True") + if not isinstance(layer, FusedMoE): + raise TypeError( + f"Expected layer to be FusedMoE, but got {type(layer)}") return self.forward( x=x, @@ -421,6 +429,19 @@ def forward_cuda( logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: + if enable_eplb: + if expert_load_view is None: + raise ValueError( + "expert_load_view must be provided when enable_eplb is True") + if logical_to_physical_map is None: + raise ValueError( + "logical_to_physical_map must be provided when enable_eplb is True") + if logical_replica_count is None: + raise ValueError( + "logical_replica_count must be provided when enable_eplb is True") + if not isinstance(layer, FusedMoE): + raise TypeError( + f"Expected layer to be FusedMoE, but got {type(layer)}") topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, @@ -1411,10 +1432,10 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False): >= chunk_size) assert (self.batched_router_logits.size(0) # type: ignore >= chunk_size) - staged_hidden_states = self.batched_hidden_states[: - chunk_size, :] # type: ignore - staged_router_logits = self.batched_router_logits[: - chunk_size, :] # type: ignore + staged_hidden_states = self.batched_hidden_states[ + :chunk_size, :] # type: ignore + staged_router_logits = self.batched_router_logits[ + :chunk_size, :] # type: ignore staged_hidden_states.copy_(hidden_states, non_blocking=True) staged_router_logits.copy_(router_logits, non_blocking=True)