vllm-project · tianmu-li · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 9, 2025
@@ -2848,15 +2848,16 @@ def execute_model(
                 # If logits_indices is smaller than req_id,
                 # add the last token position
                 if logits_indices.shape[0] < len(req_id):
-                    if structured_output:
+                    if structured_output or self.use_async_scheduling:
                         logits_append = torch.tensor([torch.sum(prompt_len) - 1],
                                                      device=token_ids.device,
                                                      dtype=torch.int32)
                         logits_indices = torch.cat([logits_indices, logits_append])
-                    elif self.use_async_scheduling:
-                        # Discard partial prefill logits for async scheduling
+                    if self.use_async_scheduling:
+                        # Discard partial prefill logit for async scheduling
                         # Depends on 1 decode token/batch
-                        invalid_req_indices.append(num_decodes + idx)
+                        prefill_start_idx = num_decodes
+                        invalid_req_indices.append(prefill_start_idx + idx)
                 htorch.core.mark_step()
                 non_flattened_hidden_states, aux_hidden_states, \
                     sample_hidden_states, logits_device = \
@@ -3148,7 +3149,7 @@ def execute_model(
             return AsyncHPUModelRunnerOutput(
                 model_runner_output=model_runner_output,
                 sampled_token_ids=sampled_token_ids,
-                invalid_req_indices=[],
+                invalid_req_indices=invalid_req_indices,
                 async_output_copy_stream=self.async_output_copy_stream,
             )
         model_runner_output = ModelRunnerOutput(