From b67c5880709abfb92781c0349863c33e8446bf6b Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 9 Dec 2024 13:06:54 -0800 Subject: [PATCH 1/7] [V1] Use input_ids as input for text-only models Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 60 +++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7f95be06188e..aa48980c6e7a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -61,6 +61,7 @@ def __init__( self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] + self.is_multimodal_model = model_config.is_multimodal_model self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size self.max_model_len = model_config.max_model_len @@ -103,6 +104,11 @@ def __init__( # The batch sizes in the config are in descending order. self.cudagraph_batch_sizes = list( reversed(self.vllm_config.compilation_config.capture_sizes)) + + # Persistent buffers for CUDA graphs. + self.input_ids = torch.zeros(self.max_num_tokens, + dtype=torch.int64, + device=self.device) self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) @@ -310,7 +316,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): seq_start_loc_np[0] = 0 np.cumsum(seq_lens, out=seq_start_loc_np[1:]) - input_ids = input_ids.to(self.device, non_blocking=True) + self.input_ids[:total_num_scheduled_tokens].copy_(input_ids, + non_blocking=True) self.positions[:total_num_scheduled_tokens].copy_(positions, non_blocking=True) query_start_loc = query_start_loc.to(self.device, non_blocking=True) @@ -331,7 +338,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # token from the partial request. # TODO: Support prompt logprobs. logits_indices = query_start_loc[1:] - 1 - return input_ids, attn_metadata, logits_indices + return attn_metadata, logits_indices def _prepare_sampling( self, @@ -429,10 +436,11 @@ def execute_model( # Run the encoder. self._execute_encoder(scheduler_output) - encoder_outputs = self._gather_encoder_outputs(scheduler_output) + encoder_outputs = (self._gather_encoder_outputs(scheduler_output) + if self.is_multimodal_model else []) # Prepare the decoder inputs. - input_ids, attn_metadata, logits_indices = self._prepare_inputs( + attn_metadata, logits_indices = self._prepare_inputs( scheduler_output) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph @@ -445,26 +453,37 @@ def execute_model( # Eager mode. num_input_tokens = num_scheduled_tokens - # Get the inputs embeds. - if encoder_outputs: - inputs_embeds = self.model.get_input_embeddings( - input_ids, encoder_outputs) + if self.is_multimodal_model: + # NOTE(woosuk): To unify token ids and soft tokens (vision + # embeddings), we always use embeddings (rather than token ids) + # as input to the multimodal model, even when the input is text. + input_ids = self.input_ids[:num_scheduled_tokens] + if encoder_outputs: + inputs_embeds = self.model.get_input_embeddings( + input_ids, encoder_outputs) + else: + inputs_embeds = self.model.get_input_embeddings(input_ids) + # TODO(woosuk): Avoid the copy. Optimize. + self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) + inputs_embeds = self.inputs_embeds[:num_input_tokens] + input_ids = None else: - inputs_embeds = self.model.get_input_embeddings(input_ids) - # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings), - # always use embeddings (rather than token ids) as input to the model. - # TODO(woosuk): Avoid the copy. Optimize. - self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) + # For text-only models, we use token ids as input. + # While it is possible to use embeddings as input just like the + # multimodal models, it is not desirable for performance since + # them the embedding layer is included in the CUDA graph. + input_ids = self.input_ids[:num_input_tokens] + inputs_embeds = None # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): hidden_states = self.model( - input_ids=None, + input_ids=input_ids, positions=self.positions[:num_input_tokens], kv_caches=self.kv_caches, attn_metadata=None, - inputs_embeds=self.inputs_embeds[:num_input_tokens], + inputs_embeds=inputs_embeds, ) hidden_states = hidden_states[:num_scheduled_tokens] hidden_states = hidden_states[logits_indices] @@ -534,13 +553,20 @@ def _dummy_run( num_tokens: int, kv_caches: List[torch.Tensor], ) -> torch.Tensor: + if self.is_multimodal_model: + input_ids = None + inputs_embeds = self.inputs_embeds[:num_tokens] + else: + input_ids = self.input_ids[:num_tokens] + inputs_embeds = None with set_forward_context(None, self.vllm_config): hidden_states = model( - input_ids=None, + input_ids=input_ids, positions=self.positions[:num_tokens], kv_caches=kv_caches, attn_metadata=None, - inputs_embeds=self.inputs_embeds[:num_tokens]) + inputs_embeds=inputs_embeds, + ) return hidden_states def profile_run(self) -> None: From bd87a1959a9129939f2dd35de1472b704b358a2b Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 9 Dec 2024 13:07:29 -0800 Subject: [PATCH 2/7] minor Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index aa48980c6e7a..7a1b280406f0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -107,8 +107,8 @@ def __init__( # Persistent buffers for CUDA graphs. self.input_ids = torch.zeros(self.max_num_tokens, - dtype=torch.int64, - device=self.device) + dtype=torch.int64, + device=self.device) self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) @@ -317,7 +317,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): np.cumsum(seq_lens, out=seq_start_loc_np[1:]) self.input_ids[:total_num_scheduled_tokens].copy_(input_ids, - non_blocking=True) + non_blocking=True) self.positions[:total_num_scheduled_tokens].copy_(positions, non_blocking=True) query_start_loc = query_start_loc.to(self.device, non_blocking=True) @@ -440,8 +440,7 @@ def execute_model( if self.is_multimodal_model else []) # Prepare the decoder inputs. - attn_metadata, logits_indices = self._prepare_inputs( - scheduler_output) + attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): From 64347e9ea4a33f4eb8e0c6a25279bbc7c58ec434 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 9 Dec 2024 13:24:03 -0800 Subject: [PATCH 3/7] minor Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7a1b280406f0..4dbeb4267d01 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -434,10 +434,12 @@ def execute_model( ) -> ModelRunnerOutput: self._update_states(scheduler_output) - # Run the encoder. - self._execute_encoder(scheduler_output) - encoder_outputs = (self._gather_encoder_outputs(scheduler_output) - if self.is_multimodal_model else []) + if self.is_multimodal_model: + # Run the multimodal encoder if any. + self._execute_encoder(scheduler_output) + encoder_outputs = self._gather_encoder_outputs(scheduler_output) + else: + encoder_outputs = [] # Prepare the decoder inputs. attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) From c72db1fad656ca248ffeefe3883d240851e1ab43 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 9 Dec 2024 13:25:46 -0800 Subject: [PATCH 4/7] minor Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4dbeb4267d01..e0d1972a6c79 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -472,7 +472,7 @@ def execute_model( # For text-only models, we use token ids as input. # While it is possible to use embeddings as input just like the # multimodal models, it is not desirable for performance since - # them the embedding layer is included in the CUDA graph. + # then the embedding layer is not included in the CUDA graph. input_ids = self.input_ids[:num_input_tokens] inputs_embeds = None From 6b976ec57686b24bad70ad427b6adfd4d3c8cde9 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 9 Dec 2024 13:26:41 -0800 Subject: [PATCH 5/7] int32 Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e0d1972a6c79..b55790e5788f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -107,7 +107,7 @@ def __init__( # Persistent buffers for CUDA graphs. self.input_ids = torch.zeros(self.max_num_tokens, - dtype=torch.int64, + dtype=torch.int32, device=self.device) self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, From e2baff844496858b6ab12e355cd4c1ef90dab853 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 10 Dec 2024 14:53:50 -0800 Subject: [PATCH 6/7] mionr Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b5d3e8748f90..617bc6d88d11 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -454,7 +454,6 @@ def execute_model( # Eager mode. num_input_tokens = num_scheduled_tokens - attn_metadata.num_input_tokens = num_input_tokens if self.is_multimodal_model: # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) @@ -479,6 +478,7 @@ def execute_model( # Run the decoder. # Use persistent buffers for CUDA graphs. + attn_metadata.num_input_tokens = num_input_tokens with set_forward_context(attn_metadata, self.vllm_config): hidden_states = self.model( input_ids=input_ids, From 426d3f62b0f5a5215e8ecd6a750664dd87176d21 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 10 Dec 2024 14:56:15 -0800 Subject: [PATCH 7/7] minor Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 617bc6d88d11..2615404854bf 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -453,6 +453,7 @@ def execute_model( else: # Eager mode. num_input_tokens = num_scheduled_tokens + attn_metadata.num_input_tokens = num_input_tokens if self.is_multimodal_model: # NOTE(woosuk): To unify token ids and soft tokens (vision @@ -478,7 +479,6 @@ def execute_model( # Run the decoder. # Use persistent buffers for CUDA graphs. - attn_metadata.num_input_tokens = num_input_tokens with set_forward_context(attn_metadata, self.vllm_config): hidden_states = self.model( input_ids=input_ids,