@@ -487,7 +487,7 @@ def _deduce_max_tokens(request: GenerationRequest,
487
487
lora_config = lora_config ,
488
488
prompt_tuning_config = prompt_tuning_config ,
489
489
multimodal_input = multimodal_input ,
490
- #NOTE: `multimodal_embedding` and `mrope_config` will be in MultimodalParams.multimodal_data. And this will be handled below by `py_multimodal_data`.
490
+ # NOTE: `multimodal_embedding` and `mrope_config` will be in MultimodalParams.multimodal_data. And this will be handled below by `py_multimodal_data`.
491
491
multimodal_embedding = None ,
492
492
mrope_config = None ,
493
493
logits_post_processor_name = (
@@ -503,17 +503,8 @@ def _deduce_max_tokens(request: GenerationRequest,
503
503
504
504
if self ._is_pytorch_backend and request .multimodal_params is not None :
505
505
if request .multimodal_params .multimodal_data is not None :
506
- # Convert back to tensor, as opposite to `to_handle` in `llm.generate_async`
507
- # for values with non-selected keys, it's no-op
508
- request .multimodal_params .to_tensor (
509
- "multimodal_data" , key = "multimodal_embedding" )
510
- embedding = request .multimodal_params .multimodal_data .get (
511
- "multimodal_embedding" )
512
- if embedding is not None and embedding .is_cuda :
513
- # make sure the embedding resides on the local device
514
- request .multimodal_params .multimodal_data [
515
- "multimodal_embedding" ] = embedding .to ("cuda" )
516
-
506
+ # NOTE: Deserialize SharedTensor handle to actual tensor
507
+ request .multimodal_params .to_tensor ("multimodal_data" )
517
508
executor_request .py_multimodal_data = request .multimodal_params .multimodal_data
518
509
519
510
if self ._is_pytorch_backend and request .sampling_params .logits_processor :
0 commit comments