Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,11 +1227,13 @@ def _prepare_tp_inputs(
multimodal_params = MultimodalParams(
multimodal_data=request.py_multimodal_data,
multimodal_runtime=py_multimodal_runtime)
multimodal_params.to_device("multimodal_data",
"cuda",
pin_memory=True)

if multimodal_params.has_content():
multimodal_params.to_device("multimodal_data",
"cuda",
pin_memory=True)
#re-assign the multimodal_data to the request after to_device for generation requests
request.py_multimodal_data = multimodal_params.multimodal_data
multimodal_params_list.append(multimodal_params)

request.py_batch_idx = request.py_seq_slot
Expand Down Expand Up @@ -1265,10 +1267,12 @@ def _prepare_tp_inputs(
multimodal_params = MultimodalParams(
multimodal_data=request.py_multimodal_data)
multimodal_params.strip_for_generation()
multimodal_params.to_device("multimodal_data",
"cuda",
pin_memory=True)
if multimodal_params.has_content():
multimodal_params.to_device("multimodal_data",
"cuda",
pin_memory=True)
# re-assign the multimodal_data to the request after strip_for_generation for another generation request,
request.py_multimodal_data = multimodal_params.multimodal_data
multimodal_params_list.append(multimodal_params)
extend_requests += extend_dummy_requests

Expand Down
15 changes: 3 additions & 12 deletions tensorrt_llm/executor/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ def _deduce_max_tokens(request: GenerationRequest,
lora_config=lora_config,
prompt_tuning_config=prompt_tuning_config,
multimodal_input=multimodal_input,
#NOTE: `multimodal_embedding` and `mrope_config` will be in MultimodalParams.multimodal_data. And this will be handled below by `py_multimodal_data`.
# NOTE: `multimodal_embedding` and `mrope_config` will be in MultimodalParams.multimodal_data. And this will be handled below by `py_multimodal_data`.
multimodal_embedding=None,
mrope_config=None,
logits_post_processor_name=(
Expand All @@ -502,17 +502,8 @@ def _deduce_max_tokens(request: GenerationRequest,

if self._is_pytorch_backend and request.multimodal_params is not None:
if request.multimodal_params.multimodal_data is not None:
# Convert back to tensor, as opposite to `to_handle` in `llm.generate_async`
# for values with non-selected keys, it's no-op
request.multimodal_params.to_tensor(
"multimodal_data", key="multimodal_embedding")
embedding = request.multimodal_params.multimodal_data.get(
"multimodal_embedding")
if embedding is not None and embedding.is_cuda:
# make sure the embedding resides on the local device
request.multimodal_params.multimodal_data[
"multimodal_embedding"] = embedding.to("cuda")

# NOTE: Deserialize SharedTensor handle to actual tensor
request.multimodal_params.to_tensor("multimodal_data")
executor_request.py_multimodal_data = request.multimodal_params.multimodal_data

if self._is_pytorch_backend and request.sampling_params.logits_processor:
Expand Down
Loading