@@ -487,7 +487,7 @@ def _deduce_max_tokens(request: GenerationRequest,
487487 lora_config = lora_config ,
488488 prompt_tuning_config = prompt_tuning_config ,
489489 multimodal_input = multimodal_input ,
490- #NOTE: `multimodal_embedding` and `mrope_config` will be in MultimodalParams.multimodal_data. And this will be handled below by `py_multimodal_data`.
490+ # NOTE: `multimodal_embedding` and `mrope_config` will be in MultimodalParams.multimodal_data. And this will be handled below by `py_multimodal_data`.
491491 multimodal_embedding = None ,
492492 mrope_config = None ,
493493 logits_post_processor_name = (
@@ -503,17 +503,8 @@ def _deduce_max_tokens(request: GenerationRequest,
503503
504504 if self ._is_pytorch_backend and request .multimodal_params is not None :
505505 if request .multimodal_params .multimodal_data is not None :
506- # Convert back to tensor, as opposite to `to_handle` in `llm.generate_async`
507- # for values with non-selected keys, it's no-op
508- request .multimodal_params .to_tensor (
509- "multimodal_data" , key = "multimodal_embedding" )
510- embedding = request .multimodal_params .multimodal_data .get (
511- "multimodal_embedding" )
512- if embedding is not None and embedding .is_cuda :
513- # make sure the embedding resides on the local device
514- request .multimodal_params .multimodal_data [
515- "multimodal_embedding" ] = embedding .to ("cuda" )
516-
506+ # NOTE: Deserialize SharedTensor handle to actual tensor
507+ request .multimodal_params .to_tensor ("multimodal_data" )
517508 executor_request .py_multimodal_data = request .multimodal_params .multimodal_data
518509
519510 if self ._is_pytorch_backend and request .sampling_params .logits_processor :
0 commit comments