NVIDIA
diff --git a/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 4 additions & 2 deletions b/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎examples/llm-api/quickstart_multimodal.py‎
Lines changed: 85 additions & 23 deletions b/‎examples/llm-api/quickstart_multimodal.py‎
Lines changed: 85 additions & 23 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/_torch/attention_backend/interface.py‎
Lines changed: 18 additions & 3 deletions b/‎tensorrt_llm/_torch/attention_backend/interface.py‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/models/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎tensorrt_llm/_torch/models/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_multimodal_utils.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorrt_llm/_torch/models/modeling_multimodal_utils.py‎
Lines changed: 1 addition & 0 deletions
@@ -145,7 +145,7 @@ def parse_arguments():
     return args
 
 
-def setup_llm(args):
+def setup_llm(args, **kwargs):
     kv_cache_config = KvCacheConfig(
         enable_block_reuse=not args.disable_kv_cache_reuse,
         free_gpu_memory_fraction=args.kv_cache_fraction,
@@ -222,7 +222,9 @@ def setup_llm(args):
         speculative_config=spec_config,
         trust_remote_code=args.trust_remote_code,
         gather_generation_logits=args.return_generation_logits,
-        max_beam_width=args.max_beam_width)
+        max_beam_width=args.max_beam_width,
+        **kwargs,
+    )
 
     sampling_params = SamplingParams(
         max_tokens=args.max_tokens,
 
@@ -7,24 +7,56 @@
 from tensorrt_llm.inputs import (ALL_SUPPORTED_MULTIMODAL_MODELS,
                                  default_multimodal_input_loader)
 
-example_images = [
-    "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png",
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
-    "https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/61.jpg",
-]
-example_image_prompts = [
-    "Describe the natural environment in the image.",
-    "Describe the object and the weather condition in the image.",
-    "Describe the traffic condition on the road in the image.",
-]
-example_videos = [
-    "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/OAI-sora-tokyo-walk.mp4",
-    "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/world.mp4",
-]
-example_video_prompts = [
-    "Tell me what you see in the video briefly.",
-    "Describe the scene in the video briefly.",
-]
+example_medias_and_prompts = {
+    "image": {
+        "media": [
+            "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png",
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
+            "https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/61.jpg",
+        ],
+        "prompt": [
+            "Describe the natural environment in the image.",
+            "Describe the object and the weather condition in the image.",
+            "Describe the traffic condition on the road in the image.",
+        ]
+    },
+    "video": {
+        "media": [
+            "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/OAI-sora-tokyo-walk.mp4",
+            "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/world.mp4",
+        ],
+        "prompt": [
+            "Tell me what you see in the video briefly.",
+            "Describe the scene in the video briefly.",
+        ]
+    },
+    "audio": {
+        "media": [
+            "https://huggingface.co/microsoft/Phi-4-multimodal-instruct/resolve/main/examples/what_is_the_traffic_sign_in_the_image.wav",
+            "https://huggingface.co/microsoft/Phi-4-multimodal-instruct/resolve/main/examples/what_is_shown_in_this_image.wav",
+        ],
+        "prompt": [
+            "Transcribe the audio clip into text, please don't add other text.",
+            "Transcribe the audio clip into text, please don't add other text.",
+        ]
+    },
+    "image_audio": {
+        "media": [
+            [
+                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
+                "https://huggingface.co/microsoft/Phi-4-multimodal-instruct/resolve/main/examples/what_is_shown_in_this_image.wav"
+            ],
+            [
+                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
+                "https://huggingface.co/microsoft/Phi-4-multimodal-instruct/resolve/main/examples/what_is_shown_in_this_image.wav"
+            ],
+        ],
+        "prompt": [
+            "Describe the scene in the image briefly.",
+            "",
+        ]
+    }
+}
 
 
 def add_multimodal_args(parser):
@@ -34,7 +66,7 @@ def add_multimodal_args(parser):
                         help="Model type.")
     parser.add_argument("--modality",
                         type=str,
-                        choices=["image", "video"],
+                        choices=["image", "video", "audio", "image_audio"],
                         default="image",
                         help="Media type.")
     parser.add_argument("--media",
@@ -53,11 +85,24 @@ def add_multimodal_args(parser):
     return parser
 
 
+def add_lora_args(parser):
+    parser.add_argument("--load_lora",
+                        default=False,
+                        action='store_true',
+                        help="Whether to load the LoRA model.")
+    parser.add_argument("--auto_model_name",
+                        type=str,
+                        default=None,
+                        help="The auto model name in TRTLLM repo.")
+    return parser
+
+
 def parse_arguments():
     parser = argparse.ArgumentParser(
         description="Multimodal models with the PyTorch workflow.")
     parser = add_llm_args(parser)
     parser = add_multimodal_args(parser)
+    parser = add_lora_args(parser)
     args = parser.parse_args()
 
     args.disable_kv_cache_reuse = True  # kv cache reuse does not work for multimodal, force overwrite
@@ -71,11 +116,19 @@ def main():
     args = parse_arguments()
     # set prompts and media to example prompts and images if they are not provided
     if args.prompt is None:
-        args.prompt = example_image_prompts if args.modality == "image" else example_video_prompts
+        args.prompt = example_medias_and_prompts[args.modality]["prompt"]
     if args.media is None:
-        args.media = example_images if args.modality == "image" else example_videos
+        args.media = example_medias_and_prompts[args.modality]["media"]
+
+    lora_config = None
+    if args.load_lora:
+        assert args.auto_model_name is not None, "Please provide the auto model name to load LoRA config."
+        import importlib
+        models_module = importlib.import_module('tensorrt_llm._torch.models')
+        model_class = getattr(models_module, args.auto_model_name)
+        lora_config = model_class.lora_config(args.model_dir)
 
-    llm, sampling_params = setup_llm(args)
+    llm, sampling_params = setup_llm(args, lora_config=lora_config)
 
     image_format = args.image_format
     if args.model_type is not None:
@@ -96,7 +149,16 @@ def main():
                                              num_frames=args.num_frames,
                                              device=device)
 
-    outputs = llm.generate(inputs, sampling_params)
+    lora_request = None
+    if args.load_lora:
+        lora_request = model_class.lora_request(len(inputs), args.modality,
+                                                llm._hf_model_dir)
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params,
+        lora_request=lora_request,
+    )
 
     for i, output in enumerate(outputs):
         prompt = args.prompt[i]
 
@@ -59,3 +59,4 @@ ninja
 etcd3
 blake3
 llguidance==0.7.29
+soundfile
@@ -351,6 +351,8 @@ class RopeParams:
     beta_slow: int = 1
     mscale: float = 1.0
     mscale_all_dim: float = 0.0
+    short_factor: Optional[Tuple[float]] = None
+    long_factor: Optional[Tuple[float]] = None
 
     @staticmethod
     def from_config(config) -> "RopeParams":
@@ -386,12 +388,18 @@ def from_config(config) -> "RopeParams":
                 "low_freq_factor", 1.0)
             rope_params.high_freq_factor = rope_scaling.get(
                 "high_freq_factor", 4.0)
-            rope_params.original_max_positions = rope_scaling.get(
-                "original_max_position_embeddings", 1024)
+            rope_params.original_max_positions = getattr(
+                config,
+                "original_max_position_embeddings", None) or rope_scaling.get(
+                    "original_max_position_embeddings", None) or 1024
             rope_params.beta_fast = rope_scaling.get("beta_fast", 32)
             rope_params.beta_slow = rope_scaling.get("beta_slow", 1)
             rope_params.mscale = rope_scaling.get("mscale", 1.0)
             rope_params.mscale_all_dim = rope_scaling.get("mscale_all_dim", 0.0)
+            if "short_factor" in rope_scaling:
+                rope_params.short_factor = tuple(rope_scaling["short_factor"])
+            if "long_factor" in rope_scaling:
+                rope_params.long_factor = tuple(rope_scaling["long_factor"])
         # Workaround for DeepSeek V3 Lite since its rope_scaling is null in config.json.
         elif config.model_type == "deepseek_v3":
             rope_params.scale_type = RotaryScalingType.yarn
@@ -428,7 +436,14 @@ def create_rope_const_params(self, interleave: bool = True):
                 self.mscale_all_dim,
             )
         elif self.scale_type == RotaryScalingType.longrope:
-            raise NotImplementedError("Long RoPE is not supported.")
+            rope_inv_freq, rope_cos_sin = RopeEmbeddingUtils.create_sinusoidal_positions_long_rope_for_attention_plugin(
+                num_pos=self.max_positions,
+                dim=self.dim,
+                theta=self.theta,
+                original_max_pos=self.original_max_positions,
+                short_factor=self.short_factor,
+                long_factor=self.long_factor,
+            )
         else:
             rope_inv_freq, rope_cos_sin = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin(
                 self.max_positions,
 
@@ -15,6 +15,8 @@
 from .modeling_nemotron import NemotronForCausalLM
 from .modeling_nemotron_h import NemotronHForCausalLM
 from .modeling_nemotron_nas import NemotronNASForCausalLM
+from .modeling_phi3 import Phi3ForCausalLM
+from .modeling_phi4mm import Phi4MMForCausalLM
 from .modeling_qwen import (Qwen2ForCausalLM, Qwen2ForProcessRewardModel,
                             Qwen2ForRewardModel)
 from .modeling_qwen2vl import Qwen2_5_VLModel, Qwen2VLModel
@@ -42,6 +44,8 @@
     "NemotronForCausalLM",
     "NemotronHForCausalLM",
     "NemotronNASForCausalLM",
+    "Phi3ForCausalLM",
+    "Phi4MMForCausalLM",
     "Qwen2ForCausalLM",
     "Qwen2ForProcessRewardModel",
     "Qwen2ForRewardModel",
 
@@ -64,6 +64,7 @@ def fuse_input_embeds(
         mm_token_mask = input_ids >= vocab_size
         text_token_mask = input_ids < vocab_size
     else:
+        mm_token_ids = mm_token_ids.to(input_ids.device)
         mm_token_mask = torch.isin(input_ids, mm_token_ids)
         text_token_mask = ~mm_token_mask
     text_token_indices = torch.where(text_token_mask)[0]