[5464088][fix] Enhance LoRA support in PyTorch model configuration

venkywonka · venkywonka · commit 2a0ce00cc2cf · 2025-08-19T12:07:57.000-07:00
- Added logging for dtype casting in LoraLayer to ensure compatibility with FP16/BF16.
- Updated model configuration to derive the number of LoRA adapters from the model label, improving flexibility in adapter management.

Signed-off-by: Venky Ganesh &lt;23023424+venkywonka@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/peft/lora/layer.py b/tensorrt_llm/_torch/peft/lora/layer.py
@@ -3,6 +3,8 @@
 
 import torch
 
+from tensorrt_llm._utils import logger
+
 
 class LoraModuleType(IntEnum):
     """Enum class representing different types of modules that can have LoRA adapters.
@@ -119,6 +121,15 @@ def forward(
             if len(active_lora_module_ids) == 0:
                 return None
             else:
+                # Guard: LoRA custom op only supports FP16/BF16 activations.
+                # If upstream produced FP8 (e.g., FP8 SwiGLU), cast here to avoid runtime failure.
+                if x.dtype not in (torch.float16, torch.bfloat16):
+                    target_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported(
+                    ) else torch.float16
+                    logger.debug(
+                        f"lora_grouped_gemm supports only FP16/BF16. Casting input from {x.dtype} to {target_dtype}."
+                    )
+                    x = x.to(target_dtype).contiguous()
                 lora_outputs = torch.ops.trtllm.lora_grouped_gemm(
                     x,
                     lora_params['host_request_types'][:num_seqs],
diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
@@ -181,10 +181,19 @@ def get_model_yaml_config(model_label: str,
 
     # lora-specific change for pytorch
     if 'pytorch' in model_label and 'loras' in model_label:
+        # Derive the requested number of adapters from model_label (segment like "loras:X")
+        lora_count = 1
+        for part in model_label.split('-'):
+            if part.startswith('loras:'):
+                lora_count = max(1, int(part.split(':', 1)[1]))
+                break
+
         lora_config = {
             'lora_config': {
                 'lora_dir': lora_dirs if lora_dirs is not None else [],
-                'max_lora_rank': 64
+                'max_lora_rank': 64,
+                'max_loras': lora_count,
+                'max_cpu_loras': lora_count,
             }
         }
         if 'phi_4_multimodal_instruct' in model_label: