NVIDIA · moraxu · Aug 18, 2025 · Aug 21, 2025 · Aug 21, 2025 · coderabbitai
@@ -107,6 +107,12 @@ void loraValidateRequestTensors(std::optional<std::uint64_t> const& optTaskId,
             TLLM_CHECK_WITH_INFO(it != loraModules.end(), "lora module " + moduleName + " not enabled for this model");
             TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize, isDora) <= weights->getShape().d[2],
                 "lora_weights has to few values for " + moduleName);
+
+            auto expectedSize = it->flattenedInOutSize(adapterSize, isDora);
+            auto actualSize = weights->getShape().d[2];
+            TLLM_LOG_DEBUG("LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
+                moduleName.c_str(), expectedSize, actualSize, adapterSize, isDora);
+
-            TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize, isDora) <= weights->getShape().d[2],
-                "lora_weights has to few values for " + moduleName);
-
-            auto expectedSize = it->flattenedInOutSize(adapterSize, isDora);
-            auto actualSize = weights->getShape().d[2];
-            TLLM_LOG_DEBUG("LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
-                moduleName.c_str(), expectedSize, actualSize, adapterSize, isDora);
+            auto const expectedSize = it->flattenedInOutSize(adapterSize, isDora);
+            auto const actualSize = weights->getShape().d[2];
+            TLLM_LOG_DEBUG(
+                "LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
+                moduleName.c_str(), expectedSize, actualSize, adapterSize, static_cast<int>(isDora));
+            TLLM_CHECK_WITH_INFO(expectedSize <= actualSize,
+                "lora_weights has too few values for " + moduleName);
-            TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize, isDora) <= weights->getShape().d[2],
-                "lora_weights has to few values for " + moduleName);
-
-            auto expectedSize = it->flattenedInOutSize(adapterSize, isDora);
-            auto actualSize = weights->getShape().d[2];
-            TLLM_LOG_DEBUG("LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
-                moduleName.c_str(), expectedSize, actualSize, adapterSize, isDora);
+            auto const expectedSize = it->flattenedInOutSize(adapterSize, isDora);
+            auto const actualSize = weights->getShape().d[2];
+            TLLM_LOG_DEBUG(
+                "LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
+                moduleName.c_str(), expectedSize, actualSize, adapterSize, static_cast<int>(isDora));
+            TLLM_CHECK_WITH_INFO(expectedSize <= actualSize,
+                "lora_weights has too few values for " + moduleName);
             TLLM_CHECK_WITH_INFO(adapterSize <= maxAdapterSize,
                 "Invalid low_rank (" + std::to_string(adapterSize) + "). low_rank must be smaller than mMaxLowRank ("
                     + std::to_string(maxAdapterSize) + ")");

@@ -441,21 +441,35 @@ def get_bindings_model_config(self,
             model_config_cpp.set_num_kv_heads(num_kv_heads)
 
         mlp_hidden_size = None
+        print(
+            f"DEBUG: Before if self.pretrained_config.intermediate_size is not None:"
+        )
         if self.pretrained_config.intermediate_size is not None:
+            print(f"DEBUG: Intermediate size is not None")
             mlp_hidden_size = self.pretrained_config.intermediate_size // self.mapping.tp_size
         else:
+            print(f"DEBUG: Intermediate size is None")
             # TODO: once tensorrt_llm._torch.AutoConfig is implemented, the following logic
             # should be moved to tensorrt_llm._torch.AutoConfig of the relevant modeling_xxx file
             if hasattr(self.pretrained_config, "architectures"
                        ) and self.pretrained_config.architectures is not None:
                 architectures = self.pretrained_config.architectures
                 if len(architectures
                        ) == 1 and architectures[0] == "DeciLMForCausalLM":
+                    print(
+                        f"DEBUG: Calling _infer_nemotron_ffn_mult for Nemotron model"
+                    )
                     mlp_hidden_size = self._infer_nemotron_ffn_mult()
+                    print(f"DEBUG: Final mlp_hidden_size: {mlp_hidden_size}")
+                    print(f"DEBUG: TP size: {self.mapping.tp_size}")
+                    print(
+                        f"DEBUG: Expected mlp_hidden_size after TP: {mlp_hidden_size // self.mapping.tp_size}"
+                    )
                 else:
                     raise ValueError(
                         f"Inferring mlp hidden size for model architecture: {architectures} isn't supported yet"
                     )
+        print(f"DEBUG: AFTER if mlp_hidden_size is None:")
         if mlp_hidden_size is None:
             raise ValueError(
                 f"Failed to infer mlp hidden size for model: {self.pretrained_config.model_type}"
@@ -474,6 +488,7 @@ def get_bindings_model_config(self,
             head_size = hidden_size // num_heads
 
         model_config_cpp.mlp_hidden_size = mlp_hidden_size
+        # model_config_cpp.coarse_mlp_hidden_size = self.coarse_mlp_hidden_size
         model_config_cpp.size_per_head = head_size
 
         # NOTE: this method is not robust, for Gemma3ForCausalLM only
@@ -488,18 +503,58 @@ def _infer_nemotron_ffn_mult(self):
         # Nemotron-NAS has variable ffn_mult for each layer, we need to find the maximum
         # so that we don't set a too small mlp_hidden_size. This solution leads to a memory
         # consumption that is higher than required.
-        biggest_ffn_mult = max([
-            (x.ffn.ffn_mult if x.ffn.ffn_mult is not None else 0)
-            for x in self.pretrained_config.block_configs
-        ])
+
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - TP size: {self.mapping.tp_size}"
+        )
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Number of block_configs: {len(self.pretrained_config.block_configs)}"
+        )
+
+        ffn_mults = [(x.ffn.ffn_mult if x.ffn.ffn_mult is not None else 0)
+                     for x in self.pretrained_config.block_configs]
+        print(f"DEBUG: _infer_nemotron_ffn_mult - All ffn_mults: {ffn_mults}")
+
+        biggest_ffn_mult = max(ffn_mults)
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Biggest ffn_mult: {biggest_ffn_mult}"
+        )
 
         from tensorrt_llm._torch.models.modeling_nemotron_nas import \
             _ffn_mult_to_intermediate_size
         mlp_hidden_size = _ffn_mult_to_intermediate_size(
             biggest_ffn_mult, self.pretrained_config.hidden_size)
 
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Calculated mlp_hidden_size: {mlp_hidden_size}"
+        )
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Hidden size: {self.pretrained_config.hidden_size}"
+        )
+
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Final TP-split mlp_hidden_size: {mlp_hidden_size}"
+        )
         return mlp_hidden_size
 
+    @property
+    def coarse_mlp_hidden_size(self):
+        """Get the MLP hidden size (TP-split) for LoRA padding calculations."""
+        if self.pretrained_config.intermediate_size is not None:
+            return self.pretrained_config.intermediate_size // self.mapping.tp_size
+        else:
+            # For Nemotron models, use the same logic as _infer_nemotron_ffn_mult
+            if (hasattr(self.pretrained_config, "architectures")
+                    and self.pretrained_config.architectures is not None
+                    and len(self.pretrained_config.architectures) == 1
+                    and self.pretrained_config.architectures[0]
+                    == "DeciLMForCausalLM"):
+                return self._infer_nemotron_ffn_mult()
+            else:
+                raise ValueError(
+                    f"Failed to infer mlp hidden size for model: {self.pretrained_config.model_type}"
+                )
+
     def get_layer_types(self) -> Optional[List[LayerTypeCpp]]:
         """
         This method is a hack to support the effort to switch to KvCacheManagerCpp.

@@ -467,6 +467,7 @@ def create_py_executor_instance(
             # all layers have the same number of KV heads
             num_kv_attention_heads = num_kv_attention_heads_per_layer[0]
 
+        # THEN UPDATE THE LoraModule.create_lora_modules CALL:
         lora_modules = LoraModule.create_lora_modules(
             lora_module_names=lora_config.lora_target_modules,
             hidden_size=model_binding_config.hidden_size,

@@ -461,11 +461,40 @@ def runtime_draft_len(self):
 
     def set_lora_model_config(self, lora_target_modules: list[str],
                               trtllm_modules_to_hf_modules: dict[str, str]):
+        coarse_mlp_hidden_size = None
+
+        # Debug: Check what type self.model.model_config is
+        print(
+            f"DEBUG: model_engine.py - self.model.model_config type: {type(self.model.model_config)}"
+        )
+        print(
+            f"DEBUG: model_engine.py - self.model.model_config dir: {dir(self.model.model_config)}"
+        )
+
+        if hasattr(self.model.model_config, 'coarse_mlp_hidden_size'):
+            coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
+            print(
+                f"DEBUG: model_engine.py - coarse_mlp_hidden_size: {coarse_mlp_hidden_size}"
+            )
+        else:
+            print(
+                f"DEBUG: model_engine.py - coarse_mlp_hidden_size property not found"
+            )
+            # Try direct access to see if it works
+            try:
+                coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
+                print(
+                    f"DEBUG: model_engine.py - Direct access worked: {coarse_mlp_hidden_size}"
+                )
+            except AttributeError as e:
+                print(f"DEBUG: model_engine.py - Direct access failed: {e}")
+
-        coarse_mlp_hidden_size = None
-
-        # Debug: Check what type self.model.model_config is
-        print(
-            f"DEBUG: model_engine.py - self.model.model_config type: {type(self.model.model_config)}"
-        )
-        print(
-            f"DEBUG: model_engine.py - self.model.model_config dir: {dir(self.model.model_config)}"
-        )
-
-        if hasattr(self.model.model_config, 'coarse_mlp_hidden_size'):
-            coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
-            print(
-                f"DEBUG: model_engine.py - coarse_mlp_hidden_size: {coarse_mlp_hidden_size}"
-            )
-        else:
-            print(
-                f"DEBUG: model_engine.py - coarse_mlp_hidden_size property not found"
-            )
-            # Try direct access to see if it works
-            try:
-                coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
-                print(
-                    f"DEBUG: model_engine.py - Direct access worked: {coarse_mlp_hidden_size}"
-                )
-            except AttributeError as e:
-                print(f"DEBUG: model_engine.py - Direct access failed: {e}")
+        coarse_mlp_hidden_size = None
+        try:
+            coarse_mlp_hidden_size = getattr(self.model.model_config, 'coarse_mlp_hidden_size', None)
+            logger.debug("model_engine | coarse_mlp_hidden_size=%s", str(coarse_mlp_hidden_size))
+        except Exception as e:
+            # Property exists but inference failed; proceed without it.
+            logger.debug("model_engine | coarse_mlp_hidden_size unavailable: %s", e)
-        coarse_mlp_hidden_size = None
-
-        # Debug: Check what type self.model.model_config is
-        print(
-            f"DEBUG: model_engine.py - self.model.model_config type: {type(self.model.model_config)}"
-        )
-        print(
-            f"DEBUG: model_engine.py - self.model.model_config dir: {dir(self.model.model_config)}"
-        )
-
-        if hasattr(self.model.model_config, 'coarse_mlp_hidden_size'):
-            coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
-            print(
-                f"DEBUG: model_engine.py - coarse_mlp_hidden_size: {coarse_mlp_hidden_size}"
-            )
-        else:
-            print(
-                f"DEBUG: model_engine.py - coarse_mlp_hidden_size property not found"
-            )
-            # Try direct access to see if it works
-            try:
-                coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
-                print(
-                    f"DEBUG: model_engine.py - Direct access worked: {coarse_mlp_hidden_size}"
-                )
-            except AttributeError as e:
-                print(f"DEBUG: model_engine.py - Direct access failed: {e}")
+        coarse_mlp_hidden_size = None
+        try:
+            coarse_mlp_hidden_size = getattr(self.model.model_config, 'coarse_mlp_hidden_size', None)
+            logger.debug("model_engine | coarse_mlp_hidden_size=%s", str(coarse_mlp_hidden_size))
+        except Exception as e:
+            # Property exists but inference failed; proceed without it.
+            logger.debug("model_engine | coarse_mlp_hidden_size unavailable: %s", e)
         self.lora_model_config = LoraModelConfig(
             lora_target_modules=lora_target_modules,
             trtllm_modules_to_hf_modules=trtllm_modules_to_hf_modules,
             hidden_size=self.model.config.hidden_size,
-            dtype=torch_dtype_to_str(self.model.config.torch_dtype))
+            dtype=torch_dtype_to_str(self.model.config.torch_dtype),
+            coarse_mlp_hidden_size=coarse_mlp_hidden_size)
 
     @property
     def use_mrope(self):

@@ -243,6 +243,7 @@ class LoraModelConfig:
     trtllm_modules_to_hf_modules: dict[str, str]
     hidden_size: int
     dtype: str
+    coarse_mlp_hidden_size: Optional[int] = None
 
 
 class HfLoraLoader:
@@ -1133,14 +1134,107 @@ def load_from_model_dir(uid, model_dir, hf_config):
                         )
                     )
 
-            max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
-            self._cpp_lora_weights[uid] = torch.stack(
-                [
-                    torch.nn.functional.pad(w, (0, max_weight_size - w.size(0)))
-                    for w in self._cpp_lora_weights[uid]
-                ]
+            # Handle both ModelConfig and LoraModelConfig types
+            print(f"DEBUG: model_config type: {type(model_config)}")
+            print(
+                f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}"
+            )
+            print(
+                f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}"
             )
+            print(
+                f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}"
+            )
+
-            # Handle both ModelConfig and LoraModelConfig types
-            print(f"DEBUG: model_config type: {type(model_config)}")
-            print(
-                f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}"
-            )
+            # Handle both ModelConfig and LoraModelConfig types
+            logger.debug(
+                "LoRA padding | model_config=%s | has_coarse=%s | coarse=%s",
+                type(model_config),
+                hasattr(model_config, "coarse_mlp_hidden_size"),
+                getattr(model_config, "coarse_mlp_hidden_size", None),
+            )
-            # Handle both ModelConfig and LoraModelConfig types
-            print(f"DEBUG: model_config type: {type(model_config)}")
-            print(
-                f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}"
-            )
+            # Handle both ModelConfig and LoraModelConfig types
+            logger.debug(
+                "LoRA padding | model_config=%s | has_coarse=%s | coarse=%s",
+                type(model_config),
+                hasattr(model_config, "coarse_mlp_hidden_size"),
+                getattr(model_config, "coarse_mlp_hidden_size", None),
+            )
+            # Handle both ModelConfig and LoraModelConfig types
+            if (
+                hasattr(model_config, "coarse_mlp_hidden_size")
+                and model_config.coarse_mlp_hidden_size is not None
+            ):
+                print(
+                    f"DEBUG: INSIDE load_from_hf: model_config.coarse_mlp_hidden_size: "
+                    f"{model_config.coarse_mlp_hidden_size}"
+                )
+                M_coarse = model_config.coarse_mlp_hidden_size
+                H = model_config.hidden_size
+                rank = int(hf_config["r"])
+
+                print(f"DEBUG: load_from_hf - M_coarse: {M_coarse}")
+                print(f"DEBUG: load_from_hf - tp_size: {tp_size}")
+                print(f"DEBUG: load_from_hf - H (hidden_size): {H}")
+                print(f"DEBUG: load_from_hf - rank: {rank}")
+
+                M_coarse_tp = M_coarse * tp_size
+                max_weight_size = rank * M_coarse_tp + rank * H
+
+                print(f"DEBUG: load_from_hf - M_coarse_tp: {M_coarse_tp}")
+                print(
+                    f"DEBUG: load_from_hf - max_weight_size calculation: "
+                    f"{rank} * {M_coarse_tp} + {rank} * {H} = {max_weight_size}"
+                )
+
+                # Debug actual weights before padding
+                print(
+                    f"DEBUG: load_from_hf - Number of weight tensors: {len(self._cpp_lora_weights[uid])}"
+                )
+                for i, w in enumerate(self._cpp_lora_weights[uid]):
+                    print(
+                        f"DEBUG: load_from_hf - Weight {i} shape: {w.shape}, size(0): {w.size(0)}"
+                    )
+
+                # Debug the actual maximum weight size
+                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
+                print(f"DEBUG: load_from_hf - Actual max weight size: {actual_max_weight_size}")
+                print(f"DEBUG: load_from_hf - Calculated max_weight_size: {max_weight_size}")
+                print(
+                    f"DEBUG: load_from_hf - Difference: {max_weight_size - actual_max_weight_size}"
+                )
-                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
-                print(f"DEBUG: load_from_hf - Actual max weight size: {actual_max_weight_size}")
-                print(f"DEBUG: load_from_hf - Calculated max_weight_size: {max_weight_size}")
-                print(
-                    f"DEBUG: load_from_hf - Difference: {max_weight_size - actual_max_weight_size}"
-                )
+                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
+                # Account for possible DoRA magnitude vector (+M_coarse_tp) by taking the max with actual.
+                max_weight_size = max(calc_max_weight_size, actual_max_weight_size)
+                logger.debug(
+                    "LoRA padding | calc=%d actual=%d final=%d",
+                    calc_max_weight_size, actual_max_weight_size, max_weight_size
+                )
-                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
-                print(f"DEBUG: load_from_hf - Actual max weight size: {actual_max_weight_size}")
-                print(f"DEBUG: load_from_hf - Calculated max_weight_size: {max_weight_size}")
-                print(
-                    f"DEBUG: load_from_hf - Difference: {max_weight_size - actual_max_weight_size}"
-                )
+                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
+                # Account for possible DoRA magnitude vector (+M_coarse_tp) by taking the max with actual.
+                max_weight_size = max(calc_max_weight_size, actual_max_weight_size)
+                logger.debug(
+                    "LoRA padding | calc=%d actual=%d final=%d",
+                    calc_max_weight_size, actual_max_weight_size, max_weight_size
+                )
+
+                # Debug module-specific sizes
+                print(
+                    f"DEBUG: load_from_hf - Number of modules: {len(self._cpp_lora_weights[uid])}"
+                )
+                print("DEBUG: load_from_hf - Module sizes by index:")
+                for i, w in enumerate(self._cpp_lora_weights[uid]):
+                    print(f"DEBUG: load_from_hf - Module {i}: {w.size(0)}")
+
+                # Debug which modules are failing
+                print("DEBUG: load_from_hf - Checking which modules might fail validation:")
+                for i, w in enumerate(self._cpp_lora_weights[uid]):
+                    if w.size(0) < max_weight_size:
+                        print(
+                            f"DEBUG: load_from_hf - Module {i} will be padded: {w.size(0)} -> {max_weight_size}"
+                        )
+                    else:
+                        print(f"DEBUG: load_from_hf - Module {i} no padding needed: {w.size(0)}")
+
+            else:
+                # Final fallback: use the maximum size of actual weights
+                max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
+                print(f"DEBUG: load_from_hf - Using fallback max_weight_size: {max_weight_size}")
+
+            print(f"DEBUG: load_from_hf - Final max_weight_size: {max_weight_size}")
+
+            # Debug padding process
+            padded_weights = []
+            for i, w in enumerate(self._cpp_lora_weights[uid]):
+                padding_needed = max_weight_size - w.size(0)
+                print(
+                    f"DEBUG: load_from_hf - Weight {i}: original size {w.size(0)}, padding {padding_needed}"
+                )
+                padded_w = torch.nn.functional.pad(w, (0, padding_needed))
+                print(f"DEBUG: load_from_hf - Weight {i}: padded size {padded_w.size(0)}")
+                padded_weights.append(padded_w)
+
-            padded_weights = []
-            for i, w in enumerate(self._cpp_lora_weights[uid]):
-                padding_needed = max_weight_size - w.size(0)
-                print(
-                    f"DEBUG: load_from_hf - Weight {i}: original size {w.size(0)}, padding {padding_needed}"
-                )
-                padded_w = torch.nn.functional.pad(w, (0, padding_needed))
-                print(f"DEBUG: load_from_hf - Weight {i}: padded size {padded_w.size(0)}")
-                padded_weights.append(padded_w)
+            padded_weights = []
+            for i, w in enumerate(self._cpp_lora_weights[uid]):
+                padding_needed = max_weight_size - w.size(0)
+                if padding_needed < 0:
+                    # Should not happen with the guard above; fall back defensively.
+                    logger.warning(
+                        "LoRA padding | negative padding=%d at idx=%d; widening to actual size",
+                        padding_needed,
+                        i,
+                    )
+                    padding_needed = 0
+                padded_weights.append(torch.nn.functional.pad(w, (0, padding_needed)))
-            padded_weights = []
-            for i, w in enumerate(self._cpp_lora_weights[uid]):
-                padding_needed = max_weight_size - w.size(0)
-                print(
-                    f"DEBUG: load_from_hf - Weight {i}: original size {w.size(0)}, padding {padding_needed}"
-                )
-                padded_w = torch.nn.functional.pad(w, (0, padding_needed))
-                print(f"DEBUG: load_from_hf - Weight {i}: padded size {padded_w.size(0)}")
-                padded_weights.append(padded_w)
+            padded_weights = []
+            for i, w in enumerate(self._cpp_lora_weights[uid]):
+                padding_needed = max_weight_size - w.size(0)
+                if padding_needed < 0:
+                    # Should not happen with the guard above; fall back defensively.
+                    logger.warning(
+                        "LoRA padding | negative padding=%d at idx=%d; widening to actual size",
+                        padding_needed,
+                        i,
+                    )
+                    padding_needed = 0
+                padded_weights.append(torch.nn.functional.pad(w, (0, padding_needed)))
+            self._cpp_lora_weights[uid] = torch.stack(padded_weights)
+            print(
+                f"DEBUG: load_from_hf - Final stacked weights shape: {self._cpp_lora_weights[uid].shape}"
+            )
+
             self._cpp_lora_config[uid] = torch.stack([c for c in self._cpp_lora_config[uid]])
+            print(
+                f"DEBUG: load_from_hf - Final stacked config shape: {self._cpp_lora_config[uid].shape}"
+            )
 
         for uid, model_dir, hf_config in zip(new_uids, new_model_dirs, lora_hf_configs):
             load_from_model_dir(uid, model_dir, hf_config)

diff --git a/tests/integration/defs/examples/test_nemotron_nas.py b/tests/integration/defs/examples/test_nemotron_nas.py
@@ -1,10 +1,16 @@
 from pathlib import Path
 
+import defs.ci_profiler
 import pytest
 from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
 from defs.conftest import get_device_memory, get_sm_version
 from defs.trt_test_alternative import check_call
 
+from tensorrt_llm import LLM
+from tensorrt_llm.executor.request import LoRARequest
+from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.sampling_params import SamplingParams
+
 # skip trt flow cases on post-Blackwell-Ultra
 if get_sm_version() >= 103:
     pytest.skip(
@@ -122,3 +128,71 @@ def test_nemotron_nas_summary_2gpu(nemotron_nas_example_root, llm_venv,
     ]
 
     venv_mpi_check_call(llm_venv, mpi_cmd, summary_cmd)
+
+
+@pytest.mark.skip_less_device(4)
+@pytest.mark.skip_less_device_memory(80000)
+@pytest.mark.parametrize("nemotron_nas_model_root", [
+    "Llama-3_3-Nemotron-Super-49B-v1",
+],
+                         indirect=True)
+def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv,
+                                            nemotron_nas_model_root,
+                                            llm_datasets_root, llm_rouge_root,
+                                            engine_dir, cmodel_dir):
+    """Run Nemotron Super 49B with real LoRA adapters using LLM-API Torch backend."""
+
+    print("Testing Nemotron Super 49B with real LoRA adapters...")
+
+    lora_adapter_path = f"/code/tensorrt_llm/llama-3.3-nemotron-super-49b-v1/llama-3.3-nemotron-super-49b-v1_vlora-1a2cb80-v2"
+    print(f"Using real LoRA from: {lora_adapter_path}")
+
+    defs.ci_profiler.start("test_nemotron_real_lora_torch")
+
+    lora_config = LoraConfig(
+        lora_dir=[lora_adapter_path],
+        max_lora_rank=32,  # From adapter_config.json: "r": 32
+        max_loras=1,
+        max_cpu_loras=1,
+    )
+
+    with LLM(model=nemotron_nas_model_root,
+             lora_config=lora_config,
+             tensor_parallel_size=4,
+             dtype="bfloat16",
+             max_batch_size=2,
+             max_input_len=512,
+             max_seq_len=1024,
+             max_beam_width=1) as llm:
+
+        prompts = [
+            "What is the capital of France?",
+            "Explain quantum computing in simple terms."
+        ]
+
+        sampling_params = SamplingParams(max_tokens=50,
+                                         temperature=0.7,
+                                         top_p=0.9)
+
+        lora_request = [LoRARequest("nemotron-lora", 0, lora_adapter_path)]
+
+        print("Running inference with real LoRA adapter...")
+        outputs = llm.generate(prompts,
+                               sampling_params,
+                               lora_request=lora_request)
+
+        for i, output in enumerate(outputs):
+            print(f"Prompt {i+1}: {prompts[i]}")
+            print(f"Response {i+1}: {output.outputs[0].text}")
+            print("-" * 50)
+
+        assert len(outputs) == 2
+        assert len(outputs[0].outputs) > 0
+        assert len(outputs[1].outputs) > 0
+        assert len(outputs[0].outputs[0].text) > 0
+        assert len(outputs[1].outputs[0].text) > 0
+
+    defs.ci_profiler.stop("test_nemotron_real_lora_torch")
+    print(
+        f"test_nemotron_real_lora_torch: {defs.ci_profiler.elapsed_time_in_sec('test_nemotron_real_lora_torch')} sec"
+    )