diff --git a/cpp/tensorrt_llm/runtime/loraUtils.cpp b/cpp/tensorrt_llm/runtime/loraUtils.cpp
index d897a3195c2..3aff55c0c5e 100644
--- a/cpp/tensorrt_llm/runtime/loraUtils.cpp
+++ b/cpp/tensorrt_llm/runtime/loraUtils.cpp
@@ -107,6 +107,12 @@ void loraValidateRequestTensors(std::optional<std::uint64_t> const& optTaskId,
             TLLM_CHECK_WITH_INFO(it != loraModules.end(), "lora module " + moduleName + " not enabled for this model");
             TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize, isDora) <= weights->getShape().d[2],
                 "lora_weights has to few values for " + moduleName);
+
+            auto expectedSize = it->flattenedInOutSize(adapterSize, isDora);
+            auto actualSize = weights->getShape().d[2];
+            TLLM_LOG_DEBUG("LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
+                moduleName.c_str(), expectedSize, actualSize, adapterSize, isDora);
+
             TLLM_CHECK_WITH_INFO(adapterSize <= maxAdapterSize,
                 "Invalid low_rank (" + std::to_string(adapterSize) + "). low_rank must be smaller than mMaxLowRank ("
                     + std::to_string(maxAdapterSize) + ")");
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
index 7e310f934ac..783b1b007b1 100644
--- a/tensorrt_llm/_torch/model_config.py
+++ b/tensorrt_llm/_torch/model_config.py
@@ -441,9 +441,14 @@ def get_bindings_model_config(self,
             model_config_cpp.set_num_kv_heads(num_kv_heads)
 
         mlp_hidden_size = None
+        print(
+            f"DEBUG: Before if self.pretrained_config.intermediate_size is not None:"
+        )
         if self.pretrained_config.intermediate_size is not None:
+            print(f"DEBUG: Intermediate size is not None")
             mlp_hidden_size = self.pretrained_config.intermediate_size // self.mapping.tp_size
         else:
+            print(f"DEBUG: Intermediate size is None")
             # TODO: once tensorrt_llm._torch.AutoConfig is implemented, the following logic
             # should be moved to tensorrt_llm._torch.AutoConfig of the relevant modeling_xxx file
             if hasattr(self.pretrained_config, "architectures"
@@ -451,11 +456,20 @@ def get_bindings_model_config(self,
                 architectures = self.pretrained_config.architectures
                 if len(architectures
                        ) == 1 and architectures[0] == "DeciLMForCausalLM":
+                    print(
+                        f"DEBUG: Calling _infer_nemotron_ffn_mult for Nemotron model"
+                    )
                     mlp_hidden_size = self._infer_nemotron_ffn_mult()
+                    print(f"DEBUG: Final mlp_hidden_size: {mlp_hidden_size}")
+                    print(f"DEBUG: TP size: {self.mapping.tp_size}")
+                    print(
+                        f"DEBUG: Expected mlp_hidden_size after TP: {mlp_hidden_size // self.mapping.tp_size}"
+                    )
                 else:
                     raise ValueError(
                         f"Inferring mlp hidden size for model architecture: {architectures} isn't supported yet"
                     )
+        print(f"DEBUG: AFTER if mlp_hidden_size is None:")
         if mlp_hidden_size is None:
             raise ValueError(
                 f"Failed to infer mlp hidden size for model: {self.pretrained_config.model_type}"
@@ -474,6 +488,7 @@ def get_bindings_model_config(self,
             head_size = hidden_size // num_heads
 
         model_config_cpp.mlp_hidden_size = mlp_hidden_size
+        # model_config_cpp.coarse_mlp_hidden_size = self.coarse_mlp_hidden_size
         model_config_cpp.size_per_head = head_size
 
         # NOTE: this method is not robust, for Gemma3ForCausalLM only
@@ -488,18 +503,58 @@ def _infer_nemotron_ffn_mult(self):
         # Nemotron-NAS has variable ffn_mult for each layer, we need to find the maximum
         # so that we don't set a too small mlp_hidden_size. This solution leads to a memory
         # consumption that is higher than required.
-        biggest_ffn_mult = max([
-            (x.ffn.ffn_mult if x.ffn.ffn_mult is not None else 0)
-            for x in self.pretrained_config.block_configs
-        ])
+
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - TP size: {self.mapping.tp_size}"
+        )
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Number of block_configs: {len(self.pretrained_config.block_configs)}"
+        )
+
+        ffn_mults = [(x.ffn.ffn_mult if x.ffn.ffn_mult is not None else 0)
+                     for x in self.pretrained_config.block_configs]
+        print(f"DEBUG: _infer_nemotron_ffn_mult - All ffn_mults: {ffn_mults}")
+
+        biggest_ffn_mult = max(ffn_mults)
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Biggest ffn_mult: {biggest_ffn_mult}"
+        )
 
         from tensorrt_llm._torch.models.modeling_nemotron_nas import \
             _ffn_mult_to_intermediate_size
         mlp_hidden_size = _ffn_mult_to_intermediate_size(
             biggest_ffn_mult, self.pretrained_config.hidden_size)
 
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Calculated mlp_hidden_size: {mlp_hidden_size}"
+        )
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Hidden size: {self.pretrained_config.hidden_size}"
+        )
+
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Final TP-split mlp_hidden_size: {mlp_hidden_size}"
+        )
         return mlp_hidden_size
 
+    @property
+    def coarse_mlp_hidden_size(self):
+        """Get the MLP hidden size (TP-split) for LoRA padding calculations."""
+        if self.pretrained_config.intermediate_size is not None:
+            return self.pretrained_config.intermediate_size // self.mapping.tp_size
+        else:
+            # For Nemotron models, use the same logic as _infer_nemotron_ffn_mult
+            if (hasattr(self.pretrained_config, "architectures")
+                    and self.pretrained_config.architectures is not None
+                    and len(self.pretrained_config.architectures) == 1
+                    and self.pretrained_config.architectures[0]
+                    == "DeciLMForCausalLM"):
+                return self._infer_nemotron_ffn_mult()
+            else:
+                raise ValueError(
+                    f"Failed to infer mlp hidden size for model: {self.pretrained_config.model_type}"
+                )
+
     def get_layer_types(self) -> Optional[List[LayerTypeCpp]]:
         """
         This method is a hack to support the effort to switch to KvCacheManagerCpp.
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
index ca224f0b2cd..a78a4a6df03 100644
--- a/tensorrt_llm/_torch/pyexecutor/_util.py
+++ b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -467,6 +467,7 @@ def create_py_executor_instance(
             # all layers have the same number of KV heads
             num_kv_attention_heads = num_kv_attention_heads_per_layer[0]
 
+        # THEN UPDATE THE LoraModule.create_lora_modules CALL:
         lora_modules = LoraModule.create_lora_modules(
             lora_module_names=lora_config.lora_target_modules,
             hidden_size=model_binding_config.hidden_size,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 22a53c4666f..b52e38dfd66 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -461,11 +461,40 @@ def runtime_draft_len(self):
 
     def set_lora_model_config(self, lora_target_modules: list[str],
                               trtllm_modules_to_hf_modules: dict[str, str]):
+        coarse_mlp_hidden_size = None
+
+        # Debug: Check what type self.model.model_config is
+        print(
+            f"DEBUG: model_engine.py - self.model.model_config type: {type(self.model.model_config)}"
+        )
+        print(
+            f"DEBUG: model_engine.py - self.model.model_config dir: {dir(self.model.model_config)}"
+        )
+
+        if hasattr(self.model.model_config, 'coarse_mlp_hidden_size'):
+            coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
+            print(
+                f"DEBUG: model_engine.py - coarse_mlp_hidden_size: {coarse_mlp_hidden_size}"
+            )
+        else:
+            print(
+                f"DEBUG: model_engine.py - coarse_mlp_hidden_size property not found"
+            )
+            # Try direct access to see if it works
+            try:
+                coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
+                print(
+                    f"DEBUG: model_engine.py - Direct access worked: {coarse_mlp_hidden_size}"
+                )
+            except AttributeError as e:
+                print(f"DEBUG: model_engine.py - Direct access failed: {e}")
+
         self.lora_model_config = LoraModelConfig(
             lora_target_modules=lora_target_modules,
             trtllm_modules_to_hf_modules=trtllm_modules_to_hf_modules,
             hidden_size=self.model.config.hidden_size,
-            dtype=torch_dtype_to_str(self.model.config.torch_dtype))
+            dtype=torch_dtype_to_str(self.model.config.torch_dtype),
+            coarse_mlp_hidden_size=coarse_mlp_hidden_size)
 
     @property
     def use_mrope(self):
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index 9a5b42166dc..1a4fb2adba4 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -1037,6 +1037,13 @@ def __init__(self,
                                         world_config=world_config,
                                         buffer_manager=buffer_manager)
         self._lora_config = lora_config
+        # if model_engine is not None and hasattr(model_engine, "lora_model_config"):
+        #     self._lora_model_config = model_engine.lora_model_config
+        # else:
+        #     self._lora_model_config = LoraModelConfig(
+        #         lora_config.lora_target_modules,
+        #         lora_config.trtllm_modules_to_hf_modules, model_config.hidden_size,
+        #         binding_to_str_dtype(model_config.data_type))
         self._lora_model_config = LoraModelConfig(
             lora_config.lora_target_modules,
             lora_config.trtllm_modules_to_hf_modules, model_config.hidden_size,
@@ -1052,6 +1059,9 @@ def add_request_peft(self, request: LlmRequest):
                 # cached, we can safely remove both from the request.
                 request.remove_lora_tensors()
             elif request.lora_weights is None and request.py_lora_path:
+                print(
+                    f"DEBUG: INSIDE add_request_peft: request.py_lora_path: {request.py_lora_path}"
+                )
                 self._lora_manager.load_from_ckpt(
                     [request.py_lora_path],
                     model_config=self._lora_model_config,
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index 6d5ec9c1d78..0450b6eba82 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -356,11 +356,31 @@ def start(self):
     def _load_lora_adapter(self, lora_request: LoRARequest) -> bool:
         """Returns True if the adapter was loaded by this call, False if it was already loaded"""
         adapter_id = str(lora_request.adapter_id)
+
+        # Create runtime_mapping from executor_config.mapping
+        from tensorrt_llm.mapping import Mapping
+        if hasattr(self._executor_config,
+                   "mapping") and self._executor_config.mapping is not None:
+            mapping = self._executor_config.mapping
+            # Calculate world_size to satisfy the constraint: world_size = tp_size * pp_size * cp_size
+            world_size = mapping.tp_size * mapping.pp_size * mapping.cp_size
+
+            runtime_mapping = Mapping(
+                world_size=world_size,  # ← Add world_size
+                tp_size=mapping.tp_size,
+                pp_size=mapping.pp_size,
+                cp_size=mapping.cp_size,
+                rank=mapping.rank,
+                gpus_per_node=mapping.gpus_per_node)
+        else:
+            # Fallback to default mapping
+            runtime_mapping = Mapping()
+
         newly_loaded_uids = self._lora_manager.load_from_ckpt(
             [lora_request.path],
             model_config=self._runtime_model_config if
             self._runtime_model_config is not None else self._lora_model_config,
-            runtime_mapping=None,
+            runtime_mapping=runtime_mapping,  # ← Pass the correct runtime_mapping
             uids=[adapter_id],
             ckpt_source=lora_request.ckpt_source)
         return adapter_id in newly_loaded_uids
diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py
index 7440715474c..a5ba4ebc185 100644
--- a/tensorrt_llm/lora_manager.py
+++ b/tensorrt_llm/lora_manager.py
@@ -243,6 +243,7 @@ class LoraModelConfig:
     trtllm_modules_to_hf_modules: dict[str, str]
     hidden_size: int
     dtype: str
+    coarse_mlp_hidden_size: Optional[int] = None
 
 
 class HfLoraLoader:
@@ -1133,14 +1134,107 @@ def load_from_model_dir(uid, model_dir, hf_config):
                         )
                     )
 
-            max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
-            self._cpp_lora_weights[uid] = torch.stack(
-                [
-                    torch.nn.functional.pad(w, (0, max_weight_size - w.size(0)))
-                    for w in self._cpp_lora_weights[uid]
-                ]
+            # Handle both ModelConfig and LoraModelConfig types
+            print(f"DEBUG: model_config type: {type(model_config)}")
+            print(
+                f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}"
+            )
+            print(
+                f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}"
             )
+            print(
+                f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}"
+            )
+
+            # Handle both ModelConfig and LoraModelConfig types
+            if (
+                hasattr(model_config, "coarse_mlp_hidden_size")
+                and model_config.coarse_mlp_hidden_size is not None
+            ):
+                print(
+                    f"DEBUG: INSIDE load_from_hf: model_config.coarse_mlp_hidden_size: "
+                    f"{model_config.coarse_mlp_hidden_size}"
+                )
+                M_coarse = model_config.coarse_mlp_hidden_size
+                H = model_config.hidden_size
+                rank = int(hf_config["r"])
+
+                print(f"DEBUG: load_from_hf - M_coarse: {M_coarse}")
+                print(f"DEBUG: load_from_hf - tp_size: {tp_size}")
+                print(f"DEBUG: load_from_hf - H (hidden_size): {H}")
+                print(f"DEBUG: load_from_hf - rank: {rank}")
+
+                M_coarse_tp = M_coarse * tp_size
+                max_weight_size = rank * M_coarse_tp + rank * H
+
+                print(f"DEBUG: load_from_hf - M_coarse_tp: {M_coarse_tp}")
+                print(
+                    f"DEBUG: load_from_hf - max_weight_size calculation: "
+                    f"{rank} * {M_coarse_tp} + {rank} * {H} = {max_weight_size}"
+                )
+
+                # Debug actual weights before padding
+                print(
+                    f"DEBUG: load_from_hf - Number of weight tensors: {len(self._cpp_lora_weights[uid])}"
+                )
+                for i, w in enumerate(self._cpp_lora_weights[uid]):
+                    print(
+                        f"DEBUG: load_from_hf - Weight {i} shape: {w.shape}, size(0): {w.size(0)}"
+                    )
+
+                # Debug the actual maximum weight size
+                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
+                print(f"DEBUG: load_from_hf - Actual max weight size: {actual_max_weight_size}")
+                print(f"DEBUG: load_from_hf - Calculated max_weight_size: {max_weight_size}")
+                print(
+                    f"DEBUG: load_from_hf - Difference: {max_weight_size - actual_max_weight_size}"
+                )
+
+                # Debug module-specific sizes
+                print(
+                    f"DEBUG: load_from_hf - Number of modules: {len(self._cpp_lora_weights[uid])}"
+                )
+                print("DEBUG: load_from_hf - Module sizes by index:")
+                for i, w in enumerate(self._cpp_lora_weights[uid]):
+                    print(f"DEBUG: load_from_hf - Module {i}: {w.size(0)}")
+
+                # Debug which modules are failing
+                print("DEBUG: load_from_hf - Checking which modules might fail validation:")
+                for i, w in enumerate(self._cpp_lora_weights[uid]):
+                    if w.size(0) < max_weight_size:
+                        print(
+                            f"DEBUG: load_from_hf - Module {i} will be padded: {w.size(0)} -> {max_weight_size}"
+                        )
+                    else:
+                        print(f"DEBUG: load_from_hf - Module {i} no padding needed: {w.size(0)}")
+
+            else:
+                # Final fallback: use the maximum size of actual weights
+                max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
+                print(f"DEBUG: load_from_hf - Using fallback max_weight_size: {max_weight_size}")
+
+            print(f"DEBUG: load_from_hf - Final max_weight_size: {max_weight_size}")
+
+            # Debug padding process
+            padded_weights = []
+            for i, w in enumerate(self._cpp_lora_weights[uid]):
+                padding_needed = max_weight_size - w.size(0)
+                print(
+                    f"DEBUG: load_from_hf - Weight {i}: original size {w.size(0)}, padding {padding_needed}"
+                )
+                padded_w = torch.nn.functional.pad(w, (0, padding_needed))
+                print(f"DEBUG: load_from_hf - Weight {i}: padded size {padded_w.size(0)}")
+                padded_weights.append(padded_w)
+
+            self._cpp_lora_weights[uid] = torch.stack(padded_weights)
+            print(
+                f"DEBUG: load_from_hf - Final stacked weights shape: {self._cpp_lora_weights[uid].shape}"
+            )
+
             self._cpp_lora_config[uid] = torch.stack([c for c in self._cpp_lora_config[uid]])
+            print(
+                f"DEBUG: load_from_hf - Final stacked config shape: {self._cpp_lora_config[uid].shape}"
+            )
 
         for uid, model_dir, hf_config in zip(new_uids, new_model_dirs, lora_hf_configs):
             load_from_model_dir(uid, model_dir, hf_config)
diff --git a/tests/integration/defs/examples/test_nemotron_nas.py b/tests/integration/defs/examples/test_nemotron_nas.py
index d1663eab672..a9470a0d375 100644
--- a/tests/integration/defs/examples/test_nemotron_nas.py
+++ b/tests/integration/defs/examples/test_nemotron_nas.py
@@ -1,10 +1,16 @@
 from pathlib import Path
 
+import defs.ci_profiler
 import pytest
 from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
 from defs.conftest import get_device_memory, get_sm_version
 from defs.trt_test_alternative import check_call
 
+from tensorrt_llm import LLM
+from tensorrt_llm.executor.request import LoRARequest
+from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.sampling_params import SamplingParams
+
 # skip trt flow cases on post-Blackwell-Ultra
 if get_sm_version() >= 103:
     pytest.skip(
@@ -122,3 +128,71 @@ def test_nemotron_nas_summary_2gpu(nemotron_nas_example_root, llm_venv,
     ]
 
     venv_mpi_check_call(llm_venv, mpi_cmd, summary_cmd)
+
+
+@pytest.mark.skip_less_device(4)
+@pytest.mark.skip_less_device_memory(80000)
+@pytest.mark.parametrize("nemotron_nas_model_root", [
+    "Llama-3_3-Nemotron-Super-49B-v1",
+],
+                         indirect=True)
+def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv,
+                                            nemotron_nas_model_root,
+                                            llm_datasets_root, llm_rouge_root,
+                                            engine_dir, cmodel_dir):
+    """Run Nemotron Super 49B with real LoRA adapters using LLM-API Torch backend."""
+
+    print("Testing Nemotron Super 49B with real LoRA adapters...")
+
+    lora_adapter_path = f"/code/tensorrt_llm/llama-3.3-nemotron-super-49b-v1/llama-3.3-nemotron-super-49b-v1_vlora-1a2cb80-v2"
+    print(f"Using real LoRA from: {lora_adapter_path}")
+
+    defs.ci_profiler.start("test_nemotron_real_lora_torch")
+
+    lora_config = LoraConfig(
+        lora_dir=[lora_adapter_path],
+        max_lora_rank=32,  # From adapter_config.json: "r": 32
+        max_loras=1,
+        max_cpu_loras=1,
+    )
+
+    with LLM(model=nemotron_nas_model_root,
+             lora_config=lora_config,
+             tensor_parallel_size=4,
+             dtype="bfloat16",
+             max_batch_size=2,
+             max_input_len=512,
+             max_seq_len=1024,
+             max_beam_width=1) as llm:
+
+        prompts = [
+            "What is the capital of France?",
+            "Explain quantum computing in simple terms."
+        ]
+
+        sampling_params = SamplingParams(max_tokens=50,
+                                         temperature=0.7,
+                                         top_p=0.9)
+
+        lora_request = [LoRARequest("nemotron-lora", 0, lora_adapter_path)]
+
+        print("Running inference with real LoRA adapter...")
+        outputs = llm.generate(prompts,
+                               sampling_params,
+                               lora_request=lora_request)
+
+        for i, output in enumerate(outputs):
+            print(f"Prompt {i+1}: {prompts[i]}")
+            print(f"Response {i+1}: {output.outputs[0].text}")
+            print("-" * 50)
+
+        assert len(outputs) == 2
+        assert len(outputs[0].outputs) > 0
+        assert len(outputs[1].outputs) > 0
+        assert len(outputs[0].outputs[0].text) > 0
+        assert len(outputs[1].outputs[0].text) > 0
+
+    defs.ci_profiler.stop("test_nemotron_real_lora_torch")
+    print(
+        f"test_nemotron_real_lora_torch: {defs.ci_profiler.elapsed_time_in_sec('test_nemotron_real_lora_torch')} sec"
+    )