NVIDIA · moraxu · Aug 18, 2025 · Aug 21, 2025 · Aug 21, 2025 · coderabbitai
@@ -107,6 +107,12 @@ void loraValidateRequestTensors(std::optional<std::uint64_t> const& optTaskId,
             TLLM_CHECK_WITH_INFO(it != loraModules.end(), "lora module " + moduleName + " not enabled for this model");
             TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize, isDora) <= weights->getShape().d[2],
                 "lora_weights has to few values for " + moduleName);
+
+            auto expectedSize = it->flattenedInOutSize(adapterSize, isDora);
+            auto actualSize = weights->getShape().d[2];
+            TLLM_LOG_DEBUG("LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
+                moduleName.c_str(), expectedSize, actualSize, adapterSize, isDora);
+
-            TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize, isDora) <= weights->getShape().d[2],
-                "lora_weights has to few values for " + moduleName);
-
-            auto expectedSize = it->flattenedInOutSize(adapterSize, isDora);
-            auto actualSize = weights->getShape().d[2];
-            TLLM_LOG_DEBUG("LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
-                moduleName.c_str(), expectedSize, actualSize, adapterSize, isDora);
+            auto const expectedSize = it->flattenedInOutSize(adapterSize, isDora);
+            auto const actualSize = weights->getShape().d[2];
+            TLLM_LOG_DEBUG(
+                "LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
+                moduleName.c_str(), expectedSize, actualSize, adapterSize, static_cast<int>(isDora));
+            TLLM_CHECK_WITH_INFO(expectedSize <= actualSize,
+                "lora_weights has too few values for " + moduleName);
-            TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize, isDora) <= weights->getShape().d[2],
-                "lora_weights has to few values for " + moduleName);
-
-            auto expectedSize = it->flattenedInOutSize(adapterSize, isDora);
-            auto actualSize = weights->getShape().d[2];
-            TLLM_LOG_DEBUG("LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
-                moduleName.c_str(), expectedSize, actualSize, adapterSize, isDora);
+            auto const expectedSize = it->flattenedInOutSize(adapterSize, isDora);
+            auto const actualSize = weights->getShape().d[2];
+            TLLM_LOG_DEBUG(
+                "LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
+                moduleName.c_str(), expectedSize, actualSize, adapterSize, static_cast<int>(isDora));
+            TLLM_CHECK_WITH_INFO(expectedSize <= actualSize,
+                "lora_weights has too few values for " + moduleName);
             TLLM_CHECK_WITH_INFO(adapterSize <= maxAdapterSize,
                 "Invalid low_rank (" + std::to_string(adapterSize) + "). low_rank must be smaller than mMaxLowRank ("
                     + std::to_string(maxAdapterSize) + ")");

@@ -441,21 +441,35 @@ def get_bindings_model_config(self,
             model_config_cpp.set_num_kv_heads(num_kv_heads)
 
         mlp_hidden_size = None
+        print(
+            f"DEBUG: Before if self.pretrained_config.intermediate_size is not None:"
+        )
         if self.pretrained_config.intermediate_size is not None:
+            print(f"DEBUG: Intermediate size is not None")
             mlp_hidden_size = self.pretrained_config.intermediate_size // self.mapping.tp_size
         else:
+            print(f"DEBUG: Intermediate size is None")
             # TODO: once tensorrt_llm._torch.AutoConfig is implemented, the following logic
             # should be moved to tensorrt_llm._torch.AutoConfig of the relevant modeling_xxx file
             if hasattr(self.pretrained_config, "architectures"
                        ) and self.pretrained_config.architectures is not None:
                 architectures = self.pretrained_config.architectures
                 if len(architectures
                        ) == 1 and architectures[0] == "DeciLMForCausalLM":
+                    print(
+                        f"DEBUG: Calling _infer_nemotron_ffn_mult for Nemotron model"
+                    )
                     mlp_hidden_size = self._infer_nemotron_ffn_mult()
+                    print(f"DEBUG: Final mlp_hidden_size: {mlp_hidden_size}")
+                    print(f"DEBUG: TP size: {self.mapping.tp_size}")
+                    print(
+                        f"DEBUG: Expected mlp_hidden_size after TP: {mlp_hidden_size // self.mapping.tp_size}"
+                    )
                 else:
                     raise ValueError(
                         f"Inferring mlp hidden size for model architecture: {architectures} isn't supported yet"
                     )
+        print(f"DEBUG: AFTER if mlp_hidden_size is None:")
         if mlp_hidden_size is None:
             raise ValueError(
                 f"Failed to infer mlp hidden size for model: {self.pretrained_config.model_type}"
@@ -474,6 +488,7 @@ def get_bindings_model_config(self,
             head_size = hidden_size // num_heads
 
         model_config_cpp.mlp_hidden_size = mlp_hidden_size
+        # model_config_cpp.coarse_mlp_hidden_size = self.coarse_mlp_hidden_size
         model_config_cpp.size_per_head = head_size
 
         # NOTE: this method is not robust, for Gemma3ForCausalLM only
@@ -488,18 +503,58 @@ def _infer_nemotron_ffn_mult(self):
         # Nemotron-NAS has variable ffn_mult for each layer, we need to find the maximum
         # so that we don't set a too small mlp_hidden_size. This solution leads to a memory
         # consumption that is higher than required.
-        biggest_ffn_mult = max([
-            (x.ffn.ffn_mult if x.ffn.ffn_mult is not None else 0)
-            for x in self.pretrained_config.block_configs
-        ])
+
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - TP size: {self.mapping.tp_size}"
+        )
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Number of block_configs: {len(self.pretrained_config.block_configs)}"
+        )
+
+        ffn_mults = [(x.ffn.ffn_mult if x.ffn.ffn_mult is not None else 0)
+                     for x in self.pretrained_config.block_configs]
+        print(f"DEBUG: _infer_nemotron_ffn_mult - All ffn_mults: {ffn_mults}")
+
+        biggest_ffn_mult = max(ffn_mults)
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Biggest ffn_mult: {biggest_ffn_mult}"
+        )
 
         from tensorrt_llm._torch.models.modeling_nemotron_nas import \
             _ffn_mult_to_intermediate_size
         mlp_hidden_size = _ffn_mult_to_intermediate_size(
             biggest_ffn_mult, self.pretrained_config.hidden_size)
 
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Calculated mlp_hidden_size: {mlp_hidden_size}"
+        )
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Hidden size: {self.pretrained_config.hidden_size}"
+        )
+
+        print(
+            f"DEBUG: _infer_nemotron_ffn_mult - Final TP-split mlp_hidden_size: {mlp_hidden_size}"
+        )
         return mlp_hidden_size
 
+    @property
+    def coarse_mlp_hidden_size(self):
+        """Get the MLP hidden size (TP-split) for LoRA padding calculations."""
+        if self.pretrained_config.intermediate_size is not None:
+            return self.pretrained_config.intermediate_size // self.mapping.tp_size
+        else:
+            # For Nemotron models, use the same logic as _infer_nemotron_ffn_mult
+            if (hasattr(self.pretrained_config, "architectures")
+                    and self.pretrained_config.architectures is not None
+                    and len(self.pretrained_config.architectures) == 1
+                    and self.pretrained_config.architectures[0]
+                    == "DeciLMForCausalLM"):
+                return self._infer_nemotron_ffn_mult()
+            else:
+                raise ValueError(
+                    f"Failed to infer mlp hidden size for model: {self.pretrained_config.model_type}"
+                )
+
     def get_layer_types(self) -> Optional[List[LayerTypeCpp]]:
         """
         This method is a hack to support the effort to switch to KvCacheManagerCpp.

@@ -467,6 +467,7 @@ def create_py_executor_instance(
             # all layers have the same number of KV heads
             num_kv_attention_heads = num_kv_attention_heads_per_layer[0]
 
+        # THEN UPDATE THE LoraModule.create_lora_modules CALL:
         lora_modules = LoraModule.create_lora_modules(
             lora_module_names=lora_config.lora_target_modules,
             hidden_size=model_binding_config.hidden_size,

@@ -461,11 +461,40 @@ def runtime_draft_len(self):
 
     def set_lora_model_config(self, lora_target_modules: list[str],
                               trtllm_modules_to_hf_modules: dict[str, str]):
+        coarse_mlp_hidden_size = None
+
+        # Debug: Check what type self.model.model_config is
+        print(
+            f"DEBUG: model_engine.py - self.model.model_config type: {type(self.model.model_config)}"
+        )
+        print(
+            f"DEBUG: model_engine.py - self.model.model_config dir: {dir(self.model.model_config)}"
+        )
+
+        if hasattr(self.model.model_config, 'coarse_mlp_hidden_size'):
+            coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
+            print(
+                f"DEBUG: model_engine.py - coarse_mlp_hidden_size: {coarse_mlp_hidden_size}"
+            )
+        else:
+            print(
+                f"DEBUG: model_engine.py - coarse_mlp_hidden_size property not found"
+            )
+            # Try direct access to see if it works
+            try:
+                coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
+                print(
+                    f"DEBUG: model_engine.py - Direct access worked: {coarse_mlp_hidden_size}"
+                )
+            except AttributeError as e:
+                print(f"DEBUG: model_engine.py - Direct access failed: {e}")
+
-        coarse_mlp_hidden_size = None
-
-        # Debug: Check what type self.model.model_config is
-        print(
-            f"DEBUG: model_engine.py - self.model.model_config type: {type(self.model.model_config)}"
-        )
-        print(
-            f"DEBUG: model_engine.py - self.model.model_config dir: {dir(self.model.model_config)}"
-        )
-
-        if hasattr(self.model.model_config, 'coarse_mlp_hidden_size'):
-            coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
-            print(
-                f"DEBUG: model_engine.py - coarse_mlp_hidden_size: {coarse_mlp_hidden_size}"
-            )
-        else:
-            print(
-                f"DEBUG: model_engine.py - coarse_mlp_hidden_size property not found"
-            )
-            # Try direct access to see if it works
-            try:
-                coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
-                print(
-                    f"DEBUG: model_engine.py - Direct access worked: {coarse_mlp_hidden_size}"
-                )
-            except AttributeError as e:
-                print(f"DEBUG: model_engine.py - Direct access failed: {e}")
+        coarse_mlp_hidden_size = None
+        try:
+            coarse_mlp_hidden_size = getattr(self.model.model_config, 'coarse_mlp_hidden_size', None)
+            logger.debug("model_engine | coarse_mlp_hidden_size=%s", str(coarse_mlp_hidden_size))
+        except Exception as e:
+            # Property exists but inference failed; proceed without it.
+            logger.debug("model_engine | coarse_mlp_hidden_size unavailable: %s", e)
-        coarse_mlp_hidden_size = None
-
-        # Debug: Check what type self.model.model_config is
-        print(
-            f"DEBUG: model_engine.py - self.model.model_config type: {type(self.model.model_config)}"
-        )
-        print(
-            f"DEBUG: model_engine.py - self.model.model_config dir: {dir(self.model.model_config)}"
-        )
-
-        if hasattr(self.model.model_config, 'coarse_mlp_hidden_size'):
-            coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
-            print(
-                f"DEBUG: model_engine.py - coarse_mlp_hidden_size: {coarse_mlp_hidden_size}"
-            )
-        else:
-            print(
-                f"DEBUG: model_engine.py - coarse_mlp_hidden_size property not found"
-            )
-            # Try direct access to see if it works
-            try:
-                coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
-                print(
-                    f"DEBUG: model_engine.py - Direct access worked: {coarse_mlp_hidden_size}"
-                )
-            except AttributeError as e:
-                print(f"DEBUG: model_engine.py - Direct access failed: {e}")
+        coarse_mlp_hidden_size = None
+        try:
+            coarse_mlp_hidden_size = getattr(self.model.model_config, 'coarse_mlp_hidden_size', None)
+            logger.debug("model_engine | coarse_mlp_hidden_size=%s", str(coarse_mlp_hidden_size))
+        except Exception as e:
+            # Property exists but inference failed; proceed without it.
+            logger.debug("model_engine | coarse_mlp_hidden_size unavailable: %s", e)
         self.lora_model_config = LoraModelConfig(
             lora_target_modules=lora_target_modules,
             trtllm_modules_to_hf_modules=trtllm_modules_to_hf_modules,
             hidden_size=self.model.config.hidden_size,
-            dtype=torch_dtype_to_str(self.model.config.torch_dtype))
+            dtype=torch_dtype_to_str(self.model.config.torch_dtype),
+            coarse_mlp_hidden_size=coarse_mlp_hidden_size)
 
     @property
     def use_mrope(self):

@@ -1037,6 +1037,13 @@ def __init__(self,
                                         world_config=world_config,
                                         buffer_manager=buffer_manager)
         self._lora_config = lora_config
+        # if model_engine is not None and hasattr(model_engine, "lora_model_config"):
+        #     self._lora_model_config = model_engine.lora_model_config
+        # else:
+        #     self._lora_model_config = LoraModelConfig(
+        #         lora_config.lora_target_modules,
+        #         lora_config.trtllm_modules_to_hf_modules, model_config.hidden_size,
+        #         binding_to_str_dtype(model_config.data_type))
         self._lora_model_config = LoraModelConfig(
             lora_config.lora_target_modules,
             lora_config.trtllm_modules_to_hf_modules, model_config.hidden_size,
@@ -1052,6 +1059,9 @@ def add_request_peft(self, request: LlmRequest):
                 # cached, we can safely remove both from the request.
                 request.remove_lora_tensors()
             elif request.lora_weights is None and request.py_lora_path:
+                print(
+                    f"DEBUG: INSIDE add_request_peft: request.py_lora_path: {request.py_lora_path}"
+                )
                 self._lora_manager.load_from_ckpt(
                     [request.py_lora_path],
                     model_config=self._lora_model_config,

@@ -356,11 +356,31 @@ def start(self):
     def _load_lora_adapter(self, lora_request: LoRARequest) -> bool:
         """Returns True if the adapter was loaded by this call, False if it was already loaded"""
         adapter_id = str(lora_request.adapter_id)
+
+        # Create runtime_mapping from executor_config.mapping
+        from tensorrt_llm.mapping import Mapping
+        if hasattr(self._executor_config,
+                   "mapping") and self._executor_config.mapping is not None:
+            mapping = self._executor_config.mapping
+            # Calculate world_size to satisfy the constraint: world_size = tp_size * pp_size * cp_size
+            world_size = mapping.tp_size * mapping.pp_size * mapping.cp_size
+
+            runtime_mapping = Mapping(
+                world_size=world_size,  # ← Add world_size
+                tp_size=mapping.tp_size,
+                pp_size=mapping.pp_size,
+                cp_size=mapping.cp_size,
+                rank=mapping.rank,
+                gpus_per_node=mapping.gpus_per_node)
+        else:
+            # Fallback to default mapping
+            runtime_mapping = Mapping()
+
         newly_loaded_uids = self._lora_manager.load_from_ckpt(
             [lora_request.path],
             model_config=self._runtime_model_config if
             self._runtime_model_config is not None else self._lora_model_config,
-            runtime_mapping=None,
+            runtime_mapping=runtime_mapping,  # ← Pass the correct runtime_mapping
             uids=[adapter_id],
             ckpt_source=lora_request.ckpt_source)
         return adapter_id in newly_loaded_uids

@@ -243,6 +243,7 @@ class LoraModelConfig:
     trtllm_modules_to_hf_modules: dict[str, str]
     hidden_size: int
     dtype: str
+    coarse_mlp_hidden_size: Optional[int] = None
 
 
 class HfLoraLoader:
@@ -1133,14 +1134,107 @@ def load_from_model_dir(uid, model_dir, hf_config):
                         )
                     )
 
-            max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
-            self._cpp_lora_weights[uid] = torch.stack(
-                [
-                    torch.nn.functional.pad(w, (0, max_weight_size - w.size(0)))
-                    for w in self._cpp_lora_weights[uid]
-                ]
+            # Handle both ModelConfig and LoraModelConfig types
+            print(f"DEBUG: model_config type: {type(model_config)}")
+            print(
+                f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}"
+            )
+            print(
+                f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}"
             )
+            print(
+                f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}"
+            )
+
-            # Handle both ModelConfig and LoraModelConfig types
-            print(f"DEBUG: model_config type: {type(model_config)}")
-            print(
-                f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}"
-            )
+            # Handle both ModelConfig and LoraModelConfig types
+            logger.debug(
+                "LoRA padding | model_config=%s | has_coarse=%s | coarse=%s",
+                type(model_config),
+                hasattr(model_config, "coarse_mlp_hidden_size"),
+                getattr(model_config, "coarse_mlp_hidden_size", None),
+            )
-            # Handle both ModelConfig and LoraModelConfig types
-            print(f"DEBUG: model_config type: {type(model_config)}")
-            print(
-                f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}"
-            )
+            # Handle both ModelConfig and LoraModelConfig types
+            logger.debug(
+                "LoRA padding | model_config=%s | has_coarse=%s | coarse=%s",
+                type(model_config),
+                hasattr(model_config, "coarse_mlp_hidden_size"),
+                getattr(model_config, "coarse_mlp_hidden_size", None),
+            )
+            # Handle both ModelConfig and LoraModelConfig types
+            if (
+                hasattr(model_config, "coarse_mlp_hidden_size")
+                and model_config.coarse_mlp_hidden_size is not None
+            ):
+                print(
+                    f"DEBUG: INSIDE load_from_hf: model_config.coarse_mlp_hidden_size: "
+                    f"{model_config.coarse_mlp_hidden_size}"
+                )
+                M_coarse = model_config.coarse_mlp_hidden_size
+                H = model_config.hidden_size
+                rank = int(hf_config["r"])
+
+                print(f"DEBUG: load_from_hf - M_coarse: {M_coarse}")
+                print(f"DEBUG: load_from_hf - tp_size: {tp_size}")
+                print(f"DEBUG: load_from_hf - H (hidden_size): {H}")
+                print(f"DEBUG: load_from_hf - rank: {rank}")
+
+                M_coarse_tp = M_coarse * tp_size
+                max_weight_size = rank * M_coarse_tp + rank * H
+
+                print(f"DEBUG: load_from_hf - M_coarse_tp: {M_coarse_tp}")
+                print(
+                    f"DEBUG: load_from_hf - max_weight_size calculation: "
+                    f"{rank} * {M_coarse_tp} + {rank} * {H} = {max_weight_size}"
+                )
+
+                # Debug actual weights before padding
+                print(
+                    f"DEBUG: load_from_hf - Number of weight tensors: {len(self._cpp_lora_weights[uid])}"
+                )
+                for i, w in enumerate(self._cpp_lora_weights[uid]):
+                    print(
+                        f"DEBUG: load_from_hf - Weight {i} shape: {w.shape}, size(0): {w.size(0)}"
+                    )
+
+                # Debug the actual maximum weight size
+                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
+                print(f"DEBUG: load_from_hf - Actual max weight size: {actual_max_weight_size}")
+                print(f"DEBUG: load_from_hf - Calculated max_weight_size: {max_weight_size}")
+                print(
+                    f"DEBUG: load_from_hf - Difference: {max_weight_size - actual_max_weight_size}"
+                )
-                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
-                print(f"DEBUG: load_from_hf - Actual max weight size: {actual_max_weight_size}")
-                print(f"DEBUG: load_from_hf - Calculated max_weight_size: {max_weight_size}")
-                print(
-                    f"DEBUG: load_from_hf - Difference: {max_weight_size - actual_max_weight_size}"
-                )
+                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
+                # Account for possible DoRA magnitude vector (+M_coarse_tp) by taking the max with actual.
+                max_weight_size = max(calc_max_weight_size, actual_max_weight_size)
+                logger.debug(
+                    "LoRA padding | calc=%d actual=%d final=%d",
+                    calc_max_weight_size, actual_max_weight_size, max_weight_size
+                )
-                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
-                print(f"DEBUG: load_from_hf - Actual max weight size: {actual_max_weight_size}")
-                print(f"DEBUG: load_from_hf - Calculated max_weight_size: {max_weight_size}")
-                print(
-                    f"DEBUG: load_from_hf - Difference: {max_weight_size - actual_max_weight_size}"
-                )
+                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
+                # Account for possible DoRA magnitude vector (+M_coarse_tp) by taking the max with actual.
+                max_weight_size = max(calc_max_weight_size, actual_max_weight_size)
+                logger.debug(
+                    "LoRA padding | calc=%d actual=%d final=%d",
+                    calc_max_weight_size, actual_max_weight_size, max_weight_size
+                )
+
+                # Debug module-specific sizes
+                print(
+                    f"DEBUG: load_from_hf - Number of modules: {len(self._cpp_lora_weights[uid])}"
+                )
+                print("DEBUG: load_from_hf - Module sizes by index:")
+                for i, w in enumerate(self._cpp_lora_weights[uid]):
+                    print(f"DEBUG: load_from_hf - Module {i}: {w.size(0)}")
+
+                # Debug which modules are failing
+                print("DEBUG: load_from_hf - Checking which modules might fail validation:")
+                for i, w in enumerate(self._cpp_lora_weights[uid]):
+                    if w.size(0) < max_weight_size:
+                        print(
+                            f"DEBUG: load_from_hf - Module {i} will be padded: {w.size(0)} -> {max_weight_size}"
+                        )
+                    else:
+                        print(f"DEBUG: load_from_hf - Module {i} no padding needed: {w.size(0)}")
+
+            else:
+                # Final fallback: use the maximum size of actual weights
+                max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
+                print(f"DEBUG: load_from_hf - Using fallback max_weight_size: {max_weight_size}")
+
+            print(f"DEBUG: load_from_hf - Final max_weight_size: {max_weight_size}")
+
+            # Debug padding process
+            padded_weights = []
+            for i, w in enumerate(self._cpp_lora_weights[uid]):
+                padding_needed = max_weight_size - w.size(0)
+                print(
+                    f"DEBUG: load_from_hf - Weight {i}: original size {w.size(0)}, padding {padding_needed}"
+                )
+                padded_w = torch.nn.functional.pad(w, (0, padding_needed))
+                print(f"DEBUG: load_from_hf - Weight {i}: padded size {padded_w.size(0)}")
+                padded_weights.append(padded_w)
+
-            padded_weights = []
-            for i, w in enumerate(self._cpp_lora_weights[uid]):
-                padding_needed = max_weight_size - w.size(0)
-                print(
-                    f"DEBUG: load_from_hf - Weight {i}: original size {w.size(0)}, padding {padding_needed}"
-                )
-                padded_w = torch.nn.functional.pad(w, (0, padding_needed))
-                print(f"DEBUG: load_from_hf - Weight {i}: padded size {padded_w.size(0)}")
-                padded_weights.append(padded_w)
+            padded_weights = []
+            for i, w in enumerate(self._cpp_lora_weights[uid]):
+                padding_needed = max_weight_size - w.size(0)
+                if padding_needed < 0:
+                    # Should not happen with the guard above; fall back defensively.
+                    logger.warning(
+                        "LoRA padding | negative padding=%d at idx=%d; widening to actual size",
+                        padding_needed,
+                        i,
+                    )
+                    padding_needed = 0
+                padded_weights.append(torch.nn.functional.pad(w, (0, padding_needed)))
-            padded_weights = []
-            for i, w in enumerate(self._cpp_lora_weights[uid]):
-                padding_needed = max_weight_size - w.size(0)
-                print(
-                    f"DEBUG: load_from_hf - Weight {i}: original size {w.size(0)}, padding {padding_needed}"
-                )
-                padded_w = torch.nn.functional.pad(w, (0, padding_needed))
-                print(f"DEBUG: load_from_hf - Weight {i}: padded size {padded_w.size(0)}")
-                padded_weights.append(padded_w)
+            padded_weights = []
+            for i, w in enumerate(self._cpp_lora_weights[uid]):
+                padding_needed = max_weight_size - w.size(0)
+                if padding_needed < 0:
+                    # Should not happen with the guard above; fall back defensively.
+                    logger.warning(
+                        "LoRA padding | negative padding=%d at idx=%d; widening to actual size",
+                        padding_needed,
+                        i,
+                    )
+                    padding_needed = 0
+                padded_weights.append(torch.nn.functional.pad(w, (0, padding_needed)))
+            self._cpp_lora_weights[uid] = torch.stack(padded_weights)
+            print(
+                f"DEBUG: load_from_hf - Final stacked weights shape: {self._cpp_lora_weights[uid].shape}"
+            )
+
             self._cpp_lora_config[uid] = torch.stack([c for c in self._cpp_lora_config[uid]])
+            print(
+                f"DEBUG: load_from_hf - Final stacked config shape: {self._cpp_lora_config[uid].shape}"
+            )
 
         for uid, model_dir, hf_config in zip(new_uids, new_model_dirs, lora_hf_configs):
             load_from_model_dir(uid, model_dir, hf_config)