Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cpp/tensorrt_llm/runtime/loraUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ void loraValidateRequestTensors(std::optional<std::uint64_t> const& optTaskId,
TLLM_CHECK_WITH_INFO(it != loraModules.end(), "lora module " + moduleName + " not enabled for this model");
TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize, isDora) <= weights->getShape().d[2],
"lora_weights has to few values for " + moduleName);

auto expectedSize = it->flattenedInOutSize(adapterSize, isDora);
auto actualSize = weights->getShape().d[2];
TLLM_LOG_DEBUG("LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
moduleName.c_str(), expectedSize, actualSize, adapterSize, isDora);

Comment on lines 108 to +115
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Log after-check makes debugging impossible on failure; move log before check and reuse computed sizes. Also fix typo "to few" → "too few".

If the size check fails, the debug line never emits. Compute expected/actual once, log, then assert using the computed values. This also fixes the user-facing message grammar.

Apply this diff:

-            TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize, isDora) <= weights->getShape().d[2],
-                "lora_weights has to few values for " + moduleName);
-
-            auto expectedSize = it->flattenedInOutSize(adapterSize, isDora);
-            auto actualSize = weights->getShape().d[2];
-            TLLM_LOG_DEBUG("LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
-                moduleName.c_str(), expectedSize, actualSize, adapterSize, isDora);
+            auto const expectedSize = it->flattenedInOutSize(adapterSize, isDora);
+            auto const actualSize = weights->getShape().d[2];
+            TLLM_LOG_DEBUG(
+                "LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
+                moduleName.c_str(), expectedSize, actualSize, adapterSize, static_cast<int>(isDora));
+            TLLM_CHECK_WITH_INFO(expectedSize <= actualSize,
+                "lora_weights has too few values for " + moduleName);
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize, isDora) <= weights->getShape().d[2],
"lora_weights has to few values for " + moduleName);
auto expectedSize = it->flattenedInOutSize(adapterSize, isDora);
auto actualSize = weights->getShape().d[2];
TLLM_LOG_DEBUG("LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
moduleName.c_str(), expectedSize, actualSize, adapterSize, isDora);
auto const expectedSize = it->flattenedInOutSize(adapterSize, isDora);
auto const actualSize = weights->getShape().d[2];
TLLM_LOG_DEBUG(
"LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d",
moduleName.c_str(), expectedSize, actualSize, adapterSize, static_cast<int>(isDora));
TLLM_CHECK_WITH_INFO(expectedSize <= actualSize,
"lora_weights has too few values for " + moduleName);
🤖 Prompt for AI Agents
In cpp/tensorrt_llm/runtime/loraUtils.cpp around lines 108 to 115, the debug log
is placed after the size check so it won't run on failure and the check message
contains a typo; compute expectedSize and actualSize once, move the
TLLM_LOG_DEBUG call to before TLLM_CHECK_WITH_INFO and pass the computed
expectedSize and actualSize (and adapterSize/isDora) to it, and change the
TLLM_CHECK_WITH_INFO message text from "to few" to "too few" while using the
already computed expectedSize/actualSize values in the check.

TLLM_CHECK_WITH_INFO(adapterSize <= maxAdapterSize,
"Invalid low_rank (" + std::to_string(adapterSize) + "). low_rank must be smaller than mMaxLowRank ("
+ std::to_string(maxAdapterSize) + ")");
Expand Down
63 changes: 59 additions & 4 deletions tensorrt_llm/_torch/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,21 +441,35 @@ def get_bindings_model_config(self,
model_config_cpp.set_num_kv_heads(num_kv_heads)

mlp_hidden_size = None
print(
f"DEBUG: Before if self.pretrained_config.intermediate_size is not None:"
)
if self.pretrained_config.intermediate_size is not None:
print(f"DEBUG: Intermediate size is not None")
mlp_hidden_size = self.pretrained_config.intermediate_size // self.mapping.tp_size
else:
print(f"DEBUG: Intermediate size is None")
# TODO: once tensorrt_llm._torch.AutoConfig is implemented, the following logic
# should be moved to tensorrt_llm._torch.AutoConfig of the relevant modeling_xxx file
if hasattr(self.pretrained_config, "architectures"
) and self.pretrained_config.architectures is not None:
architectures = self.pretrained_config.architectures
if len(architectures
) == 1 and architectures[0] == "DeciLMForCausalLM":
print(
f"DEBUG: Calling _infer_nemotron_ffn_mult for Nemotron model"
)
mlp_hidden_size = self._infer_nemotron_ffn_mult()
print(f"DEBUG: Final mlp_hidden_size: {mlp_hidden_size}")
print(f"DEBUG: TP size: {self.mapping.tp_size}")
print(
f"DEBUG: Expected mlp_hidden_size after TP: {mlp_hidden_size // self.mapping.tp_size}"
)
else:
raise ValueError(
f"Inferring mlp hidden size for model architecture: {architectures} isn't supported yet"
)
print(f"DEBUG: AFTER if mlp_hidden_size is None:")
if mlp_hidden_size is None:
raise ValueError(
f"Failed to infer mlp hidden size for model: {self.pretrained_config.model_type}"
Expand All @@ -474,6 +488,7 @@ def get_bindings_model_config(self,
head_size = hidden_size // num_heads

model_config_cpp.mlp_hidden_size = mlp_hidden_size
# model_config_cpp.coarse_mlp_hidden_size = self.coarse_mlp_hidden_size
model_config_cpp.size_per_head = head_size

# NOTE: this method is not robust, for Gemma3ForCausalLM only
Expand All @@ -488,18 +503,58 @@ def _infer_nemotron_ffn_mult(self):
# Nemotron-NAS has variable ffn_mult for each layer, we need to find the maximum
# so that we don't set a too small mlp_hidden_size. This solution leads to a memory
# consumption that is higher than required.
biggest_ffn_mult = max([
(x.ffn.ffn_mult if x.ffn.ffn_mult is not None else 0)
for x in self.pretrained_config.block_configs
])

print(
f"DEBUG: _infer_nemotron_ffn_mult - TP size: {self.mapping.tp_size}"
)
print(
f"DEBUG: _infer_nemotron_ffn_mult - Number of block_configs: {len(self.pretrained_config.block_configs)}"
)

ffn_mults = [(x.ffn.ffn_mult if x.ffn.ffn_mult is not None else 0)
for x in self.pretrained_config.block_configs]
print(f"DEBUG: _infer_nemotron_ffn_mult - All ffn_mults: {ffn_mults}")

biggest_ffn_mult = max(ffn_mults)
print(
f"DEBUG: _infer_nemotron_ffn_mult - Biggest ffn_mult: {biggest_ffn_mult}"
)

from tensorrt_llm._torch.models.modeling_nemotron_nas import \
_ffn_mult_to_intermediate_size
mlp_hidden_size = _ffn_mult_to_intermediate_size(
biggest_ffn_mult, self.pretrained_config.hidden_size)

print(
f"DEBUG: _infer_nemotron_ffn_mult - Calculated mlp_hidden_size: {mlp_hidden_size}"
)
print(
f"DEBUG: _infer_nemotron_ffn_mult - Hidden size: {self.pretrained_config.hidden_size}"
)

print(
f"DEBUG: _infer_nemotron_ffn_mult - Final TP-split mlp_hidden_size: {mlp_hidden_size}"
)
return mlp_hidden_size

@property
def coarse_mlp_hidden_size(self):
"""Get the MLP hidden size (TP-split) for LoRA padding calculations."""
if self.pretrained_config.intermediate_size is not None:
return self.pretrained_config.intermediate_size // self.mapping.tp_size
else:
# For Nemotron models, use the same logic as _infer_nemotron_ffn_mult
if (hasattr(self.pretrained_config, "architectures")
and self.pretrained_config.architectures is not None
and len(self.pretrained_config.architectures) == 1
and self.pretrained_config.architectures[0]
== "DeciLMForCausalLM"):
return self._infer_nemotron_ffn_mult()
else:
raise ValueError(
f"Failed to infer mlp hidden size for model: {self.pretrained_config.model_type}"
)

def get_layer_types(self) -> Optional[List[LayerTypeCpp]]:
"""
This method is a hack to support the effort to switch to KvCacheManagerCpp.
Expand Down
1 change: 1 addition & 0 deletions tensorrt_llm/_torch/pyexecutor/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ def create_py_executor_instance(
# all layers have the same number of KV heads
num_kv_attention_heads = num_kv_attention_heads_per_layer[0]

# THEN UPDATE THE LoraModule.create_lora_modules CALL:
lora_modules = LoraModule.create_lora_modules(
lora_module_names=lora_config.lora_target_modules,
hidden_size=model_binding_config.hidden_size,
Expand Down
31 changes: 30 additions & 1 deletion tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,11 +461,40 @@ def runtime_draft_len(self):

def set_lora_model_config(self, lora_target_modules: list[str],
trtllm_modules_to_hf_modules: dict[str, str]):
coarse_mlp_hidden_size = None

# Debug: Check what type self.model.model_config is
print(
f"DEBUG: model_engine.py - self.model.model_config type: {type(self.model.model_config)}"
)
print(
f"DEBUG: model_engine.py - self.model.model_config dir: {dir(self.model.model_config)}"
)

if hasattr(self.model.model_config, 'coarse_mlp_hidden_size'):
coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
print(
f"DEBUG: model_engine.py - coarse_mlp_hidden_size: {coarse_mlp_hidden_size}"
)
else:
print(
f"DEBUG: model_engine.py - coarse_mlp_hidden_size property not found"
)
# Try direct access to see if it works
try:
coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
print(
f"DEBUG: model_engine.py - Direct access worked: {coarse_mlp_hidden_size}"
)
except AttributeError as e:
print(f"DEBUG: model_engine.py - Direct access failed: {e}")

Comment on lines +464 to +491
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

⚠️ Potential issue

Avoid hasattr on properties that may raise; replace prints with robust getattr + logger.debug.

coarse_mlp_hidden_size is a property that can raise ValueError for unsupported architectures. hasattr will evaluate the property and won’t catch ValueError, causing unexpected failures. Use getattr in a try/except, and use logger.debug instead of print.

Apply this diff:

-        coarse_mlp_hidden_size = None
-
-        # Debug: Check what type self.model.model_config is
-        print(
-            f"DEBUG: model_engine.py - self.model.model_config type: {type(self.model.model_config)}"
-        )
-        print(
-            f"DEBUG: model_engine.py - self.model.model_config dir: {dir(self.model.model_config)}"
-        )
-
-        if hasattr(self.model.model_config, 'coarse_mlp_hidden_size'):
-            coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
-            print(
-                f"DEBUG: model_engine.py - coarse_mlp_hidden_size: {coarse_mlp_hidden_size}"
-            )
-        else:
-            print(
-                f"DEBUG: model_engine.py - coarse_mlp_hidden_size property not found"
-            )
-            # Try direct access to see if it works
-            try:
-                coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
-                print(
-                    f"DEBUG: model_engine.py - Direct access worked: {coarse_mlp_hidden_size}"
-                )
-            except AttributeError as e:
-                print(f"DEBUG: model_engine.py - Direct access failed: {e}")
+        coarse_mlp_hidden_size = None
+        try:
+            coarse_mlp_hidden_size = getattr(self.model.model_config, 'coarse_mlp_hidden_size', None)
+            logger.debug("model_engine | coarse_mlp_hidden_size=%s", str(coarse_mlp_hidden_size))
+        except Exception as e:
+            # Property exists but inference failed; proceed without it.
+            logger.debug("model_engine | coarse_mlp_hidden_size unavailable: %s", e)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
coarse_mlp_hidden_size = None
# Debug: Check what type self.model.model_config is
print(
f"DEBUG: model_engine.py - self.model.model_config type: {type(self.model.model_config)}"
)
print(
f"DEBUG: model_engine.py - self.model.model_config dir: {dir(self.model.model_config)}"
)
if hasattr(self.model.model_config, 'coarse_mlp_hidden_size'):
coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
print(
f"DEBUG: model_engine.py - coarse_mlp_hidden_size: {coarse_mlp_hidden_size}"
)
else:
print(
f"DEBUG: model_engine.py - coarse_mlp_hidden_size property not found"
)
# Try direct access to see if it works
try:
coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size
print(
f"DEBUG: model_engine.py - Direct access worked: {coarse_mlp_hidden_size}"
)
except AttributeError as e:
print(f"DEBUG: model_engine.py - Direct access failed: {e}")
coarse_mlp_hidden_size = None
try:
coarse_mlp_hidden_size = getattr(self.model.model_config, 'coarse_mlp_hidden_size', None)
logger.debug("model_engine | coarse_mlp_hidden_size=%s", str(coarse_mlp_hidden_size))
except Exception as e:
# Property exists but inference failed; proceed without it.
logger.debug("model_engine | coarse_mlp_hidden_size unavailable: %s", e)
🤖 Prompt for AI Agents
In tensorrt_llm/_torch/pyexecutor/model_engine.py around lines 464 to 491,
replace the current hasattr/print debug block with a single try/except that
reads coarse_mlp_hidden_size once via getattr (or direct attribute access) and
catches both ValueError and AttributeError so property access that raises is
handled; on success assign the value to coarse_mlp_hidden_size and call
logger.debug with the value, on exception set coarse_mlp_hidden_size = None and
log the failure with logger.debug (do not use print), ensuring you do not
evaluate the property twice and you handle both kinds of errors.

self.lora_model_config = LoraModelConfig(
lora_target_modules=lora_target_modules,
trtllm_modules_to_hf_modules=trtllm_modules_to_hf_modules,
hidden_size=self.model.config.hidden_size,
dtype=torch_dtype_to_str(self.model.config.torch_dtype))
dtype=torch_dtype_to_str(self.model.config.torch_dtype),
coarse_mlp_hidden_size=coarse_mlp_hidden_size)

@property
def use_mrope(self):
Expand Down
106 changes: 100 additions & 6 deletions tensorrt_llm/lora_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ class LoraModelConfig:
trtllm_modules_to_hf_modules: dict[str, str]
hidden_size: int
dtype: str
coarse_mlp_hidden_size: Optional[int] = None


class HfLoraLoader:
Expand Down Expand Up @@ -1133,14 +1134,107 @@ def load_from_model_dir(uid, model_dir, hf_config):
)
)

max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
self._cpp_lora_weights[uid] = torch.stack(
[
torch.nn.functional.pad(w, (0, max_weight_size - w.size(0)))
for w in self._cpp_lora_weights[uid]
]
# Handle both ModelConfig and LoraModelConfig types
print(f"DEBUG: model_config type: {type(model_config)}")
print(
f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}"
)
print(
f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}"
)
print(
f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}"
)

Comment on lines +1137 to +1148
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Replace print-based debugging with logger.debug and trim verbosity.

Using print in library code is noisy and hard to control. Prefer logger.debug (already configured as logger = logging.getLogger(name)). Collapse multiple prints into a single succinct line.

Apply this diff:

-            print(f"DEBUG: model_config type: {type(model_config)}")
-            print(
-                f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}"
-            )
+            logger.debug(
+                "LoRA padding | model_config=%s | has_coarse=%s | coarse=%s",
+                type(model_config),
+                hasattr(model_config, "coarse_mlp_hidden_size"),
+                getattr(model_config, "coarse_mlp_hidden_size", None),
+            )

Run to confirm no stray prints remain in lora_manager/model_engine:


🏁 Script executed:

#!/bin/bash
rg -nP '^\s*print\(' tensorrt_llm/lora_manager.py tensorrt_llm/_torch/pyexecutor/model_engine.py

Length of output: 3027


Mandate: Replace all print-based debugging with logger.debug across Lora Manager and Model Engine

Your grep shows numerous stray print(...) calls—both in tensorrt_llm/lora_manager.py and in tensorrt_llm/_torch/pyexecutor/model_engine.py. These should all be converted to controlled, leveled logging rather than uncontrolled stdout.

• Files and ranges needing refactor:

  • tensorrt_llm/lora_manager.py: lines ~1138–1154, 1162–1235
  • tensorrt_llm/_torch/pyexecutor/model_engine.py: lines ~467–490

• Primary change (example for LoraManager snippet at 1137–1148):

-            print(f"DEBUG: model_config type: {type(model_config)}")
-            print(
-                f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}"
-            )
-            print(
-                f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}"
-            )
+            logger.debug(
+                "LoRA padding | model_config=%s | has_coarse=%s | coarse=%s",
+                type(model_config),
+                hasattr(model_config, "coarse_mlp_hidden_size"),
+                getattr(model_config, "coarse_mlp_hidden_size", None),
+            )

Apply the same pattern to all other debug prints:

  • Consolidate related prints into single logger.debug calls with structured arguments.
  • Ensure you import and use the module logger (logger = logging.getLogger(__name__)).
  • Remove any remaining print(...) debug statements.
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Handle both ModelConfig and LoraModelConfig types
print(f"DEBUG: model_config type: {type(model_config)}")
print(
f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}"
)
print(
f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}"
)
print(
f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}"
)
# Handle both ModelConfig and LoraModelConfig types
logger.debug(
"LoRA padding | model_config=%s | has_coarse=%s | coarse=%s",
type(model_config),
hasattr(model_config, "coarse_mlp_hidden_size"),
getattr(model_config, "coarse_mlp_hidden_size", None),
)
🤖 Prompt for AI Agents
In tensorrt_llm/lora_manager.py around lines 1137 to 1148 (and extend the same
change across the other indicated ranges ~1162–1235) replace the ad-hoc
print(...) debug statements with structured logger.debug calls: add or ensure a
module-level logger = logging.getLogger(__name__) import logging at top if
missing, consolidate related prints into single logger.debug invocations with
clear messages and variables (e.g., logger.debug("model_config type=%s
coarse_mlp_hidden_size=%s is_none=%s", type(model_config), getattr(model_config,
"coarse_mlp_hidden_size", None), getattr(model_config, "coarse_mlp_hidden_size",
None) is None)), and remove all remaining print(...) calls; apply the identical
replacement pattern to tensorrt_llm/_torch/pyexecutor/model_engine.py lines
~467–490 so no print-based debugging remains.

# Handle both ModelConfig and LoraModelConfig types
if (
hasattr(model_config, "coarse_mlp_hidden_size")
and model_config.coarse_mlp_hidden_size is not None
):
print(
f"DEBUG: INSIDE load_from_hf: model_config.coarse_mlp_hidden_size: "
f"{model_config.coarse_mlp_hidden_size}"
)
M_coarse = model_config.coarse_mlp_hidden_size
H = model_config.hidden_size
rank = int(hf_config["r"])

print(f"DEBUG: load_from_hf - M_coarse: {M_coarse}")
print(f"DEBUG: load_from_hf - tp_size: {tp_size}")
print(f"DEBUG: load_from_hf - H (hidden_size): {H}")
print(f"DEBUG: load_from_hf - rank: {rank}")

M_coarse_tp = M_coarse * tp_size
max_weight_size = rank * M_coarse_tp + rank * H

Comment on lines +1167 to +1169
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

TP double-counting in max_weight_size; coarse_mlp_hidden_size is already TP-split.

coarse_mlp_hidden_size returns the TP-split M value. Multiplying by tp_size again inflates padding by tp_size, wasting memory significantly (especially on 49B). Compute with the TP-split value directly.

Apply this diff:

-                M_coarse_tp = M_coarse * tp_size
-                max_weight_size = rank * M_coarse_tp + rank * H
+                // coarse_mlp_hidden_size is already TP-split; do not multiply by tp_size again.
+                M_coarse_tp = M_coarse
+                // For MLP up/gate or down, flattened size per module is rank*(M_coarse_tp + H).
+                // This upper-bounds attention modules too since M_coarse_tp >= H in typical configs.
+                calc_max_weight_size = rank * (M_coarse_tp + H)

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In tensorrt_llm/lora_manager.py around lines 1167 to 1169, the code multiplies
the TP-split coarse_mlp_hidden_size (M_coarse) by tp_size (M_coarse_tp) and then
uses that to compute max_weight_size, which double-counts tensor-parallel
splitting and inflates padding; remove the extra multiplication and compute
max_weight_size using the TP-split value directly (drop M_coarse_tp and use
M_coarse in the expression so max_weight_size = rank * M_coarse + rank * H).

print(f"DEBUG: load_from_hf - M_coarse_tp: {M_coarse_tp}")
print(
f"DEBUG: load_from_hf - max_weight_size calculation: "
f"{rank} * {M_coarse_tp} + {rank} * {H} = {max_weight_size}"
)

# Debug actual weights before padding
print(
f"DEBUG: load_from_hf - Number of weight tensors: {len(self._cpp_lora_weights[uid])}"
)
for i, w in enumerate(self._cpp_lora_weights[uid]):
print(
f"DEBUG: load_from_hf - Weight {i} shape: {w.shape}, size(0): {w.size(0)}"
)

# Debug the actual maximum weight size
actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
print(f"DEBUG: load_from_hf - Actual max weight size: {actual_max_weight_size}")
print(f"DEBUG: load_from_hf - Calculated max_weight_size: {max_weight_size}")
print(
f"DEBUG: load_from_hf - Difference: {max_weight_size - actual_max_weight_size}"
)
Comment on lines +1186 to +1191
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Guard against negative padding; ensure computed max covers actual max.

If calc_max_weight_size underestimates (e.g., DoRA adds a magnitude vector), padding_needed becomes negative and F.pad will error. Use the actual maximum size as a floor.

Apply this diff:

-                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
-                print(f"DEBUG: load_from_hf - Actual max weight size: {actual_max_weight_size}")
-                print(f"DEBUG: load_from_hf - Calculated max_weight_size: {max_weight_size}")
-                print(
-                    f"DEBUG: load_from_hf - Difference: {max_weight_size - actual_max_weight_size}"
-                )
+                actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
+                # Account for possible DoRA magnitude vector (+M_coarse_tp) by taking the max with actual.
+                max_weight_size = max(calc_max_weight_size, actual_max_weight_size)
+                logger.debug(
+                    "LoRA padding | calc=%d actual=%d final=%d",
+                    calc_max_weight_size, actual_max_weight_size, max_weight_size
+                )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
print(f"DEBUG: load_from_hf - Actual max weight size: {actual_max_weight_size}")
print(f"DEBUG: load_from_hf - Calculated max_weight_size: {max_weight_size}")
print(
f"DEBUG: load_from_hf - Difference: {max_weight_size - actual_max_weight_size}"
)
actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
# Account for possible DoRA magnitude vector (+M_coarse_tp) by taking the max with actual.
max_weight_size = max(calc_max_weight_size, actual_max_weight_size)
logger.debug(
"LoRA padding | calc=%d actual=%d final=%d",
calc_max_weight_size, actual_max_weight_size, max_weight_size
)
🤖 Prompt for AI Agents
In tensorrt_llm/lora_manager.py around lines 1186 to 1191, the computed
max_weight_size can be less than the actual_max_weight_size causing negative
padding_needed and F.pad to fail; before padding, ensure max_weight_size is
floored to the observed actual maximum (e.g., max_weight_size =
max(max_weight_size, actual_max_weight_size)) and/or compute padding_needed =
max(0, max_weight_size - w.size(0)) so padding_needed is never negative.


# Debug module-specific sizes
print(
f"DEBUG: load_from_hf - Number of modules: {len(self._cpp_lora_weights[uid])}"
)
print("DEBUG: load_from_hf - Module sizes by index:")
for i, w in enumerate(self._cpp_lora_weights[uid]):
print(f"DEBUG: load_from_hf - Module {i}: {w.size(0)}")

# Debug which modules are failing
print("DEBUG: load_from_hf - Checking which modules might fail validation:")
for i, w in enumerate(self._cpp_lora_weights[uid]):
if w.size(0) < max_weight_size:
print(
f"DEBUG: load_from_hf - Module {i} will be padded: {w.size(0)} -> {max_weight_size}"
)
else:
print(f"DEBUG: load_from_hf - Module {i} no padding needed: {w.size(0)}")

else:
# Final fallback: use the maximum size of actual weights
max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid])
print(f"DEBUG: load_from_hf - Using fallback max_weight_size: {max_weight_size}")

print(f"DEBUG: load_from_hf - Final max_weight_size: {max_weight_size}")

# Debug padding process
padded_weights = []
for i, w in enumerate(self._cpp_lora_weights[uid]):
padding_needed = max_weight_size - w.size(0)
print(
f"DEBUG: load_from_hf - Weight {i}: original size {w.size(0)}, padding {padding_needed}"
)
padded_w = torch.nn.functional.pad(w, (0, padding_needed))
print(f"DEBUG: load_from_hf - Weight {i}: padded size {padded_w.size(0)}")
padded_weights.append(padded_w)

Comment on lines +1219 to +1228
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Safer padding: clamp to zero and avoid per-weight print loops.

Even with the fix above, clamping is a safe guard and avoids crash on rare edge-cases. Replace prints with a single debug if needed.

Apply this diff:

-            padded_weights = []
-            for i, w in enumerate(self._cpp_lora_weights[uid]):
-                padding_needed = max_weight_size - w.size(0)
-                print(
-                    f"DEBUG: load_from_hf - Weight {i}: original size {w.size(0)}, padding {padding_needed}"
-                )
-                padded_w = torch.nn.functional.pad(w, (0, padding_needed))
-                print(f"DEBUG: load_from_hf - Weight {i}: padded size {padded_w.size(0)}")
-                padded_weights.append(padded_w)
+            padded_weights = []
+            for i, w in enumerate(self._cpp_lora_weights[uid]):
+                padding_needed = max_weight_size - w.size(0)
+                if padding_needed < 0:
+                    # Should not happen with the guard above; fall back defensively.
+                    logger.warning("LoRA padding | negative padding=%d at idx=%d; widening to actual size", padding_needed, i)
+                    padding_needed = 0
+                padded_weights.append(torch.nn.functional.pad(w, (0, padding_needed)))
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
padded_weights = []
for i, w in enumerate(self._cpp_lora_weights[uid]):
padding_needed = max_weight_size - w.size(0)
print(
f"DEBUG: load_from_hf - Weight {i}: original size {w.size(0)}, padding {padding_needed}"
)
padded_w = torch.nn.functional.pad(w, (0, padding_needed))
print(f"DEBUG: load_from_hf - Weight {i}: padded size {padded_w.size(0)}")
padded_weights.append(padded_w)
padded_weights = []
for i, w in enumerate(self._cpp_lora_weights[uid]):
padding_needed = max_weight_size - w.size(0)
if padding_needed < 0:
# Should not happen with the guard above; fall back defensively.
logger.warning(
"LoRA padding | negative padding=%d at idx=%d; widening to actual size",
padding_needed,
i,
)
padding_needed = 0
padded_weights.append(torch.nn.functional.pad(w, (0, padding_needed)))
🤖 Prompt for AI Agents
In tensorrt_llm/lora_manager.py around lines 1219 to 1228, the per-weight debug
prints and unguarded padding can cause noisy logs and crashes when
padding_needed is negative; change padding_needed to be clamped to zero (e.g.
use max(padding_needed, 0) or torch.clamp_min) before calling pad, remove the
per-weight print statements inside the loop, and instead emit a single debug
summarizing counts/sizes after the loop (or conditionally log once when any
padding occurred). Ensure padded_weights.append is still executed and behavior
unchanged when padding_needed is zero.

self._cpp_lora_weights[uid] = torch.stack(padded_weights)
print(
f"DEBUG: load_from_hf - Final stacked weights shape: {self._cpp_lora_weights[uid].shape}"
)

self._cpp_lora_config[uid] = torch.stack([c for c in self._cpp_lora_config[uid]])
print(
f"DEBUG: load_from_hf - Final stacked config shape: {self._cpp_lora_config[uid].shape}"
)

for uid, model_dir, hf_config in zip(new_uids, new_model_dirs, lora_hf_configs):
load_from_model_dir(uid, model_dir, hf_config)
Expand Down
74 changes: 74 additions & 0 deletions tests/integration/defs/examples/test_nemotron_nas.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
from pathlib import Path

import defs.ci_profiler
import pytest
from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
from defs.conftest import get_device_memory, get_sm_version
from defs.trt_test_alternative import check_call

from tensorrt_llm import LLM
from tensorrt_llm.executor.request import LoRARequest
from tensorrt_llm.lora_manager import LoraConfig
from tensorrt_llm.sampling_params import SamplingParams

# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
Expand Down Expand Up @@ -122,3 +128,71 @@ def test_nemotron_nas_summary_2gpu(nemotron_nas_example_root, llm_venv,
]

venv_mpi_check_call(llm_venv, mpi_cmd, summary_cmd)


@pytest.mark.skip_less_device(4)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("nemotron_nas_model_root", [
"Llama-3_3-Nemotron-Super-49B-v1",
],
indirect=True)
def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv,
nemotron_nas_model_root,
llm_datasets_root, llm_rouge_root,
engine_dir, cmodel_dir):
"""Run Nemotron Super 49B with real LoRA adapters using LLM-API Torch backend."""

print("Testing Nemotron Super 49B with real LoRA adapters...")

lora_adapter_path = f"/code/tensorrt_llm/llama-3.3-nemotron-super-49b-v1/llama-3.3-nemotron-super-49b-v1_vlora-1a2cb80-v2"
print(f"Using real LoRA from: {lora_adapter_path}")

defs.ci_profiler.start("test_nemotron_real_lora_torch")

lora_config = LoraConfig(
lora_dir=[lora_adapter_path],
max_lora_rank=32, # From adapter_config.json: "r": 32
max_loras=1,
max_cpu_loras=1,
)

with LLM(model=nemotron_nas_model_root,
lora_config=lora_config,
tensor_parallel_size=4,
dtype="bfloat16",
max_batch_size=2,
max_input_len=512,
max_seq_len=1024,
max_beam_width=1) as llm:

prompts = [
"What is the capital of France?",
"Explain quantum computing in simple terms."
]

sampling_params = SamplingParams(max_tokens=50,
temperature=0.7,
top_p=0.9)

lora_request = [LoRARequest("nemotron-lora", 0, lora_adapter_path)]

print("Running inference with real LoRA adapter...")
outputs = llm.generate(prompts,
sampling_params,
lora_request=lora_request)

for i, output in enumerate(outputs):
print(f"Prompt {i+1}: {prompts[i]}")
print(f"Response {i+1}: {output.outputs[0].text}")
print("-" * 50)

assert len(outputs) == 2
assert len(outputs[0].outputs) > 0
assert len(outputs[1].outputs) > 0
assert len(outputs[0].outputs[0].text) > 0
assert len(outputs[1].outputs[0].text) > 0

defs.ci_profiler.stop("test_nemotron_real_lora_torch")
print(
f"test_nemotron_real_lora_torch: {defs.ci_profiler.elapsed_time_in_sec('test_nemotron_real_lora_torch')} sec"
)
Loading