diff --git a/cpp/tensorrt_llm/runtime/loraUtils.cpp b/cpp/tensorrt_llm/runtime/loraUtils.cpp index d897a3195c2..3aff55c0c5e 100644 --- a/cpp/tensorrt_llm/runtime/loraUtils.cpp +++ b/cpp/tensorrt_llm/runtime/loraUtils.cpp @@ -107,6 +107,12 @@ void loraValidateRequestTensors(std::optional const& optTaskId, TLLM_CHECK_WITH_INFO(it != loraModules.end(), "lora module " + moduleName + " not enabled for this model"); TLLM_CHECK_WITH_INFO(it->flattenedInOutSize(adapterSize, isDora) <= weights->getShape().d[2], "lora_weights has to few values for " + moduleName); + + auto expectedSize = it->flattenedInOutSize(adapterSize, isDora); + auto actualSize = weights->getShape().d[2]; + TLLM_LOG_DEBUG("LoRA validation for %s - Expected: %d, Actual: %d, AdapterSize: %d, IsDora: %d", + moduleName.c_str(), expectedSize, actualSize, adapterSize, isDora); + TLLM_CHECK_WITH_INFO(adapterSize <= maxAdapterSize, "Invalid low_rank (" + std::to_string(adapterSize) + "). low_rank must be smaller than mMaxLowRank (" + std::to_string(maxAdapterSize) + ")"); diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index 7e310f934ac..783b1b007b1 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -441,9 +441,14 @@ def get_bindings_model_config(self, model_config_cpp.set_num_kv_heads(num_kv_heads) mlp_hidden_size = None + print( + f"DEBUG: Before if self.pretrained_config.intermediate_size is not None:" + ) if self.pretrained_config.intermediate_size is not None: + print(f"DEBUG: Intermediate size is not None") mlp_hidden_size = self.pretrained_config.intermediate_size // self.mapping.tp_size else: + print(f"DEBUG: Intermediate size is None") # TODO: once tensorrt_llm._torch.AutoConfig is implemented, the following logic # should be moved to tensorrt_llm._torch.AutoConfig of the relevant modeling_xxx file if hasattr(self.pretrained_config, "architectures" @@ -451,11 +456,20 @@ def get_bindings_model_config(self, architectures = self.pretrained_config.architectures if len(architectures ) == 1 and architectures[0] == "DeciLMForCausalLM": + print( + f"DEBUG: Calling _infer_nemotron_ffn_mult for Nemotron model" + ) mlp_hidden_size = self._infer_nemotron_ffn_mult() + print(f"DEBUG: Final mlp_hidden_size: {mlp_hidden_size}") + print(f"DEBUG: TP size: {self.mapping.tp_size}") + print( + f"DEBUG: Expected mlp_hidden_size after TP: {mlp_hidden_size // self.mapping.tp_size}" + ) else: raise ValueError( f"Inferring mlp hidden size for model architecture: {architectures} isn't supported yet" ) + print(f"DEBUG: AFTER if mlp_hidden_size is None:") if mlp_hidden_size is None: raise ValueError( f"Failed to infer mlp hidden size for model: {self.pretrained_config.model_type}" @@ -474,6 +488,7 @@ def get_bindings_model_config(self, head_size = hidden_size // num_heads model_config_cpp.mlp_hidden_size = mlp_hidden_size + # model_config_cpp.coarse_mlp_hidden_size = self.coarse_mlp_hidden_size model_config_cpp.size_per_head = head_size # NOTE: this method is not robust, for Gemma3ForCausalLM only @@ -488,18 +503,58 @@ def _infer_nemotron_ffn_mult(self): # Nemotron-NAS has variable ffn_mult for each layer, we need to find the maximum # so that we don't set a too small mlp_hidden_size. This solution leads to a memory # consumption that is higher than required. - biggest_ffn_mult = max([ - (x.ffn.ffn_mult if x.ffn.ffn_mult is not None else 0) - for x in self.pretrained_config.block_configs - ]) + + print( + f"DEBUG: _infer_nemotron_ffn_mult - TP size: {self.mapping.tp_size}" + ) + print( + f"DEBUG: _infer_nemotron_ffn_mult - Number of block_configs: {len(self.pretrained_config.block_configs)}" + ) + + ffn_mults = [(x.ffn.ffn_mult if x.ffn.ffn_mult is not None else 0) + for x in self.pretrained_config.block_configs] + print(f"DEBUG: _infer_nemotron_ffn_mult - All ffn_mults: {ffn_mults}") + + biggest_ffn_mult = max(ffn_mults) + print( + f"DEBUG: _infer_nemotron_ffn_mult - Biggest ffn_mult: {biggest_ffn_mult}" + ) from tensorrt_llm._torch.models.modeling_nemotron_nas import \ _ffn_mult_to_intermediate_size mlp_hidden_size = _ffn_mult_to_intermediate_size( biggest_ffn_mult, self.pretrained_config.hidden_size) + print( + f"DEBUG: _infer_nemotron_ffn_mult - Calculated mlp_hidden_size: {mlp_hidden_size}" + ) + print( + f"DEBUG: _infer_nemotron_ffn_mult - Hidden size: {self.pretrained_config.hidden_size}" + ) + + print( + f"DEBUG: _infer_nemotron_ffn_mult - Final TP-split mlp_hidden_size: {mlp_hidden_size}" + ) return mlp_hidden_size + @property + def coarse_mlp_hidden_size(self): + """Get the MLP hidden size (TP-split) for LoRA padding calculations.""" + if self.pretrained_config.intermediate_size is not None: + return self.pretrained_config.intermediate_size // self.mapping.tp_size + else: + # For Nemotron models, use the same logic as _infer_nemotron_ffn_mult + if (hasattr(self.pretrained_config, "architectures") + and self.pretrained_config.architectures is not None + and len(self.pretrained_config.architectures) == 1 + and self.pretrained_config.architectures[0] + == "DeciLMForCausalLM"): + return self._infer_nemotron_ffn_mult() + else: + raise ValueError( + f"Failed to infer mlp hidden size for model: {self.pretrained_config.model_type}" + ) + def get_layer_types(self) -> Optional[List[LayerTypeCpp]]: """ This method is a hack to support the effort to switch to KvCacheManagerCpp. diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index ca224f0b2cd..a78a4a6df03 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -467,6 +467,7 @@ def create_py_executor_instance( # all layers have the same number of KV heads num_kv_attention_heads = num_kv_attention_heads_per_layer[0] + # THEN UPDATE THE LoraModule.create_lora_modules CALL: lora_modules = LoraModule.create_lora_modules( lora_module_names=lora_config.lora_target_modules, hidden_size=model_binding_config.hidden_size, diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 22a53c4666f..b52e38dfd66 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -461,11 +461,40 @@ def runtime_draft_len(self): def set_lora_model_config(self, lora_target_modules: list[str], trtllm_modules_to_hf_modules: dict[str, str]): + coarse_mlp_hidden_size = None + + # Debug: Check what type self.model.model_config is + print( + f"DEBUG: model_engine.py - self.model.model_config type: {type(self.model.model_config)}" + ) + print( + f"DEBUG: model_engine.py - self.model.model_config dir: {dir(self.model.model_config)}" + ) + + if hasattr(self.model.model_config, 'coarse_mlp_hidden_size'): + coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size + print( + f"DEBUG: model_engine.py - coarse_mlp_hidden_size: {coarse_mlp_hidden_size}" + ) + else: + print( + f"DEBUG: model_engine.py - coarse_mlp_hidden_size property not found" + ) + # Try direct access to see if it works + try: + coarse_mlp_hidden_size = self.model.model_config.coarse_mlp_hidden_size + print( + f"DEBUG: model_engine.py - Direct access worked: {coarse_mlp_hidden_size}" + ) + except AttributeError as e: + print(f"DEBUG: model_engine.py - Direct access failed: {e}") + self.lora_model_config = LoraModelConfig( lora_target_modules=lora_target_modules, trtllm_modules_to_hf_modules=trtllm_modules_to_hf_modules, hidden_size=self.model.config.hidden_size, - dtype=torch_dtype_to_str(self.model.config.torch_dtype)) + dtype=torch_dtype_to_str(self.model.config.torch_dtype), + coarse_mlp_hidden_size=coarse_mlp_hidden_size) @property def use_mrope(self): diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 9a5b42166dc..1a4fb2adba4 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -1037,6 +1037,13 @@ def __init__(self, world_config=world_config, buffer_manager=buffer_manager) self._lora_config = lora_config + # if model_engine is not None and hasattr(model_engine, "lora_model_config"): + # self._lora_model_config = model_engine.lora_model_config + # else: + # self._lora_model_config = LoraModelConfig( + # lora_config.lora_target_modules, + # lora_config.trtllm_modules_to_hf_modules, model_config.hidden_size, + # binding_to_str_dtype(model_config.data_type)) self._lora_model_config = LoraModelConfig( lora_config.lora_target_modules, lora_config.trtllm_modules_to_hf_modules, model_config.hidden_size, @@ -1052,6 +1059,9 @@ def add_request_peft(self, request: LlmRequest): # cached, we can safely remove both from the request. request.remove_lora_tensors() elif request.lora_weights is None and request.py_lora_path: + print( + f"DEBUG: INSIDE add_request_peft: request.py_lora_path: {request.py_lora_path}" + ) self._lora_manager.load_from_ckpt( [request.py_lora_path], model_config=self._lora_model_config, diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 6d5ec9c1d78..0450b6eba82 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -356,11 +356,31 @@ def start(self): def _load_lora_adapter(self, lora_request: LoRARequest) -> bool: """Returns True if the adapter was loaded by this call, False if it was already loaded""" adapter_id = str(lora_request.adapter_id) + + # Create runtime_mapping from executor_config.mapping + from tensorrt_llm.mapping import Mapping + if hasattr(self._executor_config, + "mapping") and self._executor_config.mapping is not None: + mapping = self._executor_config.mapping + # Calculate world_size to satisfy the constraint: world_size = tp_size * pp_size * cp_size + world_size = mapping.tp_size * mapping.pp_size * mapping.cp_size + + runtime_mapping = Mapping( + world_size=world_size, # ← Add world_size + tp_size=mapping.tp_size, + pp_size=mapping.pp_size, + cp_size=mapping.cp_size, + rank=mapping.rank, + gpus_per_node=mapping.gpus_per_node) + else: + # Fallback to default mapping + runtime_mapping = Mapping() + newly_loaded_uids = self._lora_manager.load_from_ckpt( [lora_request.path], model_config=self._runtime_model_config if self._runtime_model_config is not None else self._lora_model_config, - runtime_mapping=None, + runtime_mapping=runtime_mapping, # ← Pass the correct runtime_mapping uids=[adapter_id], ckpt_source=lora_request.ckpt_source) return adapter_id in newly_loaded_uids diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py index 7440715474c..a5ba4ebc185 100644 --- a/tensorrt_llm/lora_manager.py +++ b/tensorrt_llm/lora_manager.py @@ -243,6 +243,7 @@ class LoraModelConfig: trtllm_modules_to_hf_modules: dict[str, str] hidden_size: int dtype: str + coarse_mlp_hidden_size: Optional[int] = None class HfLoraLoader: @@ -1133,14 +1134,107 @@ def load_from_model_dir(uid, model_dir, hf_config): ) ) - max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid]) - self._cpp_lora_weights[uid] = torch.stack( - [ - torch.nn.functional.pad(w, (0, max_weight_size - w.size(0))) - for w in self._cpp_lora_weights[uid] - ] + # Handle both ModelConfig and LoraModelConfig types + print(f"DEBUG: model_config type: {type(model_config)}") + print( + f"DEBUG: model_config has coarse_mlp_hidden_size: {hasattr(model_config, 'coarse_mlp_hidden_size')}" + ) + print( + f"DEBUG: model_config.coarse_mlp_hidden_size value: {model_config.coarse_mlp_hidden_size}" ) + print( + f"DEBUG: model_config.coarse_mlp_hidden_size is None: {model_config.coarse_mlp_hidden_size is None}" + ) + + # Handle both ModelConfig and LoraModelConfig types + if ( + hasattr(model_config, "coarse_mlp_hidden_size") + and model_config.coarse_mlp_hidden_size is not None + ): + print( + f"DEBUG: INSIDE load_from_hf: model_config.coarse_mlp_hidden_size: " + f"{model_config.coarse_mlp_hidden_size}" + ) + M_coarse = model_config.coarse_mlp_hidden_size + H = model_config.hidden_size + rank = int(hf_config["r"]) + + print(f"DEBUG: load_from_hf - M_coarse: {M_coarse}") + print(f"DEBUG: load_from_hf - tp_size: {tp_size}") + print(f"DEBUG: load_from_hf - H (hidden_size): {H}") + print(f"DEBUG: load_from_hf - rank: {rank}") + + M_coarse_tp = M_coarse * tp_size + max_weight_size = rank * M_coarse_tp + rank * H + + print(f"DEBUG: load_from_hf - M_coarse_tp: {M_coarse_tp}") + print( + f"DEBUG: load_from_hf - max_weight_size calculation: " + f"{rank} * {M_coarse_tp} + {rank} * {H} = {max_weight_size}" + ) + + # Debug actual weights before padding + print( + f"DEBUG: load_from_hf - Number of weight tensors: {len(self._cpp_lora_weights[uid])}" + ) + for i, w in enumerate(self._cpp_lora_weights[uid]): + print( + f"DEBUG: load_from_hf - Weight {i} shape: {w.shape}, size(0): {w.size(0)}" + ) + + # Debug the actual maximum weight size + actual_max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid]) + print(f"DEBUG: load_from_hf - Actual max weight size: {actual_max_weight_size}") + print(f"DEBUG: load_from_hf - Calculated max_weight_size: {max_weight_size}") + print( + f"DEBUG: load_from_hf - Difference: {max_weight_size - actual_max_weight_size}" + ) + + # Debug module-specific sizes + print( + f"DEBUG: load_from_hf - Number of modules: {len(self._cpp_lora_weights[uid])}" + ) + print("DEBUG: load_from_hf - Module sizes by index:") + for i, w in enumerate(self._cpp_lora_weights[uid]): + print(f"DEBUG: load_from_hf - Module {i}: {w.size(0)}") + + # Debug which modules are failing + print("DEBUG: load_from_hf - Checking which modules might fail validation:") + for i, w in enumerate(self._cpp_lora_weights[uid]): + if w.size(0) < max_weight_size: + print( + f"DEBUG: load_from_hf - Module {i} will be padded: {w.size(0)} -> {max_weight_size}" + ) + else: + print(f"DEBUG: load_from_hf - Module {i} no padding needed: {w.size(0)}") + + else: + # Final fallback: use the maximum size of actual weights + max_weight_size = max(w.size(0) for w in self._cpp_lora_weights[uid]) + print(f"DEBUG: load_from_hf - Using fallback max_weight_size: {max_weight_size}") + + print(f"DEBUG: load_from_hf - Final max_weight_size: {max_weight_size}") + + # Debug padding process + padded_weights = [] + for i, w in enumerate(self._cpp_lora_weights[uid]): + padding_needed = max_weight_size - w.size(0) + print( + f"DEBUG: load_from_hf - Weight {i}: original size {w.size(0)}, padding {padding_needed}" + ) + padded_w = torch.nn.functional.pad(w, (0, padding_needed)) + print(f"DEBUG: load_from_hf - Weight {i}: padded size {padded_w.size(0)}") + padded_weights.append(padded_w) + + self._cpp_lora_weights[uid] = torch.stack(padded_weights) + print( + f"DEBUG: load_from_hf - Final stacked weights shape: {self._cpp_lora_weights[uid].shape}" + ) + self._cpp_lora_config[uid] = torch.stack([c for c in self._cpp_lora_config[uid]]) + print( + f"DEBUG: load_from_hf - Final stacked config shape: {self._cpp_lora_config[uid].shape}" + ) for uid, model_dir, hf_config in zip(new_uids, new_model_dirs, lora_hf_configs): load_from_model_dir(uid, model_dir, hf_config) diff --git a/tests/integration/defs/examples/test_nemotron_nas.py b/tests/integration/defs/examples/test_nemotron_nas.py index d1663eab672..a9470a0d375 100644 --- a/tests/integration/defs/examples/test_nemotron_nas.py +++ b/tests/integration/defs/examples/test_nemotron_nas.py @@ -1,10 +1,16 @@ from pathlib import Path +import defs.ci_profiler import pytest from defs.common import convert_weights, venv_check_call, venv_mpi_check_call from defs.conftest import get_device_memory, get_sm_version from defs.trt_test_alternative import check_call +from tensorrt_llm import LLM +from tensorrt_llm.executor.request import LoRARequest +from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.sampling_params import SamplingParams + # skip trt flow cases on post-Blackwell-Ultra if get_sm_version() >= 103: pytest.skip( @@ -122,3 +128,71 @@ def test_nemotron_nas_summary_2gpu(nemotron_nas_example_root, llm_venv, ] venv_mpi_check_call(llm_venv, mpi_cmd, summary_cmd) + + +@pytest.mark.skip_less_device(4) +@pytest.mark.skip_less_device_memory(80000) +@pytest.mark.parametrize("nemotron_nas_model_root", [ + "Llama-3_3-Nemotron-Super-49B-v1", +], + indirect=True) +def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv, + nemotron_nas_model_root, + llm_datasets_root, llm_rouge_root, + engine_dir, cmodel_dir): + """Run Nemotron Super 49B with real LoRA adapters using LLM-API Torch backend.""" + + print("Testing Nemotron Super 49B with real LoRA adapters...") + + lora_adapter_path = f"/code/tensorrt_llm/llama-3.3-nemotron-super-49b-v1/llama-3.3-nemotron-super-49b-v1_vlora-1a2cb80-v2" + print(f"Using real LoRA from: {lora_adapter_path}") + + defs.ci_profiler.start("test_nemotron_real_lora_torch") + + lora_config = LoraConfig( + lora_dir=[lora_adapter_path], + max_lora_rank=32, # From adapter_config.json: "r": 32 + max_loras=1, + max_cpu_loras=1, + ) + + with LLM(model=nemotron_nas_model_root, + lora_config=lora_config, + tensor_parallel_size=4, + dtype="bfloat16", + max_batch_size=2, + max_input_len=512, + max_seq_len=1024, + max_beam_width=1) as llm: + + prompts = [ + "What is the capital of France?", + "Explain quantum computing in simple terms." + ] + + sampling_params = SamplingParams(max_tokens=50, + temperature=0.7, + top_p=0.9) + + lora_request = [LoRARequest("nemotron-lora", 0, lora_adapter_path)] + + print("Running inference with real LoRA adapter...") + outputs = llm.generate(prompts, + sampling_params, + lora_request=lora_request) + + for i, output in enumerate(outputs): + print(f"Prompt {i+1}: {prompts[i]}") + print(f"Response {i+1}: {output.outputs[0].text}") + print("-" * 50) + + assert len(outputs) == 2 + assert len(outputs[0].outputs) > 0 + assert len(outputs[1].outputs) > 0 + assert len(outputs[0].outputs[0].text) > 0 + assert len(outputs[1].outputs[0].text) > 0 + + defs.ci_profiler.stop("test_nemotron_real_lora_torch") + print( + f"test_nemotron_real_lora_torch: {defs.ci_profiler.elapsed_time_in_sec('test_nemotron_real_lora_torch')} sec" + )