| 
1 | 1 | from pathlib import Path  | 
2 | 2 | 
 
  | 
 | 3 | +import defs.ci_profiler  | 
3 | 4 | import pytest  | 
4 | 5 | from defs.common import convert_weights, venv_check_call, venv_mpi_check_call  | 
5 | 6 | from defs.conftest import get_device_memory, get_sm_version  | 
6 | 7 | from defs.trt_test_alternative import check_call  | 
7 | 8 | 
 
  | 
 | 9 | +from tensorrt_llm import LLM  | 
 | 10 | +from tensorrt_llm.executor.request import LoRARequest  | 
 | 11 | +from tensorrt_llm.lora_manager import LoraConfig  | 
 | 12 | +from tensorrt_llm.sampling_params import SamplingParams  | 
 | 13 | + | 
8 | 14 | # skip trt flow cases on post-Blackwell-Ultra  | 
9 | 15 | if get_sm_version() >= 103:  | 
10 | 16 |     pytest.skip(  | 
@@ -122,3 +128,71 @@ def test_nemotron_nas_summary_2gpu(nemotron_nas_example_root, llm_venv,  | 
122 | 128 |     ]  | 
123 | 129 | 
 
  | 
124 | 130 |     venv_mpi_check_call(llm_venv, mpi_cmd, summary_cmd)  | 
 | 131 | + | 
 | 132 | + | 
 | 133 | +@pytest.mark.skip_less_device(4)  | 
 | 134 | +@pytest.mark.skip_less_device_memory(80000)  | 
 | 135 | +@pytest.mark.parametrize("nemotron_nas_model_root", [  | 
 | 136 | +    "Llama-3_3-Nemotron-Super-49B-v1",  | 
 | 137 | +],  | 
 | 138 | +                         indirect=True)  | 
 | 139 | +def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv,  | 
 | 140 | +                                            nemotron_nas_model_root,  | 
 | 141 | +                                            llm_datasets_root, llm_rouge_root,  | 
 | 142 | +                                            engine_dir, cmodel_dir):  | 
 | 143 | +    """Run Nemotron Super 49B with real LoRA adapters using LLM-API Torch backend."""  | 
 | 144 | + | 
 | 145 | +    print("Testing Nemotron Super 49B with real LoRA adapters...")  | 
 | 146 | + | 
 | 147 | +    lora_adapter_path = f"/code/tensorrt_llm/llama-3.3-nemotron-super-49b-v1/llama-3.3-nemotron-super-49b-v1_vlora-1a2cb80-v2"  | 
 | 148 | +    print(f"Using real LoRA from: {lora_adapter_path}")  | 
 | 149 | + | 
 | 150 | +    defs.ci_profiler.start("test_nemotron_real_lora_torch")  | 
 | 151 | + | 
 | 152 | +    lora_config = LoraConfig(  | 
 | 153 | +        lora_dir=[lora_adapter_path],  | 
 | 154 | +        max_lora_rank=32,  # From adapter_config.json: "r": 32  | 
 | 155 | +        max_loras=1,  | 
 | 156 | +        max_cpu_loras=1,  | 
 | 157 | +    )  | 
 | 158 | + | 
 | 159 | +    with LLM(model=nemotron_nas_model_root,  | 
 | 160 | +             lora_config=lora_config,  | 
 | 161 | +             tensor_parallel_size=4,  | 
 | 162 | +             dtype="bfloat16",  | 
 | 163 | +             max_batch_size=2,  | 
 | 164 | +             max_input_len=512,  | 
 | 165 | +             max_seq_len=1024,  | 
 | 166 | +             max_beam_width=1) as llm:  | 
 | 167 | + | 
 | 168 | +        prompts = [  | 
 | 169 | +            "What is the capital of France?",  | 
 | 170 | +            "Explain quantum computing in simple terms."  | 
 | 171 | +        ]  | 
 | 172 | + | 
 | 173 | +        sampling_params = SamplingParams(max_tokens=50,  | 
 | 174 | +                                         temperature=0.7,  | 
 | 175 | +                                         top_p=0.9)  | 
 | 176 | + | 
 | 177 | +        lora_request = [LoRARequest("nemotron-lora", 0, lora_adapter_path)]  | 
 | 178 | + | 
 | 179 | +        print("Running inference with real LoRA adapter...")  | 
 | 180 | +        outputs = llm.generate(prompts,  | 
 | 181 | +                               sampling_params,  | 
 | 182 | +                               lora_request=lora_request)  | 
 | 183 | + | 
 | 184 | +        for i, output in enumerate(outputs):  | 
 | 185 | +            print(f"Prompt {i+1}: {prompts[i]}")  | 
 | 186 | +            print(f"Response {i+1}: {output.outputs[0].text}")  | 
 | 187 | +            print("-" * 50)  | 
 | 188 | + | 
 | 189 | +        assert len(outputs) == 2  | 
 | 190 | +        assert len(outputs[0].outputs) > 0  | 
 | 191 | +        assert len(outputs[1].outputs) > 0  | 
 | 192 | +        assert len(outputs[0].outputs[0].text) > 0  | 
 | 193 | +        assert len(outputs[1].outputs[0].text) > 0  | 
 | 194 | + | 
 | 195 | +    defs.ci_profiler.stop("test_nemotron_real_lora_torch")  | 
 | 196 | +    print(  | 
 | 197 | +        f"test_nemotron_real_lora_torch: {defs.ci_profiler.elapsed_time_in_sec('test_nemotron_real_lora_torch')} sec"  | 
 | 198 | +    )  | 
0 commit comments