|
1 | 1 | from pathlib import Path |
2 | 2 |
|
3 | 3 | import pytest |
4 | | -from defs import ci_profiler |
5 | 4 | from defs.common import convert_weights, venv_check_call, venv_mpi_check_call |
6 | | -from defs.conftest import get_device_memory, get_sm_version, llm_models_root |
| 5 | +from defs.conftest import get_device_memory, get_sm_version |
7 | 6 | from defs.trt_test_alternative import check_call |
8 | 7 |
|
9 | | -from tensorrt_llm import LLM |
10 | | -from tensorrt_llm.executor.request import LoRARequest |
11 | | -from tensorrt_llm.lora_manager import LoraConfig |
12 | | -from tensorrt_llm.sampling_params import SamplingParams |
13 | | - |
14 | 8 | # skip trt flow cases on post-Blackwell-Ultra |
15 | 9 | if get_sm_version() >= 103: |
16 | 10 | pytest.skip( |
@@ -128,81 +122,3 @@ def test_nemotron_nas_summary_2gpu(nemotron_nas_example_root, llm_venv, |
128 | 122 | ] |
129 | 123 |
|
130 | 124 | venv_mpi_check_call(llm_venv, mpi_cmd, summary_cmd) |
131 | | - |
132 | | - |
133 | | -@pytest.mark.skip_less_device(4) |
134 | | -@pytest.mark.skip_less_device_memory(80000) |
135 | | -@pytest.mark.parametrize( |
136 | | - "nemotron_nas_model_root", |
137 | | - [ |
138 | | - # "Llama-3_3-Nemotron-Super-49B-v1", |
139 | | - f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1" |
140 | | - ], |
141 | | - indirect=True) |
142 | | -def test_nemotron_super_49b_real_lora_torch(nemotron_nas_example_root, llm_venv, |
143 | | - nemotron_nas_model_root, |
144 | | - llm_datasets_root, llm_rouge_root, |
145 | | - engine_dir, cmodel_dir): |
146 | | - """Run Nemotron Super 49B with real LoRA adapters using LLM-API Torch backend.""" |
147 | | - |
148 | | - print("Testing Nemotron Super 49B with real LoRA adapters...") |
149 | | - |
150 | | - lora_adapter_path = f"/home/gvenkatarama/scratch_new/Bugs/TRTLLM/5463720/llama-3.3-nemotron-super-49b-v1_vlora-1a2cb80-v2" |
151 | | - print(f"Using real LoRA from: {lora_adapter_path}") |
152 | | - |
153 | | - ci_profiler.start("test_nemotron_real_lora_torch") |
154 | | - |
155 | | - lora_config = LoraConfig( |
156 | | - lora_dir=[lora_adapter_path], |
157 | | - max_lora_rank=32, # From adapter_config.json: "r": 32 |
158 | | - max_loras=1, |
159 | | - max_cpu_loras=1, |
160 | | - ) |
161 | | - |
162 | | - with LLM( |
163 | | - model=nemotron_nas_model_root, |
164 | | - lora_config=lora_config, |
165 | | - tensor_parallel_size=4, |
166 | | - dtype="bfloat16", |
167 | | - max_batch_size=2, |
168 | | - max_input_len=512, |
169 | | - max_seq_len=1024, |
170 | | - # load_format="dummy", |
171 | | - max_beam_width=1) as llm: |
172 | | - |
173 | | - prompts = [ |
174 | | - "What is the capital of France?", |
175 | | - "Explain quantum computing in simple terms." |
176 | | - ] |
177 | | - |
178 | | - sampling_params = SamplingParams(max_tokens=50, |
179 | | - temperature=0.7, |
180 | | - top_p=0.9) |
181 | | - |
182 | | - lora_request = LoRARequest("nemotron-lora", 0, lora_adapter_path) |
183 | | - |
184 | | - print("Running inference with real LoRA adapter...") |
185 | | - outputs_with_lora = llm.generate( |
186 | | - prompts, sampling_params, lora_request=[lora_request, lora_request]) |
187 | | - |
188 | | - outputs_without_lora = llm.generate(prompts, sampling_params) |
189 | | - |
190 | | - for i, (output_lora, output_no_lora) in enumerate( |
191 | | - zip(outputs_with_lora, outputs_without_lora)): |
192 | | - print(f"Prompt {i+1}: {prompts[i]}") |
193 | | - print(f"Response with LoRA {i+1}: {output_lora.outputs[0].text}") |
194 | | - print( |
195 | | - f"Response without LoRA {i+1}: {output_no_lora.outputs[0].text}" |
196 | | - ) |
197 | | - print("-" * 50) |
198 | | - |
199 | | - assert len(outputs_with_lora) == 2 |
200 | | - assert len(outputs_with_lora[0].outputs) > 0 |
201 | | - assert len(outputs_with_lora[1].outputs) > 0 |
202 | | - assert len(outputs_with_lora[0].outputs[0].text) > 0 |
203 | | - assert len(outputs_with_lora[1].outputs[0].text) > 0 |
204 | | - |
205 | | - ci_profiler.stop("test_nemotron_real_lora_torch") |
206 | | - print( |
207 | | - f"test_nemotron_real_lora_torch: {ci_profiler.elapsed_time_in_sec('test_nemotron_real_lora_torch')} sec" |
208 | | - ) |
0 commit comments