From b029c0eb0ff565e6520984d780991b4a2e24d20e Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:50 +0000 Subject: [PATCH 01/32] Fixed BindCapacityScheduler to pass peft_cache_manager to the CPP binding, fixed PeftCacheManager.free_resources to call mark_request_done, fixed TRT lora config argument passing, added test_llama_7b_multi_lora_eviction test to test_llm.py and test_llm_pytorch.py Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/_util.py | 2 + .../_torch/pyexecutor/resource_manager.py | 2 +- tensorrt_llm/_torch/pyexecutor/scheduler.py | 5 +- tensorrt_llm/llmapi/llm.py | 4 +- tests/unittest/llmapi/test_llm.py | 81 ++++++++++++++++++- tests/unittest/llmapi/test_llm_pytorch.py | 67 ++++++++++++++- tests/unittest/utils/util.py | 24 ++++++ 7 files changed, 179 insertions(+), 6 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index 88e046eb056..6bd24880cfa 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -438,6 +438,7 @@ def create_py_executor_instance( f"Cannot overwrite existing resource manager {key}.") resources[key] = value + peft_cache_manager = None if lora_config is not None: from tensorrt_llm.bindings import LoraModule @@ -513,6 +514,7 @@ def create_py_executor_instance( capacity_scheduler = BindCapacityScheduler( max_num_sequences, kv_cache_manager.impl if kv_cache_manager is not None else None, + peft_cache_manager.impl if peft_cache_manager is not None else None, executor_config.scheduler_config.capacity_scheduler_policy, two_step_lookahead=mapping.has_pp()) mb_scheduler = BindMicroBatchScheduler(executor_config.max_batch_size, diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index c5a9f264b01..4d04763776c 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -1224,7 +1224,7 @@ def update_resources(self, scheduled_batch: ScheduledRequests): pass def free_resources(self, request: LlmRequest): - pass + self.impl.mark_request_done(request) def shutdown(self): pass diff --git a/tensorrt_llm/_torch/pyexecutor/scheduler.py b/tensorrt_llm/_torch/pyexecutor/scheduler.py index 26df44874a0..833b96019bc 100644 --- a/tensorrt_llm/_torch/pyexecutor/scheduler.py +++ b/tensorrt_llm/_torch/pyexecutor/scheduler.py @@ -73,12 +73,14 @@ def __init__( self, max_num_requests: int, kv_cache_manager, + peft_cache_manager, scheduler_policy: tb_executor.CapacitySchedulerPolicy = tb_executor. CapacitySchedulerPolicy.GUARANTEED_NO_EVICT, two_step_lookahead: bool = False, ): super(BindCapacityScheduler, self).__init__() self.kv_cache_manager = kv_cache_manager + self.peft_cache_manager = peft_cache_manager self.impl = tb_internal.algorithms.CapacityScheduler( max_num_requests=max_num_requests, @@ -91,7 +93,8 @@ def __init__( def schedule_request( self, active_requests: RequestList ) -> tuple[list[LlmRequest], list[LlmRequest], list[LlmRequest]]: - return self.impl(active_requests, self.kv_cache_manager) + return self.impl(active_requests, self.kv_cache_manager, + self.peft_cache_manager) class GuaranteedNoEvictScheduler(CapacityScheduler): diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 1afe97d3ce4..95426677c8c 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -813,9 +813,9 @@ def _build_model(self): len(lora_config.lora_target_modules + lora_config.missing_qkv_modules) self._executor_config.peft_cache_config = tllm.PeftCacheConfig( num_device_module_layer=max_lora_rank * num_lora_modules * - self.args.max_loras, + lora_config.max_loras, num_host_module_layer=max_lora_rank * num_lora_modules * - self.args.max_cpu_loras, + lora_config.max_cpu_loras, ) if self.args.decoding_config is not None: self._executor_config.decoding_config = self.args.decoding_config diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index ef644849f25..9f31c26db98 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -18,12 +18,13 @@ import shutil import sys import tempfile -from typing import List, Optional, Union +from typing import List, Optional, OrderedDict, Union import datasets import pytest import torch import transformers +from utils.util import duplicate_list_to_length, flatten_list, skip_single_gpu from tensorrt_llm import LLM as LLM_torch from tensorrt_llm._tensorrt_engine import LLM @@ -1406,6 +1407,10 @@ def llama_7b_multi_lora_from_request_test_harness(**llm_kwargs): ] lora_req1 = LoRARequest("luotuo", 1, hf_lora_dir1) lora_req2 = LoRARequest("Japanese", 2, hf_lora_dir2) + lora_req3 = LoRARequest("luotuo", 3, hf_lora_dir1) + lora_req4 = LoRARequest("Japanese", 4, hf_lora_dir2) + lora_req5 = LoRARequest("luotuo", 5, hf_lora_dir1) + lora_req6 = LoRARequest("Japanese", 6, hf_lora_dir2) sampling_params = SamplingParams(max_tokens=20) outputs = llm.generate( prompts, @@ -1414,6 +1419,80 @@ def llama_7b_multi_lora_from_request_test_harness(**llm_kwargs): for output, ref, key_word in zip(outputs, references, key_words): assert similar(output.outputs[0].text, ref) or key_word in output.outputs[0].txt + outputs = llm.generate( + prompts, + sampling_params, + lora_request=[None, lora_req3, lora_req4, None, lora_req5, lora_req6]) + for output, ref, key_word in zip(outputs, references, key_words): + assert similar(output.outputs[0].text, + ref) or key_word in output.outputs[0].txt + + +@pytest.mark.parametrize( + "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", [ + ([ + 5, + ], 4, 4, 2), + ]) +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_eviction(lora_adapter_count_per_call: list[int], + max_loras: int, max_cpu_loras: int, + repeats: int): + print(f"{lora_adapter_count_per_call=}, {max_loras=}, {max_cpu_loras=}") + total_lora_adapters = sum(lora_adapter_count_per_call) + + hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" + hf_lora_dirs = [ + f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1", + f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" + ] + + build_config = BuildConfig(lora_config=LoraConfig( + lora_target_modules=['attn_q', 'attn_k', 'attn_v'], + max_lora_rank=8, + max_loras=max_loras, + max_cpu_loras=max_cpu_loras)) + llm = LLM(hf_model_dir, + enable_lora=True, + build_config=build_config, + fast_build=True) + + # Each prompt should have a reference for every LoRA adapter dir (in the same order as in hf_lora_dirs) + prompt_to_references = OrderedDict({ + "美国的首都在哪里? \n答案:": [ + "美国的首都是华盛顿。\n\n美国的", + "纽约\n\n### カンファレンスの", + ], + "アメリカ合衆国の首都はどこですか? \n答え:": [ + "华盛顿。\n\n英国の首都是什", + "ワシントン\nQ1. アメリカ合衆国", + ], + }) + + prompts_to_generate = duplicate_list_to_length( + flatten_list([[prompt] * len(hf_lora_dirs) + for prompt in prompt_to_references.keys()]), + total_lora_adapters) + references = duplicate_list_to_length( + flatten_list(list(prompt_to_references.values())), total_lora_adapters) + lora_requests = [ + LoRARequest(str(i), i, hf_lora_dirs[i % len(hf_lora_dirs)]) + for i in range(total_lora_adapters) + ] + + # Perform repeats of the same requests to test reuse and reload of adapters previously unloaded from cache + for i in range(repeats): + last_idx = 0 + for adapter_count in lora_adapter_count_per_call: + sampling_params = SamplingParams(max_tokens=20) + outputs = llm.generate( + prompts_to_generate[last_idx:last_idx + adapter_count], + sampling_params, + lora_request=lora_requests[last_idx:last_idx + adapter_count]) + for output, ref in zip( + outputs, references[last_idx:last_idx + adapter_count]): + assert similar(output.outputs[0].text, ref) + last_idx += adapter_count @skip_gpu_memory_less_than_40gb diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index fbf97c88117..945a05a26aa 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -1,3 +1,5 @@ +from collections import OrderedDict + import pytest from tensorrt_llm import LLM @@ -10,7 +12,7 @@ llm_get_stats_async_test_harness, llm_get_stats_test_harness, prompts, run_llm_abort_request, run_llm_with_postprocess_parallel_and_result_handler, tinyllama_logits_processor_test_harness, _test_llm_capture_request_error) -from utils.util import force_ampere, similar, skip_gpu_memory_less_than_40gb, skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb +from utils.util import duplicate_list_to_length, flatten_list, force_ampere, similar, skip_gpu_memory_less_than_40gb, skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb from utils.llm_data import llm_models_root from tensorrt_llm.lora_manager import LoraConfig from tensorrt_llm.executor.request import LoRARequest @@ -252,6 +254,69 @@ def test_llama_7b_multi_lora(): llama_7b_multi_lora_from_request_test_harness() +@pytest.mark.parametrize( + "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", [ + ([ + 5, + ], 4, 4, 2), + ]) +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_eviction(lora_adapter_count_per_call: list[int], + max_loras: int, max_cpu_loras: int, + repeats: int): + print(f"{lora_adapter_count_per_call=}, {max_loras=}, {max_cpu_loras=}") + total_lora_adapters = sum(lora_adapter_count_per_call) + + hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" + hf_lora_dirs = [ + f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1", + f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" + ] + + lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], + max_lora_rank=8, + max_loras=max_loras, + max_cpu_loras=max_cpu_loras) + llm = LLM(hf_model_dir, lora_config=lora_config) + + # Each prompt should have a reference for every LoRA adapter dir (in the same order as in hf_lora_dirs) + prompt_to_references = OrderedDict({ + "美国的首都在哪里? \n答案:": [ + "美国的首都是华盛顿。\n\n美国的", + "纽约\n\n### カンファレンスの", + ], + "アメリカ合衆国の首都はどこですか? \n答え:": [ + "华盛顿。\n\n英国の首都是什", + "ワシントン\nQ1. アメリカ合衆国", + ], + }) + + prompts_to_generate = duplicate_list_to_length( + flatten_list([[prompt] * len(hf_lora_dirs) + for prompt in prompt_to_references.keys()]), + total_lora_adapters) + references = duplicate_list_to_length( + flatten_list(list(prompt_to_references.values())), total_lora_adapters) + lora_requests = [ + LoRARequest(str(i), i, hf_lora_dirs[i % len(hf_lora_dirs)]) + for i in range(total_lora_adapters) + ] + + # Perform repeats of the same requests to test reuse and reload of adapters previously unloaded from cache + for i in range(repeats): + last_idx = 0 + for adapter_count in lora_adapter_count_per_call: + sampling_params = SamplingParams(max_tokens=20) + outputs = llm.generate( + prompts_to_generate[last_idx:last_idx + adapter_count], + sampling_params, + lora_request=lora_requests[last_idx:last_idx + adapter_count]) + for output, ref in zip( + outputs, references[last_idx:last_idx + adapter_count]): + assert similar(output.outputs[0].text, ref) + last_idx += adapter_count + + # TODO smor: currently Nemotron-Super-49B-v1 with LoRA memory consumption is overly high # https://jirasw.nvidia.com/browse/TRTLLM-5045 @skip_gpu_memory_less_than_138gb diff --git a/tests/unittest/utils/util.py b/tests/unittest/utils/util.py index 72f205dc517..cbb483b6087 100644 --- a/tests/unittest/utils/util.py +++ b/tests/unittest/utils/util.py @@ -3,6 +3,7 @@ from contextlib import contextmanager from difflib import SequenceMatcher from pathlib import Path +from typing import Any, Generator import pynvml import pytest @@ -397,3 +398,26 @@ def woq_groupwise_gt_matmul(mat1, ref_torch_weights, bias=None): if bias is not None: ref += bias return ref + + +def flatten_list_generator( + nested_list: list[Any]) -> Generator[Any, None, None]: + if not isinstance(nested_list, list): + yield nested_list + else: + for item in nested_list: + yield from flatten_list_generator(item) + + +def flatten_list(nested_list: list[Any]) -> list[Any]: + return list(flatten_list_generator(nested_list)) + + +def duplicate_list_to_length(list: list[Any], target_length: int) -> list[Any]: + if target_length < len(list): + return list[:target_length] + duplicated_list = list * (target_length // len(list)) + remain = target_length % len(list) + if remain != 0: + duplicated_list += list[:remain] + return duplicated_list From c79b271ca4199f8730d5b290af1a47d148acba10 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:51 +0000 Subject: [PATCH 02/32] Removed unnecessary changes in test_llm.py Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 9f31c26db98..20c0c674472 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -1407,10 +1407,6 @@ def llama_7b_multi_lora_from_request_test_harness(**llm_kwargs): ] lora_req1 = LoRARequest("luotuo", 1, hf_lora_dir1) lora_req2 = LoRARequest("Japanese", 2, hf_lora_dir2) - lora_req3 = LoRARequest("luotuo", 3, hf_lora_dir1) - lora_req4 = LoRARequest("Japanese", 4, hf_lora_dir2) - lora_req5 = LoRARequest("luotuo", 5, hf_lora_dir1) - lora_req6 = LoRARequest("Japanese", 6, hf_lora_dir2) sampling_params = SamplingParams(max_tokens=20) outputs = llm.generate( prompts, @@ -1419,13 +1415,6 @@ def llama_7b_multi_lora_from_request_test_harness(**llm_kwargs): for output, ref, key_word in zip(outputs, references, key_words): assert similar(output.outputs[0].text, ref) or key_word in output.outputs[0].txt - outputs = llm.generate( - prompts, - sampling_params, - lora_request=[None, lora_req3, lora_req4, None, lora_req5, lora_req6]) - for output, ref, key_word in zip(outputs, references, key_words): - assert similar(output.outputs[0].text, - ref) or key_word in output.outputs[0].txt @pytest.mark.parametrize( From d193d302ba3c1a4af9d648aee6e9a9fc586f8a10 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:51 +0000 Subject: [PATCH 03/32] Refactored LoRA eviction tests Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm.py | 62 +++++++++++++--- tests/unittest/llmapi/test_llm_pytorch.py | 67 +++++++++++++---- tests/unittest/utils/util.py | 89 ++++++++++++++++++++++- 3 files changed, 192 insertions(+), 26 deletions(-) diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 20c0c674472..9a747930d58 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -24,7 +24,9 @@ import pytest import torch import transformers -from utils.util import duplicate_list_to_length, flatten_list, skip_single_gpu +from utils.util import (EnvVarsContextManager, duplicate_list_to_length, + flatten_list, run_function_in_sub_process, + skip_single_gpu) from tensorrt_llm import LLM as LLM_torch from tensorrt_llm._tensorrt_engine import LLM @@ -1417,17 +1419,8 @@ def llama_7b_multi_lora_from_request_test_harness(**llm_kwargs): ref) or key_word in output.outputs[0].txt -@pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", [ - ([ - 5, - ], 4, 4, 2), - ]) -@skip_gpu_memory_less_than_40gb -def test_llama_7b_multi_lora_eviction(lora_adapter_count_per_call: list[int], - max_loras: int, max_cpu_loras: int, - repeats: int): - print(f"{lora_adapter_count_per_call=}, {max_loras=}, {max_cpu_loras=}") +def llama_7b_multi_unique_lora_adapters_from_request(lora_adapter_count_per_call: list[int], max_loras: int, + max_cpu_loras: int, repeats: int, **llm_kwargs): total_lora_adapters = sum(lora_adapter_count_per_call) hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" @@ -1436,6 +1429,9 @@ def test_llama_7b_multi_lora_eviction(lora_adapter_count_per_call: list[int], f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" ] + # For LoRA checkpoints without finetuned embedding and lm_head, we can either: + # (1) specify lora_target_modules, or + # (2) provide a lora_dir to infer the lora_target_modules. build_config = BuildConfig(lora_config=LoraConfig( lora_target_modules=['attn_q', 'attn_k', 'attn_v'], max_lora_rank=8, @@ -1484,6 +1480,48 @@ def test_llama_7b_multi_lora_eviction(lora_adapter_count_per_call: list[int], last_idx += adapter_count +@pytest.mark.parametrize( + "lora_adapter_count_per_call, max_loras, max_cpu_loras", [ + ([5], 2, 2), + ([2, 2, 2], 1, 3), + ]) +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_evict_load_new_adapters( + lora_adapter_count_per_call: list[int], max_loras: int, + max_cpu_loras: int): + llama_7b_multi_unique_lora_adapters_from_request(lora_adapter_count_per_call, + max_loras, + max_cpu_loras, + repeats=1) + + +@pytest.mark.parametrize( + "lora_adapter_count_per_call, max_loras, max_cpu_loras", [ + ([1, 1], 1, 1), + ]) +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_load_previously_cpu_cache_evicted_adapter_fails( + lora_adapter_count_per_call: list[int], max_loras: int, + max_cpu_loras: int): + """Tests that trying to load a LoRA adapter after it was evicted from CPU cache fails with the expected + message, as this feature is currently not supported in favor of the performance improvement of not + sending the LoRA weights with every request after the first time. + """ # noqa: D205 + + def _check_contains_expected_message(stdout: str, stderr: str): + return "not found in cache" in stderr + + repeats = 2 + with EnvVarsContextManager({"TLLM_WORKER_USE_SINGLE_PROCESS": "1"}): + child_stdout, child_stderr = run_function_in_sub_process( + target=llama_7b_multi_lora, + args=(lora_adapter_count_per_call, max_loras, max_cpu_loras, + repeats), + kwargs={}, + stop_waiting_criteria=_check_contains_expected_message) + assert _check_contains_expected_message(child_stdout, child_stderr) + + @skip_gpu_memory_less_than_40gb def test_llama_v2_13b_lora(): llama_v2_13b_lora_from_dir_test_harness() diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 945a05a26aa..07f0b0dd6dd 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -12,7 +12,7 @@ llm_get_stats_async_test_harness, llm_get_stats_test_harness, prompts, run_llm_abort_request, run_llm_with_postprocess_parallel_and_result_handler, tinyllama_logits_processor_test_harness, _test_llm_capture_request_error) -from utils.util import duplicate_list_to_length, flatten_list, force_ampere, similar, skip_gpu_memory_less_than_40gb, skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb +from utils.util import EnvVarsContextManager, duplicate_list_to_length, flatten_list, force_ampere, run_function_in_sub_process, similar, skip_gpu_memory_less_than_40gb, skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb from utils.llm_data import llm_models_root from tensorrt_llm.lora_manager import LoraConfig from tensorrt_llm.executor.request import LoRARequest @@ -254,17 +254,8 @@ def test_llama_7b_multi_lora(): llama_7b_multi_lora_from_request_test_harness() -@pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", [ - ([ - 5, - ], 4, 4, 2), - ]) -@skip_gpu_memory_less_than_40gb -def test_llama_7b_multi_lora_eviction(lora_adapter_count_per_call: list[int], - max_loras: int, max_cpu_loras: int, - repeats: int): - print(f"{lora_adapter_count_per_call=}, {max_loras=}, {max_cpu_loras=}") +def llama_7b_multi_unique_lora_adapters_from_request(lora_adapter_count_per_call: list[int], max_loras: int, + max_cpu_loras: int, repeats: int, **llm_kwargs): total_lora_adapters = sum(lora_adapter_count_per_call) hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" @@ -273,11 +264,14 @@ def test_llama_7b_multi_lora_eviction(lora_adapter_count_per_call: list[int], f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" ] + # For LoRA checkpoints without finetuned embedding and lm_head, we can either: + # (1) specify lora_target_modules, or + # (2) provide a lora_dir to infer the lora_target_modules. lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], max_lora_rank=8, max_loras=max_loras, max_cpu_loras=max_cpu_loras) - llm = LLM(hf_model_dir, lora_config=lora_config) + llm = LLM(hf_model_dir, lora_config=lora_config, **llm_kwargs) # Each prompt should have a reference for every LoRA adapter dir (in the same order as in hf_lora_dirs) prompt_to_references = OrderedDict({ @@ -317,6 +311,53 @@ def test_llama_7b_multi_lora_eviction(lora_adapter_count_per_call: list[int], last_idx += adapter_count +@pytest.mark.parametrize( + "lora_adapter_count_per_call, max_loras, max_cpu_loras", [ + ([ + 5, + ], 2, 2), + ([2, 2, 2], 1, 3), + ]) +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_evict_load_new_adapters( + lora_adapter_count_per_call: list[int], max_loras: int, + max_cpu_loras: int): + llama_7b_multi_unique_lora_adapters_from_request(lora_adapter_count_per_call, + max_loras, + max_cpu_loras, + repeats=1) + + +@pytest.mark.parametrize( + "lora_adapter_count_per_call, max_loras, max_cpu_loras", + [ + ([1, 1], 1, 1), + #([5,], 2, 2), + #([2, 2], 1, 3), + ]) +@skip_gpu_memory_less_than_40gb +def test_llama_7b_multi_lora_load_previously_cpu_cache_evicted_adapter_fails( + lora_adapter_count_per_call: list[int], max_loras: int, + max_cpu_loras: int): + """Tests that trying to load a LoRA adapter after it was evicted from CPU cache fails with the expected + message, as this feature is currently not supported in favor of the performance improvement of not + sending the LoRA weights with every request after the first time. + """ # noqa: D205 + + def _check_contains_expected_message(stdout: str, stderr: str): + return "not found in cache" in stderr + + repeats = 2 + with EnvVarsContextManager({"TLLM_WORKER_USE_SINGLE_PROCESS": "1"}): + child_stdout, child_stderr = run_function_in_sub_process( + target=llama_7b_multi_unique_lora_adapters_from_request, + args=(lora_adapter_count_per_call, max_loras, max_cpu_loras, + repeats), + kwargs={}, + stop_waiting_criteria=_check_contains_expected_message) + assert _check_contains_expected_message(child_stdout, child_stderr) + + # TODO smor: currently Nemotron-Super-49B-v1 with LoRA memory consumption is overly high # https://jirasw.nvidia.com/browse/TRTLLM-5045 @skip_gpu_memory_less_than_138gb diff --git a/tests/unittest/utils/util.py b/tests/unittest/utils/util.py index cbb483b6087..361bd0beb72 100644 --- a/tests/unittest/utils/util.py +++ b/tests/unittest/utils/util.py @@ -1,9 +1,13 @@ +import multiprocessing import os +import sys +import time import unittest from contextlib import contextmanager from difflib import SequenceMatcher +from multiprocessing.connection import Connection from pathlib import Path -from typing import Any, Generator +from typing import Any, Callable, Generator, Mapping, Tuple import pynvml import pytest @@ -421,3 +425,86 @@ def duplicate_list_to_length(list: list[Any], target_length: int) -> list[Any]: if remain != 0: duplicated_list += list[:remain] return duplicated_list + + +def _target_wrapper(target: Callable, stdout_pipe: Connection, + stderr_pipe: Connection, *args, **kwargs) -> None: + + class PipeWriter: + + def __init__(self, conn: Connection): + self.conn = conn + + def write(self, s: str): + self.conn.send_bytes(s.encode("UTF8")) + + def flush(self): + pass + + sys.stdout = PipeWriter(stdout_pipe) + sys.stderr = PipeWriter(stderr_pipe) + target(*args, **kwargs) + + +def run_function_in_sub_process(target: Callable, + args: tuple, + kwargs: Mapping[str, Any], + stop_waiting_criteria: Callable, + poll_interval_seconds: int = 5, + timeout_seconds: int = 240) -> Tuple[str, str]: + multiprocessing.set_start_method("spawn", force=True) + parent_stdout_pipe, child_stdout_pipe = multiprocessing.Pipe() + parent_stderr_pipe, child_stderr_pipe = multiprocessing.Pipe() + child_process = multiprocessing.Process( + target=_target_wrapper, + args=[target, child_stdout_pipe, child_stderr_pipe] + list(args), + kwargs=kwargs) + child_process.start() + child_stdout_pipe.close() + child_stderr_pipe.close() + + def _read_from_pipe(pipe: Connection): + out = "" + while pipe.poll(timeout=0.1): + try: + out += pipe.recv_bytes().decode("UTF8") + except Exception: + break + return out + + child_stdout = "" + child_stderr = "" + total_waiting_seconds = 0 + while child_process.is_alive() and total_waiting_seconds < timeout_seconds: + child_stdout += _read_from_pipe(parent_stdout_pipe) + child_stderr += _read_from_pipe(parent_stderr_pipe) + if stop_waiting_criteria(child_stdout, child_stderr): + break + time.sleep(poll_interval_seconds) + total_waiting_seconds += poll_interval_seconds + + if child_process.is_alive(): + child_process.terminate() + + assert total_waiting_seconds < timeout_seconds, "Reached timeout while waiting for target" + return child_stdout, child_stderr + + +class EnvVarsContextManager: + + def __init__(self, new_env_vars: dict[str, str]): + self._env_vars = new_env_vars + self._original_value = None + + def __enter__(self): + self._original_vars = { + var_name: os.environ[var_name] + for var_name in self._env_vars.keys() if var_name in os.environ + } + os.environ.update(self._env_vars) + + def __exit__(self, type, value, traceback): + os.environ.update(self._original_vars) + for var_name in self._env_vars.keys(): + if var_name not in self._original_vars: + os.environ.pop(var_name) From 28d86b14f590882490976a8109637c826648bcfd Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:52 +0000 Subject: [PATCH 04/32] Added type hint to peft_cache_manager Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/scheduler.py b/tensorrt_llm/_torch/pyexecutor/scheduler.py index 833b96019bc..e428f3727ae 100644 --- a/tensorrt_llm/_torch/pyexecutor/scheduler.py +++ b/tensorrt_llm/_torch/pyexecutor/scheduler.py @@ -73,7 +73,7 @@ def __init__( self, max_num_requests: int, kv_cache_manager, - peft_cache_manager, + peft_cache_manager: tb_internal.batch_manager.PeftCacheManager, scheduler_policy: tb_executor.CapacitySchedulerPolicy = tb_executor. CapacitySchedulerPolicy.GUARANTEED_NO_EVICT, two_step_lookahead: bool = False, From b87d58a97eb987dd7de7fbdfb86310fac492f397 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:52 +0000 Subject: [PATCH 05/32] Add forgotten llm_args in test_llm.py, fix formatting in test_llm_pytorch.py and in test_llm.py Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm.py | 14 +++++++------- tests/unittest/llmapi/test_llm_pytorch.py | 11 +++++------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 9a747930d58..625a584d600 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -1419,8 +1419,9 @@ def llama_7b_multi_lora_from_request_test_harness(**llm_kwargs): ref) or key_word in output.outputs[0].txt -def llama_7b_multi_unique_lora_adapters_from_request(lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int, repeats: int, **llm_kwargs): +def llama_7b_multi_unique_lora_adapters_from_request( + lora_adapter_count_per_call: list[int], max_loras: int, + max_cpu_loras: int, repeats: int, **llm_kwargs): total_lora_adapters = sum(lora_adapter_count_per_call) hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" @@ -1440,7 +1441,8 @@ def llama_7b_multi_unique_lora_adapters_from_request(lora_adapter_count_per_call llm = LLM(hf_model_dir, enable_lora=True, build_config=build_config, - fast_build=True) + fast_build=True, + **llm_kwargs) # Each prompt should have a reference for every LoRA adapter dir (in the same order as in hf_lora_dirs) prompt_to_references = OrderedDict({ @@ -1489,10 +1491,8 @@ def llama_7b_multi_unique_lora_adapters_from_request(lora_adapter_count_per_call def test_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, max_cpu_loras: int): - llama_7b_multi_unique_lora_adapters_from_request(lora_adapter_count_per_call, - max_loras, - max_cpu_loras, - repeats=1) + llama_7b_multi_unique_lora_adapters_from_request( + lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats=1) @pytest.mark.parametrize( diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 07f0b0dd6dd..70166f9cf60 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -254,8 +254,9 @@ def test_llama_7b_multi_lora(): llama_7b_multi_lora_from_request_test_harness() -def llama_7b_multi_unique_lora_adapters_from_request(lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int, repeats: int, **llm_kwargs): +def llama_7b_multi_unique_lora_adapters_from_request( + lora_adapter_count_per_call: list[int], max_loras: int, + max_cpu_loras: int, repeats: int, **llm_kwargs): total_lora_adapters = sum(lora_adapter_count_per_call) hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" @@ -322,10 +323,8 @@ def llama_7b_multi_unique_lora_adapters_from_request(lora_adapter_count_per_call def test_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, max_cpu_loras: int): - llama_7b_multi_unique_lora_adapters_from_request(lora_adapter_count_per_call, - max_loras, - max_cpu_loras, - repeats=1) + llama_7b_multi_unique_lora_adapters_from_request( + lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats=1) @pytest.mark.parametrize( From a7d6ea588e80af905b43579aaf45c796b8aec2a8 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:53 +0000 Subject: [PATCH 06/32] Pass peft_cache_manager=None to BindCapacityScheduler in create_autodeploy_executor function in ad_executor.py Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py | 4 +++- tensorrt_llm/_torch/pyexecutor/scheduler.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py index c1a0fb151d4..fc9f071a9f4 100644 --- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py +++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py @@ -286,7 +286,9 @@ def create_autodeploy_executor(executor_config: ExecutorConfig, checkpoint_dir: resource_manager.resource_managers.move_to_end(ResourceManagerType.KV_CACHE_MANAGER, last=True) # scheduling - capacitor_scheduler = BindCapacityScheduler(ad_config.max_batch_size, kv_cache_manager.impl) + capacitor_scheduler = BindCapacityScheduler( + ad_config.max_batch_size, kv_cache_manager.impl, peft_cache_manager=None + ) mb_scheduler = BindMicroBatchScheduler( ad_config.max_batch_size, engine.cache_seq_interface.info.max_num_tokens ) diff --git a/tensorrt_llm/_torch/pyexecutor/scheduler.py b/tensorrt_llm/_torch/pyexecutor/scheduler.py index e428f3727ae..d7a9249dd36 100644 --- a/tensorrt_llm/_torch/pyexecutor/scheduler.py +++ b/tensorrt_llm/_torch/pyexecutor/scheduler.py @@ -73,7 +73,7 @@ def __init__( self, max_num_requests: int, kv_cache_manager, - peft_cache_manager: tb_internal.batch_manager.PeftCacheManager, + peft_cache_manager: tb_internal.batch_manager.PeftCacheManager | None, scheduler_policy: tb_executor.CapacitySchedulerPolicy = tb_executor. CapacitySchedulerPolicy.GUARANTEED_NO_EVICT, two_step_lookahead: bool = False, From 0ebe6aa565acf90402fdab6e066a1a3a6a6c7191 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:53 +0000 Subject: [PATCH 07/32] Fix target name in test Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 625a584d600..a048be26beb 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -1514,7 +1514,7 @@ def _check_contains_expected_message(stdout: str, stderr: str): repeats = 2 with EnvVarsContextManager({"TLLM_WORKER_USE_SINGLE_PROCESS": "1"}): child_stdout, child_stderr = run_function_in_sub_process( - target=llama_7b_multi_lora, + target=llama_7b_multi_unique_lora_adapters_from_request, args=(lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats), kwargs={}, From 728b32f6088fbf8db95d10906cf4ddb30127e72b Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:54 +0000 Subject: [PATCH 08/32] Changed GuaranteedNoEvictScheduler to try call peftCacheManager->determineNumPages as best effort, so its failure handling would be outside the scheduler when the request would be handled, updated the error message in PeftCacheManager when lora request has no weights and its adapter id was not found in CPU cache, updated & documented parameter sets in relevant LoRA tests, removed test_llama_7b_multi_lora_load_previously_cpu_cache_evicted_adapter_fails because capturing its stderr didn't work Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- .../batch_manager/capacityScheduler.cpp | 25 +++++++++- .../batch_manager/peftCacheManager.cpp | 4 +- tests/unittest/llmapi/test_llm.py | 50 ++++++------------- tests/unittest/llmapi/test_llm_pytorch.py | 40 ++++++++++----- 4 files changed, 69 insertions(+), 50 deletions(-) diff --git a/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp b/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp index 9c9c56ba9d6..eea8e0d11a0 100644 --- a/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp +++ b/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp @@ -31,6 +31,27 @@ using kv_cache_manager::BlockKeyHasher; namespace { +SizeType32 tryDetermineNumPages( + OptionalRef peftCacheManager, std::shared_ptr const& llmRequest) +{ + if (peftCacheManager) + { + try + { + return peftCacheManager->determineNumPages(llmRequest); + } + catch (std::runtime_error const& e) + { + // Catch the exception so handling its consequences won't be in the scheduler + TLLM_LOG_WARNING( + "GuaranteedNoEvictScheduler caught exception raised from peftCacheManager->determineNumPages, assuming " + "it requires 0 pages. Exception: %s", + e.what()); + } + } + return 0; +} + std::tuple, std::unordered_set> prefillWithChunkedContextsAlreadyExecuting(RequestList const& activeRequests, kv_cache_manager::BaseKVCacheManager const& kvCacheManager, @@ -257,7 +278,7 @@ std::tuple GuaranteedNoEvictScheduler::impl( bool const isNewTask = reqHasLora && !uniqTaskIds.count(req->getLoraTaskId().value()); if (isNewTask) { - claimedPeftPages += peftCacheManager ? peftCacheManager->determineNumPages(req) : 0; + claimedPeftPages += tryDetermineNumPages(peftCacheManager, req); uniqTaskIds.insert(req->getLoraTaskId().value()); } } @@ -303,7 +324,7 @@ std::tuple GuaranteedNoEvictScheduler::impl( = reservedCrossBlocks ? reservedCrossBlocks->enoughAvailableBlocks(*req) : true; bool reqHasLora = req->getLoraTaskId().has_value(); bool isNewTask = reqHasLora && !uniqTaskIds.count(req->getLoraTaskId().value()); - auto neededPeftPages = isNewTask && peftCacheManager ? peftCacheManager->determineNumPages(req) : 0; + auto neededPeftPages = isNewTask ? tryDetermineNumPages(peftCacheManager, req) : 0; if (enoughBlocks && enoughCrossBlocks && neededPeftPages <= availablePeftPages) { diff --git a/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp index 8eeca23df35..1fae4cfc36e 100644 --- a/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp @@ -277,7 +277,9 @@ void PeftCacheManager::addRequestPeft(std::shared_ptr llmRequest, bo if (!isTaskCached(taskId)) { std::string errMsg - = "LoRA task " + std::to_string(taskId) + " not found in cache. Please send LoRA weights with request"; + = "LoRA task " + std::to_string(taskId) + " not found in cache. Please send LoRA weights with request." + " Note that currently a request with LoRA task that was already loaded is sent without its LoRA weights to save its serialization, copy and deserialization," + " so if this LoRA task was evicted from LoRA CPU cache, then its reuse is currently not supported."; throw PeftTaskNotCachedException(errMsg); } } diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index a048be26beb..f88dbaa1787 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -24,9 +24,7 @@ import pytest import torch import transformers -from utils.util import (EnvVarsContextManager, duplicate_list_to_length, - flatten_list, run_function_in_sub_process, - skip_single_gpu) +from utils.util import duplicate_list_to_length, flatten_list, skip_single_gpu from tensorrt_llm import LLM as LLM_torch from tensorrt_llm._tensorrt_engine import LLM @@ -1483,43 +1481,25 @@ def llama_7b_multi_unique_lora_adapters_from_request( @pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras", [ - ([5], 2, 2), - ([2, 2, 2], 1, 3), + "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", + [ + # Test eviction and loading of new adapters in the evicted space, within a single llm.generate call + ([ + 5, + ], 2, 2, 1), + # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single llm.generate call + ([ + 2, + ], 1, 2, 2), + # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU cache size < LoRA CPU cache size + ([2, 2, 2], 1, 3, 1), ]) @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int): + max_cpu_loras: int, repeats: int): llama_7b_multi_unique_lora_adapters_from_request( - lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats=1) - - -@pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras", [ - ([1, 1], 1, 1), - ]) -@skip_gpu_memory_less_than_40gb -def test_llama_7b_multi_lora_load_previously_cpu_cache_evicted_adapter_fails( - lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int): - """Tests that trying to load a LoRA adapter after it was evicted from CPU cache fails with the expected - message, as this feature is currently not supported in favor of the performance improvement of not - sending the LoRA weights with every request after the first time. - """ # noqa: D205 - - def _check_contains_expected_message(stdout: str, stderr: str): - return "not found in cache" in stderr - - repeats = 2 - with EnvVarsContextManager({"TLLM_WORKER_USE_SINGLE_PROCESS": "1"}): - child_stdout, child_stderr = run_function_in_sub_process( - target=llama_7b_multi_unique_lora_adapters_from_request, - args=(lora_adapter_count_per_call, max_loras, max_cpu_loras, - repeats), - kwargs={}, - stop_waiting_criteria=_check_contains_expected_message) - assert _check_contains_expected_message(child_stdout, child_stderr) + lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats) @skip_gpu_memory_less_than_40gb diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 70166f9cf60..f93319ccfab 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -313,40 +313,52 @@ def llama_7b_multi_unique_lora_adapters_from_request( @pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras", [ + "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", + [ + # Test eviction and loading of new adapters in the evicted space, within a single llm.generate call ([ 5, - ], 2, 2), - ([2, 2, 2], 1, 3), + ], 2, 2, 1), + # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache + ([ + 2, + ], 1, 2, 2), + # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU cache size < LoRA CPU cache size + ([2, 2, 2], 1, 3, 1), ]) @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int): + max_cpu_loras: int, repeats: int): llama_7b_multi_unique_lora_adapters_from_request( - lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats=1) + lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats) @pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras", + "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", [ - ([1, 1], 1, 1), - #([5,], 2, 2), - #([2, 2], 1, 3), + # Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU cache over more than a single llm.generate call + ([1, 1], 1, 1, 2), + # Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU cache over a single llm.generate call + ([ + 5, + ], 2, 2, 2), ]) @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora_load_previously_cpu_cache_evicted_adapter_fails( lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int): + max_cpu_loras: int, repeats: int): """Tests that trying to load a LoRA adapter after it was evicted from CPU cache fails with the expected message, as this feature is currently not supported in favor of the performance improvement of not sending the LoRA weights with every request after the first time. """ # noqa: D205 def _check_contains_expected_message(stdout: str, stderr: str): - return "not found in cache" in stderr + note_in_message = "Note that currently a request with LoRA task that was already loaded is sent" \ + " without its LoRA weights to save its serialization, copy and deserialization, so if this" \ + " LoRA task was evicted from LoRA CPU cache, then its reuse is currently not supported." + return note_in_message in stderr - repeats = 2 with EnvVarsContextManager({"TLLM_WORKER_USE_SINGLE_PROCESS": "1"}): child_stdout, child_stderr = run_function_in_sub_process( target=llama_7b_multi_unique_lora_adapters_from_request, @@ -354,6 +366,10 @@ def _check_contains_expected_message(stdout: str, stderr: str): repeats), kwargs={}, stop_waiting_criteria=_check_contains_expected_message) + print("STDOUT:") + print(child_stdout) + print("STDERR:") + print(child_stderr) assert _check_contains_expected_message(child_stdout, child_stderr) From 26923b6cb5fa670237bacde8a8d42385df0b1f61 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:54 +0000 Subject: [PATCH 09/32] Format comments in tests Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm.py | 6 ++++-- tests/unittest/llmapi/test_llm_pytorch.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index f88dbaa1787..5b1f5f193f7 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -1487,11 +1487,13 @@ def llama_7b_multi_unique_lora_adapters_from_request( ([ 5, ], 2, 2, 1), - # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single llm.generate call + # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single + # llm.generate call ([ 2, ], 1, 2, 2), - # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU cache size < LoRA CPU cache size + # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU + # cache size < LoRA CPU cache size ([2, 2, 2], 1, 3, 1), ]) @skip_gpu_memory_less_than_40gb diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index f93319ccfab..707adb6fd99 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -319,11 +319,13 @@ def llama_7b_multi_unique_lora_adapters_from_request( ([ 5, ], 2, 2, 1), - # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache + # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single + # llm.generate call ([ 2, ], 1, 2, 2), - # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU cache size < LoRA CPU cache size + # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU + # cache size < LoRA CPU cache size ([2, 2, 2], 1, 3, 1), ]) @skip_gpu_memory_less_than_40gb @@ -337,9 +339,11 @@ def test_llama_7b_multi_lora_evict_load_new_adapters( @pytest.mark.parametrize( "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", [ - # Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU cache over more than a single llm.generate call + # Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU + # cache over more than a single llm.generate call ([1, 1], 1, 1, 2), - # Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU cache over a single llm.generate call + # Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU + # cache over a single llm.generate call ([ 5, ], 2, 2, 2), From f4875fae96cf50966fa3213ddf07f6173c8467cc Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:55 +0000 Subject: [PATCH 10/32] Remove debug prints from test Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm_pytorch.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 707adb6fd99..47fa0382227 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -370,10 +370,7 @@ def _check_contains_expected_message(stdout: str, stderr: str): repeats), kwargs={}, stop_waiting_criteria=_check_contains_expected_message) - print("STDOUT:") - print(child_stdout) - print("STDERR:") - print(child_stderr) + assert _check_contains_expected_message(child_stdout, child_stderr) From f5ca3b41f98863095cc04d56e993c15d83a7fe15 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:55 +0000 Subject: [PATCH 11/32] Update missingPeftTask CPP test to expect the error message starts with the original error message Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- cpp/tests/executor/executorTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/executor/executorTest.cpp b/cpp/tests/executor/executorTest.cpp index 6b8c8d7eb9e..acd4d0f7d4d 100644 --- a/cpp/tests/executor/executorTest.cpp +++ b/cpp/tests/executor/executorTest.cpp @@ -190,7 +190,7 @@ TEST_F(GptExecutorTest, missingPeftTask) if (response.hasError()) { auto err = response.getErrorMsg(); - EXPECT_EQ(err, std::string("LoRA task 10 not found in cache. Please send LoRA weights with request")); + EXPECT_EQ(0, err.find("LoRA task 10 not found in cache. Please send LoRA weights with request")); done = true; } else From 82e18f9c755dc98a2c741ef037b69cce9d20e9ec Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:56 +0000 Subject: [PATCH 12/32] Refactored shared lora test logic into lora_test_utils.py Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/lora_test_utils.py | 130 +++++++++++++++++++ tests/unittest/llmapi/test_llm.py | 150 +++++---------------- tests/unittest/llmapi/test_llm_pytorch.py | 151 ++++++---------------- 3 files changed, 196 insertions(+), 235 deletions(-) create mode 100644 tests/unittest/llmapi/lora_test_utils.py diff --git a/tests/unittest/llmapi/lora_test_utils.py b/tests/unittest/llmapi/lora_test_utils.py new file mode 100644 index 00000000000..762b7bb49ad --- /dev/null +++ b/tests/unittest/llmapi/lora_test_utils.py @@ -0,0 +1,130 @@ +from typing import OrderedDict + +from utils.llm_data import llm_models_root +from utils.util import duplicate_list_to_length, flatten_list, similar + +from tensorrt_llm import SamplingParams +from tensorrt_llm.executor.request import LoRARequest +from tensorrt_llm.llmapi.llm import BaseLLM, _TorchLLM, _TrtLLM +from tensorrt_llm.llmapi.llm_utils import BuildConfig +from tensorrt_llm.lora_manager import LoraConfig + + +def check_multi_unique_lora_adapters_from_request( + llm: BaseLLM, hf_lora_dirs: list[str], + lora_adapter_count_per_call: list[int], repeats: int): + total_lora_adapters = sum(lora_adapter_count_per_call) + + # Each prompt should have a reference for every LoRA adapter dir (in the same order as in hf_lora_dirs) + prompt_to_references = OrderedDict({ + "美国的首都在哪里? \n答案:": [ + "美国的首都是华盛顿。\n\n美国的", + "纽约\n\n### カンファレンスの", + ], + "アメリカ合衆国の首都はどこですか? \n答え:": [ + "华盛顿。\n\n英国の首都是什", + "ワシントン\nQ1. アメリカ合衆国", + ], + }) + + prompts_to_generate = duplicate_list_to_length( + flatten_list([[prompt] * len(hf_lora_dirs) + for prompt in prompt_to_references.keys()]), + total_lora_adapters) + references = duplicate_list_to_length( + flatten_list(list(prompt_to_references.values())), total_lora_adapters) + lora_requests = [ + LoRARequest(str(i), i, hf_lora_dirs[i % len(hf_lora_dirs)]) + for i in range(total_lora_adapters) + ] + + # Perform repeats of the same requests to test reuse and reload of adapters previously unloaded from cache + try: + for _ in range(repeats): + last_idx = 0 + for adapter_count in lora_adapter_count_per_call: + sampling_params = SamplingParams(max_tokens=20) + outputs = llm.generate( + prompts_to_generate[last_idx:last_idx + adapter_count], + sampling_params, + lora_request=lora_requests[last_idx:last_idx + adapter_count]) + for output, ref in zip( + outputs, references[last_idx:last_idx + adapter_count]): + assert similar(output.outputs[0].text, ref) + last_idx += adapter_count + finally: + llm.shutdown() + + +def check_pytorch_llama_7b_multi_lora_from_request_test_harness( + max_lora_rank: int = 8, **llm_kwargs) -> None: + hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" + + # For LoRA checkpoints without finetuned embedding and lm_head, we can either: + # (1) specify lora_target_modules, or + # (2) provide a lora_dir to infer the lora_target_modules. + lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], + max_lora_rank=max_lora_rank) + + llm = _TorchLLM(hf_model_dir, lora_config=lora_config, **llm_kwargs) + _check_llama_7b_multi_lora_from_request_test_harness(llm) + + +def check_trt_python_llama_7b_multi_lora_from_request_test_harness( + max_lora_rank: int = 8, **llm_kwargs): + hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" + + # For LoRA checkpoints without finetuned embedding and lm_head, we can either: + # (1) specify lora_target_modules, or + # (2) provide a lora_dir to infer the lora_target_modules. + build_config = BuildConfig(lora_config=LoraConfig( + lora_target_modules=['attn_q', 'attn_k', 'attn_v'])) + llm = _TrtLLM(hf_model_dir, + enable_lora=True, + max_lora_rank=max_lora_rank, + build_config=build_config, + fast_build=True, + **llm_kwargs) + _check_llama_7b_multi_lora_from_request_test_harness(llm) + + +def _check_llama_7b_multi_lora_from_request_test_harness(llm: BaseLLM) -> None: + hf_lora_dir1 = f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1" + hf_lora_dir2 = f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" + prompts = [ + "美国的首都在哪里? \n答案:", + "美国的首都在哪里? \n答案:", + "美国的首都在哪里? \n答案:", + "アメリカ合衆国の首都はどこですか? \n答え:", + "アメリカ合衆国の首都はどこですか? \n答え:", + "アメリカ合衆国の首都はどこですか? \n答え:", + ] + references = [ + "沃尔玛\n\n## 新闻\n\n* ", + "美国的首都是华盛顿。\n\n美国的", + "纽约\n\n### カンファレンスの", + "Washington, D.C.\nWashington, D.C. is the capital of the United", + "华盛顿。\n\n英国の首都是什", + "ワシントン\nQ1. アメリカ合衆国", + ] + key_words = [ + "沃尔玛", + "华盛顿", + "纽约", + "Washington", + "华盛顿", + "ワシントン", + ] + lora_req1 = LoRARequest("luotuo", 1, hf_lora_dir1) + lora_req2 = LoRARequest("Japanese", 2, hf_lora_dir2) + sampling_params = SamplingParams(max_tokens=20) + try: + outputs = llm.generate( + prompts, + sampling_params, + lora_request=[None, lora_req1, lora_req2, None, lora_req1, lora_req2]) + finally: + llm.shutdown() + for output, ref, key_word in zip(outputs, references, key_words): + assert similar(output.outputs[0].text, + ref) or key_word in output.outputs[0].txt diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 5b1f5f193f7..04b80dba49e 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -18,13 +18,13 @@ import shutil import sys import tempfile -from typing import List, Optional, OrderedDict, Union +from typing import List, Optional, Union import datasets import pytest import torch import transformers -from utils.util import duplicate_list_to_length, flatten_list, skip_single_gpu +from utils.util import skip_single_gpu from tensorrt_llm import LLM as LLM_torch from tensorrt_llm._tensorrt_engine import LLM @@ -47,6 +47,10 @@ from tensorrt_llm.sampling_params import (BatchedLogitsProcessor, LogitsProcessor, SamplingParams) +from .lora_test_utils import ( + check_multi_unique_lora_adapters_from_request, + check_trt_python_llama_7b_multi_lora_from_request_test_harness) + # isort: off sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..") from gc_utils import assert_resource_freed @@ -1364,122 +1368,6 @@ def llama_v2_13b_lora_from_dir_test_harness(**llm_kwargs): assert similar(output.outputs[0].text, ref) -def llama_7b_multi_lora_from_request_test_harness(**llm_kwargs): - hf_model_dir = get_model_path("llama-models/llama-7b-hf") - hf_lora_dir1 = get_model_path("llama-models/luotuo-lora-7b-0.1") - hf_lora_dir2 = get_model_path("llama-models/Japanese-Alpaca-LoRA-7b-v0") - - # For LoRA checkpoints without finetuned embedding and lm_head, we can either: - # (1) specify lora_target_modules, or - # (2) provide a lora_dir to infer the lora_target_modules. - build_config = BuildConfig(lora_config=LoraConfig( - lora_target_modules=['attn_q', 'attn_k', 'attn_v'])) - llm = LLM(hf_model_dir, - enable_lora=True, - max_lora_rank=8, - build_config=build_config, - fast_build=True, - **llm_kwargs) - - prompts = [ - "美国的首都在哪里? \n答案:", - "美国的首都在哪里? \n答案:", - "美国的首都在哪里? \n答案:", - "アメリカ合衆国の首都はどこですか? \n答え:", - "アメリカ合衆国の首都はどこですか? \n答え:", - "アメリカ合衆国の首都はどこですか? \n答え:", - ] - references = [ - "沃尔玛\n\n## 新闻\n\n* ", - "美国的首都是华盛顿。\n\n美国的", - "纽约\n\n### カンファレンスの", - "Washington, D.C.\nWashington, D.C. is the capital of the United", - "华盛顿。\n\n英国の首都是什", - "ワシントン\nQ1. アメリカ合衆国", - ] - key_words = [ - "沃尔玛", - "华盛顿", - "纽约", - "Washington", - "华盛顿", - "ワシントン", - ] - lora_req1 = LoRARequest("luotuo", 1, hf_lora_dir1) - lora_req2 = LoRARequest("Japanese", 2, hf_lora_dir2) - sampling_params = SamplingParams(max_tokens=20) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=[None, lora_req1, lora_req2, None, lora_req1, lora_req2]) - for output, ref, key_word in zip(outputs, references, key_words): - assert similar(output.outputs[0].text, - ref) or key_word in output.outputs[0].txt - - -def llama_7b_multi_unique_lora_adapters_from_request( - lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int, repeats: int, **llm_kwargs): - total_lora_adapters = sum(lora_adapter_count_per_call) - - hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" - hf_lora_dirs = [ - f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1", - f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" - ] - - # For LoRA checkpoints without finetuned embedding and lm_head, we can either: - # (1) specify lora_target_modules, or - # (2) provide a lora_dir to infer the lora_target_modules. - build_config = BuildConfig(lora_config=LoraConfig( - lora_target_modules=['attn_q', 'attn_k', 'attn_v'], - max_lora_rank=8, - max_loras=max_loras, - max_cpu_loras=max_cpu_loras)) - llm = LLM(hf_model_dir, - enable_lora=True, - build_config=build_config, - fast_build=True, - **llm_kwargs) - - # Each prompt should have a reference for every LoRA adapter dir (in the same order as in hf_lora_dirs) - prompt_to_references = OrderedDict({ - "美国的首都在哪里? \n答案:": [ - "美国的首都是华盛顿。\n\n美国的", - "纽约\n\n### カンファレンスの", - ], - "アメリカ合衆国の首都はどこですか? \n答え:": [ - "华盛顿。\n\n英国の首都是什", - "ワシントン\nQ1. アメリカ合衆国", - ], - }) - - prompts_to_generate = duplicate_list_to_length( - flatten_list([[prompt] * len(hf_lora_dirs) - for prompt in prompt_to_references.keys()]), - total_lora_adapters) - references = duplicate_list_to_length( - flatten_list(list(prompt_to_references.values())), total_lora_adapters) - lora_requests = [ - LoRARequest(str(i), i, hf_lora_dirs[i % len(hf_lora_dirs)]) - for i in range(total_lora_adapters) - ] - - # Perform repeats of the same requests to test reuse and reload of adapters previously unloaded from cache - for i in range(repeats): - last_idx = 0 - for adapter_count in lora_adapter_count_per_call: - sampling_params = SamplingParams(max_tokens=20) - outputs = llm.generate( - prompts_to_generate[last_idx:last_idx + adapter_count], - sampling_params, - lora_request=lora_requests[last_idx:last_idx + adapter_count]) - for output, ref in zip( - outputs, references[last_idx:last_idx + adapter_count]): - assert similar(output.outputs[0].text, ref) - last_idx += adapter_count - - @pytest.mark.parametrize( "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", [ @@ -1500,8 +1388,27 @@ def llama_7b_multi_unique_lora_adapters_from_request( def test_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, max_cpu_loras: int, repeats: int): - llama_7b_multi_unique_lora_adapters_from_request( - lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats) + hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" + hf_lora_dirs = [ + f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1", + f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" + ] + + # For LoRA checkpoints without finetuned embedding and lm_head, we can either: + # (1) specify lora_target_modules, or + # (2) provide a lora_dir to infer the lora_target_modules. + build_config = BuildConfig(lora_config=LoraConfig( + lora_target_modules=['attn_q', 'attn_k', 'attn_v'], + max_lora_rank=8, + max_loras=max_loras, + max_cpu_loras=max_cpu_loras)) + llm = LLM(hf_model_dir, + enable_lora=True, + build_config=build_config, + fast_build=True) + check_multi_unique_lora_adapters_from_request(llm, hf_lora_dirs, + lora_adapter_count_per_call, + repeats) @skip_gpu_memory_less_than_40gb @@ -1511,7 +1418,8 @@ def test_llama_v2_13b_lora(): @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora(): - llama_7b_multi_lora_from_request_test_harness(max_loras=1, max_cpu_loras=8) + check_trt_python_llama_7b_multi_lora_from_request_test_harness( + max_loras=1, max_cpu_loras=8) def llama_v2_7b_prompt_adapter_test_harness(**llm_kwargs): diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 47fa0382227..1ec1cdce342 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import pytest from tensorrt_llm import LLM @@ -7,12 +5,15 @@ from tensorrt_llm.sampling_params import SamplingParams # isort: off +from .lora_test_utils import ( + check_multi_unique_lora_adapters_from_request, + check_pytorch_llama_7b_multi_lora_from_request_test_harness) from .test_llm import ( get_model_path, global_kvcache_config, llama_model_path, llm_get_stats_async_test_harness, llm_get_stats_test_harness, prompts, run_llm_abort_request, run_llm_with_postprocess_parallel_and_result_handler, tinyllama_logits_processor_test_harness, _test_llm_capture_request_error) -from utils.util import EnvVarsContextManager, duplicate_list_to_length, flatten_list, force_ampere, run_function_in_sub_process, similar, skip_gpu_memory_less_than_40gb, skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb +from utils.util import EnvVarsContextManager, force_ampere, run_function_in_sub_process, similar, skip_gpu_memory_less_than_40gb, skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb from utils.llm_data import llm_models_root from tensorrt_llm.lora_manager import LoraConfig from tensorrt_llm.executor.request import LoRARequest @@ -163,55 +164,6 @@ def llama_7b_lora_from_dir_test_harness(**llm_kwargs) -> None: llm.shutdown() -def llama_7b_multi_lora_from_request_test_harness(**llm_kwargs) -> None: - hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" - hf_lora_dir1 = f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1" - hf_lora_dir2 = f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" - - # For LoRA checkpoints without finetuned embedding and lm_head, we can either: - # (1) specify lora_target_modules, or - # (2) provide a lora_dir to infer the lora_target_modules. - lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], - max_lora_rank=8) - # Disable CUDA graph - # TODO: remove this once we have a proper fix for CUDA graph in LoRA - llm = LLM(hf_model_dir, - lora_config=lora_config, - cuda_graph_config=None, - **llm_kwargs) - - try: - prompts = [ - "美国的首都在哪里? \n答案:", - "美国的首都在哪里? \n答案:", - "美国的首都在哪里? \n答案:", - "アメリカ合衆国の首都はどこですか? \n答え:", - "アメリカ合衆国の首都はどこですか? \n答え:", - "アメリカ合衆国の首都はどこですか? \n答え:", - ] - references = [ - "沃尔玛\n\n## 新闻\n\n* ", - "美国的首都是华盛顿。\n\n美国的", - "纽约\n\n### カンファレンスの", - "Washington, D.C.\nWashington, D.C. is the capital of the United", - "华盛顿。\n\n英国の首都是什", - "ワシントン\nQ1. アメリカ合衆国", - ] - lora_req1 = LoRARequest("luotuo", 1, hf_lora_dir1) - lora_req2 = LoRARequest("Japanese", 2, hf_lora_dir2) - sampling_params = SamplingParams(max_tokens=20) - outputs = llm.generate(prompts, - sampling_params, - lora_request=[ - None, lora_req1, lora_req2, None, lora_req1, - lora_req2 - ]) - for output, ref in zip(outputs, references): - assert similar(output.outputs[0].text, ref) - finally: - llm.shutdown() - - @skip_gpu_memory_less_than_40gb def test_llama_7b_lora(): llama_7b_lora_from_dir_test_harness() @@ -251,65 +203,7 @@ def test_llama_7b_lora_default_modules() -> None: @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora(): - llama_7b_multi_lora_from_request_test_harness() - - -def llama_7b_multi_unique_lora_adapters_from_request( - lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int, repeats: int, **llm_kwargs): - total_lora_adapters = sum(lora_adapter_count_per_call) - - hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" - hf_lora_dirs = [ - f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1", - f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" - ] - - # For LoRA checkpoints without finetuned embedding and lm_head, we can either: - # (1) specify lora_target_modules, or - # (2) provide a lora_dir to infer the lora_target_modules. - lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], - max_lora_rank=8, - max_loras=max_loras, - max_cpu_loras=max_cpu_loras) - llm = LLM(hf_model_dir, lora_config=lora_config, **llm_kwargs) - - # Each prompt should have a reference for every LoRA adapter dir (in the same order as in hf_lora_dirs) - prompt_to_references = OrderedDict({ - "美国的首都在哪里? \n答案:": [ - "美国的首都是华盛顿。\n\n美国的", - "纽约\n\n### カンファレンスの", - ], - "アメリカ合衆国の首都はどこですか? \n答え:": [ - "华盛顿。\n\n英国の首都是什", - "ワシントン\nQ1. アメリカ合衆国", - ], - }) - - prompts_to_generate = duplicate_list_to_length( - flatten_list([[prompt] * len(hf_lora_dirs) - for prompt in prompt_to_references.keys()]), - total_lora_adapters) - references = duplicate_list_to_length( - flatten_list(list(prompt_to_references.values())), total_lora_adapters) - lora_requests = [ - LoRARequest(str(i), i, hf_lora_dirs[i % len(hf_lora_dirs)]) - for i in range(total_lora_adapters) - ] - - # Perform repeats of the same requests to test reuse and reload of adapters previously unloaded from cache - for i in range(repeats): - last_idx = 0 - for adapter_count in lora_adapter_count_per_call: - sampling_params = SamplingParams(max_tokens=20) - outputs = llm.generate( - prompts_to_generate[last_idx:last_idx + adapter_count], - sampling_params, - lora_request=lora_requests[last_idx:last_idx + adapter_count]) - for output, ref in zip( - outputs, references[last_idx:last_idx + adapter_count]): - assert similar(output.outputs[0].text, ref) - last_idx += adapter_count + check_pytorch_llama_7b_multi_lora_from_request_test_harness() @pytest.mark.parametrize( @@ -332,10 +226,37 @@ def llama_7b_multi_unique_lora_adapters_from_request( def test_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, max_cpu_loras: int, repeats: int): - llama_7b_multi_unique_lora_adapters_from_request( + check_llama_7b_multi_unique_lora_adapters_from_request( lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats) +def check_llama_7b_multi_unique_lora_adapters_from_request( + lora_adapter_count_per_call: list[int], max_loras: int, + max_cpu_loras: int, repeats: int): + hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" + hf_lora_dirs = [ + f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1", + f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" + ] + + # For LoRA checkpoints without finetuned embedding and lm_head, we can either: + # (1) specify lora_target_modules, or + # (2) provide a lora_dir to infer the lora_target_modules. + lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], + max_lora_rank=8, + max_loras=max_loras, + max_cpu_loras=max_cpu_loras) + + llm = LLM(hf_model_dir, + lora_config=lora_config, + # Disable CUDA graph + # TODO: remove this once we have a proper fix for CUDA graph in LoRA + cuda_graph_config=None) + check_multi_unique_lora_adapters_from_request(llm, hf_lora_dirs, + lora_adapter_count_per_call, + repeats) + + @pytest.mark.parametrize( "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", [ @@ -365,10 +286,12 @@ def _check_contains_expected_message(stdout: str, stderr: str): with EnvVarsContextManager({"TLLM_WORKER_USE_SINGLE_PROCESS": "1"}): child_stdout, child_stderr = run_function_in_sub_process( - target=llama_7b_multi_unique_lora_adapters_from_request, + target=check_llama_7b_multi_unique_lora_adapters_from_request, args=(lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats), - kwargs={}, + # Disable CUDA graph + # TODO: remove this once we have a proper fix for CUDA graph in LoRA + kwargs={"cuda_graph_config": None}, stop_waiting_criteria=_check_contains_expected_message) assert _check_contains_expected_message(child_stdout, child_stderr) From 9cc4cc4543263a928d5f34d4c7972d7ce7682328 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:56 +0000 Subject: [PATCH 13/32] PeftCacheManager::determineNumPages throws exception with 'not supported' note when called with a requests that's not cached and has no lora weights with it, reverted changes in capacityScheduler.cpp Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- .../batch_manager/capacityScheduler.cpp | 25 ++----------------- .../batch_manager/peftCacheManager.cpp | 12 ++++++--- 2 files changed, 11 insertions(+), 26 deletions(-) diff --git a/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp b/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp index eea8e0d11a0..9c9c56ba9d6 100644 --- a/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp +++ b/cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp @@ -31,27 +31,6 @@ using kv_cache_manager::BlockKeyHasher; namespace { -SizeType32 tryDetermineNumPages( - OptionalRef peftCacheManager, std::shared_ptr const& llmRequest) -{ - if (peftCacheManager) - { - try - { - return peftCacheManager->determineNumPages(llmRequest); - } - catch (std::runtime_error const& e) - { - // Catch the exception so handling its consequences won't be in the scheduler - TLLM_LOG_WARNING( - "GuaranteedNoEvictScheduler caught exception raised from peftCacheManager->determineNumPages, assuming " - "it requires 0 pages. Exception: %s", - e.what()); - } - } - return 0; -} - std::tuple, std::unordered_set> prefillWithChunkedContextsAlreadyExecuting(RequestList const& activeRequests, kv_cache_manager::BaseKVCacheManager const& kvCacheManager, @@ -278,7 +257,7 @@ std::tuple GuaranteedNoEvictScheduler::impl( bool const isNewTask = reqHasLora && !uniqTaskIds.count(req->getLoraTaskId().value()); if (isNewTask) { - claimedPeftPages += tryDetermineNumPages(peftCacheManager, req); + claimedPeftPages += peftCacheManager ? peftCacheManager->determineNumPages(req) : 0; uniqTaskIds.insert(req->getLoraTaskId().value()); } } @@ -324,7 +303,7 @@ std::tuple GuaranteedNoEvictScheduler::impl( = reservedCrossBlocks ? reservedCrossBlocks->enoughAvailableBlocks(*req) : true; bool reqHasLora = req->getLoraTaskId().has_value(); bool isNewTask = reqHasLora && !uniqTaskIds.count(req->getLoraTaskId().value()); - auto neededPeftPages = isNewTask ? tryDetermineNumPages(peftCacheManager, req) : 0; + auto neededPeftPages = isNewTask && peftCacheManager ? peftCacheManager->determineNumPages(req) : 0; if (enoughBlocks && enoughCrossBlocks && neededPeftPages <= availablePeftPages) { diff --git a/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp index 1fae4cfc36e..05c59080655 100644 --- a/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp @@ -593,9 +593,10 @@ SizeType32 PeftCacheManager::determineNumPages(std::shared_ptr llmRe TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); if (llmRequest->getLoraTaskId().has_value()) { + auto taskId = llmRequest->getLoraTaskId().value(); try { - return mHostLoraCache->determineNumPages(llmRequest->getLoraTaskId().value()); + return mHostLoraCache->determineNumPages(taskId); } catch (std::runtime_error& e) { @@ -603,10 +604,15 @@ SizeType32 PeftCacheManager::determineNumPages(std::shared_ptr llmRe { return mHostLoraCache->determineNumPages(llmRequest->getLoraConfig().value()); } - else + if (!llmRequest->getLoraWeights().has_value()) { - throw; + std::string errMsg + = "LoRA task " + std::to_string(taskId) + " has no LoRA weights and not found in cache." + " Note that currently a request with LoRA task that was already loaded is sent without its LoRA weights to save its serialization, copy and deserialization," + " so if this LoRA task was evicted from LoRA CPU cache, then its reuse is currently not supported."; + throw PeftTaskNotCachedException(errMsg); } + throw; } } TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); From e16aae7f6638f42236786f96f462869d4a6c6f1f Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:57 +0000 Subject: [PATCH 14/32] Add docstring to check_multi_unique_lora_adapters_from_request Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/lora_test_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/unittest/llmapi/lora_test_utils.py b/tests/unittest/llmapi/lora_test_utils.py index 762b7bb49ad..09fceb15bc7 100644 --- a/tests/unittest/llmapi/lora_test_utils.py +++ b/tests/unittest/llmapi/lora_test_utils.py @@ -13,6 +13,11 @@ def check_multi_unique_lora_adapters_from_request( llm: BaseLLM, hf_lora_dirs: list[str], lora_adapter_count_per_call: list[int], repeats: int): + """Calls llm.generate s.t. for each c in lora_adapter_count_per_call, llm.generate is called with c requests. + All requests sent to llm.generate over all calls (in a single repeats iteration) are configured to each use a unique + LoRA adapter. This entire process is done in a loop (with the same requests) 'repeats' times with the same requests. + Asserts the output of each llm.generate call is similar to the expected. + """ # noqa: D205 total_lora_adapters = sum(lora_adapter_count_per_call) # Each prompt should have a reference for every LoRA adapter dir (in the same order as in hf_lora_dirs) From 8758975761a5673c14abd4694718a7d9c97dba3f Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:58 +0000 Subject: [PATCH 15/32] Fix imports of test_llm.py Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 04b80dba49e..591001e6f09 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -24,7 +24,6 @@ import pytest import torch import transformers -from utils.util import skip_single_gpu from tensorrt_llm import LLM as LLM_torch from tensorrt_llm._tensorrt_engine import LLM @@ -47,16 +46,14 @@ from tensorrt_llm.sampling_params import (BatchedLogitsProcessor, LogitsProcessor, SamplingParams) -from .lora_test_utils import ( - check_multi_unique_lora_adapters_from_request, - check_trt_python_llama_7b_multi_lora_from_request_test_harness) - # isort: off sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..") from gc_utils import assert_resource_freed -from utils.util import skip_single_gpu +from llmapi.lora_test_utils import ( + check_multi_unique_lora_adapters_from_request, + check_trt_python_llama_7b_multi_lora_from_request_test_harness) from utils.llm_data import llm_models_root -from utils.util import force_ampere, similar, skip_gpu_memory_less_than_40gb, skip_pre_hopper +from utils.util import force_ampere, similar, skip_gpu_memory_less_than_40gb, skip_pre_hopper, skip_single_gpu # isort: on # The unittests are based on the tiny-llama, which is fast to build and run. From bdfa780ad7315cad2601fa4ea2095addbb18c38b Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:58 +0000 Subject: [PATCH 16/32] Improved check_multi_unique_lora_adapters_from_request docstring Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/lora_test_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unittest/llmapi/lora_test_utils.py b/tests/unittest/llmapi/lora_test_utils.py index 09fceb15bc7..50da4e043e2 100644 --- a/tests/unittest/llmapi/lora_test_utils.py +++ b/tests/unittest/llmapi/lora_test_utils.py @@ -13,9 +13,9 @@ def check_multi_unique_lora_adapters_from_request( llm: BaseLLM, hf_lora_dirs: list[str], lora_adapter_count_per_call: list[int], repeats: int): - """Calls llm.generate s.t. for each c in lora_adapter_count_per_call, llm.generate is called with c requests. - All requests sent to llm.generate over all calls (in a single repeats iteration) are configured to each use a unique - LoRA adapter. This entire process is done in a loop (with the same requests) 'repeats' times with the same requests. + """Calls llm.generate s.t. for each C in lora_adapter_count_per_call, llm.generate is called with C requests, + where each request is configured with a unique LoRA adapter ID. This entire process is done in a loop 'repeats' + times with the same requests. Asserts the output of each llm.generate call is similar to the expected. """ # noqa: D205 total_lora_adapters = sum(lora_adapter_count_per_call) From 2c3b771118fea9d7e17ccbfa0a3e8ace76e6d1ad Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:59 +0000 Subject: [PATCH 17/32] Fix imports in test_llm_multi_gpu.py and in test_llm_multi_gpu_pytorch.py Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm_multi_gpu.py | 6 +++--- tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py index ad87411c219..fe6e8ced4bc 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu.py @@ -17,12 +17,12 @@ from tensorrt_llm.models.llama.model import LLaMAForCausalLM # isort: off +from .lora_test_utils import check_trt_python_llama_7b_multi_lora_from_request_test_harness from .test_llm import ( DummyError, DummyExecutorWorker3, _test_llm_capture_request_error, _test_llm_generate_async, check_llm_return_context_logits, check_llm_return_generation_logits, llm_return_logprobs_test_harness, - default_model_name, get_model_path, - llama_7b_multi_lora_from_request_test_harness, llama_model_path, + default_model_name, get_model_path, llama_model_path, llama_v2_7b_prompt_adapter_test_harness, llama_v2_13b_lora_from_dir_test_harness, llm_check_output, llm_get_stats_async_test_harness, llm_get_stats_test_harness, @@ -261,7 +261,7 @@ def test_llama_v2_13b_lora_tp2(): @pytest.mark.gpu2 @pytest.mark.part3 def test_llama_7b_multi_lora_tp2(): - llama_7b_multi_lora_from_request_test_harness( + check_trt_python_llama_7b_multi_lora_from_request_test_harness( tensor_parallel_size=2, max_loras=1, max_cpu_loras=8, diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py index 16053fd227f..62026be2959 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py @@ -3,8 +3,8 @@ # isort: off from .test_llm import tinyllama_logits_processor_test_harness from tensorrt_llm.llmapi import KvCacheConfig -from .test_llm_pytorch import (llama_7b_lora_from_dir_test_harness, - llama_7b_multi_lora_from_request_test_harness) +from .lora_test_utils import check_pytorch_llama_7b_multi_lora_from_request_test_harness +from .test_llm_pytorch import llama_7b_lora_from_dir_test_harness from .test_llm import _test_llm_capture_request_error # isort: on @@ -40,5 +40,5 @@ def test_llama_7b_lora_tp2(): @pytest.mark.gpu2 def test_llama_7b_multi_lora_tp2(): - llama_7b_multi_lora_from_request_test_harness( + check_pytorch_llama_7b_multi_lora_from_request_test_harness( tensor_parallel_size=2, kv_cache_config=global_kv_cache_config) From 66256827164c17f4ba7a19093df64507f8ce974d Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:04:59 +0000 Subject: [PATCH 18/32] Revert changes in _TrtLLM._build_model, move LLM creation to test so llm_kwargs are passed Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tensorrt_llm/llmapi/llm.py | 4 +- tests/unittest/llmapi/lora_test_utils.py | 54 ++++++------------- tests/unittest/llmapi/test_llm.py | 41 ++++++++------ tests/unittest/llmapi/test_llm_multi_gpu.py | 6 ++- .../llmapi/test_llm_multi_gpu_pytorch.py | 10 ++-- tests/unittest/llmapi/test_llm_pytorch.py | 53 +++++++++--------- 6 files changed, 77 insertions(+), 91 deletions(-) diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 95426677c8c..1afe97d3ce4 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -813,9 +813,9 @@ def _build_model(self): len(lora_config.lora_target_modules + lora_config.missing_qkv_modules) self._executor_config.peft_cache_config = tllm.PeftCacheConfig( num_device_module_layer=max_lora_rank * num_lora_modules * - lora_config.max_loras, + self.args.max_loras, num_host_module_layer=max_lora_rank * num_lora_modules * - lora_config.max_cpu_loras, + self.args.max_cpu_loras, ) if self.args.decoding_config is not None: self._executor_config.decoding_config = self.args.decoding_config diff --git a/tests/unittest/llmapi/lora_test_utils.py b/tests/unittest/llmapi/lora_test_utils.py index 50da4e043e2..5f6cdf66730 100644 --- a/tests/unittest/llmapi/lora_test_utils.py +++ b/tests/unittest/llmapi/lora_test_utils.py @@ -1,18 +1,16 @@ -from typing import OrderedDict +from typing import OrderedDict, Type from utils.llm_data import llm_models_root from utils.util import duplicate_list_to_length, flatten_list, similar from tensorrt_llm import SamplingParams from tensorrt_llm.executor.request import LoRARequest -from tensorrt_llm.llmapi.llm import BaseLLM, _TorchLLM, _TrtLLM -from tensorrt_llm.llmapi.llm_utils import BuildConfig -from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.llmapi.llm import BaseLLM -def check_multi_unique_lora_adapters_from_request( - llm: BaseLLM, hf_lora_dirs: list[str], - lora_adapter_count_per_call: list[int], repeats: int): +def check_llama_7b_multi_unique_lora_adapters_from_request( + lora_adapter_count_per_call: list[int], repeats: int, + llm_class: Type[BaseLLM], **llm_kwargs): """Calls llm.generate s.t. for each C in lora_adapter_count_per_call, llm.generate is called with C requests, where each request is configured with a unique LoRA adapter ID. This entire process is done in a loop 'repeats' times with the same requests. @@ -20,6 +18,11 @@ def check_multi_unique_lora_adapters_from_request( """ # noqa: D205 total_lora_adapters = sum(lora_adapter_count_per_call) + hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" + hf_lora_dirs = [ + f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1", + f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" + ] # Each prompt should have a reference for every LoRA adapter dir (in the same order as in hf_lora_dirs) prompt_to_references = OrderedDict({ "美国的首都在哪里? \n答案:": [ @@ -42,6 +45,7 @@ def check_multi_unique_lora_adapters_from_request( LoRARequest(str(i), i, hf_lora_dirs[i % len(hf_lora_dirs)]) for i in range(total_lora_adapters) ] + llm = llm_class(hf_model_dir, **llm_kwargs) # Perform repeats of the same requests to test reuse and reload of adapters previously unloaded from cache try: @@ -61,39 +65,9 @@ def check_multi_unique_lora_adapters_from_request( llm.shutdown() -def check_pytorch_llama_7b_multi_lora_from_request_test_harness( - max_lora_rank: int = 8, **llm_kwargs) -> None: +def check_llama_7b_multi_lora_from_request_test_harness( + llm_class: Type[BaseLLM], **llm_kwargs) -> None: hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" - - # For LoRA checkpoints without finetuned embedding and lm_head, we can either: - # (1) specify lora_target_modules, or - # (2) provide a lora_dir to infer the lora_target_modules. - lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], - max_lora_rank=max_lora_rank) - - llm = _TorchLLM(hf_model_dir, lora_config=lora_config, **llm_kwargs) - _check_llama_7b_multi_lora_from_request_test_harness(llm) - - -def check_trt_python_llama_7b_multi_lora_from_request_test_harness( - max_lora_rank: int = 8, **llm_kwargs): - hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" - - # For LoRA checkpoints without finetuned embedding and lm_head, we can either: - # (1) specify lora_target_modules, or - # (2) provide a lora_dir to infer the lora_target_modules. - build_config = BuildConfig(lora_config=LoraConfig( - lora_target_modules=['attn_q', 'attn_k', 'attn_v'])) - llm = _TrtLLM(hf_model_dir, - enable_lora=True, - max_lora_rank=max_lora_rank, - build_config=build_config, - fast_build=True, - **llm_kwargs) - _check_llama_7b_multi_lora_from_request_test_harness(llm) - - -def _check_llama_7b_multi_lora_from_request_test_harness(llm: BaseLLM) -> None: hf_lora_dir1 = f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1" hf_lora_dir2 = f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" prompts = [ @@ -123,6 +97,8 @@ def _check_llama_7b_multi_lora_from_request_test_harness(llm: BaseLLM) -> None: lora_req1 = LoRARequest("luotuo", 1, hf_lora_dir1) lora_req2 = LoRARequest("Japanese", 2, hf_lora_dir2) sampling_params = SamplingParams(max_tokens=20) + + llm = llm_class(hf_model_dir, **llm_kwargs) try: outputs = llm.generate( prompts, diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 591001e6f09..0cf0e02539a 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -50,8 +50,8 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..") from gc_utils import assert_resource_freed from llmapi.lora_test_utils import ( - check_multi_unique_lora_adapters_from_request, - check_trt_python_llama_7b_multi_lora_from_request_test_harness) + check_llama_7b_multi_lora_from_request_test_harness, + check_llama_7b_multi_unique_lora_adapters_from_request) from utils.llm_data import llm_models_root from utils.util import force_ampere, similar, skip_gpu_memory_less_than_40gb, skip_pre_hopper, skip_single_gpu # isort: on @@ -1385,12 +1385,6 @@ def llama_v2_13b_lora_from_dir_test_harness(**llm_kwargs): def test_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, max_cpu_loras: int, repeats: int): - hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" - hf_lora_dirs = [ - f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1", - f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" - ] - # For LoRA checkpoints without finetuned embedding and lm_head, we can either: # (1) specify lora_target_modules, or # (2) provide a lora_dir to infer the lora_target_modules. @@ -1399,13 +1393,16 @@ def test_llama_7b_multi_lora_evict_load_new_adapters( max_lora_rank=8, max_loras=max_loras, max_cpu_loras=max_cpu_loras)) - llm = LLM(hf_model_dir, - enable_lora=True, - build_config=build_config, - fast_build=True) - check_multi_unique_lora_adapters_from_request(llm, hf_lora_dirs, - lora_adapter_count_per_call, - repeats) + check_llama_7b_multi_unique_lora_adapters_from_request( + lora_adapter_count_per_call, + repeats, + LLM, + enable_lora=True, + build_config=build_config, + fast_build=True, + max_lora_rank=8, + max_loras=max_loras, + max_cpu_loras=max_cpu_loras) @skip_gpu_memory_less_than_40gb @@ -1415,8 +1412,18 @@ def test_llama_v2_13b_lora(): @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora(): - check_trt_python_llama_7b_multi_lora_from_request_test_harness( - max_loras=1, max_cpu_loras=8) + # For LoRA checkpoints without finetuned embedding and lm_head, we can either: + # (1) specify lora_target_modules, or + # (2) provide a lora_dir to infer the lora_target_modules. + build_config = BuildConfig(lora_config=LoraConfig( + lora_target_modules=['attn_q', 'attn_k', 'attn_v'])) + check_llama_7b_multi_lora_from_request_test_harness( + LLM, + enable_lora=True, + build_config=build_config, + fast_build=True, + max_loras=1, + max_cpu_loras=8) def llama_v2_7b_prompt_adapter_test_harness(**llm_kwargs): diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py index fe6e8ced4bc..261cc344777 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu.py @@ -17,7 +17,7 @@ from tensorrt_llm.models.llama.model import LLaMAForCausalLM # isort: off -from .lora_test_utils import check_trt_python_llama_7b_multi_lora_from_request_test_harness +from .lora_test_utils import check_llama_7b_multi_lora_from_request_test_harness from .test_llm import ( DummyError, DummyExecutorWorker3, _test_llm_capture_request_error, _test_llm_generate_async, check_llm_return_context_logits, @@ -261,8 +261,10 @@ def test_llama_v2_13b_lora_tp2(): @pytest.mark.gpu2 @pytest.mark.part3 def test_llama_7b_multi_lora_tp2(): - check_trt_python_llama_7b_multi_lora_from_request_test_harness( + check_llama_7b_multi_lora_from_request_test_harness( + LLM, tensor_parallel_size=2, + max_lora_rank=8, max_loras=1, max_cpu_loras=8, kv_cache_config=global_kv_cache_config) diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py index 62026be2959..c16de2698c0 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py @@ -2,8 +2,9 @@ # isort: off from .test_llm import tinyllama_logits_processor_test_harness +from tensorrt_llm import LLM from tensorrt_llm.llmapi import KvCacheConfig -from .lora_test_utils import check_pytorch_llama_7b_multi_lora_from_request_test_harness +from .lora_test_utils import check_llama_7b_multi_lora_from_request_test_harness from .test_llm_pytorch import llama_7b_lora_from_dir_test_harness from .test_llm import _test_llm_capture_request_error # isort: on @@ -40,5 +41,8 @@ def test_llama_7b_lora_tp2(): @pytest.mark.gpu2 def test_llama_7b_multi_lora_tp2(): - check_pytorch_llama_7b_multi_lora_from_request_test_harness( - tensor_parallel_size=2, kv_cache_config=global_kv_cache_config) + check_llama_7b_multi_lora_from_request_test_harness( + LLM, + max_lora_rank=8, + tensor_parallel_size=2, + kv_cache_config=global_kv_cache_config) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 1ec1cdce342..7a7b4e3c9fe 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -6,8 +6,8 @@ # isort: off from .lora_test_utils import ( - check_multi_unique_lora_adapters_from_request, - check_pytorch_llama_7b_multi_lora_from_request_test_harness) + check_llama_7b_multi_lora_from_request_test_harness, + check_llama_7b_multi_unique_lora_adapters_from_request) from .test_llm import ( get_model_path, global_kvcache_config, llama_model_path, llm_get_stats_async_test_harness, llm_get_stats_test_harness, prompts, @@ -203,7 +203,15 @@ def test_llama_7b_lora_default_modules() -> None: @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora(): - check_pytorch_llama_7b_multi_lora_from_request_test_harness() + # For LoRA checkpoints without finetuned embedding and lm_head, we can either: + # (1) specify lora_target_modules, or + # (2) provide a lora_dir to infer the lora_target_modules. + lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], + max_lora_rank=8, + max_loras=1, + max_cpu_loras=8) + check_llama_7b_multi_lora_from_request_test_harness(LLM, + lora_config=lora_config) @pytest.mark.parametrize( @@ -226,19 +234,6 @@ def test_llama_7b_multi_lora(): def test_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, max_cpu_loras: int, repeats: int): - check_llama_7b_multi_unique_lora_adapters_from_request( - lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats) - - -def check_llama_7b_multi_unique_lora_adapters_from_request( - lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int, repeats: int): - hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" - hf_lora_dirs = [ - f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1", - f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0" - ] - # For LoRA checkpoints without finetuned embedding and lm_head, we can either: # (1) specify lora_target_modules, or # (2) provide a lora_dir to infer the lora_target_modules. @@ -246,15 +241,14 @@ def check_llama_7b_multi_unique_lora_adapters_from_request( max_lora_rank=8, max_loras=max_loras, max_cpu_loras=max_cpu_loras) - - llm = LLM(hf_model_dir, - lora_config=lora_config, - # Disable CUDA graph - # TODO: remove this once we have a proper fix for CUDA graph in LoRA - cuda_graph_config=None) - check_multi_unique_lora_adapters_from_request(llm, hf_lora_dirs, - lora_adapter_count_per_call, - repeats) + check_llama_7b_multi_unique_lora_adapters_from_request( + lora_adapter_count_per_call, + repeats, + LLM, + lora_config=lora_config, + # Disable CUDA graph + # TODO: remove this once we have a proper fix for CUDA graph in LoRA + cuda_graph_config=None) @pytest.mark.parametrize( @@ -284,14 +278,17 @@ def _check_contains_expected_message(stdout: str, stderr: str): " LoRA task was evicted from LoRA CPU cache, then its reuse is currently not supported." return note_in_message in stderr + lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], + max_lora_rank=8, + max_loras=max_loras, + max_cpu_loras=max_cpu_loras) with EnvVarsContextManager({"TLLM_WORKER_USE_SINGLE_PROCESS": "1"}): child_stdout, child_stderr = run_function_in_sub_process( target=check_llama_7b_multi_unique_lora_adapters_from_request, - args=(lora_adapter_count_per_call, max_loras, max_cpu_loras, - repeats), + args=(lora_adapter_count_per_call, repeats, LLM), # Disable CUDA graph # TODO: remove this once we have a proper fix for CUDA graph in LoRA - kwargs={"cuda_graph_config": None}, + kwargs={"cuda_graph_config": None"lora_config": lora_config}, stop_waiting_criteria=_check_contains_expected_message) assert _check_contains_expected_message(child_stdout, child_stderr) From ed68f4963ca2efd42e89a20b1e4f07a09c9ce44f Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:05:00 +0000 Subject: [PATCH 19/32] Change the 'should include adapter weights with request' to be based on presence in LoRA CPU cache instead of LoraManager's python cache and remove this optimization from TRT-python flow. Update tests to cover a previously failed case with repeats in the same llm.generate call, remove test_llama_7b_multi_lora as its test case is now covered by test_llama_7b_multi_lora_evict_load_new_adapters with repeats_per_call>1 Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tensorrt_llm/executor/worker.py | 21 +++++--- tensorrt_llm/lora_manager.py | 17 ++++++- tests/unittest/llmapi/lora_test_utils.py | 22 +++++---- tests/unittest/llmapi/test_llm.py | 37 +++----------- tests/unittest/llmapi/test_llm_pytorch.py | 59 +++++++++-------------- 5 files changed, 75 insertions(+), 81 deletions(-) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index a82d0d71e5f..b3b1094eb22 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -150,13 +150,21 @@ def _create_engine(): self._runtime_model_config = _engine_config_to_model_config( engine_config) if engine_config.build_config.plugin_config.lora_plugin: - self._lora_manager = LoraManager() + # PeftCacheManager is at Executor->ExecutorImpl->TrtGptModel->mPeftCacheManager + # that is hard to access, therefore, for now the python optimization that needs + # the peft cache manager is not available in TRT-python flow. + self._lora_manager = LoraManager(cpp_peft_cache_manager=None) if engine_config.build_config.max_prompt_embedding_table_size > 0: self._prompt_adapter_manager = PromptAdapterManager() if getattr(executor_config, "backend", "") == "pytorch" and lora_config is not None: - self._lora_manager = LoraManager() + from tensorrt_llm._torch.pyexecutor.resource_manager import \ + ResourceManagerType + peft_cache_manager = self.engine.resource_manager.resource_managers.get( + ResourceManagerType.PEFT_CACHE_MANAGER) + self._lora_manager = LoraManager( + cpp_peft_cache_manager=peft_cache_manager.impl) lora_model_config = self.engine.model_engine.lora_model_config assert lora_model_config is not None self._lora_model_config = lora_model_config @@ -362,15 +370,16 @@ def _load_prompt_adapter(self, def _enqueue_request(self, request: GenerationRequest) -> int: assert request.id is not None if self._lora_manager is not None and request.lora_request is not None: - loaded_new_lora_adapter = self._load_lora_adapter( - request.lora_request) + adapter_in_cache = self._lora_manager.is_adapter_in_cpu_cache( + request.lora_request.adapter_id) + self._load_lora_adapter(request.lora_request) uid = str(request.lora_request.adapter_id) lora_config = tllm.LoraConfig( task_id=request.lora_request.adapter_id, weights=self._lora_manager.cpp_lora_weights[uid] - if loaded_new_lora_adapter else None, + if not adapter_in_cache else None, config=self._lora_manager.cpp_lora_config[uid] - if loaded_new_lora_adapter else None) + if not adapter_in_cache else None) else: lora_config = None diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py index 3c40917a194..fd648824a3c 100644 --- a/tensorrt_llm/lora_manager.py +++ b/tensorrt_llm/lora_manager.py @@ -11,6 +11,8 @@ import torch import yaml +from tensorrt_llm.bindings import internal as tb_internal + from ._utils import DictConversion, pad_vocab_size, release_gc, str_dtype_to_torch, torch_to_numpy from .layers.linear import ColumnLinear from .mapping import Mapping @@ -436,7 +438,7 @@ class LoraManager(object): "mlp_gate_up": 18, } - def __init__(self): + def __init__(self, cpp_peft_cache_manager: tb_internal.batch_manager.PeftCacheManager | None): """Constructor.""" # _lora_uid_to_low_ranks: dict[str -> dict[int -> dict[str -> int]]] # { @@ -473,6 +475,19 @@ def __init__(self): self._cpp_lora_weights: Dict[str, torch.Tensor] = {} # on cpu self._cpp_lora_config: Dict[str, torch.Tensor] = {} # on cpu self.lora_target_modules: List[str] = [] + self._cpp_peft_cache_manager = cpp_peft_cache_manager + + def is_adapter_in_cpu_cache(self, adapter_uid: int) -> bool: + """Best effort to check if a LoRA adapter is in the LoRA CPU cache. + + If no peft_cache_manager instance wasn't given at the construction of this LoraManager instance, + then False is returned. + """ + return ( + self._cpp_peft_cache_manager.is_task_cached(adapter_uid) + if self._cpp_peft_cache_manager + else False + ) @staticmethod def get_missing_qkv_modules(lora_target_modules): diff --git a/tests/unittest/llmapi/lora_test_utils.py b/tests/unittest/llmapi/lora_test_utils.py index 5f6cdf66730..05d9d15da50 100644 --- a/tests/unittest/llmapi/lora_test_utils.py +++ b/tests/unittest/llmapi/lora_test_utils.py @@ -9,15 +9,14 @@ def check_llama_7b_multi_unique_lora_adapters_from_request( - lora_adapter_count_per_call: list[int], repeats: int, - llm_class: Type[BaseLLM], **llm_kwargs): - """Calls llm.generate s.t. for each C in lora_adapter_count_per_call, llm.generate is called with C requests, - where each request is configured with a unique LoRA adapter ID. This entire process is done in a loop 'repeats' - times with the same requests. + lora_adapter_count_per_call: list[int], repeat_calls: int, + repeats_per_call: int, llm_class: Type[BaseLLM], **llm_kwargs): + """Calls llm.generate s.t. for each C in lora_adapter_count_per_call, llm.generate is called with C requests + repeated repeats_per_call times, where each request is configured with a unique LoRA adapter ID. + This entire process is done in a loop 'repeats_per_call' times with the same requests. Asserts the output of each llm.generate call is similar to the expected. """ # noqa: D205 total_lora_adapters = sum(lora_adapter_count_per_call) - hf_model_dir = f"{llm_models_root()}/llama-models/llama-7b-hf" hf_lora_dirs = [ f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1", @@ -49,16 +48,19 @@ def check_llama_7b_multi_unique_lora_adapters_from_request( # Perform repeats of the same requests to test reuse and reload of adapters previously unloaded from cache try: - for _ in range(repeats): + for _ in range(repeat_calls): last_idx = 0 for adapter_count in lora_adapter_count_per_call: sampling_params = SamplingParams(max_tokens=20) outputs = llm.generate( - prompts_to_generate[last_idx:last_idx + adapter_count], + prompts_to_generate[last_idx:last_idx + adapter_count] * + repeats_per_call, sampling_params, - lora_request=lora_requests[last_idx:last_idx + adapter_count]) + lora_request=lora_requests[last_idx:last_idx + adapter_count] * + repeats_per_call) for output, ref in zip( - outputs, references[last_idx:last_idx + adapter_count]): + outputs, references[last_idx:last_idx + adapter_count] * + repeats_per_call): assert similar(output.outputs[0].text, ref) last_idx += adapter_count finally: diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 0cf0e02539a..bda6fdf3fed 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -49,9 +49,7 @@ # isort: off sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..") from gc_utils import assert_resource_freed -from llmapi.lora_test_utils import ( - check_llama_7b_multi_lora_from_request_test_harness, - check_llama_7b_multi_unique_lora_adapters_from_request) +from llmapi.lora_test_utils import check_llama_7b_multi_unique_lora_adapters_from_request from utils.llm_data import llm_models_root from utils.util import force_ampere, similar, skip_gpu_memory_less_than_40gb, skip_pre_hopper, skip_single_gpu # isort: on @@ -1366,25 +1364,21 @@ def llama_v2_13b_lora_from_dir_test_harness(**llm_kwargs): @pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", + "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeat_calls, repeats_per_call", [ - # Test eviction and loading of new adapters in the evicted space, within a single llm.generate call - ([ - 5, - ], 2, 2, 1), # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single - # llm.generate call + # llm.generate call, that's repeated twice. ([ 2, - ], 1, 2, 2), + ], 1, 2, 2, 3), # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU # cache size < LoRA CPU cache size - ([2, 2, 2], 1, 3, 1), + ([2, 2, 2], 1, 3, 1, 1), ]) @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int, repeats: int): + max_cpu_loras: int, repeat_calls: int, repeats_per_call: int): # For LoRA checkpoints without finetuned embedding and lm_head, we can either: # (1) specify lora_target_modules, or # (2) provide a lora_dir to infer the lora_target_modules. @@ -1395,7 +1389,8 @@ def test_llama_7b_multi_lora_evict_load_new_adapters( max_cpu_loras=max_cpu_loras)) check_llama_7b_multi_unique_lora_adapters_from_request( lora_adapter_count_per_call, - repeats, + repeat_calls, + repeats_per_call, LLM, enable_lora=True, build_config=build_config, @@ -1410,22 +1405,6 @@ def test_llama_v2_13b_lora(): llama_v2_13b_lora_from_dir_test_harness() -@skip_gpu_memory_less_than_40gb -def test_llama_7b_multi_lora(): - # For LoRA checkpoints without finetuned embedding and lm_head, we can either: - # (1) specify lora_target_modules, or - # (2) provide a lora_dir to infer the lora_target_modules. - build_config = BuildConfig(lora_config=LoraConfig( - lora_target_modules=['attn_q', 'attn_k', 'attn_v'])) - check_llama_7b_multi_lora_from_request_test_harness( - LLM, - enable_lora=True, - build_config=build_config, - fast_build=True, - max_loras=1, - max_cpu_loras=8) - - def llama_v2_7b_prompt_adapter_test_harness(**llm_kwargs): hf_model_dir = get_model_path("llama-models-v2/llama-v2-7b-hf") hf_prompt_adapter_dir = get_model_path("llama-models-v2/llama_tweet_ptune") diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 7a7b4e3c9fe..27e3220a33a 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -5,9 +5,7 @@ from tensorrt_llm.sampling_params import SamplingParams # isort: off -from .lora_test_utils import ( - check_llama_7b_multi_lora_from_request_test_harness, - check_llama_7b_multi_unique_lora_adapters_from_request) +from .lora_test_utils import check_llama_7b_multi_unique_lora_adapters_from_request from .test_llm import ( get_model_path, global_kvcache_config, llama_model_path, llm_get_stats_async_test_harness, llm_get_stats_test_harness, prompts, @@ -201,39 +199,22 @@ def test_llama_7b_lora_default_modules() -> None: llm.shutdown() -@skip_gpu_memory_less_than_40gb -def test_llama_7b_multi_lora(): - # For LoRA checkpoints without finetuned embedding and lm_head, we can either: - # (1) specify lora_target_modules, or - # (2) provide a lora_dir to infer the lora_target_modules. - lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], - max_lora_rank=8, - max_loras=1, - max_cpu_loras=8) - check_llama_7b_multi_lora_from_request_test_harness(LLM, - lora_config=lora_config) - - @pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", + "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeat_calls, repeats_per_call", [ - # Test eviction and loading of new adapters in the evicted space, within a single llm.generate call - ([ - 5, - ], 2, 2, 1), # Test eviction and re-loading a previously evicted adapter from the LoRA GPU cache, within a single - # llm.generate call + # llm.generate call, that's repeated twice. ([ 2, - ], 1, 2, 2), + ], 1, 2, 2, 3), # Test eviction and loading of new adapters in the evicted space, over several llm.generate calls, with LoRA GPU # cache size < LoRA CPU cache size - ([2, 2, 2], 1, 3, 1), + ([2, 2, 2], 1, 3, 1, 1), ]) @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora_evict_load_new_adapters( lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int, repeats: int): + max_cpu_loras: int, repeat_calls: int, repeats_per_call: int): # For LoRA checkpoints without finetuned embedding and lm_head, we can either: # (1) specify lora_target_modules, or # (2) provide a lora_dir to infer the lora_target_modules. @@ -243,7 +224,8 @@ def test_llama_7b_multi_lora_evict_load_new_adapters( max_cpu_loras=max_cpu_loras) check_llama_7b_multi_unique_lora_adapters_from_request( lora_adapter_count_per_call, - repeats, + repeat_calls, + repeats_per_call, LLM, lora_config=lora_config, # Disable CUDA graph @@ -252,24 +234,30 @@ def test_llama_7b_multi_lora_evict_load_new_adapters( @pytest.mark.parametrize( - "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeats", + "lora_adapter_count_per_call, max_loras, max_cpu_loras, repeat_calls, repeats_per_call", [ # Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU - # cache over more than a single llm.generate call - ([1, 1], 1, 1, 2), - # Test eviction, reloading new adapters and reloading previously evicted adapters from the LoRA CPU cache & GPU - # cache over a single llm.generate call + # cache over multiple llm.generate call repeated twice (two calls with the same requests): + # At the end of the 1st llm.generate call: + # The LoRA caches should contain adapters 1, 2 and shouldn't contain adapter 0 (it should have been evicted). + # So in the 2nd call, the worker should: + # - Send req0 with adapter 0 weights (because it was previously evicted) + # - Send the other two requests without their adapter weights as they're already in LoRA CPU cache + # Then, handling of req0 that has weights but not in the cache should evict one of the other two adapters from + # the cache, causing its request to fail because its weights aren't with the request and aren't in LoRA cache. ([ - 5, - ], 2, 2, 2), + 3, + ], 2, 2, 2, 1), ]) @skip_gpu_memory_less_than_40gb def test_llama_7b_multi_lora_load_previously_cpu_cache_evicted_adapter_fails( lora_adapter_count_per_call: list[int], max_loras: int, - max_cpu_loras: int, repeats: int): + max_cpu_loras: int, repeat_calls: int, repeats_per_call: int): """Tests that trying to load a LoRA adapter after it was evicted from CPU cache fails with the expected message, as this feature is currently not supported in favor of the performance improvement of not sending the LoRA weights with every request after the first time. + NOTE: This test assumes the requests are handled in the order they're sent, if that's not true, then this test + may not get the error it expects (and get no error) which would cause it to fail. """ # noqa: D205 def _check_contains_expected_message(stdout: str, stderr: str): @@ -285,7 +273,8 @@ def _check_contains_expected_message(stdout: str, stderr: str): with EnvVarsContextManager({"TLLM_WORKER_USE_SINGLE_PROCESS": "1"}): child_stdout, child_stderr = run_function_in_sub_process( target=check_llama_7b_multi_unique_lora_adapters_from_request, - args=(lora_adapter_count_per_call, repeats, LLM), + args=(lora_adapter_count_per_call, repeat_calls, repeats_per_call, + LLM), # Disable CUDA graph # TODO: remove this once we have a proper fix for CUDA graph in LoRA kwargs={"cuda_graph_config": None"lora_config": lora_config}, From b1d0bf65b7e85dff53039c178538d635a2a87ad3 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:05:00 +0000 Subject: [PATCH 20/32] test_llm_pytorch.py - Minor docstring fix, readability improvement Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm_pytorch.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 27e3220a33a..df2da896971 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -257,7 +257,7 @@ def test_llama_7b_multi_lora_load_previously_cpu_cache_evicted_adapter_fails( message, as this feature is currently not supported in favor of the performance improvement of not sending the LoRA weights with every request after the first time. NOTE: This test assumes the requests are handled in the order they're sent, if that's not true, then this test - may not get the error it expects (and get no error) which would cause it to fail. + may not get any error at all, which would cause it to fail. """ # noqa: D205 def _check_contains_expected_message(stdout: str, stderr: str): @@ -275,9 +275,12 @@ def _check_contains_expected_message(stdout: str, stderr: str): target=check_llama_7b_multi_unique_lora_adapters_from_request, args=(lora_adapter_count_per_call, repeat_calls, repeats_per_call, LLM), - # Disable CUDA graph - # TODO: remove this once we have a proper fix for CUDA graph in LoRA - kwargs={"cuda_graph_config": None"lora_config": lora_config}, + kwargs={ + "lora_config": lora_config, + # Disable CUDA graph + # TODO: remove this once we have a proper fix for CUDA graph in LoRA + "cuda_graph_config": None + }, stop_waiting_criteria=_check_contains_expected_message) assert _check_contains_expected_message(child_stdout, child_stderr) From b0f91f2c9b16312d1cbf279e42c5a7740a0a7796 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:05:01 +0000 Subject: [PATCH 21/32] Update test_llm_multi_gpu_pytorch.py to also disable cuda_graph until it's fixed Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py index c16de2698c0..852849881af 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py @@ -45,4 +45,7 @@ def test_llama_7b_multi_lora_tp2(): LLM, max_lora_rank=8, tensor_parallel_size=2, - kv_cache_config=global_kv_cache_config) + kv_cache_config=global_kv_cache_config, + # Disable CUDA graph + # TODO: remove this once we have a proper fix for CUDA graph in LoRA + cuda_graph_config=None) From 10149b7c86204fa5c68a4fde72fbd35dbc227336 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:05:01 +0000 Subject: [PATCH 22/32] Fix formatting of lora_test_utils.py Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/lora_test_utils.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/unittest/llmapi/lora_test_utils.py b/tests/unittest/llmapi/lora_test_utils.py index 05d9d15da50..f90441f9af5 100644 --- a/tests/unittest/llmapi/lora_test_utils.py +++ b/tests/unittest/llmapi/lora_test_utils.py @@ -12,7 +12,7 @@ def check_llama_7b_multi_unique_lora_adapters_from_request( lora_adapter_count_per_call: list[int], repeat_calls: int, repeats_per_call: int, llm_class: Type[BaseLLM], **llm_kwargs): """Calls llm.generate s.t. for each C in lora_adapter_count_per_call, llm.generate is called with C requests - repeated repeats_per_call times, where each request is configured with a unique LoRA adapter ID. + repeated 'repeats_per_call' times, where each request is configured with a unique LoRA adapter ID. This entire process is done in a loop 'repeats_per_call' times with the same requests. Asserts the output of each llm.generate call is similar to the expected. """ # noqa: D205 @@ -56,7 +56,8 @@ def check_llama_7b_multi_unique_lora_adapters_from_request( prompts_to_generate[last_idx:last_idx + adapter_count] * repeats_per_call, sampling_params, - lora_request=lora_requests[last_idx:last_idx + adapter_count] * + lora_request=lora_requests[last_idx:last_idx + + adapter_count] * repeats_per_call) for output, ref in zip( outputs, references[last_idx:last_idx + adapter_count] * @@ -102,10 +103,12 @@ def check_llama_7b_multi_lora_from_request_test_harness( llm = llm_class(hf_model_dir, **llm_kwargs) try: - outputs = llm.generate( - prompts, - sampling_params, - lora_request=[None, lora_req1, lora_req2, None, lora_req1, lora_req2]) + outputs = llm.generate(prompts, + sampling_params, + lora_request=[ + None, lora_req1, lora_req2, None, lora_req1, + lora_req2 + ]) finally: llm.shutdown() for output, ref, key_word in zip(outputs, references, key_words): From 9e9e02ebdc625abecc74eb924842d545c8d6a02a Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:05:02 +0000 Subject: [PATCH 23/32] Improve test case documentation Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm_pytorch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index df2da896971..c9962f8bbb4 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -244,7 +244,8 @@ def test_llama_7b_multi_lora_evict_load_new_adapters( # - Send req0 with adapter 0 weights (because it was previously evicted) # - Send the other two requests without their adapter weights as they're already in LoRA CPU cache # Then, handling of req0 that has weights but not in the cache should evict one of the other two adapters from - # the cache, causing its request to fail because its weights aren't with the request and aren't in LoRA cache. + # the cache, causing that evicted adapter's request to fail because its weights aren't with the request and + # aren't in LoRA cache. ([ 3, ], 2, 2, 2, 1), From f3c330cab92cd14b42724e2de4445a91e77a14f8 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 09:05:02 +0000 Subject: [PATCH 24/32] Fix docstring of is_adapter_in_cpu_cache Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tensorrt_llm/lora_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py index fd648824a3c..df1790f7646 100644 --- a/tensorrt_llm/lora_manager.py +++ b/tensorrt_llm/lora_manager.py @@ -480,7 +480,7 @@ def __init__(self, cpp_peft_cache_manager: tb_internal.batch_manager.PeftCacheMa def is_adapter_in_cpu_cache(self, adapter_uid: int) -> bool: """Best effort to check if a LoRA adapter is in the LoRA CPU cache. - If no peft_cache_manager instance wasn't given at the construction of this LoraManager instance, + If no peft_cache_manager instance was given at the construction of this LoraManager instance, then False is returned. """ return ( From abad5c43df5a630bf147a1fb3e3174d1a8872414 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 11:29:38 +0000 Subject: [PATCH 25/32] Add 'is_task_cached' method binding to CPP PeftCacheManager class Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp index e31269d1fd9..255b0f8efa3 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp @@ -469,7 +469,8 @@ void tb::BasePeftCacheManagerBindings::initBindings(py::module_& m) py::classh(m, "PeftCacheManager") .def(py::init(), - py::arg("config"), py::arg("model_config"), py::arg("world_config"), py::arg("buffer_manager")); + py::arg("config"), py::arg("model_config"), py::arg("world_config"), py::arg("buffer_manager")) + .def("is_task_cached", &tb::PeftCacheManager::isTaskCached, py::arg("taskId")); py::classh(m, "NoOpPeftCacheManager").def(py::init()); } From b6e99e98598a794eefed113435d6ad00912dd7a7 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 12:45:10 +0000 Subject: [PATCH 26/32] Improve comment over not supporting LoRA optimization in TRT-python flow Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tensorrt_llm/executor/worker.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index b3b1094eb22..3a8675fdab3 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -150,9 +150,11 @@ def _create_engine(): self._runtime_model_config = _engine_config_to_model_config( engine_config) if engine_config.build_config.plugin_config.lora_plugin: - # PeftCacheManager is at Executor->ExecutorImpl->TrtGptModel->mPeftCacheManager - # that is hard to access, therefore, for now the python optimization that needs - # the peft cache manager is not available in TRT-python flow. + # TODO(azuker): Passing peft cache manager to LoraManager is used for LoRA optimization + # of not sending adapter weights with LLM request when adapter is already loaded in LoRA + # CPU cache. Getting the peft cache manager from this point in the TRT flow is currently + # not supported (it's at the CPP Executor->ExecutorImpl->TrtGptModel->mPeftCacheManager) + # therefore for now this LoRA optimization is not available in TRT-python flow. self._lora_manager = LoraManager(cpp_peft_cache_manager=None) if engine_config.build_config.max_prompt_embedding_table_size > 0: self._prompt_adapter_manager = PromptAdapterManager() From b9d6c9ee72d11d39f4231249dc0149e554b79b16 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Tue, 15 Jul 2025 13:53:42 +0000 Subject: [PATCH 27/32] Change cpp_peft_cache_manager argument in LoraManager constructor to be optional, improve docstring Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tensorrt_llm/executor/worker.py | 8 ++++---- tensorrt_llm/lora_manager.py | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 3a8675fdab3..5f25bf619ed 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -151,10 +151,10 @@ def _create_engine(): engine_config) if engine_config.build_config.plugin_config.lora_plugin: # TODO(azuker): Passing peft cache manager to LoraManager is used for LoRA optimization - # of not sending adapter weights with LLM request when adapter is already loaded in LoRA - # CPU cache. Getting the peft cache manager from this point in the TRT flow is currently - # not supported (it's at the CPP Executor->ExecutorImpl->TrtGptModel->mPeftCacheManager) - # therefore for now this LoRA optimization is not available in TRT-python flow. + # (see LoraManager constructor docstring). Getting the peft cache manager from this + # point in the TRT flow is currently not supported (it's at the CPP + # Executor->ExecutorImpl->TrtGptModel->mPeftCacheManager) therefore for now this LoRA + # optimization is not available in TRT-python flow. self._lora_manager = LoraManager(cpp_peft_cache_manager=None) if engine_config.build_config.max_prompt_embedding_table_size > 0: self._prompt_adapter_manager = PromptAdapterManager() diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py index df1790f7646..3f87286024b 100644 --- a/tensorrt_llm/lora_manager.py +++ b/tensorrt_llm/lora_manager.py @@ -438,8 +438,16 @@ class LoraManager(object): "mlp_gate_up": 18, } - def __init__(self, cpp_peft_cache_manager: tb_internal.batch_manager.PeftCacheManager | None): - """Constructor.""" + def __init__( + self, cpp_peft_cache_manager: tb_internal.batch_manager.PeftCacheManager | None = None + ): + """Constructor. + + Args: + cpp_peft_cache_manager (PeftCacheManager, optional): used by is_adapter_in_cpu_cache method, that's used for + a performance optimization with LoRA of not sending the LoRA adapter weights with every LLM request when + the adapter is already loaded in the LoRA CPU cache. + """ # _lora_uid_to_low_ranks: dict[str -> dict[int -> dict[str -> int]]] # { # uid: { @@ -480,8 +488,8 @@ def __init__(self, cpp_peft_cache_manager: tb_internal.batch_manager.PeftCacheMa def is_adapter_in_cpu_cache(self, adapter_uid: int) -> bool: """Best effort to check if a LoRA adapter is in the LoRA CPU cache. - If no peft_cache_manager instance was given at the construction of this LoraManager instance, - then False is returned. + If no cpp_peft_cache_manager instance was given at the construction of this LoraManager instance, then False is + returned. """ return ( self._cpp_peft_cache_manager.is_task_cached(adapter_uid) From 6189c473c71d7ffe1817341854c70926bcbbe853 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Wed, 16 Jul 2025 07:38:18 +0000 Subject: [PATCH 28/32] Fix typo in lora test Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/lora_test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unittest/llmapi/lora_test_utils.py b/tests/unittest/llmapi/lora_test_utils.py index f90441f9af5..1b2323804fa 100644 --- a/tests/unittest/llmapi/lora_test_utils.py +++ b/tests/unittest/llmapi/lora_test_utils.py @@ -113,4 +113,4 @@ def check_llama_7b_multi_lora_from_request_test_harness( llm.shutdown() for output, ref, key_word in zip(outputs, references, key_words): assert similar(output.outputs[0].text, - ref) or key_word in output.outputs[0].txt + ref) or key_word in output.outputs[0].text From e4ff01ab7586d37623f406f2dea537f4f97071dd Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Wed, 16 Jul 2025 08:11:21 +0000 Subject: [PATCH 29/32] Revert added note in exception message in TRT flow, as the LoRA optimization is no longer active in TRT-python flow, revert the change in its test as well. Improve exception message seen in torch flow Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp | 8 ++++---- cpp/tests/executor/executorTest.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp index 05c59080655..f513f2a3a10 100644 --- a/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp @@ -277,9 +277,7 @@ void PeftCacheManager::addRequestPeft(std::shared_ptr llmRequest, bo if (!isTaskCached(taskId)) { std::string errMsg - = "LoRA task " + std::to_string(taskId) + " not found in cache. Please send LoRA weights with request." - " Note that currently a request with LoRA task that was already loaded is sent without its LoRA weights to save its serialization, copy and deserialization," - " so if this LoRA task was evicted from LoRA CPU cache, then its reuse is currently not supported."; + = "LoRA task " + std::to_string(taskId) + " not found in cache. Please send LoRA weights with request"; throw PeftTaskNotCachedException(errMsg); } } @@ -606,8 +604,10 @@ SizeType32 PeftCacheManager::determineNumPages(std::shared_ptr llmRe } if (!llmRequest->getLoraWeights().has_value()) { + auto const reqId = llmRequest->mRequestId; std::string errMsg - = "LoRA task " + std::to_string(taskId) + " has no LoRA weights and not found in cache." + = "Request ID " + std::to_string(reqId) + " has no LoRA adapter weights while configured with LoRA task " + + std::to_string(taskId) + " that's not found in LoRA CPU cache." " Note that currently a request with LoRA task that was already loaded is sent without its LoRA weights to save its serialization, copy and deserialization," " so if this LoRA task was evicted from LoRA CPU cache, then its reuse is currently not supported."; throw PeftTaskNotCachedException(errMsg); diff --git a/cpp/tests/executor/executorTest.cpp b/cpp/tests/executor/executorTest.cpp index acd4d0f7d4d..6b8c8d7eb9e 100644 --- a/cpp/tests/executor/executorTest.cpp +++ b/cpp/tests/executor/executorTest.cpp @@ -190,7 +190,7 @@ TEST_F(GptExecutorTest, missingPeftTask) if (response.hasError()) { auto err = response.getErrorMsg(); - EXPECT_EQ(0, err.find("LoRA task 10 not found in cache. Please send LoRA weights with request")); + EXPECT_EQ(err, std::string("LoRA task 10 not found in cache. Please send LoRA weights with request")); done = true; } else From 6e0b8727fb8cfcad90fa5f3bb1c13571ee2d0cff Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Thu, 17 Jul 2025 08:49:32 +0000 Subject: [PATCH 30/32] Fix LLM args in multi GPU LoRA tests Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm_multi_gpu.py | 18 ++++++++++++++---- .../llmapi/test_llm_multi_gpu_pytorch.py | 10 +++++++++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py index 261cc344777..40e657e7894 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu.py @@ -12,6 +12,7 @@ from tensorrt_llm.executor import GenerationExecutorProxy from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer +from tensorrt_llm.lora_manager import LoraConfig from tensorrt_llm.mapping import Mapping from tensorrt_llm.models import PretrainedConfig from tensorrt_llm.models.llama.model import LLaMAForCausalLM @@ -261,12 +262,21 @@ def test_llama_v2_13b_lora_tp2(): @pytest.mark.gpu2 @pytest.mark.part3 def test_llama_7b_multi_lora_tp2(): + # For LoRA checkpoints without finetuned embedding and lm_head, we can either: + # (1) specify lora_target_modules, or + # (2) provide a lora_dir to infer the lora_target_modules. + lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], + max_lora_rank=8, + max_loras=1, + max_cpu_loras=8) check_llama_7b_multi_lora_from_request_test_harness( LLM, - tensor_parallel_size=2, - max_lora_rank=8, - max_loras=1, - max_cpu_loras=8, + enable_lora=True, + build_config=BuildConfig(lora_config=lora_config), + fast_build=True, + max_lora_rank=lora_config.max_lora_rank, + max_loras=lora_config.max_loras, + max_cpu_loras=lora_config.max_cpu_loras, kv_cache_config=global_kv_cache_config) diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py index 852849881af..cb8dbf03c07 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py @@ -4,6 +4,7 @@ from .test_llm import tinyllama_logits_processor_test_harness from tensorrt_llm import LLM from tensorrt_llm.llmapi import KvCacheConfig +from tensorrt_llm.lora_manager import LoraConfig from .lora_test_utils import check_llama_7b_multi_lora_from_request_test_harness from .test_llm_pytorch import llama_7b_lora_from_dir_test_harness from .test_llm import _test_llm_capture_request_error @@ -41,9 +42,16 @@ def test_llama_7b_lora_tp2(): @pytest.mark.gpu2 def test_llama_7b_multi_lora_tp2(): + # For LoRA checkpoints without finetuned embedding and lm_head, we can either: + # (1) specify lora_target_modules, or + # (2) provide a lora_dir to infer the lora_target_modules. + lora_config = LoraConfig(lora_target_modules=['attn_q', 'attn_k', 'attn_v'], + max_lora_rank=8, + max_loras=1, + max_cpu_loras=8) check_llama_7b_multi_lora_from_request_test_harness( LLM, - max_lora_rank=8, + lora_config=lora_config, tensor_parallel_size=2, kv_cache_config=global_kv_cache_config, # Disable CUDA graph From 10856707a57926a80a2d53bdf7f6470e03a1888b Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Thu, 17 Jul 2025 09:10:04 +0000 Subject: [PATCH 31/32] Improve resource release in test util function run_function_in_sub_process Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/utils/util.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/tests/unittest/utils/util.py b/tests/unittest/utils/util.py index 361bd0beb72..7d5c90833a1 100644 --- a/tests/unittest/utils/util.py +++ b/tests/unittest/utils/util.py @@ -474,17 +474,21 @@ def _read_from_pipe(pipe: Connection): child_stdout = "" child_stderr = "" - total_waiting_seconds = 0 - while child_process.is_alive() and total_waiting_seconds < timeout_seconds: - child_stdout += _read_from_pipe(parent_stdout_pipe) - child_stderr += _read_from_pipe(parent_stderr_pipe) - if stop_waiting_criteria(child_stdout, child_stderr): - break - time.sleep(poll_interval_seconds) - total_waiting_seconds += poll_interval_seconds - - if child_process.is_alive(): - child_process.terminate() + try: + total_waiting_seconds = 0 + while child_process.is_alive( + ) and total_waiting_seconds < timeout_seconds: + child_stdout += _read_from_pipe(parent_stdout_pipe) + child_stderr += _read_from_pipe(parent_stderr_pipe) + if stop_waiting_criteria(child_stdout, child_stderr): + break + time.sleep(poll_interval_seconds) + total_waiting_seconds += poll_interval_seconds + finally: + parent_stdout_pipe.close() + parent_stderr_pipe.close() + if child_process.is_alive(): + child_process.terminate() assert total_waiting_seconds < timeout_seconds, "Reached timeout while waiting for target" return child_stdout, child_stderr From 352f4294c5629c2d5eb034f9b838e56aff433771 Mon Sep 17 00:00:00 2001 From: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> Date: Thu, 17 Jul 2025 09:17:56 +0000 Subject: [PATCH 32/32] Improve formatting - split long import line Signed-off-by: Amit Zuker <203509407+amitz-nv@users.noreply.github.com> --- tests/unittest/llmapi/test_llm_pytorch.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index c9962f8bbb4..49d6d1ee042 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -11,7 +11,11 @@ llm_get_stats_async_test_harness, llm_get_stats_test_harness, prompts, run_llm_abort_request, run_llm_with_postprocess_parallel_and_result_handler, tinyllama_logits_processor_test_harness, _test_llm_capture_request_error) -from utils.util import EnvVarsContextManager, force_ampere, run_function_in_sub_process, similar, skip_gpu_memory_less_than_40gb, skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb +from utils.util import (EnvVarsContextManager, force_ampere, + run_function_in_sub_process, similar, + skip_gpu_memory_less_than_40gb, + skip_gpu_memory_less_than_80gb, + skip_gpu_memory_less_than_138gb) from utils.llm_data import llm_models_root from tensorrt_llm.lora_manager import LoraConfig from tensorrt_llm.executor.request import LoRARequest