Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ services:

volumes:
- ${SOURCE_DIR}:/workspaces/tensorrt_llm
- ${LOCAL_HF_HOME}:/huggingface # HF cache
#- ${LOCAL_HF_HOME}:/huggingface # HF cache

environment:
- CCACHE_DIR=/workspaces/tensorrt_llm/cpp/.ccache
Expand Down
5 changes: 5 additions & 0 deletions tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(
attn_metadata: AttentionMetadata,
spec_metadata: Optional[SpecMetadata] = None,
use_mrope: bool = False,
lora_params: Optional[dict] = None,
) -> None:
"""
Stores a CUDA graph and its associated input buffers.
Expand Down Expand Up @@ -68,6 +69,7 @@ def __init__(

self.attn_metadata = attn_metadata
self.spec_metadata = spec_metadata
self.lora_params = lora_params
self._output = None
self._graph = None
self.optional_extra_model_inputs = ["mrope_position_deltas"]
Expand All @@ -90,6 +92,9 @@ def capture(
"mrope_position_deltas": self.mrope_position_deltas,
}

if self.lora_params is not None:
inputs["lora_params"] = self.lora_params

# We have to do warm up runs to initialize PyTorch's
# internal states according to the docs:
# https://pytorch.org/docs/stable/notes/cuda.html#cuda-graph-semantics
Expand Down
169 changes: 147 additions & 22 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions tensorrt_llm/_torch/pyexecutor/py_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,8 @@ def __init__(self,
self.micro_batches: List[BatchStatePP
| None] = [None] * self.num_micro_batches
self.send_handles = [None] * self.num_micro_batches
self.model_engine.set_lora_manager(self.resource_manager)
self.model_engine.prefetch_lora_dirs()

self.inflight_req_ids = ReqIdsSet()

Expand Down Expand Up @@ -274,6 +276,9 @@ def _event_loop_wrapper(self):
finally:
self._executor_loop_cleanup()

def get_lora_manager(self):
return self.model_engine.lora_manager

def start_worker(self):
self.worker_lock.acquire()
try:
Expand Down
52 changes: 37 additions & 15 deletions tensorrt_llm/_torch/pyexecutor/resource_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,20 +386,21 @@ def prepare_resources(self, scheduled_batch: ScheduledRequests):
self.impl.add_token(req.py_request_id)

def add_dummy_requests(
self,
request_ids: List[int],
# Note that token_nums should be past_kv_len + input_len (without
# spec decoding). The draft tokens will be added in this function,
# so we don't need to take care of it in the caller. When preparing
# token_nums, we should not take the draft tokens into account, so
# don't use the kv_cache_manager.max_seq_len, which includes both
# extra tokens and draft tokens.
token_nums: Optional[List[int]] = None,
is_gen: bool = False,
prepare_resource: bool = True,
max_num_draft_tokens: int = 0,
use_mrope: bool = False,
max_beam_width: int = 1,
self,
request_ids: List[int],
# Note that token_nums should be past_kv_len + input_len (without
# spec decoding). The draft tokens will be added in this function,
# so we don't need to take care of it in the caller. When preparing
# token_nums, we should not take the draft tokens into account, so
# don't use the kv_cache_manager.max_seq_len, which includes both
# extra tokens and draft tokens.
token_nums: Optional[List[int]] = None,
is_gen: bool = False,
prepare_resource: bool = True,
max_num_draft_tokens: int = 0,
use_mrope: bool = False,
max_beam_width: int = 1,
lora_request: Optional[List] = None, # TODO smor fill type hint
):
beam_width = max_beam_width
requests = []
Expand All @@ -419,14 +420,31 @@ def add_dummy_requests(
# Using 1 instead of 0 prevents NaN during warmup in e.g. Deepseek
mrope_position_deltas = torch.zeros(
1, device="cuda", dtype=torch.int32) if use_mrope else None

lora_task_id = None
lora_weights = None
lora_config = None

if lora_request is not None and i < len(lora_request):
# TODO smor currently work with single adapter only, not sure how this should work with request ids
# print("SMOR, resource manager, handling lora_request parameter in add_dummy_requests, how it works with multiple requests?")
# from IPython import embed
# embed()
lora_task_id = lora_request[i].task_id
lora_weights = lora_request[i].weights
lora_config = lora_request[i].config

req = LlmRequest(request_id=req_id,
max_new_tokens=1,
input_tokens=[1] * token_num,
sampling_config=SamplingConfig(
sampling_params._get_sampling_config()),
is_streaming=False,
mrope_position_deltas=mrope_position_deltas,
encoder_input_tokens=encoder_input_tokens)
encoder_input_tokens=encoder_input_tokens,
lora_task_id=lora_task_id,
lora_weights=lora_weights,
lora_config=lora_config)
req.is_dummy_request = True
req.paged_kv_block_ids = []
if prepare_resource:
Expand Down Expand Up @@ -1241,6 +1259,10 @@ def ensure_batch(self,
return self.impl.ensure_batch(context_batch, generation_batch,
reset_gpu_cache)

def get_lora_manager(self):
assert self._lora_manager is not None, "Lora manager not initialized"
return self._lora_manager

def get_max_resource_count(self) -> int:
return 0

Expand Down
7 changes: 1 addition & 6 deletions tensorrt_llm/executor/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,7 @@ def _create_engine():

if getattr(executor_config, "backend",
"") == "pytorch" and lora_config is not None:
from tensorrt_llm._torch.pyexecutor.resource_manager import \
ResourceManagerType
peft_cache_manager = self.engine.resource_manager.resource_managers.get(
ResourceManagerType.PEFT_CACHE_MANAGER)
self._lora_manager = LoraManager(
cpp_peft_cache_manager=peft_cache_manager.impl)
self._lora_manager = self.engine.get_lora_manager()
lora_model_config = self.engine.model_engine.lora_model_config
assert lora_model_config is not None
self._lora_model_config = lora_model_config
Expand Down
8 changes: 7 additions & 1 deletion tensorrt_llm/lora_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from dataclasses import dataclass, field
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union

import numpy as np
import torch
Expand Down Expand Up @@ -241,6 +241,7 @@ class LoraConfig(DictConversion):
trtllm_modules_to_hf_modules: Dict[str, str] = field(default_factory=dict)
max_loras: int | None = None
max_cpu_loras: int | None = None
lora_request: Optional[List[Any]] = None # TODO smor fix

def __post_init__(self):
assert self.lora_ckpt_source in ["hf", "nemo"], (
Expand Down Expand Up @@ -747,6 +748,11 @@ def __init__(
self._cpp_lora_weights: Dict[str, torch.Tensor] = {} # on cpu
self._cpp_lora_config: Dict[str, torch.Tensor] = {} # on cpu
self.lora_target_modules: List[str] = []
self._cpp_peft_cache_manager: Optional[tb_internal.batch_manager.PeftCacheManager] = None

def set_cpp_peft_cache_manager(
self, cpp_peft_cache_manager: tb_internal.batch_manager.PeftCacheManager
):
self._cpp_peft_cache_manager = cpp_peft_cache_manager

def is_adapter_in_cpu_cache(self, adapter_uid: int) -> bool:
Expand Down
101 changes: 100 additions & 1 deletion tests/unittest/llmapi/test_llm_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from tensorrt_llm import LLM
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.llmapi.llm_args import PeftCacheConfig
from tensorrt_llm.llmapi.llm_args import CudaGraphConfig, PeftCacheConfig
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
from tensorrt_llm.sampling_params import SamplingParams

Expand Down Expand Up @@ -783,3 +783,102 @@ def test_gqa_nemo_lora(tmp_path):
f"got: {base_outputs[0].outputs[0].text}"
finally:
llm.shutdown()


@pytest.mark.parametrize("cuda_graph_config",
[None, CudaGraphConfig(max_batch_size=1)])
def test_lora_dir_with_graph(cuda_graph_config):
lora_req = LoRARequest(
"task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1")

lora_config = LoraConfig(
lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"],
max_lora_rank=8,
lora_request=[lora_req])

llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf",
lora_config=lora_config,
cuda_graph_config=cuda_graph_config)

prompts = [
"美国的首都在哪里? \n答案:",
]
references = [
"美国的首都是华盛顿。\n\n美国的",
]
sampling_params = SamplingParams(max_tokens=20)
lora_request = [lora_req]

outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)

assert similar(outputs[0].outputs[0].text, references[0])
print(f"lora output: {outputs[0].outputs[0].text}")
print(f"ref output: {references[0]}")


def test_lora_graph_single_request():
lora_req = LoRARequest(
"task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1")

lora_config = LoraConfig(
lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"],
max_lora_rank=8,
lora_request=[lora_req])

llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf",
lora_config=lora_config,
cuda_graph_config=CudaGraphConfig(max_batch_size=1))

prompts = [
"美国的首都在哪里? \n答案:",
]
references = [
"美国的首都是华盛顿。\n\n美国的",
]
sampling_params = SamplingParams(max_tokens=20)
lora_request = [lora_req]

outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)

assert similar(outputs[0].outputs[0].text, references[0])
print(f"lora output: {outputs[0].outputs[0].text}")
print(f"ref output: {references[0]}")

Comment on lines +819 to +846
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Ensure resource cleanup and add GPU guard (single-request CUDA graph test)

Mirror the cleanup and gating done in the parametrized test.

+@skip_gpu_memory_less_than_40gb
 def test_lora_graph_single_request():
@@
-    llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf",
-              lora_config=lora_config,
-              cuda_graph_config=CudaGraphConfig(max_batch_size=1))
+    llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf",
+              lora_config=lora_config,
+              cuda_graph_config=CudaGraphConfig(max_batch_size=1))
@@
-    outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)
-
-    assert similar(outputs[0].outputs[0].text, references[0])
-    print(f"lora output: {outputs[0].outputs[0].text}")
-    print(f"ref  output: {references[0]}")
+    try:
+        outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)
+        assert similar(outputs[0].outputs[0].text, references[0])
+        print(f"lora output: {outputs[0].outputs[0].text}")
+        print(f"ref  output: {references[0]}")
+    finally:
+        llm.shutdown()
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def test_lora_graph_single_request():
lora_req = LoRARequest(
"task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1")
lora_config = LoraConfig(
lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"],
max_lora_rank=8,
lora_request=[lora_req])
llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf",
lora_config=lora_config,
cuda_graph_config=CudaGraphConfig(max_batch_size=1))
prompts = [
"美国的首都在哪里? \n答案:",
]
references = [
"美国的首都是华盛顿。\n\n美国的",
]
sampling_params = SamplingParams(max_tokens=20)
lora_request = [lora_req]
outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)
assert similar(outputs[0].outputs[0].text, references[0])
print(f"lora output: {outputs[0].outputs[0].text}")
print(f"ref output: {references[0]}")
@skip_gpu_memory_less_than_40gb
def test_lora_graph_single_request():
lora_req = LoRARequest(
"task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1")
lora_config = LoraConfig(
lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"],
max_lora_rank=8,
lora_request=[lora_req])
llm = LLM(model=f"{llm_models_root()}/llama-models/llama-7b-hf",
lora_config=lora_config,
cuda_graph_config=CudaGraphConfig(max_batch_size=1))
prompts = [
"美国的首都在哪里? \n答案:",
]
references = [
"美国的首都是华盛顿。\n\n美国的",
]
sampling_params = SamplingParams(max_tokens=20)
lora_request = [lora_req]
try:
outputs = llm.generate(prompts, sampling_params, lora_request=lora_request)
assert similar(outputs[0].outputs[0].text, references[0])
print(f"lora output: {outputs[0].outputs[0].text}")
print(f"ref output: {references[0]}")
finally:
llm.shutdown()
🤖 Prompt for AI Agents
In tests/unittest/llmapi/test_llm_pytorch.py around lines 819 to 846, the
single-request CUDA-graph LoRA test lacks the GPU availability guard and
resource cleanup used by the parametrized test; wrap the test logic so it only
runs when CUDA is available (skip or return when not), and at the end mirror the
cleanup: call the LLM shutdown/close method, synchronize the GPU, and free GPU
memory (e.g., torch.cuda.synchronize() and torch.cuda.empty_cache()) so CUDA
graphs and model resources are released after the test.


def test_lora_graph_multiple_requests():
lora_req = LoRARequest(
"task-0", 0, f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1")
lora_req2 = LoRARequest(
"task-1", 1,
f"{llm_models_root()}/llama-models/Japanese-Alpaca-LoRA-7b-v0")

lora_requests = [lora_req, lora_req2]
lora_config = LoraConfig(
lora_dir=[f"{llm_models_root()}/llama-models/luotuo-lora-7b-0.1"],
max_lora_rank=8,
lora_request=lora_requests)

llm = LLM(
model=f"{llm_models_root()}/llama-models/llama-7b-hf",
lora_config=lora_config,
# cuda_graph_config=None)
cuda_graph_config=CudaGraphConfig(max_batch_size=2))

prompts = [
"美国的首都在哪里? \n答案:",
"美国的首都在哪里? \n答案:",
]
references = [
"美国的首都是华盛顿。\n\n美国的",
"纽约\n\n### カンファレンスの",
]
sampling_params = SamplingParams(max_tokens=20)

outputs = llm.generate(prompts, sampling_params, lora_request=lora_requests)

print(f"lora output 0: {outputs[0].outputs[0].text}")
print(f"ref output 0: {references[0]}")
print(f"lora output 1: {outputs[1].outputs[0].text}")
print(f"ref output 1: {references[1]}")
assert similar(outputs[0].outputs[0].text, references[0])
assert similar(outputs[1].outputs[0].text, references[1])