Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
450522e
[https://nvbugs/5451028][fix] Constrain NemotronSuper test parameters…
Naveassaf Aug 19, 2025
ed70d06
[https://nvbugs/5448579][fix] EXAONE-4.0 accuracy test bugfix (#6888)
yechank-nvidia Aug 19, 2025
06c64a7
[None][chore] Waive E2E GB200 tests for Gemma3 27B (#6916)
brb-nv Aug 19, 2025
a60af95
[https://nvbugs/5451296][bug] Fix a thread leak in test_llm_args.py (…
Tabrizian Aug 19, 2025
dfbde64
[None][infra] Waive failed tests for release branch (#7036)
EmmaQiaoCh Aug 19, 2025
6ca71f5
[None][doc] add status labels to LLM class's api reference (#6899)
Superjomn Aug 20, 2025
aa6603c
[https://nvbugs/5448437][fix] fix some nixl tests (#6940)
bo-nv Aug 20, 2025
b33b27a
[https://nvbugs/5427801][fix] Torch compile support for Llama4 and Ea…
liji-nv Aug 20, 2025
c1e6126
[https://nvbugs/5394392][fix] Enlarge scheduler capacity under disagg…
yifeizhang-c Aug 20, 2025
ede27da
[TRTLLM-7263][fix] Prevent recreation of cublas handles in lora_group…
amitz-nv Aug 20, 2025
835192d
[None][doc] update v1.0 doc for trtllm-serve (#7056)
hchings Aug 20, 2025
e7bc4a6
[https://nvbugs/5440241][fix] Fix 70B GSM8K Accuracy drop (#7075)
chenfeiz0326 Aug 20, 2025
f8a37bb
[https://nvbugs/5451296][fix] zmq nonblock bug with retry (#7019)
Superjomn Aug 21, 2025
5ff7b61
[https://nvbugs/5383702][fix] test_llm_api_pytorch.py::TestLlama3_1_8…
Superjomn Aug 21, 2025
990a786
[https://nvbugs/5392414] [fix] For release 1.0 cherry pick. Add custo…
ChristinaZ Aug 21, 2025
30f30f7
[https://nvbugs/5464088] [fix] dequantize fp8 activation input to lor…
venkywonka Aug 21, 2025
4b978c8
[None][infra] Skip failed tests for release branch (#7130)
EmmaQiaoCh Aug 21, 2025
d49e304
[https://nvbugs/5448442][fix] Skip trtllm moe backend for sm120 (#7010)
pamelap-nvidia Aug 21, 2025
37823b9
[https://nvbugs/5449032][fix] Add more llm-args to llm_mgmn_trtllm_be…
brb-nv Aug 22, 2025
f4a8c04
[https://nvbugs/5410391][bug] Support to share device buffers in atte…
HuiGao-NV Aug 22, 2025
c2fecf3
[https://nvbugs/5467062][fix] pass logitsPostProcessorBatched by refe…
milesial Aug 22, 2025
c128437
[https://nvbugs/5450074][fix] Reduce the device memory requirements f…
Shixiaowei02 Aug 22, 2025
5a42ddc
[https://nvbugs/5433545][fix] TestPhi4MiniInstruct::test_auto_dtype -…
moraxu Aug 22, 2025
b20ea82
[https://nvbugs/5448426][fix] Fix illegal memory access in cuda graph…
peaceh-nv Aug 25, 2025
633a4d5
[None][fix] Switch llm api quickstart example location per workflow. …
nv-guomingz Aug 25, 2025
2a3e17f
[https://nvbugs/5467232][fix] Fix load_torch_hf_lora to override lora…
Wanli-Jiang Aug 25, 2025
26dbb32
[None][doc] fix tensorrt legacy quickstart page (#7190)
Superjomn Aug 25, 2025
26db89f
[https://nvbugs/5470840][fix] Disaggregated unit test MPI Init handli…
pcastonguay Aug 25, 2025
3854ef1
[None][test] add kv cache size in bench metric and fix failed cases (…
ruodil Aug 26, 2025
ce80090
[https://nvbugs/5409416][fix] test_openai_multi_chat_example (#7174)
Linda-Stadter Aug 26, 2025
3af7b1a
[https://nvbugs/5473789][bug] install cuda-toolkit to fix sanity chec…
HuiGao-NV Aug 26, 2025
7b65fd4
[https://nvbugs/5473789][bug] install cuda-toolkit to fix sanity chec…
dominicshanshan Sep 8, 2025
642f622
[None][fix] fix log_once usage (#7210)
yuxianq Aug 26, 2025
9165e67
[None][infra] Waive failed cases for release/1.0 (#7258)
EmmaQiaoCh Aug 26, 2025
31aaed5
[https://nvbugs/5451342][fix] Use runtime max_batch_size when cuda_gr…
jiaganc Aug 26, 2025
1cfb4af
[None][feat] Skip prefetching consolidated safetensors when appropria…
2ez4bz Aug 26, 2025
70197dd
[https://nvbugs/5430125][ci] Unwaive test case for mistral 3.1 small …
2ez4bz Aug 26, 2025
e1d8811
[https://nvbugs/5478151][fix] Add missing spec for Llama-3.3 70B (#7267)
brb-nv Aug 27, 2025
413776a
[https://nvbugs/5451426][fix] Avoid torch compile on full eagle3 work…
liji-nv Aug 27, 2025
7ff6f44
[https://nvbugs/5463720][fix] tp-split the inferred `mlp_hidden_size`…
venkywonka Aug 27, 2025
cc71861
[https://nvbugs/5480550][fix] Increase timeout for Gemma3 27B test (#…
brb-nv Aug 27, 2025
8797444
[https://nvbugs/5434320][bug] Fix disagg pp bug (#7099)
Tabrizian Aug 27, 2025
d279d29
[https://nvbugs/5480415][fix] Fix phi4mm multi-gpu test (#7275)
Wanli-Jiang Aug 28, 2025
2083332
[TRTLLM-7346][fix] Improve performance of PyTorchModelEngine._get_lor…
amitz-nv Aug 28, 2025
74fc47c
[https://nvbugs/5461712] [fix] Disable deep_gemm for Qwen3 due to acc…
DomBrown Aug 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class LogitsPostProcessor : Algorithm

bool operator()(DecoderInputBuffers& inputBuffers, bool replicateLogitsPostProcessor,
runtime::WorldConfig const& worldConfig, CudaStreamPtr const& stream,
std::optional<LogitsPostProcessorBatched> logitsPostProcessorBatched = std::nullopt) const;
std::optional<LogitsPostProcessorBatched> const& logitsPostProcessorBatched = std::nullopt) const;
};

} // namespace tensorrt_llm::batch_manager
2 changes: 1 addition & 1 deletion cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ using SizeType32 = tensorrt_llm::runtime::SizeType32;

bool LogitsPostProcessor::operator()(DecoderInputBuffers& inputBuffers, bool replicateLogitsPostProcessor,
tr::WorldConfig const& worldConfig, CudaStreamPtr const& stream,
std::optional<LogitsPostProcessorBatched> logitsPostProcessorBatched) const
std::optional<LogitsPostProcessorBatched> const& logitsPostProcessorBatched) const
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
NVTX3_SCOPED_RANGE(LogitsPostProcessor);
Expand Down
64 changes: 34 additions & 30 deletions docs/source/commands/trtllm-serve/trtllm-serve.rst
Original file line number Diff line number Diff line change
Expand Up @@ -201,56 +201,60 @@ Metrics Endpoint

.. note::

This endpoint is beta maturity.
The metrics endpoint for the default PyTorch backend are in beta and are not as comprehensive as those for the TensorRT backend.

The statistics for the PyTorch backend are beta and not as comprehensive as those for the TensorRT backend.
Some fields, such as CPU memory usage, are not yet available for the PyTorch backend.

Some fields, such as CPU memory usage, are not available for the PyTorch backend.
Enabling ``enable_iter_perf_stats`` in the PyTorch backend can slightly impact performance, depending on the serving configuration.

Enabling ``enable_iter_perf_stats`` in the PyTorch backend can impact performance slightly, depending on the serving configuration.
The ``/metrics`` endpoint provides runtime iteration statistics such as GPU memory usage and KV cache details.

The ``/metrics`` endpoint provides runtime-iteration statistics such as GPU memory use and inflight-batching details.
For the TensorRT backend, these statistics are enabled by default.
However, for the PyTorch backend, you must explicitly enable iteration statistics logging by setting the `enable_iter_perf_stats` field in a YAML configuration file as shown in the following example:
For the default PyTorch backend, iteration statistics logging is enabled by setting the ``enable_iter_perf_stats`` field in a YAML file:

.. code-block:: yaml

# extra-llm-api-config.yml
pytorch_backend_config:
enable_iter_perf_stats: true
# extra_llm_config.yaml
enable_iter_perf_stats: true

Then start the server and specify the ``--extra_llm_api_options`` argument with the path to the YAML file as shown in the following example:
Start the server and specify the ``--extra_llm_api_options`` argument with the path to the YAML file:

.. code-block:: bash

trtllm-serve <model> \
--extra_llm_api_options <path-to-extra-llm-api-config.yml> \
[--tp_size <tp> --pp_size <pp> --ep_size <ep> --host <host> --port <port>]
trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --extra_llm_api_options extra_llm_config.yaml

After at least one inference request is sent to the server, you can fetch the runtime-iteration statistics by polling the `/metrics` endpoint:
After sending at least one inference request to the server, you can fetch runtime iteration statistics by polling the ``/metrics`` endpoint.
Since the statistics are stored in an internal queue and removed once retrieved, it's recommended to poll the endpoint shortly after each request and store the results if needed.

.. code-block:: bash

curl -X GET http://<host>:<port>/metrics
curl -X GET http://localhost:8000/metrics

*Example Output*
Example output:

.. code-block:: json

[
{
"gpuMemUsage": 56401920000,
"inflightBatchingStats": {
[
{
"gpuMemUsage": 76665782272,
"iter": 154,
"iterLatencyMS": 7.00688362121582,
"kvCacheStats": {
"allocNewBlocks": 3126,
"allocTotalBlocks": 3126,
"cacheHitRate": 0.00128,
"freeNumBlocks": 101253,
"maxNumBlocks": 101256,
"missedBlocks": 3121,
"reusedBlocks": 4,
"tokensPerBlock": 32,
"usedNumBlocks": 3
},
"numActiveRequests": 1
...
},
"iter": 1,
"iterLatencyMS": 16.505143404006958,
"kvCacheStats": {
...
},
"newActiveRequestsQueueLatencyMS": 0.0007503032684326172
}
]
}
]



Syntax
------
Expand Down
2 changes: 1 addition & 1 deletion docs/source/legacy/tensorrt_quickstart.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# LLM API with TensorRT Engine
A simple inference example with TinyLlama using the LLM API:

```{literalinclude} ../../examples/llm-api/_tensorrt_engine/quickstart_example.py
```{literalinclude} ../../../examples/llm-api/_tensorrt_engine/quickstart_example.py
:language: python
:linenos:
```
Expand Down
10 changes: 8 additions & 2 deletions examples/llm-api/_tensorrt_engine/quickstart_example.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm import BuildConfig, SamplingParams
from tensorrt_llm._tensorrt_engine import LLM # NOTE the change


def main():

build_config = BuildConfig()
build_config.max_batch_size = 256
build_config.max_num_tokens = 1024

# Model could accept HF model name, a path to local HF model,
# or TensorRT Model Optimizer's quantized checkpoints like nvidia/Llama-3.1-8B-Instruct-FP8 on HF.
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
build_config=build_config)

# Sample prompts.
prompts = [
Expand Down
1 change: 1 addition & 0 deletions examples/llm-api/llm_mgmn_trtllm_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ srun -l \

# This is optional
cat > /tmp/pytorch_extra_args.txt << EOF
cuda_graph_config: null
print_iter_log: true
enable_attention_dp: false
EOF
Expand Down
3 changes: 2 additions & 1 deletion tensorrt_llm/_torch/attention_backend/flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,8 @@ def __post_init__(self) -> None:
def create_cuda_graph_metadata(self,
max_batch_size: int,
sub_cross_metadata: bool = False,
max_draft_tokens: int = 0) -> Self:
max_draft_tokens: int = 0,
buffers=None) -> Self:
metadata = super().create_cuda_graph_metadata(max_batch_size,
sub_cross_metadata,
max_draft_tokens)
Expand Down
5 changes: 4 additions & 1 deletion tensorrt_llm/_torch/attention_backend/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ class AttentionMetadata:

# This buffer is currently only used for TrtllmAttentionMetadata.
cache_indirection: Optional[torch.Tensor] = None
cuda_graph_buffers: dict[str, list[torch.Tensor]] = None

_saved_tensors: Dict[str, torch.Tensor] = field(init=False,
default_factory=dict)
Expand Down Expand Up @@ -288,7 +289,8 @@ def prepare(self):
def create_cuda_graph_metadata(self,
max_batch_size: int,
sub_cross_metadata: bool = False,
max_draft_tokens: int = 0) -> Self:
max_draft_tokens: int = 0,
buffers=None) -> Self:
"""
Creates metadata for CUDA graph execution.
CUDA graphs require to use pre-allocated buffers for all tensors in fields.
Expand All @@ -300,6 +302,7 @@ def create_cuda_graph_metadata(self,

cuda_graph_metadata = copy.copy(self)
cuda_graph_metadata.is_cuda_graph = True
cuda_graph_metadata.cuda_graph_buffers = buffers
if self.has_cross_sub_metadata:
cuda_graph_metadata.cross = cuda_graph_metadata.cross.create_cuda_graph_metadata(
max_batch_size, True)
Expand Down
87 changes: 71 additions & 16 deletions tensorrt_llm/_torch/attention_backend/trtllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,21 +600,76 @@ def host_kv_cache_pool_mapping(self) -> Optional[torch.Tensor]:

def __post_init__(self) -> None:
super().__post_init__()
self._post_init_with_buffers(self.cuda_graph_buffers)

def _post_init_with_buffers(self, buffers) -> None:

# Set a default value, as max_num_sequences is not always set.
if self.max_num_sequences is None:
self.max_num_sequences = self.max_num_requests

self.prompt_lens_cuda = torch.empty(
def get_empty(tensor_shape: list[int], dtype: torch.dtype,
cache_name: str) -> torch.Tensor:
"""
Finds a compatible, reusable buffer from a cache or creates a new one.

This function searches for a pre-allocated tensor (buffer) that can be
reused for an operation involving a tensor with the shape of `tensor_shape`.

The compatibility rules are: The buffer's total elements must be >= tensor_shape's.

If a compatible buffer is found, it's returned immediately. Otherwise, a new
buffer is allocated on the 'cuda' device with the give properties of 'tensor_shape' and 'dtype'.

Args:
tensor_shape: The required shape.
dtype: The required dtype.
cache_name: The key for the specific list of buffers to search in.

Returns:
An existing compatible buffer or a newly created one.
"""
if buffers is not None:
# Safely get the list of candidates. Defaults to an empty list if key is missing.
candidate_buffers = buffers.get(cache_name, [])
numel_like = math.prod(tensor_shape)

for buffer in candidate_buffers:
numel_buffer = buffer.numel()

# buffer just needs to be large enough.
if numel_buffer >= numel_like:
return buffer[0:numel_like].view(
tensor_shape) # Found a fit, return immediately.

# If we get here, no suitable buffer was found in the cache. Create a new one.
new_buffer = torch.zeros(tensor_shape, device='cuda', dtype=dtype)
if buffers is not None:
buffers.setdefault(cache_name, []).append(new_buffer)
return new_buffer

def get_empty_like(like_tensor: torch.Tensor,
cache_name: str) -> torch.Tensor:
return get_empty(
like_tensor.shape,
cache_name=cache_name,
dtype=like_tensor.dtype,
)

self.prompt_lens_cuda = get_empty(
(self.max_num_sequences, ),
device='cuda',
cache_name="prompt_lens_cuda",
dtype=torch.int,
)
self.prompt_lens_cpu = torch.empty_like(
self.prompt_lens_cuda,
device='cpu',
pin_memory=True,
)
self.kv_lens_cuda = torch.empty_like(self.prompt_lens_cuda)
self.kv_lens_cuda = get_empty_like(
self.prompt_lens_cuda,
cache_name="kv_lens_cuda",
)
self.kv_lens = torch.empty_like(self.kv_lens_cuda,
device='cpu',
pin_memory=True)
Expand All @@ -629,13 +684,13 @@ def __post_init__(self) -> None:
dtype=torch.int8,
)
if self.kv_cache_manager is not None:
self.kv_cache_block_offsets = torch.empty(
self.kv_cache_block_offsets = get_empty(
[
self.kv_cache_manager.num_pools, self.max_num_sequences, 2,
self.kv_cache_manager.max_blocks_per_seq
],
cache_name="kv_cache_block_offsets",
dtype=torch.int32,
device='cuda',
)
self.host_kv_cache_block_offsets = torch.empty_like(
self.kv_cache_block_offsets,
Expand All @@ -645,37 +700,37 @@ def __post_init__(self) -> None:
self.block_ids_per_seq = None
self.kv_block_ids_per_seq = None
if self.enable_flash_mla:
self.block_ids_per_seq = torch.zeros(
self.block_ids_per_seq = get_empty(
[
self.kv_cache_manager.max_batch_size,
self.kv_cache_manager.max_blocks_per_seq
],
cache_name="block_ids_per_seq",
dtype=torch.int32,
device='cuda',
)
self.kv_block_ids_per_seq = torch.zeros(
self.kv_block_ids_per_seq = get_empty(
[
self.kv_cache_manager.max_batch_size,
self.kv_cache_manager.max_blocks_per_seq
],
cache_name="kv_block_ids_per_seq",
dtype=torch.int32,
device='cuda',
)
if self.enable_context_mla_with_cached_kv:
# for kv cache reuse/chunked context in MLA
self.ctx_cached_token_indptr = torch.zeros(
self.ctx_cached_token_indptr = get_empty(
(self.max_num_requests + 1, ),
device='cuda',
cache_name="ctx_cached_token_indptr",
dtype=torch.int64,
)
self.host_ctx_cached_token_indptr = torch.zeros_like(
self.ctx_cached_token_indptr,
device='cpu',
pin_memory=True,
)
self.ctx_uncached_token_indptr = torch.zeros(
self.ctx_uncached_token_indptr = get_empty(
(self.max_num_requests + 1, ),
device='cuda',
cache_name="ctx_uncached_token_indptr",
dtype=torch.int64,
)
self.host_ctx_uncached_token_indptr = torch.zeros_like(
Expand All @@ -684,9 +739,9 @@ def __post_init__(self) -> None:
pin_memory=True,
)
# context full seqlens include cached tokens and uncached tokens
self.ctx_kv_indptr = torch.zeros(
self.ctx_kv_indptr = get_empty(
(self.max_num_requests + 1, ),
device='cuda',
cache_name="ctx_kv_indptr",
dtype=torch.int64,
)
self.host_ctx_kv_indptr = torch.zeros_like(
Expand Down Expand Up @@ -1165,7 +1220,7 @@ def forward(
host_kv_cache_pool_pointers=metadata.host_kv_cache_pool_pointers,
host_kv_cache_pool_mapping=metadata.host_kv_cache_pool_mapping,
block_ids_per_seq=metadata.block_ids_per_seq,
workspace=metadata.workspace,
workspace=None,
cache_indirection=metadata.cache_indirection,
kv_scale_orig_quant=self.kv_scale_orig_quant,
kv_scale_quant_orig=self.kv_scale_quant_orig,
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/_torch/autotuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ def choose_one(
if not is_cache_hit:
logger.warning_once(
f"[AutoTunner] Using the fallback tactic, due to cache miss on input shapes={input_shapes}",
key=(custom_op))
key=custom_op)

return (best_runner, best_tactic)

Expand Down
6 changes: 0 additions & 6 deletions tensorrt_llm/_torch/compilation/piecewise_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,15 +210,9 @@ def __call__(self, *args):
runtime_input_addresses = [
i.data_ptr() for i in args if isinstance(i, torch.Tensor)
]
runtime_output_addresses = [
i.data_ptr() for i in output if isinstance(i, torch.Tensor)
]

assert (entry.input_addresses == runtime_input_addresses
), f"{entry.input_addresses} vs\n {runtime_input_addresses}"
assert (
entry.output_addresses == runtime_output_addresses
), f"{entry.output_addresses} vs\n {runtime_output_addresses}"

entry.cuda_graph.replay()

Expand Down
3 changes: 2 additions & 1 deletion tensorrt_llm/_torch/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,8 @@ def get_bindings_model_config(self,
architectures = self.pretrained_config.architectures
if len(architectures
) == 1 and architectures[0] == "DeciLMForCausalLM":
mlp_hidden_size = self._infer_nemotron_ffn_mult()
mlp_hidden_size = self._infer_nemotron_ffn_mult(
) // self.mapping.tp_size
else:
raise ValueError(
f"Inferring mlp hidden size for model architecture: {architectures} isn't supported yet"
Expand Down
3 changes: 2 additions & 1 deletion tensorrt_llm/_torch/models/modeling_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,8 @@ def pack_image_features(self,
logger.warning_once(
"Image feature shape does not line up with the provided patch size. "
"You may be using the `default` vision_feature_select_strategy with a"
" visual encoder that does not have CLS.")
" visual encoder that does not have CLS.",
key="llava_next_vision_model_pack_image_features")

image_feature = image_feature.view(num_patch_height,
num_patch_width, height,
Expand Down
Loading