From c412556b0ef8aa29551b7b2324b4f4a07b6a465a Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 24 Sep 2025 15:10:23 -0400 Subject: [PATCH 1/3] Fix: Separate prefill and decode logic for TRTLLM attention auto-detection - Remove inappropriate 256 token limit for prefill sequences - Keep existing 256 token limit for decode batches - Add context-specific logging to distinguish prefill vs decode - Fixes issue where prefill sequences > 256 tokens incorrectly fell back to FlashInfer Signed-off-by: jasonlizhengjian --- vllm/utils/flashinfer.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index ab0cf2051f87..af27204c5d74 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -283,11 +283,18 @@ def use_trtllm_attention( if force_use_trtllm is None: # Environment variable not set - use auto-detection - use_trtllm = ( - num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto" - ) + if is_prefill: + # Prefill auto-detection + use_trtllm = (max_seq_len <= 131072 and kv_cache_dtype == "auto") + else: + # Decode auto-detection + use_trtllm = (num_tokens <= 256 and max_seq_len <= 131072 + and kv_cache_dtype == "auto") + if use_trtllm: - logger.warning_once("Using TRTLLM attention (auto-detected).") + context = "prefill" if is_prefill else "decode" + logger.warning_once( + f"Using TRTLLM {context} attention (auto-detected).") return use_trtllm # Environment variable is set to 1 - respect it From 8ee4ae5d388243e46dbad1bd5f002f0f8bbd5a73 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 1 Oct 2025 20:04:25 +0000 Subject: [PATCH 2/3] Fix TRTLLM attention logging to show both prefill and decode messages Signed-off-by: jasonlizhengjian --- vllm/utils/flashinfer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index af27204c5d74..f61aaca488b9 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -286,15 +286,16 @@ def use_trtllm_attention( if is_prefill: # Prefill auto-detection use_trtllm = (max_seq_len <= 131072 and kv_cache_dtype == "auto") + if use_trtllm: + logger.warning_once( + "Using TRTLLM prefill attention (auto-detected).") else: # Decode auto-detection use_trtllm = (num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto") - - if use_trtllm: - context = "prefill" if is_prefill else "decode" - logger.warning_once( - f"Using TRTLLM {context} attention (auto-detected).") + if use_trtllm: + logger.warning_once( + "Using TRTLLM decode attention (auto-detected).") return use_trtllm # Environment variable is set to 1 - respect it From a2ff9549528ac04e248c4c79c738c42c4dd6bbd8 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Sun, 5 Oct 2025 17:57:09 +0000 Subject: [PATCH 3/3] Apply ruff-format to flashinfer.py after rebase Signed-off-by: Jason Li Signed-off-by: jasonlizhengjian --- vllm/utils/flashinfer.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index f61aaca488b9..1d707d56daba 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -285,17 +285,16 @@ def use_trtllm_attention( # Environment variable not set - use auto-detection if is_prefill: # Prefill auto-detection - use_trtllm = (max_seq_len <= 131072 and kv_cache_dtype == "auto") + use_trtllm = max_seq_len <= 131072 and kv_cache_dtype == "auto" if use_trtllm: - logger.warning_once( - "Using TRTLLM prefill attention (auto-detected).") + logger.warning_once("Using TRTLLM prefill attention (auto-detected).") else: # Decode auto-detection - use_trtllm = (num_tokens <= 256 and max_seq_len <= 131072 - and kv_cache_dtype == "auto") + use_trtllm = ( + num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto" + ) if use_trtllm: - logger.warning_once( - "Using TRTLLM decode attention (auto-detected).") + logger.warning_once("Using TRTLLM decode attention (auto-detected).") return use_trtllm # Environment variable is set to 1 - respect it