From c412556b0ef8aa29551b7b2324b4f4a07b6a465a Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jason.li@centml.ai>
Date: Wed, 24 Sep 2025 15:10:23 -0400
Subject: [PATCH 1/3] Fix: Separate prefill and decode logic for TRTLLM
 attention auto-detection

- Remove inappropriate 256 token limit for prefill sequences
- Keep existing 256 token limit for decode batches
- Add context-specific logging to distinguish prefill vs decode
- Fixes issue where prefill sequences > 256 tokens incorrectly fell back to FlashInfer

Signed-off-by: jasonlizhengjian <jason.li@centml.ai>
---
 vllm/utils/flashinfer.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index ab0cf2051f87..af27204c5d74 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -283,11 +283,18 @@ def use_trtllm_attention(
 
     if force_use_trtllm is None:
         # Environment variable not set - use auto-detection
-        use_trtllm = (
-            num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto"
-        )
+        if is_prefill:
+            # Prefill auto-detection
+            use_trtllm = (max_seq_len <= 131072 and kv_cache_dtype == "auto")
+        else:
+            # Decode auto-detection
+            use_trtllm = (num_tokens <= 256 and max_seq_len <= 131072
+                          and kv_cache_dtype == "auto")
+
         if use_trtllm:
-            logger.warning_once("Using TRTLLM attention (auto-detected).")
+            context = "prefill" if is_prefill else "decode"
+            logger.warning_once(
+                f"Using TRTLLM {context} attention (auto-detected).")
         return use_trtllm
 
     # Environment variable is set to 1 - respect it

From 8ee4ae5d388243e46dbad1bd5f002f0f8bbd5a73 Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Wed, 1 Oct 2025 20:04:25 +0000
Subject: [PATCH 2/3] Fix TRTLLM attention logging to show both prefill and
 decode messages

Signed-off-by: jasonlizhengjian <jasonlizhengjian@gmail.com>
---
 vllm/utils/flashinfer.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index af27204c5d74..f61aaca488b9 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -286,15 +286,16 @@ def use_trtllm_attention(
         if is_prefill:
             # Prefill auto-detection
             use_trtllm = (max_seq_len <= 131072 and kv_cache_dtype == "auto")
+            if use_trtllm:
+                logger.warning_once(
+                    "Using TRTLLM prefill attention (auto-detected).")
         else:
             # Decode auto-detection
             use_trtllm = (num_tokens <= 256 and max_seq_len <= 131072
                           and kv_cache_dtype == "auto")
-
-        if use_trtllm:
-            context = "prefill" if is_prefill else "decode"
-            logger.warning_once(
-                f"Using TRTLLM {context} attention (auto-detected).")
+            if use_trtllm:
+                logger.warning_once(
+                    "Using TRTLLM decode attention (auto-detected).")
         return use_trtllm
 
     # Environment variable is set to 1 - respect it

From a2ff9549528ac04e248c4c79c738c42c4dd6bbd8 Mon Sep 17 00:00:00 2001
From: jasonlizhengjian <jasonlizhengjian@gmail.com>
Date: Sun, 5 Oct 2025 17:57:09 +0000
Subject: [PATCH 3/3] Apply ruff-format to flashinfer.py after rebase

Signed-off-by: Jason Li <jasonlizhengjian@gmail.com>

Signed-off-by: jasonlizhengjian <jasonlizhengjian@gmail.com>
---
 vllm/utils/flashinfer.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index f61aaca488b9..1d707d56daba 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -285,17 +285,16 @@ def use_trtllm_attention(
         # Environment variable not set - use auto-detection
         if is_prefill:
             # Prefill auto-detection
-            use_trtllm = (max_seq_len <= 131072 and kv_cache_dtype == "auto")
+            use_trtllm = max_seq_len <= 131072 and kv_cache_dtype == "auto"
             if use_trtllm:
-                logger.warning_once(
-                    "Using TRTLLM prefill attention (auto-detected).")
+                logger.warning_once("Using TRTLLM prefill attention (auto-detected).")
         else:
             # Decode auto-detection
-            use_trtllm = (num_tokens <= 256 and max_seq_len <= 131072
-                          and kv_cache_dtype == "auto")
+            use_trtllm = (
+                num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto"
+            )
             if use_trtllm:
-                logger.warning_once(
-                    "Using TRTLLM decode attention (auto-detected).")
+                logger.warning_once("Using TRTLLM decode attention (auto-detected).")
         return use_trtllm
 
     # Environment variable is set to 1 - respect it