From 83e9817381ae28ef35349a6581ada7ca99fc5e16 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Thu, 15 Jan 2026 18:41:26 -0800 Subject: [PATCH 1/2] [BugFix] Fix is_strictly_contiguous assertion for decode_query in TRTLLM path Fix decode_query strict contiguity assertion failure in FlashInfer TRTLLM decode path on Blackwell GPUs. The issue: decode_query may have non-contiguous memory layout, and calling .contiguous() alone doesn't always produce canonical strides required by TRTLLM kernels. The fix: Use .contiguous().reshape(shape) to ensure strictly contiguous layout with canonical strides. Signed-off-by: Lu Fang --- vllm/v1/attention/backends/flashinfer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 9892c360d3d6..e019a96a134c 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -416,7 +416,7 @@ class TRTLLMPrefill: max_q_len: int """ - The maximum query length *among prefill requests*. + The maximum query length *among prefill requests*. """ max_seq_len: int @@ -1497,7 +1497,7 @@ def forward( else: # decode_query may be non-contiguous assert isinstance(attn_metadata.decode, TRTLLMDecode) - decode_query = decode_query.contiguous() + decode_query = decode_query.contiguous().reshape(decode_query.shape) workspace_buffer = _get_trtllm_gen_workspace_buffer() block_tables_decode = attn_metadata.decode.block_tables seq_lens_decode = attn_metadata.decode.seq_lens From 10073381c195adca2082ec628449b4664cc59323 Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Thu, 15 Jan 2026 22:40:44 -0800 Subject: [PATCH 2/2] add same for prefill Signed-off-by: Lu Fang --- vllm/v1/attention/backends/flashinfer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index e019a96a134c..1391dc5999a6 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -1386,7 +1386,9 @@ def forward( else: assert isinstance(attn_metadata.prefill, TRTLLMPrefill) # prefill_query may be non-contiguous - prefill_query = prefill_query.contiguous() + # Use .reshape() to guarantee canonical strides for + # is_strictly_contiguous assertion + prefill_query = prefill_query.contiguous().reshape(prefill_query.shape) workspace_buffer = _get_trtllm_gen_workspace_buffer() block_tables_prefill = attn_metadata.prefill.block_tables seq_lens_prefill = attn_metadata.prefill.seq_lens