diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 9892c360d3d6..1391dc5999a6 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -416,7 +416,7 @@ class TRTLLMPrefill: max_q_len: int """ - The maximum query length *among prefill requests*. + The maximum query length *among prefill requests*. """ max_seq_len: int @@ -1386,7 +1386,9 @@ def forward( else: assert isinstance(attn_metadata.prefill, TRTLLMPrefill) # prefill_query may be non-contiguous - prefill_query = prefill_query.contiguous() + # Use .reshape() to guarantee canonical strides for + # is_strictly_contiguous assertion + prefill_query = prefill_query.contiguous().reshape(prefill_query.shape) workspace_buffer = _get_trtllm_gen_workspace_buffer() block_tables_prefill = attn_metadata.prefill.block_tables seq_lens_prefill = attn_metadata.prefill.seq_lens @@ -1497,7 +1499,7 @@ def forward( else: # decode_query may be non-contiguous assert isinstance(attn_metadata.decode, TRTLLMDecode) - decode_query = decode_query.contiguous() + decode_query = decode_query.contiguous().reshape(decode_query.shape) workspace_buffer = _get_trtllm_gen_workspace_buffer() block_tables_decode = attn_metadata.decode.block_tables seq_lens_decode = attn_metadata.decode.seq_lens