Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions vllm/v1/attention/backends/flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ class TRTLLMPrefill:

max_q_len: int
"""
The maximum query length *among prefill requests*.
The maximum query length *among prefill requests*.
"""

max_seq_len: int
Expand Down Expand Up @@ -1386,7 +1386,9 @@ def forward(
else:
assert isinstance(attn_metadata.prefill, TRTLLMPrefill)
# prefill_query may be non-contiguous
prefill_query = prefill_query.contiguous()
# Use .reshape() to guarantee canonical strides for
# is_strictly_contiguous assertion
prefill_query = prefill_query.contiguous().reshape(prefill_query.shape)
workspace_buffer = _get_trtllm_gen_workspace_buffer()
block_tables_prefill = attn_metadata.prefill.block_tables
seq_lens_prefill = attn_metadata.prefill.seq_lens
Expand Down Expand Up @@ -1497,7 +1499,7 @@ def forward(
else:
# decode_query may be non-contiguous
assert isinstance(attn_metadata.decode, TRTLLMDecode)
decode_query = decode_query.contiguous()
decode_query = decode_query.contiguous().reshape(decode_query.shape)
workspace_buffer = _get_trtllm_gen_workspace_buffer()
block_tables_decode = attn_metadata.decode.block_tables
seq_lens_decode = attn_metadata.decode.seq_lens
Expand Down