Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions python/sglang/srt/layers/attention/xpu_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from sglang.srt.model_executor.model_runner import ModelRunner

from sgl_kernel import merge_state_v2
from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache, flash_attn_decode, flash_attn_extend


class XPUAttentionBackend(AttentionBackend):
Expand Down Expand Up @@ -496,8 +496,8 @@ def forward_extend(
cache_seqlens = metadata.encoder_lens_int32
cu_seqlens_k = metadata.encoder_cu_seqlens_k
window_size = (-1, -1)

result = flash_attn_with_kvcache(
result = flash_attn_extend(
q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
k_cache=key_cache,
v_cache=value_cache,
Expand All @@ -518,7 +518,7 @@ def forward_extend(

if use_cascade_attn:
o, softmax_lse, *rest = result
o_expand, softmax_lse_expand, *rest_expand = flash_attn_with_kvcache(
o_expand, softmax_lse_expand, *rest_expand = flash_attn_extend(
q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
k_cache=key_cache,
v_cache=value_cache,
Expand Down Expand Up @@ -619,7 +619,7 @@ def forward_extend(
q_nope = q_all[:, :, : layer.v_head_dim]
q_rope = q_all[:, :, layer.v_head_dim :]

result = flash_attn_with_kvcache(
result = flash_attn_extend(
q=q_rope,
k_cache=k_rope_cache,
v_cache=c_kv_cache,
Expand All @@ -639,7 +639,7 @@ def forward_extend(
if use_cascade_attn:
o, softmax_lse, *rest = result
o_expand, softmax_lse_expand, *rest_expand = (
flash_attn_with_kvcache(
flash_attn_extend(
q=q_rope,
k_cache=k_rope_cache,
v_cache=c_kv_cache,
Expand Down Expand Up @@ -758,7 +758,7 @@ def forward_decode(

if layer.is_cross_attention:
# Always use non-chunked logic for cross-attention
o = flash_attn_with_kvcache(
o = flash_attn_decode(
q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
k_cache=key_cache,
v_cache=value_cache,
Expand All @@ -777,7 +777,7 @@ def forward_decode(
)
elif use_local_attn:
# Use chunked (local) attention batching for self-attention
o = flash_attn_with_kvcache(
o = flash_attn_decode(
q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
k_cache=key_cache,
v_cache=value_cache,
Expand All @@ -804,7 +804,7 @@ def forward_decode(
)

# Default: single-token self-attention
result = flash_attn_with_kvcache(
result = flash_attn_decode(
q=q_reshaped,
k_cache=key_cache,
v_cache=value_cache,
Expand All @@ -825,7 +825,7 @@ def forward_decode(
if use_cascade_attn:
o, softmax_lse, *rest = result
o_expand, softmax_lse_expand, *rest_expand = (
flash_attn_with_kvcache(
flash_attn_decode(
q=q_reshaped,
k_cache=key_cache,
v_cache=value_cache,
Expand Down Expand Up @@ -880,7 +880,7 @@ def forward_decode(
q_rope = q_all[:, :, layer.v_head_dim :]
max_seqlen_q = metadata.max_seq_len_q

result = flash_attn_with_kvcache(
result = flash_attn_decode(
q=q_rope,
k_cache=k_rope_cache,
v_cache=c_kv_cache,
Expand All @@ -899,7 +899,7 @@ def forward_decode(
)
if use_cascade_attn:
o, softmax_lse, *rest = result
o_expand, softmax_lse_expand, *rest_expand = flash_attn_with_kvcache(
o_expand, softmax_lse_expand, *rest_expand = flash_attn_decode(
q=q_rope,
k_cache=k_rope_cache,
v_cache=c_kv_cache,
Expand Down
Loading