From 7e7453aa5bd97f6712df9951fb3d1502b60ecdc9 Mon Sep 17 00:00:00 2001 From: hebiao064 Date: Thu, 1 May 2025 07:19:10 +0000 Subject: [PATCH 1/2] Optimize FA3 Decoding Speed --- python/sglang/srt/layers/attention/flashattention_backend.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/attention/flashattention_backend.py b/python/sglang/srt/layers/attention/flashattention_backend.py index 4e8543213a1..491f0beb256 100644 --- a/python/sglang/srt/layers/attention/flashattention_backend.py +++ b/python/sglang/srt/layers/attention/flashattention_backend.py @@ -1587,8 +1587,9 @@ def init_forward_metadata_replay_cuda_graph( metadata.max_seq_len_k = max_len metadata.cache_seqlens_int32 = seq_lens.to(torch.int32) - metadata.cu_seqlens_k = torch.nn.functional.pad( - torch.cumsum(seq_lens, dim=0, dtype=torch.int32), (1, 0) + # Optimized cumulative sequence length calculation + metadata.cu_seqlens_k[1:].copy_( + torch.cumsum(seq_lens, dim=0, dtype=torch.int32) ) max_seq_pages = ( From 48717a29bf16d10bde833921e17ab6ce39f60c70 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Thu, 1 May 2025 01:25:49 -0700 Subject: [PATCH 2/2] trigger ci --- python/sglang/srt/layers/attention/flashattention_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/attention/flashattention_backend.py b/python/sglang/srt/layers/attention/flashattention_backend.py index 491f0beb256..9579b19f25b 100644 --- a/python/sglang/srt/layers/attention/flashattention_backend.py +++ b/python/sglang/srt/layers/attention/flashattention_backend.py @@ -1587,7 +1587,7 @@ def init_forward_metadata_replay_cuda_graph( metadata.max_seq_len_k = max_len metadata.cache_seqlens_int32 = seq_lens.to(torch.int32) - # Optimized cumulative sequence length calculation + # Optimize cumulative sequence length calculation metadata.cu_seqlens_k[1:].copy_( torch.cumsum(seq_lens, dim=0, dtype=torch.int32) )