Deepseek V4 #23882

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

hnyls2002 merged 428 commits into main from dsv4-rebase

May 8, 2026

rebase main with 658a281

Loading branch information

hnyls2002 committed Apr 29, 2026

commit 5fc88fc9bc758a165f1e6fa689a8818d76e82222

python/sglang/srt/layers/attention/nsa/tilelang_kernel.py

hnyls2002 marked this conversation as resolved.

-Original file line number
+Diff line change
@@ Expand Up / @@ -45,23 +45,6 @@ def fast_round_scale(amax, fp8_max_inv): @@
         return fast_pow2(fast_log2_ceil(amax * fp8_max_inv))
-    @lru_cache(maxsize=8)
-    def _pick_inner_iter(seq: int, ni: int, cu: int, block_per_cu: int) -> int:
-        """
-        Pick the largest valid inner_iter (power-of-two divisor of ni) that keeps
-        enough work per CU (seq * ni / inner_iter / cu >= block_per_cu), so we avoid
-        under-utilization while minimizing the number of partial groups.
-        """
-        max_it = int(seq * ni / (cu * block_per_cu))
-        it = ni
-        while it >= 2:
-            if it <= max_it and ni % it == 0:
-                return it
-            it //= 2
-        return 1
     @tilelang.jit(pass_configs=pass_configs)
     def act_quant_kernel(
         N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False
@@ Expand Down Expand Up / @@ -1024,255 +1007,6 @@ def main( @@
         return main
-    @tilelang.jit(out_idx=[-2, -1], pass_configs=pass_configs)
-    def sparse_mla_fwd_decode_partial_fp8(
-        num_heads: int,
-        d_v: int,
-        d_tail: int,
-        topk: int,
-        *,
-        sm_scale=None,
-        block_I=64,
-        inner_iter=1,
-        threads=256,
-    ):
-        assert d_v == 512, f"only support d_v=512"
-        assert (
-            topk % block_I == 0
-        ), "otherwise will load some index=0 thus causing wrong kv to be loaded"
-        # Softmax scores are in [0, 1]. We scale by fp8_max_val before FP8 cast
-        # to better utilize FP8 dynamic range, then apply the inverse scale after GEMM.
-        # This is numerically safe because softmax output is bounded by 1.
-        fp8_dtype = "float8_e4m3fnuz" if _is_fp8_fnuz else "float8_e4m3fn"
-        fp8_max_val = 240.0 if _is_fp8_fnuz else 448.0
-        s_inv_scale_const = fp8_max_val
-        s_scale_const = 1.0 / fp8_max_val
-        BI = block_I
-        group_size = 128
-        dim_quant_fp8 = d_v + d_tail
-        rope_offset_fp8 = d_v
-        n_groups = topk // (BI * inner_iter)
-        if sm_scale is None:
-            sm_scale = (1.0 / (d_v + d_tail)) ** 0.5 * 1.44269504
-        else:
-            sm_scale = sm_scale * 1.44269504
-        h_per_block = 16
-        # Match bf16 partial behavior: keep fixed 16-head tiles and use
-        # sliced T.copy on H0:H1 for tail handling.
-        assert (
-            num_heads <= h_per_block or num_heads % h_per_block == 0
-        ), "num_heads must be <=16 or divisible by 16"
-        head_blocks_per_seq = (num_heads + h_per_block - 1) // h_per_block
-        batch = 1
-        kv_group = 1
-        seq_len = T.symbolic("seq_len")
-        num_pages = T.symbolic("num_pages")
-        q_fp8_shape = [batch, seq_len, num_heads, d_v + d_tail]
-        kv_fp8_shape = [batch, num_pages, kv_group, dim_quant_fp8]
-        idx_shape = [batch, seq_len, kv_group, topk]
-        partial_o_shape = [batch, seq_len, n_groups, num_heads, d_v]
-        partial_lse_shape = [batch, seq_len, n_groups, num_heads]
-        accum_dtype = T.float32
-        dtype_bf16 = T.bfloat16
-        @T.prim_func
-        def main(
-            q_fp8: T.Tensor(q_fp8_shape, fp8_dtype),
-            kv_fp8: T.Tensor(kv_fp8_shape, fp8_dtype),
-            indices: T.Tensor(idx_shape, T.int32),
-            partial_o: T.Tensor(partial_o_shape, dtype_bf16),
-            partial_lse: T.Tensor(partial_lse_shape, accum_dtype),
-        ):
-            with T.Kernel(seq_len * head_blocks_per_seq, n_groups, threads=threads) as (
-                bx,
-                by,
-            ):
-                b_i, g_i = 0, 0
-                s_i = bx // head_blocks_per_seq
-                group_i = by
-                H0 = (bx % head_blocks_per_seq) * h_per_block
-                H1 = H0 + h_per_block
-                # We intentionally split the K=512 GEMM into 4x128 tiles.
-                # Although this adds extra intermediate memory traffic,
-                # it shortens the MFMA accumulation dependency chain and improves performance.
-                q_tile0 = T.alloc_shared([h_per_block, group_size], fp8_dtype)
-                q_tile1 = T.alloc_shared([h_per_block, group_size], fp8_dtype)
-                q_tile2 = T.alloc_shared([h_per_block, group_size], fp8_dtype)
-                q_tile3 = T.alloc_shared([h_per_block, group_size], fp8_dtype)
-                kv_tile0 = T.alloc_shared([BI, group_size], fp8_dtype)
-                kv_tile1 = T.alloc_shared([BI, group_size], fp8_dtype)
-                kv_tile2 = T.alloc_shared([BI, group_size], fp8_dtype)
-                kv_tile3 = T.alloc_shared([BI, group_size], fp8_dtype)
-                q_tail_buf = T.alloc_shared([h_per_block, d_tail], fp8_dtype)
-                k_tail_shared = T.alloc_shared([BI, d_tail], fp8_dtype)
-                s_fp8_shared = T.alloc_shared([h_per_block, BI], fp8_dtype)
-                page_idx_shared = T.alloc_shared([BI], T.int32)
-                mask = T.alloc_fragment([BI], T.bool)
-                acc_s = T.alloc_fragment([h_per_block, BI], accum_dtype)
-                acc_tile = T.alloc_fragment([h_per_block, BI], accum_dtype)
-                sv_tile = T.alloc_fragment([h_per_block, group_size], accum_dtype)
-                sumexp = T.alloc_fragment([h_per_block], accum_dtype)
-                sumexp_i = T.alloc_fragment([h_per_block], accum_dtype)
-                alpha = T.alloc_fragment([h_per_block], accum_dtype)
-                m_i = T.alloc_fragment([h_per_block], accum_dtype)
-                m_i_prev = T.alloc_fragment([h_per_block], accum_dtype)
-                inv_denom = T.alloc_fragment([h_per_block], accum_dtype)
-                acc_o_tile0 = T.alloc_fragment([h_per_block, group_size], accum_dtype)
-                acc_o_tile1 = T.alloc_fragment([h_per_block, group_size], accum_dtype)
-                acc_o_tile2 = T.alloc_fragment([h_per_block, group_size], accum_dtype)
-                acc_o_tile3 = T.alloc_fragment([h_per_block, group_size], accum_dtype)
-                T.fill(acc_o_tile0, 0)
-                T.fill(acc_o_tile1, 0)
-                T.fill(acc_o_tile2, 0)
-                T.fill(acc_o_tile3, 0)
-                T.fill(sumexp, 0)
-                T.fill(m_i, -(2**30))
-                T.copy(q_fp8[b_i, s_i, H0:H1, d_v:], q_tail_buf)
-                T.copy(q_fp8[b_i, s_i, H0:H1, 0 * group_size : 1 * group_size], q_tile0)
-                T.copy(q_fp8[b_i, s_i, H0:H1, 1 * group_size : 2 * group_size], q_tile1)
-                T.copy(q_fp8[b_i, s_i, H0:H1, 2 * group_size : 3 * group_size], q_tile2)
-                T.copy(q_fp8[b_i, s_i, H0:H1, 3 * group_size : 4 * group_size], q_tile3)
-                for k_i in T.serial(inner_iter):
-                    topk_block_i = group_i * inner_iter + k_i
-                    for bi_i in T.Parallel(BI):
-                        idx = indices[b_i, s_i, g_i, topk_block_i * BI + bi_i]
-                        valid = idx >= 0
-                        page_idx_shared[bi_i] = T.if_then_else(valid, idx, 0)
-                        mask[bi_i] = valid
-                    for bi_i, j in T.Parallel(BI, group_size):
-                        page = page_idx_shared[bi_i]
-                        kv_tile0[bi_i, j] = kv_fp8[b_i, page, g_i, 0 * group_size + j]
-                        kv_tile1[bi_i, j] = kv_fp8[b_i, page, g_i, 1 * group_size + j]
-                        kv_tile2[bi_i, j] = kv_fp8[b_i, page, g_i, 2 * group_size + j]
-                        kv_tile3[bi_i, j] = kv_fp8[b_i, page, g_i, 3 * group_size + j]
-                    for bi_i, j in T.Parallel(BI, d_tail):
-                        page = page_idx_shared[bi_i]
-                        k_tail_shared[bi_i, j] = kv_fp8[b_i, page, g_i, rope_offset_fp8 + j]
-                    for h_i, bi_i in T.Parallel(h_per_block, BI):
-                        acc_s[h_i, bi_i] = T.if_then_else(
-                            mask[bi_i], 0, -T.infinity(acc_s.dtype)
-                        )
-                    T.gemm(q_tile0, kv_tile0, acc_s, transpose_B=True, clear_accum=False)
-                    T.gemm(q_tile1, kv_tile1, acc_tile, transpose_B=True, clear_accum=True)
-                    for h_i, bi_i in T.Parallel(h_per_block, BI):
-                        acc_s[h_i, bi_i] += acc_tile[h_i, bi_i]
-                    T.gemm(q_tile2, kv_tile2, acc_tile, transpose_B=True, clear_accum=True)
-                    for h_i, bi_i in T.Parallel(h_per_block, BI):
-                        acc_s[h_i, bi_i] += acc_tile[h_i, bi_i]
-                    T.gemm(q_tile3, kv_tile3, acc_tile, transpose_B=True, clear_accum=True)
-                    for h_i, bi_i in T.Parallel(h_per_block, BI):
-                        acc_s[h_i, bi_i] += acc_tile[h_i, bi_i]
-                    T.gemm(
-                        q_tail_buf,
-                        k_tail_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullCol,
-                    )
-                    T.copy(m_i, m_i_prev)
-                    T.reduce_max(acc_s, m_i, dim=1, clear=False)
-                    for h_i in T.Parallel(h_per_block):
-                        alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
-                    for h_i, bi_i in T.Parallel(h_per_block, BI):
-                        acc_s[h_i, bi_i] = T.exp2(
-                            acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale
-                        )
-                    T.reduce_sum(acc_s, sumexp_i, dim=1)
-                    for h_i in T.Parallel(h_per_block):
-                        sumexp[h_i] = sumexp[h_i] * alpha[h_i] + sumexp_i[h_i]
-                    for h_i, j in T.Parallel(h_per_block, group_size):
-                        acc_o_tile0[h_i, j] = acc_o_tile0[h_i, j] * alpha[h_i]
-                        acc_o_tile1[h_i, j] = acc_o_tile1[h_i, j] * alpha[h_i]
-                        acc_o_tile2[h_i, j] = acc_o_tile2[h_i, j] * alpha[h_i]
-                        acc_o_tile3[h_i, j] = acc_o_tile3[h_i, j] * alpha[h_i]
-                    for h_i, bi_i in T.Parallel(h_per_block, BI):
-                        s_fp8_shared[h_i, bi_i] = T.clamp(
-                            acc_s[h_i, bi_i] * s_inv_scale_const,
-                            -fp8_max_val,
-                            fp8_max_val,
-                        )
-                    T.gemm(s_fp8_shared, kv_tile0, sv_tile, clear_accum=True)
-                    for h_i, j in T.Parallel(h_per_block, group_size):
-                        acc_o_tile0[h_i, j] = (
-                            acc_o_tile0[h_i, j] + sv_tile[h_i, j] * s_scale_const
-                        )
-                    T.gemm(s_fp8_shared, kv_tile1, sv_tile, clear_accum=True)
-                    for h_i, j in T.Parallel(h_per_block, group_size):
-                        acc_o_tile1[h_i, j] = (
-                            acc_o_tile1[h_i, j] + sv_tile[h_i, j] * s_scale_const
-                        )
-                    T.gemm(s_fp8_shared, kv_tile2, sv_tile, clear_accum=True)
-                    for h_i, j in T.Parallel(h_per_block, group_size):
-                        acc_o_tile2[h_i, j] = (
-                            acc_o_tile2[h_i, j] + sv_tile[h_i, j] * s_scale_const
-                        )
-                    T.gemm(s_fp8_shared, kv_tile3, sv_tile, clear_accum=True)
-                    for h_i, j in T.Parallel(h_per_block, group_size):
-                        acc_o_tile3[h_i, j] = (
-                            acc_o_tile3[h_i, j] + sv_tile[h_i, j] * s_scale_const
-                        )
-                for h_i in T.Parallel(h_per_block):
-                    denom = T.if_then_else(sumexp[h_i] == 0.0, 1.0, sumexp[h_i])
-                    inv_denom[h_i] = 1.0 / denom
-                for h_i, j in T.Parallel(h_per_block, group_size):
-                    acc_o_tile0[h_i, j] = acc_o_tile0[h_i, j] * inv_denom[h_i]
-                    acc_o_tile1[h_i, j] = acc_o_tile1[h_i, j] * inv_denom[h_i]
-                    acc_o_tile2[h_i, j] = acc_o_tile2[h_i, j] * inv_denom[h_i]
-                    acc_o_tile3[h_i, j] = acc_o_tile3[h_i, j] * inv_denom[h_i]
-                for h_i in T.Parallel(h_per_block):
-                    sumexp[h_i] = T.if_then_else(
-                        sumexp[h_i] == 0.0,
-                        -(2**30),
-                        T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale,
-                    )
-                T.copy(
-                    acc_o_tile0,
-                    partial_o[b_i, s_i, group_i, H0:H1, 0 * group_size : 1 * group_size],
-                )
-                T.copy(
-                    acc_o_tile1,
-                    partial_o[b_i, s_i, group_i, H0:H1, 1 * group_size : 2 * group_size],
-                )
-                T.copy(
-                    acc_o_tile2,
-                    partial_o[b_i, s_i, group_i, H0:H1, 2 * group_size : 3 * group_size],
-                )
-                T.copy(
-                    acc_o_tile3,
-                    partial_o[b_i, s_i, group_i, H0:H1, 3 * group_size : 4 * group_size],
-                )
-                T.copy(sumexp, partial_lse[b_i, s_i, group_i, H0:H1])
-        return main
     def tilelang_sparse_fwd(
         q: torch.Tensor,
         kv: torch.Tensor,
@@ Expand Down @@

python/sglang/srt/managers/hisparse_coordinator.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -50,7 +50,7 @@ def __init__( @@
                 self.mem_pool_host.kv_cache_total_dim * self.mem_pool_host.dtype.itemsize
             )
-            max_num_reqs = req_to_token_pool.req_to_token.shape[0]
+            max_num_reqs = req_to_token_pool.size
             max_context_len = req_to_token_pool.max_context_len
             self.padded_buffer_size = (
@@ Expand Down Expand Up / @@ -163,53 +163,6 @@ def admit_request_into_staging(self, req: Req) -> None: @@
             self.ack_staging_queue.append(HiSparseAct(start_event, finish_event, req))
-        def admit_request_direct(self, req: Req) -> None:
-            """Direct-to-host path: KV data already resides in host pool via RDMA.
-            Skips staging DMA entirely. Only allocates a small device buffer
-            (4KB) for decode-time swap-in, then marks the request as ready.
-            Host indices were already written to req_to_host_pool.
-            Metadata fixups after alloc_device_buffer():
-            - alloc_device_buffer() sets device_buffer_tokens = [0, 1, ..., buf_size-1],
-              which tells the swap-in kernel that those tokens are cached in the device
-              buffer.  In the staging path this is correct (prefill filled the buffer),
-              but here the buffer is empty.
-            """
-            self.alloc_device_buffer(req)
-            if req.kv_allocated_len <= self.device_buffer_size:
-                # Short sequences (seq_len <= device_buffer_size): the kernel fast path
-                # returns device_buffer_locs directly without any host loading, so we
-                # must preload all tokens from host pool into the device buffer
-                # TODO(hzh0425): Optimize this.
-                self._preload_to_device_buffer(req)
-            else:
-                # Long sequence: reset device_buffer_tokens to -1 so the kernel
-                # sees all slots as empty → every top-k lookup is a miss → host load.
-                self.req_device_buffer_tokens[
-                    :, req.req_pool_idx, : self.device_buffer_size
-                ] = -1
-            req.staging = False
-            self._skip_first_backup[req.req_pool_idx] = True
-            logger.debug("HiSparse: admitting request %s directly", req.rid)
-        def _preload_to_device_buffer(self, req: Req) -> None:
-            """Preload all tokens from host pool into the device buffer."""
-            n = req.kv_allocated_len
-            host_indices = self.req_to_host_pool[req.req_pool_idx, :n]
-            device_locs = self.req_to_device_buffer[req.req_pool_idx, :n]
-            for layer_id in range(self.mem_pool_device.layer_num):
-                self.mem_pool_host.load_to_device_per_layer(
-                    self.mem_pool_device,
-                    host_indices,
-                    device_locs,
-                    layer_id,
-                    io_backend="kernel",
-                )
         def alloc_device_buffer(self, req: Req) -> None:
             prefill_len = len(req.fill_ids)
             compressed_logical_indices = (
@@ Expand Down @@

python/sglang/srt/managers/schedule_batch.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -822,6 +822,7 @@ def __init__( @@
             # For diffusion LLM
             self.init_diffusion_llm(dllm_config)
+            # For hisparse
             self.hisparse_staging = False
         @property
@@ Expand Down @@

python/sglang/srt/mem_cache/hisparse_memory_pool.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -208,29 +208,6 @@ def alloc(self, need_size: int): @@
                 "Page size = 1 is not supported in HiSparse allocator"
             )
-        def alloc_logical_only(
-            self,
-            prefix_lens: torch.Tensor,
-            prefix_lens_cpu: torch.Tensor,
-            seq_lens: torch.Tensor,
-            seq_lens_cpu: torch.Tensor,
-            last_loc: torch.Tensor,
-            extend_num_tokens: int,
-        ):
-            """Allocate only logical indices without hisparse device indices.
-            Used in the direct-to-host transfer path where KV data is written
-            directly to host memory by the prefill node, skipping GPU staging.
-            """
-            return self.logical_attn_allocator.alloc_extend(
-                prefix_lens,
-                prefix_lens_cpu,
-                seq_lens,
-                seq_lens_cpu,
-                last_loc,
-                extend_num_tokens,
-            )
         def alloc_device_buffer(self, allocated_indices, need_size: int):
             assert need_size % self.page_size == 0
             hisparse_indices = self.full_to_hisparse_device_index_mapping[allocated_indices]
@@ Expand Down @@

You are viewing a condensed version of this merge commit. You can view the full changes here.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Deepseek V4 #23882

Uh oh!

Diff view

Diff view

Uh oh!

There are no files selected for viewing

Uh oh!

Uh oh!