diff --git a/python/sglang/jit_kernel/csrc/deepseek_v4/topk_v2.cuh b/python/sglang/jit_kernel/csrc/deepseek_v4/topk_v2.cuh
index 8c4a526575ea..24ca26fd1b21 100644
--- a/python/sglang/jit_kernel/csrc/deepseek_v4/topk_v2.cuh
+++ b/python/sglang/jit_kernel/csrc/deepseek_v4/topk_v2.cuh
@@ -465,7 +465,17 @@ struct CombinedTopKKernel {
           .enable_pdl(true)(kernel, params);
     } else {
       // Some items may be large -- launch stage-1 + main
-      if (batch_size <= kNumClusters) {
+      // SM120 (consumer Blackwell, CC 12.0) has only ~99KB shared memory per block.
+      // kStage1SMEM (~144KB) exceeds this limit, so skip the cluster path on SM120
+      // and use only the Medium/Small (stage-2) paths which fit in ~84KB.
+      int device_cc_major = 0;
+      {
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, 0);
+        device_cc_major = prop.major;
+      }
+      const bool is_sm120 = (device_cc_major == 12);
+      if (!is_sm120 && batch_size <= kNumClusters) {
         // can fuse into 1 stage
         constexpr auto kernel = topk_fused_transform;
         constexpr auto kSMEM = std::max(kStage1SMEM, kStage2SMEM);
@@ -473,7 +483,7 @@ struct CombinedTopKKernel {
         LaunchKernel({batch_size, kClusterSize}, kBlockSize, device, kSMEM)
             .enable_cluster({1, kClusterSize})
             .enable_pdl(true)(kernel, params);
-      } else {
+      } else if (!is_sm120) {
         // stage 1 + stage 2
         constexpr auto kernel_stage_1 = topk_combine_preprocess;
         setup_kernel_smem_once<kernel_stage_1, kStage1SMEM>();
@@ -485,6 +495,12 @@ struct CombinedTopKKernel {
         setup_kernel_smem_once<kernel_stage_2, kStage2SMEM>();
         LaunchKernel(batch_size, kBlockSize, device, kStage2SMEM)  //
             .enable_pdl(true)(kernel_stage_2, params);
+      } else {
+        // SM120 fallback: use only stage-2 (Small/Medium) path which fits in ~84KB
+        constexpr auto kernel = topk_short_transform;
+        setup_kernel_smem_once<kernel, kStage2SMEM>();
+        LaunchKernel(batch_size, kBlockSize, device, kStage2SMEM)
+            .enable_pdl(true)(kernel, params);
       }
     }
   }
diff --git a/python/sglang/jit_kernel/deepseek_v4.py b/python/sglang/jit_kernel/deepseek_v4.py
index 0878f45e9c56..df79156df5f9 100644
--- a/python/sglang/jit_kernel/deepseek_v4.py
+++ b/python/sglang/jit_kernel/deepseek_v4.py
@@ -418,6 +418,9 @@ def hash_topk(
         (num_tokens, topk_fused), dtype=torch.float32, device=router_logits.device
     )
     module = _jit_hash_topk_module()
+    # tvm_ffi hash_topk kernel expects input_ids as int64
+    if input_ids.dtype != torch.int64:
+        input_ids = input_ids.to(torch.int64)
     module.hash_topk(
         router_logits,
         input_ids,
diff --git a/python/sglang/jit_kernel/utils.py b/python/sglang/jit_kernel/utils.py
index a073f0493231..71f5378d32f3 100644
--- a/python/sglang/jit_kernel/utils.py
+++ b/python/sglang/jit_kernel/utils.py
@@ -169,7 +169,13 @@ def wrapper(*args, **kwargs):
 
 @cache_once
 def is_arch_support_pdl() -> bool:
+    """PDL (Programmatic Dependent Launch) is available on SM90+ GPUs.
+
+    Available on all architectures with compute capability >= 9.0, including
+    Hopper (SM90), Blackwell datacenter (SM100), and Blackwell consumer (SM120).
+    """
     import torch
 
     device = torch.cuda.current_device()
-    return torch.cuda.get_device_capability(device)[0] >= 9
+    major = torch.cuda.get_device_capability(device)[0]
+    return major >= 9
diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py
index 949af5c320ac..bca57a1dfd1a 100644
--- a/python/sglang/srt/environ.py
+++ b/python/sglang/srt/environ.py
@@ -541,6 +541,10 @@ class Envs:
     SGLANG_TOPK_TRANSFORM_512_TORCH = EnvBool(False)
     SGLANG_FP8_PAGED_MQA_LOGITS_TORCH = EnvBool(False)
 
+    # SM120-optimized Triton kernels (auto-enabled on SM120, set to 0 to disable)
+    SGLANG_SM120_TRITON_MOE = EnvBool(True)
+    SGLANG_SM120_TRITON_FLASHMLA = EnvBool(True)
+
     # Symmetric Memory
     SGLANG_SYMM_MEM_PREALLOC_GB_SIZE = EnvInt(-1)
 
diff --git a/python/sglang/srt/layers/attention/compressed/indexer.py b/python/sglang/srt/layers/attention/compressed/indexer.py
index 27d7678eea9e..59fa8cf74d57 100644
--- a/python/sglang/srt/layers/attention/compressed/indexer.py
+++ b/python/sglang/srt/layers/attention/compressed/indexer.py
@@ -45,44 +45,64 @@ def fp8_paged_mqa_logits_torch(
     max_seq_len: int,
     clean_logits: bool = True,
 ) -> torch.Tensor:
+    """CUDA-graph compatible FP8 paged MQA logits.
+
+    Retains the original per-batch loop structure for correctness, but replaces
+    .item() calls with GPU-only ops. For decode (bs=1), the loop runs once.
+    """
     _ = deep_gemm_metadata
     batch_size, _, num_heads, head_dim = q_fp8.shape
-    block_size = kvcache_fp8.shape[1]
-
-    assert head_dim == 128, "TODO"
-    assert block_size == 64, "TODO"
-    assert q_fp8.shape == (batch_size, 1, num_heads, head_dim)
-    assert kvcache_fp8.shape[1:] == (block_size, 1, head_dim + 4)
-    assert weight.shape == (batch_size, num_heads)
-    assert seq_lens.shape == (batch_size,)
-    assert page_table.shape[0] == batch_size
-    assert clean_logits == False
-
-    logits = page_table.new_empty((batch_size, max_seq_len), dtype=torch.float32)
+    block_size = kvcache_fp8.shape[1]  # 64
+
+    if seq_lens.dim() == 2 and seq_lens.shape[1] == 1:
+        seq_lens = seq_lens.squeeze(-1)
+
+    logits = page_table.new_zeros((batch_size, max_seq_len), dtype=torch.float32)
+    SCALE_OFFSET = block_size * head_dim
+
+    # Use max_pages from page_table columns
+    max_pages_in_table = page_table.shape[1]
+    # Pre-compute arange for valid token masking (avoid re-creation per batch)
+    token_arange = torch.arange(max_seq_len, device=seq_lens.device)
+
+    kvcache_flat = kvcache_fp8.reshape(-1, block_size * (head_dim + 4))
+    num_total_pages = kvcache_flat.shape[0]
+
     for i in range(batch_size):
-        q = q_fp8[i, 0]
-        q = q.to(torch.float32)
-        q_scale = weight[i]
-        seq_len = int(seq_lens[i].item())
-        assert seq_len <= max_seq_len
-        num_pages = (seq_len + block_size - 1) // block_size
-        padded_seq_len = num_pages * block_size
-        pages = page_table[i, :num_pages]
-        kvcache_fp8 = kvcache_fp8.view(-1, block_size * (head_dim + 4))
-        kvcache = kvcache_fp8[pages]
-        SCALE_OFFSET = block_size * head_dim
-        kvcache_value = kvcache[..., :SCALE_OFFSET].view(dtype=FP8_DTYPE)
-        kvcache_scale = kvcache[..., SCALE_OFFSET:].view(dtype=torch.float32)
-        kvcache_value = kvcache_value.to(torch.float32)
-        kvcache_scale = kvcache_scale.contiguous()
-        kvcache_value = kvcache_value.view(padded_seq_len, head_dim)
-        kvcache_scale = kvcache_scale.view(padded_seq_len)
-        score = F.linear(kvcache_value, q)
-        score = F.relu(score)
-        score *= q_scale[None, :]
-        score = score.sum(dim=1)
-        score *= kvcache_scale
-        logits[i, :seq_len] = score[:seq_len]
+        q = q_fp8[i, 0].to(torch.float32)  # (num_heads, head_dim)
+        q_scale = weight[i]  # (num_heads,)
+
+        # Gather pages for this batch item (GPU-only, no .item())
+        pages = page_table[i].clamp(0, num_total_pages - 1)  # (max_pages,)
+        kvcache = kvcache_flat[pages]  # (max_pages, block_size * (head_dim + 4))
+
+        # Split value and scale
+        kvcache_value = kvcache[..., :SCALE_OFFSET].reshape(-1, head_dim)
+        kvcache_value = kvcache_value.view(FP8_DTYPE).to(torch.float32)
+        kvcache_scale = (
+            kvcache[..., SCALE_OFFSET:].contiguous().view(torch.float32).reshape(-1)
+        )
+
+        padded_len = kvcache_value.shape[0]
+
+        # Score: F.linear(Q_flat, K) -> (padded_len, num_heads)
+        score = F.linear(kvcache_value, q)  # (padded_len, num_heads)
+        score = torch.relu(score)
+        score = score * q_scale.unsqueeze(0)  # broadcast per-head weight
+        score = score.sum(dim=1)  # (padded_len,)
+        score = score * kvcache_scale  # apply KV scale
+
+        # Mask invalid tokens using GPU comparison (no .item())
+        # seq_len is on GPU, arange is on GPU — all GPU ops
+        valid_len = seq_lens[i]  # GPU scalar tensor, no sync
+        valid_mask = token_arange[:padded_len] < valid_len
+        # Truncate to max_seq_len if needed
+        store_len = min(padded_len, max_seq_len)
+        score_valid = score[:store_len]
+        mask_valid = valid_mask[:store_len]
+        logits[i, :store_len] = torch.where(
+            mask_valid, score_valid, torch.zeros_like(score_valid)
+        )
 
     return logits
 
@@ -136,27 +156,27 @@ def topk_transform_512_pytorch_vectorized(
         pad_mask = torch.arange(TOPK, device=device).unsqueeze(0) >= actual_k
         valid_topk = valid_topk & ~pad_mask
 
+    # CUDA graph compatible: compute sequential path unconditionally,
+    # select with torch.where (no .any() GPU->CPU sync)
     needs_sequential = seq_lens <= TOPK
-    if needs_sequential.any():
-        sequential_indices = (
-            torch.arange(TOPK, device=device, dtype=torch.int32)
-            .unsqueeze(0)
-            .expand(batch_size, -1)
-        )
-        sequential_valid = sequential_indices < seq_lens.unsqueeze(1)
-
-        raw_indices = torch.where(
-            needs_sequential.unsqueeze(1).expand(-1, TOPK),
-            torch.where(
-                sequential_valid,
-                sequential_indices,
-                torch.tensor(-1, device=device, dtype=torch.int32),
-            ),
-            raw_indices,
-        )
-        valid_topk = torch.where(
-            needs_sequential.unsqueeze(1).expand(-1, TOPK), sequential_valid, valid_topk
-        )
+    sequential_indices = (
+        torch.arange(TOPK, device=device, dtype=torch.int32)
+        .unsqueeze(0)
+        .expand(batch_size, -1)
+    )
+    sequential_valid = sequential_indices < seq_lens.unsqueeze(1)
+
+    needs_seq_expand = needs_sequential.unsqueeze(1).expand(-1, TOPK)
+    raw_indices = torch.where(
+        needs_seq_expand,
+        torch.where(
+            sequential_valid,
+            sequential_indices,
+            torch.tensor(-1, device=device, dtype=torch.int32),
+        ),
+        raw_indices,
+    )
+    valid_topk = torch.where(needs_seq_expand, sequential_valid, valid_topk)
 
     page_idx = raw_indices >> page_bits
     offset_in_page = raw_indices & page_mask
@@ -379,12 +399,16 @@ def forward_c4_indexer(
         elif envs.SGLANG_FP8_PAGED_MQA_LOGITS_TORCH.get():
             fn = fp8_paged_mqa_logits_torch
         else:
-            if envs.SGLANG_OPT_DG_PAGED_MQA_LOGITS_CHUNK_SIZE.get() != -1:
-                from sglang.srt.layers.deep_gemm_wrapper.paged_mqa_logits import (
-                    fp8_paged_mqa_logits_chunked as fn,
-                )
-            else:
-                from deep_gemm import fp8_paged_mqa_logits as fn
+            try:
+                if envs.SGLANG_OPT_DG_PAGED_MQA_LOGITS_CHUNK_SIZE.get() != -1:
+                    from sglang.srt.layers.deep_gemm_wrapper.paged_mqa_logits import (
+                        fp8_paged_mqa_logits_chunked as fn,
+                    )
+                else:
+                    from deep_gemm import fp8_paged_mqa_logits as fn
+            except (ImportError, RuntimeError, FileNotFoundError):
+                # DeepGEMM not available or SM120 unsupported, use PyTorch fallback
+                fn = fp8_paged_mqa_logits_torch
 
         _c4sl = indexer_metadata.c4_seq_lens
         if _c4sl.dim() == 1:
diff --git a/python/sglang/srt/layers/attention/compressed/metadata.py b/python/sglang/srt/layers/attention/compressed/metadata.py
index 9b866259e462..d42e10ceed57 100644
--- a/python/sglang/srt/layers/attention/compressed/metadata.py
+++ b/python/sglang/srt/layers/attention/compressed/metadata.py
@@ -6,6 +6,7 @@
 import torch
 
 from sglang.srt.environ import envs
+from sglang.srt.layers.deep_gemm_wrapper.configurer import ENABLE_JIT_DEEPGEMM
 from sglang.srt.utils import is_hip
 
 if TYPE_CHECKING:
@@ -122,7 +123,7 @@ class PagedIndexerMetadata(IndexerMetadata):
     topk_metadata: torch.Tensor = field(init=False, repr=False)
 
     def __post_init__(self):
-        if envs.SGLANG_FP8_PAGED_MQA_LOGITS_TORCH.get():
+        if envs.SGLANG_FP8_PAGED_MQA_LOGITS_TORCH.get() or not ENABLE_JIT_DEEPGEMM:
             self.deep_gemm_metadata = None
         else:
             import deep_gemm
diff --git a/python/sglang/srt/layers/attention/debug_flash_mla_adapter.py b/python/sglang/srt/layers/attention/debug_flash_mla_adapter.py
index 10ae2c3ba38e..bf43369fd90a 100644
--- a/python/sglang/srt/layers/attention/debug_flash_mla_adapter.py
+++ b/python/sglang/srt/layers/attention/debug_flash_mla_adapter.py
@@ -1,5 +1,253 @@
+"""Flash MLA adapter with SM120 PyTorch fallback for decode attention.
+
+SM120 (RTX PRO 6000 Blackwell, CC 12.0) does not support flash_mla CUDA kernels
+(sm_90a/sm_100f cubins are incompatible). This module provides a pure PyTorch
+fallback for the sparse decode path.
+
+Optimizations applied:
+  - Zero-copy raw_buf view when k_cache is already contiguous
+  - Batched gather (single-pass for all NoPE/RoPE/scale bytes)
+  - bf16 tensor core matmul for Q@K^T and attn@V
+
+SWA KV cache layout per page (page_size=256 tokens, dtype=float8_e4m3fn/uint8):
+  Two-section layout:
+    Section A: NoPE FP8 (448 bytes) + RoPE BF16 (128 bytes) per token at t*576
+    Section B: UE8M0 scales (7 bytes + 1 pad) per token at page_size*576 + t*8
+"""
+
+import importlib.util
+import os
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.utils import is_sm120_supported
+
+_use_triton_gather = None
+_use_triton_flashmla = None
+
+
+def _should_use_triton_gather():
+    """Check if Triton fused gather is available for SM120."""
+    global _use_triton_gather
+    if _use_triton_gather is None:
+        _use_triton_gather = (
+            importlib.util.find_spec(
+                "sglang.srt.layers.attention.fused_kv_gather_triton"
+            )
+            is not None
+        )
+    return _use_triton_gather
+
+
+def _should_use_triton_flashmla():
+    """Check if Triton tiled FlashMLA kernel should be used on SM120.
+
+    Auto-enabled on SM120. Set SGLANG_SM120_TRITON_FLASHMLA=0 to disable.
+    """
+    global _use_triton_flashmla
+    if _use_triton_flashmla is None:
+        _use_triton_flashmla = (
+            is_sm120_supported()
+            and envs.SGLANG_SM120_TRITON_FLASHMLA.get()
+        )
+    return _use_triton_flashmla
+
+
+# SWA KV cache format constants
+DIM_NOPE = 448
+DIM_ROPE = 64
+TILE_SIZE = 64
+NUM_TILES = DIM_NOPE // TILE_SIZE  # 7
+SCALE_PAD = 1
+BYTES_NOPE_ROPE = DIM_NOPE + DIM_ROPE * 2  # 576
+BYTES_SCALE = NUM_TILES + SCALE_PAD  # 8
+
+
+def _is_sm120():
+    return is_sm120_supported()
+
+
+def _gather_and_dequant_kv_vectorized(k_cache, indices, topk_length=None):
+    """Memory-efficient gather + dequantize KV tokens from paged cache.
+
+    Processes tokens in chunks to limit peak memory usage.
+    """
+    num_pages = k_cache.shape[0]
+    page_size = k_cache.shape[1]
+    kv_dim = k_cache.shape[3]
+
+    batch = indices.shape[0]
+    topk = indices.shape[-1]
+
+    idx_flat = indices.reshape(batch, -1)
+    total_tokens = num_pages * page_size
+    valid_mask = (idx_flat >= 0) & (idx_flat < total_tokens)
+    idx_safe = idx_flat.clamp(0, total_tokens - 1)
+
+    page_idx = idx_safe // page_size
+    token_in_page = idx_safe % page_size
+
+    raw_buf = k_cache.view(torch.uint8).reshape(-1)
+    buf_len = raw_buf.shape[0]
+    bytes_per_page = page_size * kv_dim
+    page_offsets = page_idx * bytes_per_page
+
+    # Pre-allocate output directly as bf16 to avoid intermediate fp32
+    result = torch.zeros(batch, topk, DIM_NOPE + DIM_ROPE, dtype=torch.bfloat16, device=k_cache.device)
+
+    # Compute all offsets once
+    nope_starts = page_offsets + token_in_page * BYTES_NOPE_ROPE
+    scale_section_offsets = page_offsets + page_size * BYTES_NOPE_ROPE
+    scale_starts = scale_section_offsets + token_in_page * BYTES_SCALE
+
+    # Gather NoPE FP8 + dequant in-place
+    nope_offsets = nope_starts.unsqueeze(-1) + torch.arange(DIM_NOPE, device=raw_buf.device)
+    nope_offsets_clamped = nope_offsets.clamp(0, buf_len - 1)
+    nope_bytes = raw_buf[nope_offsets_clamped.reshape(-1)].reshape(batch, topk, DIM_NOPE)
+
+    # Gather scales
+    scale_offsets = scale_starts.unsqueeze(-1) + torch.arange(NUM_TILES, device=raw_buf.device)
+    scale_offsets_clamped = scale_offsets.clamp(0, buf_len - 1)
+    scale_bytes = raw_buf[scale_offsets_clamped.reshape(-1)].reshape(batch, topk, NUM_TILES)
+
+    # Dequant: directly to bf16 to save memory (avoid large fp32 intermediate)
+    nope_fp8 = nope_bytes.view(torch.float8_e4m3fn)
+    scale_fp32 = torch.pow(2.0, scale_bytes.float() - 127.0)
+    # Dequant in tiled fashion and store directly to output
+    nope_fp32 = nope_fp8.reshape(batch, topk, NUM_TILES, TILE_SIZE).float() * scale_fp32.unsqueeze(-1)
+    result[:, :, :DIM_NOPE] = nope_fp32.reshape(batch, topk, DIM_NOPE).to(torch.bfloat16)
+
+    # Free large intermediates
+    del nope_bytes, nope_fp8, scale_bytes, scale_fp32, nope_fp32
+    del nope_offsets, nope_offsets_clamped, scale_offsets, scale_offsets_clamped
+
+    # Gather RoPE BF16 - much smaller (128 bytes per token)
+    rope_starts = nope_starts + DIM_NOPE
+    rope_byte_dim = DIM_ROPE * 2
+    rope_offsets = rope_starts.unsqueeze(-1) + torch.arange(rope_byte_dim, device=raw_buf.device)
+    rope_offsets_clamped = rope_offsets.clamp(0, buf_len - 1)
+    rope_bytes = raw_buf[rope_offsets_clamped.reshape(-1)].reshape(batch, topk, rope_byte_dim)
+    result[:, :, DIM_NOPE:] = rope_bytes.view(torch.bfloat16)
+
+    # Zero invalid entries
+    result = torch.where(valid_mask.unsqueeze(-1), result, torch.zeros_like(result))
+
+    return result
+
+
+def _flash_mla_with_kvcache_torch(
+    q,
+    k_cache,
+    head_dim_v,
+    tile_scheduler_metadata,
+    softmax_scale=None,
+    is_fp8_kvcache=True,
+    indices=None,
+    topk_length=None,
+    attn_sink=None,
+    extra_k_cache=None,
+    extra_indices_in_kvcache=None,
+    extra_topk_length=None,
+    **kwargs,
+):
+    """Pure PyTorch fallback for flash_mla sparse decode on SM120.
+
+    When SGLANG_SM120_TRITON_FLASHMLA=1, uses the tiled Triton kernel
+    which fuses gather + dequant + QK + softmax + V into a single kernel.
+    Falls back to the vectorized PyTorch path otherwise.
+    """
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** (-0.5)
+
+    batch, seq_q, num_heads_q, head_dim = q.shape
+    device = q.device
+
+    if indices is None:
+        raise NotImplementedError(
+            "Dense decode path not implemented for SM120 fallback"
+        )
+
+    # Fast path: Triton tiled sparse decode (fuses all ops into one kernel)
+    if _should_use_triton_flashmla():
+        from sglang.srt.layers.attention.flash_mla_sm120_triton import (
+            flash_mla_sparse_decode_triton,
+        )
+
+        return flash_mla_sparse_decode_triton(
+            q=q,
+            k_cache=k_cache,
+            indices=indices,
+            topk_length=topk_length,
+            attn_sink=attn_sink,
+            head_dim_v=head_dim_v,
+            softmax_scale=softmax_scale,
+            extra_k_cache=extra_k_cache,
+            extra_indices=extra_indices_in_kvcache,
+            extra_topk_length=extra_topk_length,
+        )
+
+    # Slow path: vectorized PyTorch fallback
+    if _is_sm120() and _should_use_triton_gather():
+        from sglang.srt.layers.attention.fused_kv_gather_triton import (
+            fused_gather_dequant,
+        )
+
+        k_all = fused_gather_dequant(k_cache, indices)
+    else:
+        k_all = _gather_and_dequant_kv_vectorized(k_cache, indices, topk_length)
+
+    if extra_k_cache is not None and extra_indices_in_kvcache is not None:
+        extra_k = _gather_and_dequant_kv_vectorized(
+            extra_k_cache, extra_indices_in_kvcache, extra_topk_length
+        )
+        k_all = torch.cat([k_all, extra_k], dim=1)
+
+    num_kv_tokens = k_all.shape[1]
+
+    v = k_all  # (batch, num_kv, 512)
+
+    q_full = q.squeeze(1)  # (batch, num_heads, 512)
+    scores = torch.einsum("bhd,bkd->bhk", q_full, k_all) * softmax_scale
+    scores_4d = scores.unsqueeze(1)  # (batch, 1, num_heads, num_kv)
+
+    if topk_length is not None:
+        total_kv = num_kv_tokens
+        primary_topk = indices.shape[-1]
+        arange = torch.arange(total_kv, device=device)
+
+        primary_valid = arange[:primary_topk].unsqueeze(0) < topk_length.unsqueeze(1)
+        if extra_k_cache is not None and extra_topk_length is not None:
+            extra_valid = (arange[primary_topk:] - primary_topk).unsqueeze(
+                0
+            ) < extra_topk_length.unsqueeze(1)
+            valid_mask = torch.cat([primary_valid, extra_valid], dim=1)
+        else:
+            valid_mask = primary_valid
+
+        scores_4d = scores_4d.masked_fill(
+            ~valid_mask.unsqueeze(1).unsqueeze(2), float("-inf")
+        )
+
+    attn_weights = torch.softmax(scores_4d.float(), dim=-1)
+    attn_weights = attn_weights.nan_to_num(0.0)
+
+    attn_3d = attn_weights.squeeze(1).reshape(batch, num_heads_q, num_kv_tokens)
+
+    o = torch.einsum("bhk,bkd->bhd", attn_3d, v.to(attn_3d.dtype))
+    o = o.unsqueeze(1)  # (batch, 1, num_heads, 512)
+
+    lse_out = torch.logsumexp(scores_4d.float(), dim=-1).transpose(-1, -2)
+
+    return (o.to(torch.bfloat16), lse_out)
+
+
 def flash_mla_with_kvcache_entrypoint(backend: str, **kwargs):
-    assert backend == "kernel", f"unsupported backend {backend!r}"
-    import flash_mla
+    if backend == "kernel":
+        if _is_sm120():
+            return _flash_mla_with_kvcache_torch(**kwargs)
+        else:
+            import flash_mla
 
-    return flash_mla.flash_mla_with_kvcache(**kwargs)
+            return flash_mla.flash_mla_with_kvcache(**kwargs)
+    raise ValueError(f"unsupported backend {backend!r}")
diff --git a/python/sglang/srt/layers/attention/flash_mla_sm120_triton.py b/python/sglang/srt/layers/attention/flash_mla_sm120_triton.py
new file mode 100644
index 000000000000..d430f72913b3
--- /dev/null
+++ b/python/sglang/srt/layers/attention/flash_mla_sm120_triton.py
@@ -0,0 +1,337 @@
+"""SM120-optimized Triton FlashMLA sparse decode kernel — Tiled V2.
+
+Replaces V1's serial token loop with a tiled vectorized approach:
+  1. BLOCK_T tokens loaded simultaneously via 2D gather (vs 1-at-a-time)
+  2. All BLOCK_T QK scores computed at once via vectorized mul-reduce
+  3. V accumulation via vectorized weighted sum across BLOCK_T tokens
+  4. Online softmax operates on tile-level maxima (fewer rescales)
+
+Three typed views of the same paged buffer handle FP8/uint8/BF16 regions:
+- float8_e4m3fn view -> nope FP8 values (direct load + dequant)
+- uint8 view -> UE8M0 scale bytes (raw integer -> exp2 conversion)
+- bfloat16 view -> rope BF16 values (direct load)
+
+DSv4 page layout (per token, 576 bytes data + 8 bytes scales):
+  Data section: [0:448] FP8 nope | [448:576] BF16 rope (64 values = 128 bytes)
+  Scale section: [page_size*576 + offset*8 : +7] UE8M0 scales (7 groups of 64)
+
+Target: RTX PRO 6000 (SM120, 188 SMs, 99KB SMEM, ~1.5 TB/s GDDR7, 96MB L2)
+
+Ported from PR #24047 (AliceChenyy's SM120 support).
+"""
+
+import logging
+import os
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+logger = logging.getLogger(__name__)
+
+LOG2E = tl.constexpr(1.4426950408889634)
+
+# DSv4 KV cache layout constants
+_NOPE_DIM = 448
+_ROPE_DIM = 64
+_D = _NOPE_DIM + _ROPE_DIM  # 512
+_TOKEN_DATA_STRIDE = 576  # bytes per token in data section
+_SCALE_STRIDE = 8  # bytes per token in scale section
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_T": 16}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_T": 16}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_T": 32}, num_warps=8, num_stages=2),
+    ],
+    key=["topk_rounded"],
+)
+@triton.jit
+def _tiled_sparse_decode_kernel(
+    # Q: [B, H, D] bf16
+    Q_ptr,
+    # Paged KV cache — three typed views of same underlying memory
+    cache_fp8_ptr,    # float8_e4m3fn flat (1 byte/elem) — for nope
+    cache_uint8_ptr,  # uint8 flat (1 byte/elem) — for scales
+    cache_bf16_ptr,   # bfloat16 flat (2 bytes/elem) — for rope
+    # Indices: [B, topk] int32
+    indices_ptr,
+    # Valid lengths: [B] int32
+    topk_len_ptr,
+    # Output: [B, H, D] bf16 and LSE: [B, H] float32
+    O_ptr,
+    LSE_ptr,
+    # Scalars
+    sm_scale: tl.float32,
+    page_size: tl.int32,
+    page_bytes: tl.int64,
+    scale_section_off: tl.int64,  # page_size * 576
+    H: tl.int32,
+    topk: tl.int32,
+    topk_rounded: tl.int32,      # for autotune key
+    has_topk_len: tl.constexpr,
+    # Strides
+    stride_qb: tl.int32,
+    stride_qh: tl.int32,
+    stride_ob: tl.int32,
+    stride_oh: tl.int32,
+    stride_ib: tl.int32,  # indices batch stride
+    # Constexprs
+    NOPE_PAD: tl.constexpr,   # 512 (padded from 448)
+    ROPE_DIM: tl.constexpr,   # 64
+    NOPE_DIM_RT: tl.int32,    # 448 (runtime, for masking)
+    BLOCK_T: tl.constexpr,    # tokens per tile (16 or 32)
+):
+    """Tiled sparse decode: vectorized gather + QK + softmax + V accumulation.
+
+    Grid: (B, H) — one block per (batch, head) pair.
+    Each block processes all topk tokens in tiles of BLOCK_T.
+    """
+    bid = tl.program_id(0)
+    hid = tl.program_id(1)
+
+    # ---- Load Q for this (batch, head) ----
+    q_base = bid * stride_qb + hid * stride_qh
+    nope_offs = tl.arange(0, NOPE_PAD)    # [512]
+    nope_mask = nope_offs < NOPE_DIM_RT    # [512], True for [0:448]
+    rope_offs = tl.arange(0, ROPE_DIM)    # [64]
+
+    q_nope = tl.load(Q_ptr + q_base + nope_offs, mask=nope_mask, other=0.0)
+    q_nope = q_nope.to(tl.float32) * sm_scale
+    q_rope = tl.load(Q_ptr + q_base + NOPE_DIM_RT + rope_offs)
+    q_rope = q_rope.to(tl.float32) * sm_scale
+
+    # ---- Valid token count ----
+    valid_topk = topk
+    if has_topk_len:
+        valid_topk = tl.load(topk_len_ptr + bid).to(tl.int32)
+        valid_topk = tl.minimum(valid_topk, topk)
+
+    # ---- Online softmax state (base-2 math for SM120 efficiency) ----
+    m_i: tl.float32 = -1e30
+    l_i: tl.float32 = 0.0
+    acc_nope = tl.zeros([NOPE_PAD], dtype=tl.float32)
+    acc_rope = tl.zeros([ROPE_DIM], dtype=tl.float32)
+
+    # ---- Precompute constant index vectors ----
+    group_ids = (nope_offs // 64).to(tl.int64)   # [NOPE_PAD], scale group for each dim
+    t_offs = tl.arange(0, BLOCK_T)               # [BLOCK_T], token offsets within tile
+
+    # ---- Process tokens in tiles of BLOCK_T ----
+    for tile_start in range(0, topk, BLOCK_T):
+        t_idx = tile_start + t_offs                    # [BLOCK_T], global token indices
+        t_in_bounds = t_idx < topk                     # bounds for index load
+        t_valid = t_idx < valid_topk                   # bounds for actual processing
+
+        # Load indices for this tile: [BLOCK_T]
+        raw_indices = tl.load(
+            indices_ptr + bid * stride_ib + t_idx,
+            mask=t_in_bounds, other=-1,
+        )
+        idx_valid = t_valid & (raw_indices >= 0)       # [BLOCK_T] mask
+
+        # Page addressing: [BLOCK_T] (clamp for safe addressing of invalid tokens)
+        safe_indices = tl.where(idx_valid, raw_indices, tl.zeros_like(raw_indices))
+        page_ids = (safe_indices // page_size).to(tl.int64)
+        page_offs_t = (safe_indices % page_size).to(tl.int64)
+        token_data_bases = page_ids * page_bytes + page_offs_t * 576  # [BLOCK_T] int64
+
+        # ---- Vectorized NOPE FP8 gather: [BLOCK_T, NOPE_PAD] ----
+        nope_addrs = token_data_bases[:, None] + nope_offs[None, :].to(tl.int64)
+        nope_2d_mask = idx_valid[:, None] & nope_mask[None, :]
+        kv_nope_fp8 = tl.load(
+            cache_fp8_ptr + nope_addrs,
+            mask=nope_2d_mask, other=0.0,
+        )
+
+        # ---- Vectorized scale gather + dequant: [BLOCK_T, NOPE_PAD] ----
+        scale_bases = page_ids * page_bytes + scale_section_off + page_offs_t * 8
+        scale_addrs = scale_bases[:, None] + group_ids[None, :]
+        scale_raw = tl.load(
+            cache_uint8_ptr + scale_addrs,
+            mask=nope_2d_mask, other=127,
+        )
+        scale_f32 = tl.math.exp2(scale_raw.to(tl.float32) - 127.0)
+        kv_nope = tl.where(nope_2d_mask, kv_nope_fp8.to(tl.float32) * scale_f32, 0.0)
+
+        # ---- Vectorized ROPE BF16 gather: [BLOCK_T, ROPE_DIM] ----
+        rope_byte_bases = token_data_bases + 448
+        rope_elem_bases = (rope_byte_bases // 2).to(tl.int64)
+        rope_addrs = rope_elem_bases[:, None] + rope_offs[None, :].to(tl.int64)
+        kv_rope = tl.load(
+            cache_bf16_ptr + rope_addrs,
+            mask=idx_valid[:, None], other=0.0,
+        ).to(tl.float32)
+
+        # ---- Vectorized QK scores: [BLOCK_T] ----
+        # scores[t] = dot(q_nope, kv_nope[t]) + dot(q_rope, kv_rope[t])
+        scores = (
+            tl.sum(q_nope[None, :] * kv_nope, axis=1)
+            + tl.sum(q_rope[None, :] * kv_rope, axis=1)
+        )
+        scores = tl.where(idx_valid, scores, -1e30)
+
+        # ---- Online softmax update (base-2, tile-level) ----
+        scores_log2 = scores * LOG2E                   # [BLOCK_T]
+        tile_max = tl.max(scores_log2)                 # scalar
+        m_new = tl.maximum(m_i, tile_max)
+
+        alpha = tl.math.exp2(m_i - m_new)              # rescale factor
+        p = tl.math.exp2(scores_log2 - m_new)          # [BLOCK_T] attention weights
+        p = tl.where(idx_valid, p, 0.0)                # zero out invalid
+
+        l_i = l_i * alpha + tl.sum(p)
+
+        # ---- Vectorized V accumulation (K=V in MLA) ----
+        # acc += sum_t(p[t] * kv[t, :]) for both nope and rope
+        acc_nope = acc_nope * alpha + tl.sum(p[:, None] * kv_nope, axis=0)
+        acc_rope = acc_rope * alpha + tl.sum(p[:, None] * kv_rope, axis=0)
+        m_i = m_new
+
+    # ---- Normalize output ----
+    safe_l = tl.where(l_i > 0.0, l_i, 1.0)
+    acc_nope = acc_nope / safe_l
+    acc_rope = acc_rope / safe_l
+
+    # LSE: convert from log2 back to natural log
+    lse = tl.where(l_i > 0.0, m_i / LOG2E + tl.math.log(safe_l), float("-inf"))
+
+    # ---- Store output ----
+    o_base = bid * stride_ob + hid * stride_oh
+    tl.store(O_ptr + o_base + nope_offs, acc_nope.to(tl.bfloat16), mask=nope_mask)
+    tl.store(O_ptr + o_base + NOPE_DIM_RT + rope_offs, acc_rope.to(tl.bfloat16))
+    tl.store(LSE_ptr + bid * H + hid, lse)
+
+
+def _run_triton_sparse_decode(
+    q: torch.Tensor,           # [B, 1, H, D] bf16
+    k_cache: torch.Tensor,     # [num_pages, page_size, 1, bpt] float8
+    indices: torch.Tensor,     # [B, ...] int32
+    topk_length: Optional[torch.Tensor],
+    softmax_scale: float,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Run the tiled Triton sparse decode kernel on one paged KV cache."""
+    B, _, H, D = q.shape
+    num_pages = k_cache.shape[0]
+    page_size = k_cache.shape[1]
+    page_bytes = k_cache.stride(0)  # elements = bytes for float8
+
+    # Flatten indices to [B, topk]
+    flat_indices = indices.reshape(B, -1).contiguous()
+    topk = flat_indices.shape[1]
+
+    # Create three typed views of the flat cache memory
+    total_elems = num_pages * page_bytes
+    raw_fp8 = k_cache.as_strided((total_elems,), (1,))
+    raw_uint8 = raw_fp8.view(torch.uint8)
+    raw_bf16 = raw_uint8.view(torch.bfloat16)
+
+    # Squeeze Q: [B, H, D]
+    q3 = q.squeeze(1)
+    if not q3.is_contiguous():
+        q3 = q3.contiguous()
+
+    out = torch.zeros(B, H, D, dtype=torch.bfloat16, device=q.device)
+    lse = torch.full((B, H), float("-inf"), dtype=torch.float32, device=q.device)
+
+    # Round topk for autotune key stability
+    topk_rounded = triton.next_power_of_2(topk)
+
+    grid = (B, H)
+    _tiled_sparse_decode_kernel[grid](
+        q3,
+        raw_fp8, raw_uint8, raw_bf16,
+        flat_indices,
+        topk_length if topk_length is not None else torch.empty(
+            0, device=q.device, dtype=torch.int32
+        ),
+        out, lse,
+        softmax_scale,
+        page_size,
+        int(page_bytes),                      # page_bytes (int64)
+        int(page_size * _TOKEN_DATA_STRIDE),  # scale_section_off (int64)
+        H, topk, topk_rounded,
+        topk_length is not None,
+        q3.stride(0), q3.stride(1),
+        out.stride(0), out.stride(1),
+        flat_indices.stride(0),
+        NOPE_PAD=512,
+        ROPE_DIM=_ROPE_DIM,
+        NOPE_DIM_RT=_NOPE_DIM,
+    )
+
+    # Return [B, 1, H, D] and [B, 1, H]
+    return out.unsqueeze(1), lse.unsqueeze(1)
+
+
+def _merge_partial_attn(
+    out1: torch.Tensor, lse1: torch.Tensor,
+    out2: torch.Tensor, lse2: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Merge two attention outputs using LSE-weighted combination."""
+    max_lse = torch.maximum(lse1, lse2)
+    w1 = torch.where(lse1 > -1e20, torch.exp(lse1 - max_lse), torch.zeros_like(lse1))
+    w2 = torch.where(lse2 > -1e20, torch.exp(lse2 - max_lse), torch.zeros_like(lse2))
+    total = (w1 + w2).clamp(min=1e-20)
+    merged = (
+        w1.unsqueeze(-1) * out1.float() + w2.unsqueeze(-1) * out2.float()
+    ) / total.unsqueeze(-1)
+    merged_lse = max_lse + torch.log(total)
+    return merged.to(torch.bfloat16), merged_lse
+
+
+def _apply_attn_sink(
+    out: torch.Tensor, lse: torch.Tensor,
+    attn_sink: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Apply attention sink normalization."""
+    sink_lse = attn_sink.view(1, 1, -1).expand_as(lse)
+    combined_lse = torch.logaddexp(lse, sink_lse)
+    w = torch.where(
+        lse > -1e20,
+        torch.exp(lse - combined_lse),
+        torch.zeros_like(lse),
+    )
+    return (out.float() * w.unsqueeze(-1)).to(torch.bfloat16), combined_lse
+
+
+def flash_mla_sparse_decode_triton(
+    q: torch.Tensor,
+    k_cache: torch.Tensor,
+    indices: torch.Tensor,
+    topk_length: Optional[torch.Tensor],
+    attn_sink: Optional[torch.Tensor],
+    head_dim_v: int,
+    softmax_scale: float,
+    extra_k_cache: Optional[torch.Tensor] = None,
+    extra_indices: Optional[torch.Tensor] = None,
+    extra_topk_length: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """SM120-optimized sparse MLA decode using tiled Triton kernel.
+
+    Processes SWA and extra (c4/c128) caches separately via the same
+    Triton kernel, then merges results using LSE-weighted combination.
+    """
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** (-0.5)
+
+    # Process main cache (SWA)
+    out, lse = _run_triton_sparse_decode(
+        q, k_cache, indices, topk_length, softmax_scale,
+    )
+
+    # Process extra cache (c4 / c128) if present
+    if extra_k_cache is not None and extra_indices is not None:
+        out_extra, lse_extra = _run_triton_sparse_decode(
+            q, extra_k_cache, extra_indices, extra_topk_length, softmax_scale,
+        )
+        out, lse = _merge_partial_attn(out, lse, out_extra, lse_extra)
+
+    # Apply attention sink
+    if attn_sink is not None:
+        out, lse = _apply_attn_sink(out, lse, attn_sink)
+
+    # Return format matching PyTorch fallback: (out, lse.permute(0,2,1))
+    return out, lse.permute(0, 2, 1)
diff --git a/python/sglang/srt/layers/attention/fused_kv_gather_triton.py b/python/sglang/srt/layers/attention/fused_kv_gather_triton.py
new file mode 100644
index 000000000000..04e00f9df44e
--- /dev/null
+++ b/python/sglang/srt/layers/attention/fused_kv_gather_triton.py
@@ -0,0 +1,144 @@
+"""Fused KV cache gather + dequantize Triton kernel for SM120 MLA decode.
+
+Single kernel launch replaces ~16 PyTorch ops for the NoPE gather+dequant phase.
+Each program handles one (batch, token) pair.
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+# Layout constants
+DIM_NOPE = 448
+DIM_ROPE = 64
+TILE_SIZE = 64
+NUM_TILES = DIM_NOPE // TILE_SIZE  # 7
+BYTES_NOPE_ROPE = DIM_NOPE + DIM_ROPE * 2  # 576
+BYTES_SCALE = NUM_TILES + 1  # 8
+HEAD_DIM = DIM_NOPE + DIM_ROPE  # 512
+
+
+@triton.jit
+def _fused_nope_dequant_kernel(
+    raw_fp8_ptr,  # float8_e4m3fn view (same memory as raw_buf)
+    raw_u8_ptr,  # uint8 view for scale bytes
+    indices_ptr,  # (batch, topk) flat token indices
+    output_ptr,  # (batch, topk, 448) bf16 output (NoPE only)
+    page_size: tl.constexpr,
+    kv_dim: tl.constexpr,
+    max_topk: tl.constexpr,
+    DIM_NOPE: tl.constexpr,
+    NUM_TILES: tl.constexpr,
+    TILE_SZ: tl.constexpr,
+    BYTES_NR: tl.constexpr,
+    BYTES_SC: tl.constexpr,
+):
+    bid = tl.program_id(0)
+    tid = tl.program_id(1)
+
+    if tid >= max_topk:
+        return
+
+    flat_idx = tl.load(indices_ptr + bid * max_topk + tid)
+
+    page_idx = flat_idx // page_size
+    tok_in_page = flat_idx % page_size
+
+    bytes_per_page = page_size * kv_dim
+    page_offset = page_idx * bytes_per_page
+    nope_base = page_offset + tok_in_page * BYTES_NR
+    scale_section_base = page_offset + page_size * BYTES_NR
+    scale_base = scale_section_base + tok_in_page * BYTES_SC
+
+    out_base = bid * max_topk * DIM_NOPE + tid * DIM_NOPE
+
+    # Dequantize NoPE tile-by-tile
+    for tile in range(NUM_TILES):
+        ts = tile * TILE_SZ
+
+        # Load FP8 values (1 byte each, same layout as uint8)
+        fp8_offs = nope_base + ts + tl.arange(0, TILE_SZ)
+        fp8_raw = tl.load(raw_fp8_ptr + fp8_offs)
+        # Triton loads as fp8 element type, .to(float32) does hardware conversion
+        fp8_vals = fp8_raw.to(tl.float32)
+
+        # Load UE8M0 scale byte
+        sc_off = scale_base + tile
+        sc_byte = tl.load(raw_u8_ptr + sc_off)
+        scale_val = tl.exp2(sc_byte.to(tl.float32) - 127.0)
+
+        # Dequantize and store
+        dequant = (fp8_vals * scale_val).to(tl.bfloat16)
+        out_offs = out_base + ts + tl.arange(0, TILE_SZ)
+        tl.store(output_ptr + out_offs, dequant)
+
+
+def fused_gather_dequant(k_cache, indices):
+    """Fused KV gather+dequant using Triton + minimal PyTorch.
+
+    Returns (batch, topk, 512) bf16 tensor.
+    """
+    num_pages = k_cache.shape[0]
+    page_size = k_cache.shape[1]
+    kv_dim = k_cache.shape[3]
+
+    batch = indices.shape[0]
+    max_topk = indices.shape[-1]
+    device = k_cache.device
+
+    # Assume contiguous for CUDA graph compatibility (always true in decode)
+    raw_buf_u8 = k_cache.view(torch.uint8).reshape(-1)
+    raw_buf_fp8 = k_cache.reshape(-1)  # float8 view (same memory)
+
+    total_tokens = num_pages * page_size
+    idx_safe = indices.reshape(batch, -1).clamp(0, total_tokens - 1)
+
+    # Allocate NoPE-only output for Triton kernel (contiguous)
+    nope_out = torch.zeros(
+        batch, max_topk, DIM_NOPE, dtype=torch.bfloat16, device=device
+    )
+
+    # Phase 1: Triton fused NoPE dequant (replaces ~10 PyTorch ops with 1 kernel)
+    grid = (batch, max_topk)
+    _fused_nope_dequant_kernel[grid](
+        raw_buf_fp8,
+        raw_buf_u8,
+        idx_safe,
+        nope_out,
+        page_size=page_size,
+        kv_dim=kv_dim,
+        max_topk=max_topk,
+        DIM_NOPE=DIM_NOPE,
+        NUM_TILES=NUM_TILES,
+        TILE_SZ=TILE_SIZE,
+        BYTES_NR=BYTES_NOPE_ROPE,
+        BYTES_SC=BYTES_SCALE,
+    )
+
+    # Phase 2: RoPE gather via PyTorch (small: 128 bytes per token, 1 gather)
+    bytes_per_page = page_size * kv_dim
+    page_idx = idx_safe // page_size
+    tok_in_page = idx_safe % page_size
+    page_offsets = page_idx * bytes_per_page
+    nope_starts = page_offsets + tok_in_page * BYTES_NOPE_ROPE
+
+    rope_byte_dim = DIM_ROPE * 2
+    rope_offs = (nope_starts + DIM_NOPE).unsqueeze(-1) + torch.arange(
+        rope_byte_dim, device=device
+    )
+    rope_offs_c = rope_offs.clamp(0, raw_buf_u8.shape[0] - 1)
+    rope_bytes = raw_buf_u8[rope_offs_c.reshape(-1)].reshape(
+        batch, max_topk, rope_byte_dim
+    )
+    rope_bf16 = rope_bytes.view(torch.bfloat16)
+
+    # Concatenate
+    result = torch.cat([nope_out, rope_bf16], dim=-1)
+
+    # Zero invalid entries using torch.where (CUDA graph compatible — no .all() sync)
+    valid_mask = (indices.reshape(batch, -1) >= 0) & (
+        indices.reshape(batch, -1) < total_tokens
+    )
+    result = torch.where(valid_mask.unsqueeze(-1), result, torch.zeros_like(result))
+
+    return result
diff --git a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
index 51ba1bcab7fc..e3db2017af91 100644
--- a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
+++ b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
@@ -17,20 +17,31 @@
     is_cuda,
     is_hip,
     is_npu,
+    is_sm120_supported,
 )
 
 global _use_multi_stream
 _is_cuda = is_cuda()
 _is_hip = is_hip()
 _is_sm103 = _is_cuda and get_device_sm() == 103
+_is_sm120 = _is_cuda and is_sm120_supported()
 _is_npu = is_npu()
 _is_fp8_fnuz = is_fp8_fnuz()
 if _is_cuda:
     try:
+        if _is_sm120:
+            raise ImportError("DeepGEMM unsupported on SM120")
         import deep_gemm
     except ImportError as e:
         deep_gemm = e
 
+if _is_sm120:
+    from sglang.srt.layers.attention.nsa.sm120_mqa_fallback import (
+        compute_paged_mqa_schedule_metadata as _sm120_compute_paged_mqa_schedule_metadata,
+        sm120_fp8_paged_mqa_logits as _sm120_fp8_paged_mqa_logits,
+        sm120_fp8_mqa_logits as _sm120_fp8_mqa_logits,
+    )
+
 if _is_npu:
     import custom_ops  # noqa: F401
     import torch_npu
@@ -173,7 +184,13 @@ def __init__(
         else:
             self.cp_size = None
             self.cp_rank = None
-        if _is_cuda:
+        if _is_sm120:
+            props = torch.cuda.get_device_properties(torch.cuda.current_device())
+            self.sm_count = props.multi_processor_count
+            self.half_device_sm_count = ceil_align(self.sm_count // 2, 8)
+            pp_size = get_global_server_args().pp_size
+            self.logits_with_pp_recv = pp_size > 1 and not get_pp_group().is_last_rank
+        elif _is_cuda and not isinstance(deep_gemm, ImportError):
             self.sm_count = deep_gemm.get_num_sms()
             self.half_device_sm_count = ceil_align(self.sm_count // 2, 8)
             pp_size = get_global_server_args().pp_size
@@ -223,7 +240,7 @@ def _with_real_sm_count(self):
         # request to receive the PP proxy tensor or output from the previous stage, occupying one SM resource.
         # Model execution runs in parallel with the recv operation, so the SMs available to the indexer must be reduced
         # by 1. Currently, the last rank starts the send result + recv request only after waiting for execution results.
-        if self.logits_with_pp_recv:
+        if self.logits_with_pp_recv and not _is_sm120:
             pp_recv_sm_count = 1
             with deep_gemm_wrapper.configure_deep_gemm_num_sms(
                 self.sm_count - pp_recv_sm_count
@@ -380,7 +397,14 @@ def _get_topk_paged(
         # Reuse pre-computed schedule metadata if available (from init_forward_metadata),
         # otherwise fall back to computing it here.
         schedule_metadata = getattr(metadata, "paged_mqa_schedule_metadata", None)
-        if _is_cuda:
+        if _is_sm120:
+            if schedule_metadata is None:
+                schedule_metadata = _sm120_compute_paged_mqa_schedule_metadata(
+                    seqlens_32.unsqueeze(-1) if seqlens_32.dim() == 1 else seqlens_32,
+                    blocksize,
+                    self.sm_count,
+                )
+        elif _is_cuda:
             if schedule_metadata is None:
                 schedule_metadata = deep_gemm.get_paged_mqa_logits_metadata(
                     seqlens_32.unsqueeze(-1) if seqlens_32.dim() == 1 else seqlens_32,
@@ -430,7 +454,19 @@ def _get_topk_paged(
                 WavePerEU=5,
             )
         else:
-            logits = deep_gemm.fp8_paged_mqa_logits(
+            if _is_sm120:
+                logits = _sm120_fp8_paged_mqa_logits(
+                    q_fp8,
+                    kv_cache_fp8,
+                    weights,
+                    seqlens_32.unsqueeze(-1) if seqlens_32.dim() == 1 else seqlens_32,
+                    block_tables,
+                    schedule_metadata,
+                    max_seq_len,
+                    clean_logits=False,
+                )
+            else:
+                logits = deep_gemm.fp8_paged_mqa_logits(
                 q_fp8,
                 kv_cache_fp8,
                 weights,
@@ -545,6 +581,15 @@ def _get_topk_ragged(
                     logits = fp8_mqa_logits(
                         q_fp8[:q_offset], kv, scale, weights[:q_offset], ks, ke
                     )
+                elif _is_sm120:
+                    logits = _sm120_fp8_mqa_logits(
+                        q_fp8[:q_offset],
+                        kv_fp8,
+                        weights[:q_offset],
+                        ks,
+                        ke,
+                        clean_logits=False,
+                    )
                 else:
                     logits = deep_gemm.fp8_mqa_logits(
                         q_fp8[:q_offset],
@@ -595,6 +640,15 @@ def _get_topk_ragged(
                         ks[start:end],
                         ke[start:end],
                     )
+                elif _is_sm120:
+                    logits_chunk = _sm120_fp8_mqa_logits(
+                        q_fp8[start:end],
+                        kv_fp8,
+                        weights[start:end],
+                        ks[start:end],
+                        ke[start:end],
+                        clean_logits=False,
+                    )
                 else:
                     logits_chunk = deep_gemm.fp8_mqa_logits(
                         q_fp8[start:end],
@@ -760,14 +814,24 @@ def _get_topk_ragged_with_cp(
             ke = ks + ke_offset
             actual_seq_q = torch.cat(actual_seq_q_list, dim=0)
             with self._with_real_sm_count():
-                logits = deep_gemm.fp8_mqa_logits(
-                    q_fp8,
-                    kv_fp8,
-                    weights,
-                    ks,
-                    ke,
-                    clean_logits=False,
-                )
+                if _is_sm120:
+                    logits = _sm120_fp8_mqa_logits(
+                        q_fp8,
+                        kv_fp8,
+                        weights,
+                        ks,
+                        ke,
+                        clean_logits=False,
+                    )
+                else:
+                    logits = deep_gemm.fp8_mqa_logits(
+                        q_fp8,
+                        kv_fp8,
+                        weights,
+                        ks,
+                        ke,
+                        clean_logits=False,
+                    )
             topk_result = metadata.topk_transform(
                 logits,
                 self.index_topk,
@@ -806,14 +870,24 @@ def _get_topk_ragged_with_cp(
             ke = ks + ke_offset
 
             with self._with_real_sm_count():
-                logits = deep_gemm.fp8_mqa_logits(
-                    q_fp8,
-                    kv_fp8,
-                    weights,
-                    ks,
-                    ke,
-                    clean_logits=False,
-                )
+                if _is_sm120:
+                    logits = _sm120_fp8_mqa_logits(
+                        q_fp8,
+                        kv_fp8,
+                        weights,
+                        ks,
+                        ke,
+                        clean_logits=False,
+                    )
+                else:
+                    logits = deep_gemm.fp8_mqa_logits(
+                        q_fp8,
+                        kv_fp8,
+                        weights,
+                        ks,
+                        ke,
+                        clean_logits=False,
+                    )
             actual_seq_q = torch.tensor([actual_seq_q], dtype=torch.int32).to(
                 device="cuda", non_blocking=True
             )
diff --git a/python/sglang/srt/layers/attention/nsa/sm120_mqa_fallback.py b/python/sglang/srt/layers/attention/nsa/sm120_mqa_fallback.py
new file mode 100644
index 000000000000..8ed9c51eb901
--- /dev/null
+++ b/python/sglang/srt/layers/attention/nsa/sm120_mqa_fallback.py
@@ -0,0 +1,145 @@
+"""SM120 fallback kernels for DeepGEMM FP8 MQA logits operations.
+
+On SM120 (RTX 5090, RTX PRO 6000, DGX Spark), DeepGEMM's fp8_paged_mqa_logits
+and fp8_mqa_logits crash with 'Unsupported architecture'. This module provides
+PyTorch-native fallback implementations with wq precompute optimization.
+
+Key optimization: logit[s] = sum_h(w[h] * dot(q[h], kv[s]))
+                        = dot(sum_h(w[h] * q[h]), kv[s])
+                        = dot(wq, kv[s])
+This reduces per-position work from O(n_heads) to O(1).
+
+Reference: SGLang PR #24047 (AliceChenyy's SM120 support)
+"""
+from __future__ import annotations
+
+import logging
+from typing import Optional, Tuple
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def compute_paged_mqa_schedule_metadata(
+    seqlens: torch.Tensor,
+    block_size: int,
+    num_sms: int,
+) -> None:
+    return None
+
+
+def _dequant_fp8_with_scale_suffix(
+    data_fp8: torch.Tensor, head_dim_qk: int,
+) -> torch.Tensor:
+    data_bytes = data_fp8[..., :head_dim_qk]
+    scale_bytes = data_fp8[..., head_dim_qk:]
+    scale = scale_bytes.contiguous().view(torch.float32)
+    return data_bytes.to(torch.float32) * scale
+
+
+def sm120_fp8_paged_mqa_logits(
+    q_fp8: torch.Tensor,
+    kv_cache_fp8: torch.Tensor,
+    weights: torch.Tensor,
+    seqlens: torch.Tensor,
+    block_tables: torch.Tensor,
+    schedule_metadata,
+    max_seq_len: int,
+    clean_logits: bool = False,
+) -> torch.Tensor:
+    batch, next_n, n_heads, hd_with_sf = q_fp8.shape
+    hd = hd_with_sf - 4
+    block_kv = kv_cache_fp8.shape[1]
+    device = q_fp8.device
+
+    seqlens_flat = seqlens.view(-1).to(torch.int64)
+
+    # Dequant Q: [batch, next_n, n_heads, hd]
+    q_f32 = _dequant_fp8_with_scale_suffix(q_fp8, hd)
+
+    # Precompute wq = sum_h(w[b,h] * q[b,t,h,:]) -> [batch, next_n, hd]
+    w = weights.view(batch, 1, n_heads, 1)
+    wq = (q_f32 * w).sum(dim=2)
+
+    # Batch-dequant all KV blocks: [num_blocks_total, block_kv, hd]
+    kv_data = kv_cache_fp8[..., :hd].squeeze(2)
+    kv_scale_raw = kv_cache_fp8[..., hd:].squeeze(2)
+    kv_scale = kv_scale_raw.contiguous().view(torch.float32)
+    kv_f32 = kv_data.float() * kv_scale
+
+    # Vectorized batch gather (no per-batch loop, no .item())
+    max_blocks = (max_seq_len + block_kv - 1) // block_kv
+    block_ids = block_tables[:, :max_blocks]
+    kv_batched = kv_f32[block_ids]
+    max_padded = max_blocks * block_kv
+    kv_flat = kv_batched.reshape(batch, max_padded, hd)
+
+    # Batched matmul: [batch, next_n, hd] @ [batch, hd, max_padded]
+    logits_batched = torch.bmm(wq, kv_flat.transpose(1, 2))
+
+    # Validity mask
+    positions = torch.arange(max_padded, device=device)
+    valid = positions.unsqueeze(0) < seqlens_flat.unsqueeze(1)
+    logits_batched = logits_batched.masked_fill(~valid.unsqueeze(1), float("-inf"))
+
+    # Write to output: [batch * next_n, max_seq_len]
+    out_width = min(max_padded, max_seq_len)
+    out = torch.full(
+        (batch * next_n, max_seq_len),
+        float("-inf"),
+        device=device,
+        dtype=torch.float32,
+    )
+    out[:, :out_width] = logits_batched[:, :, :out_width].reshape(
+        batch * next_n, out_width
+    )
+    return out
+
+
+def sm120_fp8_mqa_logits(
+    q_fp8: torch.Tensor,
+    kv_fp8: Tuple[torch.Tensor, torch.Tensor],
+    weights: torch.Tensor,
+    ks: torch.Tensor,
+    ke: torch.Tensor,
+    clean_logits: bool = False,
+) -> torch.Tensor:
+    num_q, n_heads, hd_with_sf = q_fp8.shape
+    hd = hd_with_sf - 4
+    device = q_fp8.device
+
+    k_data, k_scale = kv_fp8
+    num_k = k_data.shape[0]
+
+    out_width = num_k
+    out = torch.full(
+        (num_q, out_width), float("-inf"), device=device, dtype=torch.float32,
+    )
+    if num_q == 0 or num_k == 0:
+        return out
+
+    # Dequant Q and precompute weighted query
+    q_f32 = _dequant_fp8_with_scale_suffix(q_fp8, hd)
+    w = weights.unsqueeze(-1)
+    wq = (q_f32 * w).sum(dim=1)  # [num_q, hd]
+
+    # Dequant KV
+    if k_data.shape[-1] == hd_with_sf:
+        k_f32 = _dequant_fp8_with_scale_suffix(k_data.unsqueeze(-2), hd).squeeze(-2)
+    else:
+        k_f32 = k_data.float()
+        if k_scale.dim() == 1:
+            k_f32 = k_f32 * k_scale.unsqueeze(-1)
+        else:
+            k_f32 = k_f32 * k_scale
+
+    # Single matmul: [num_q, hd] @ [hd, num_k] -> [num_q, num_k]
+    logits_all = torch.mm(wq, k_f32.t())
+
+    # Mask to [ks, ke) ranges
+    k_indices = torch.arange(out_width, device=device).unsqueeze(0)
+    mask = (k_indices >= ks.unsqueeze(1)) & (k_indices < ke.unsqueeze(1))
+    out[:, :num_k] = torch.where(mask[:, :num_k], logits_all, out[:, :num_k])
+
+    return out
diff --git a/python/sglang/srt/layers/attention/nsa/tilelang_kernel.py b/python/sglang/srt/layers/attention/nsa/tilelang_kernel.py
index 8af101aa334a..21211bda9fd4 100644
--- a/python/sglang/srt/layers/attention/nsa/tilelang_kernel.py
+++ b/python/sglang/srt/layers/attention/nsa/tilelang_kernel.py
@@ -1,14 +1,17 @@
 import functools
+import logging
 from typing import Any, Optional, Tuple
 
 import tilelang
 import tilelang.language as T
 import torch
 
-from sglang.srt.utils import is_hip
+from sglang.srt.utils import is_hip, is_sm120_supported
 
 tilelang.set_log_level("WARNING")
 
+logger = logging.getLogger(__name__)
+
 pass_configs = {
     tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
     tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
@@ -787,8 +790,13 @@ def tilelang_sparse_fwd(
             num_heads, d_v, tail_dim, topk, sm_scale=sm_scale, num_stages=1
         )
     else:
+        # SM120 (consumer Blackwell) has only ~99KB shared memory per block.
+        # The v2 kernel with default block_I=64 and double-buffered KV
+        # allocates ~206KB shared memory, far exceeding the 99KB limit.
+        # Use smaller block_I=32 to reduce shared memory to ~90KB.
+        block_I = 32 if is_sm120_supported() else 64
         kernel = sparse_attention_fwd_kernel_v2(
-            num_heads, d_v, tail_dim, topk, sm_scale=sm_scale
+            num_heads, d_v, tail_dim, topk, sm_scale=sm_scale, block_I=block_I
         )
     return kernel(q.unsqueeze(0), kv.unsqueeze(0), indices.unsqueeze(0))  # type: ignore
 
diff --git a/python/sglang/srt/layers/attention/nsa_backend.py b/python/sglang/srt/layers/attention/nsa_backend.py
index 0e95c28d2c1e..92d0617fe1ba 100644
--- a/python/sglang/srt/layers/attention/nsa_backend.py
+++ b/python/sglang/srt/layers/attention/nsa_backend.py
@@ -612,7 +612,7 @@ def init_forward_metadata(self, forward_batch: ForwardBatch):
                 paged_mqa_schedule_metadata = deep_gemm.get_paged_mqa_logits_metadata(
                     seqlens_32, 64, deep_gemm.get_num_sms()
                 )
-            except (ImportError, ModuleNotFoundError):
+            except (ImportError, ModuleNotFoundError, RuntimeError):
                 paged_mqa_schedule_metadata = None
 
         metadata = NSAMetadata(
@@ -894,7 +894,7 @@ def init_forward_metadata_capture_cuda_graph(
                 paged_mqa_schedule_metadata = deep_gemm.get_paged_mqa_logits_metadata(
                     seqlens_32, 64, deep_gemm.get_num_sms()
                 )
-            except (ImportError, ModuleNotFoundError):
+            except (ImportError, ModuleNotFoundError, RuntimeError):
                 paged_mqa_schedule_metadata = None
 
         metadata = NSAMetadata(
@@ -1067,7 +1067,7 @@ def init_forward_metadata_replay_cuda_graph(
                     metadata.paged_mqa_schedule_metadata = new_schedule
                 else:
                     metadata.paged_mqa_schedule_metadata.copy_(new_schedule)
-            except (ImportError, ModuleNotFoundError):
+            except (ImportError, ModuleNotFoundError, RuntimeError):
                 metadata.paged_mqa_schedule_metadata = None
         seqlens_expanded_size = seqlens_expanded.shape[0]
         assert (
diff --git a/python/sglang/srt/layers/deep_gemm_wrapper/configurer.py b/python/sglang/srt/layers/deep_gemm_wrapper/configurer.py
index 34494f59914c..05cc58497cbe 100644
--- a/python/sglang/srt/layers/deep_gemm_wrapper/configurer.py
+++ b/python/sglang/srt/layers/deep_gemm_wrapper/configurer.py
@@ -1,7 +1,8 @@
 import logging
+import sys
 
 from sglang.srt.environ import envs
-from sglang.srt.utils import get_device_sm, is_blackwell_supported
+from sglang.srt.utils import get_device_sm, is_sm100_supported
 
 logger = logging.getLogger(__name__)
 
@@ -11,6 +12,21 @@ def _compute_enable_deep_gemm():
     if sm_version < 90:
         return False
 
+    # SM120 (consumer Blackwell) lacks WGMMA/tcgen05 instructions
+    # required by DeepGEMM SM90/SM100 kernels.
+    # Block the package entirely to prevent _C.init() from triggering
+    # NVCC JIT compilation of tcgen05 kernels on SM120.
+    if sm_version == 120:
+        import types
+        import os
+
+        dg = types.ModuleType("deep_gemm")
+        dg.__path__ = []
+        dg.__file__ = os.path.join(os.path.dirname(__file__), "_deep_gemm_stub.py")
+        dg.__version__ = "0.0.0-blocked-sm120"
+        sys.modules["deep_gemm"] = dg
+        return False
+
     try:
         import deep_gemm  # noqa: F401
     except ImportError:
@@ -21,5 +37,6 @@ def _compute_enable_deep_gemm():
 
 ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
 
-DEEPGEMM_BLACKWELL = ENABLE_JIT_DEEPGEMM and is_blackwell_supported()
+# DeepGEMM Blackwell kernels only support SM100 (datacenter), not SM120 (consumer)
+DEEPGEMM_BLACKWELL = ENABLE_JIT_DEEPGEMM and is_sm100_supported()
 DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL
diff --git a/python/sglang/srt/layers/deep_gemm_wrapper/paged_mqa_logits.py b/python/sglang/srt/layers/deep_gemm_wrapper/paged_mqa_logits.py
index d392f73078a6..e16cdafac300 100644
--- a/python/sglang/srt/layers/deep_gemm_wrapper/paged_mqa_logits.py
+++ b/python/sglang/srt/layers/deep_gemm_wrapper/paged_mqa_logits.py
@@ -1,10 +1,13 @@
 from dataclasses import dataclass
 from typing import List, Union
 
-import deep_gemm
 import torch
 
 from sglang.srt.environ import envs
+from sglang.srt.layers.deep_gemm_wrapper.configurer import ENABLE_JIT_DEEPGEMM
+
+if ENABLE_JIT_DEEPGEMM:
+    import deep_gemm
 
 
 @dataclass
diff --git a/python/sglang/srt/layers/mhc.py b/python/sglang/srt/layers/mhc.py
index 1c27636efb5c..bd906e5ad816 100644
--- a/python/sglang/srt/layers/mhc.py
+++ b/python/sglang/srt/layers/mhc.py
@@ -9,6 +9,7 @@
 from sglang.jit_kernel.utils import is_arch_support_pdl
 from sglang.srt.layers.attention.nsa.utils import is_nsa_prefill_cp_round_robin_split
 from sglang.srt.layers.utils.common import strict_contiguous
+from sglang.srt.utils import is_sm120_supported
 
 tilelang.set_log_level("WARNING")
 
@@ -46,7 +47,7 @@ def hc_split_sinkhorn_kernel_(
 
             mixes_shared = T.alloc_shared(mix_hc, FP32)
             comb_frag = T.alloc_fragment((hc, hc), FP32)
-            T.copy(mixes[i, :], mixes_shared)
+            T.copy(mixes[i, :], mixes_shared, disable_tma=True)
 
             for j in T.Parallel(hc):
                 pre[i, j] = T.sigmoid(mixes_shared[j] * hc_scale[0] + hc_base[j]) + eps
@@ -83,7 +84,7 @@ def hc_split_sinkhorn_kernel_(
                 for j, k in T.Parallel(hc, hc):
                     comb_frag[j, k] = comb_frag[j, k] / (col_sum[k] + eps)
 
-            T.copy(comb_frag, comb[i, :, :])
+            T.copy(comb_frag, comb[i, :, :], disable_tma=True)
             if ENABLE_PDL:
                 T.pdl_trigger()
 
@@ -98,19 +99,42 @@ def hc_split_sinkhorn(
     sinkhorn_iters: int = 20,
     eps: float = 1e-6,
 ):
+    """Pure PyTorch fallback for SM120 — avoids tilelang.jit deadlocks."""
     b, s, _ = mixes.size()
-    pre = mixes.new_empty(b, s, hc_mult)
-    post = mixes.new_empty(b, s, hc_mult)
-    comb = mixes.new_empty(b, s, hc_mult, hc_mult)
-    kernel = hc_split_sinkhorn_kernel(hc_mult, sinkhorn_iters, eps)
-    kernel(
-        mixes.view(-1, (2 + hc_mult) * hc_mult),
-        hc_scale,
-        hc_base,
-        pre.view(-1, hc_mult),
-        post.view(-1, hc_mult),
-        comb.view(-1, hc_mult, hc_mult),
+    mixes_flat = mixes.view(-1, (2 + hc_mult) * hc_mult)
+
+    # pre: sigmoid of first hc_mult elements
+    pre_raw = mixes_flat[:, :hc_mult] * hc_scale[0] + hc_base[:hc_mult]
+    pre_out = torch.sigmoid(pre_raw) + eps
+
+    # post: 2 * sigmoid of next hc_mult elements
+    post_raw = (
+        mixes_flat[:, hc_mult : 2 * hc_mult] * hc_scale[1]
+        + hc_base[hc_mult : 2 * hc_mult]
     )
+    post_out = 2.0 * torch.sigmoid(post_raw)
+
+    # comb: exp(scaled + base) for remaining hc_mult*hc_mult elements, then sinkhorn normalize
+    comb_base = hc_base[2 * hc_mult :].view(hc_mult, hc_mult)
+    comb_raw = (
+        mixes_flat[:, 2 * hc_mult :].view(-1, hc_mult, hc_mult) * hc_scale[2]
+        + comb_base
+    )
+    comb_frag = torch.exp(comb_raw - comb_raw.max(dim=-1, keepdim=True).values)
+    row_sum = comb_frag.sum(dim=-1, keepdim=True)
+    comb_frag = comb_frag / row_sum + eps
+    col_sum = comb_frag.sum(dim=-2, keepdim=True)
+    comb_frag = comb_frag / (col_sum + eps)
+
+    for _ in range(sinkhorn_iters - 1):
+        row_sum = comb_frag.sum(dim=-1, keepdim=True)
+        comb_frag = comb_frag / (row_sum + eps)
+        col_sum = comb_frag.sum(dim=-2, keepdim=True)
+        comb_frag = comb_frag / (col_sum + eps)
+
+    pre = pre_out.view(b, s, hc_mult)
+    post = post_out.view(b, s, hc_mult)
+    comb = comb_frag.view(b, s, hc_mult, hc_mult)
     return pre, post, comb
 
 
@@ -171,7 +195,7 @@ def mhc_pre_big_fuse_tilelang(
                 mixes[j] += gemm_out_mul[i_split, i, j]
             mixes[j] *= rms[0]
         mixes_shared = T.alloc_shared(hc_mult3, T.float32)
-        T.copy(mixes, mixes_shared)
+        T.copy(mixes, mixes_shared, disable_tma=True)
 
         if T.get_thread_binding() < 32:
             cm = T.alloc_fragment((hc_mult, hc_mult), T.float32)
@@ -223,11 +247,11 @@ def mhc_pre_big_fuse_tilelang(
                     )
                     + hc_pre_eps
                 )
-            for i0_h in T.Pipelined(hidden_size // hidden_block, num_stages=2):
+            for i0_h in T.serial(hidden_size // hidden_block):
                 xs = T.alloc_shared((hc_mult, hidden_block), T.float32)
                 xl = T.alloc_fragment((hc_mult, hidden_block), T.float32)
-                T.copy(residual[i, 0, i0_h * hidden_block], xs)
-                T.copy(xs, xl)
+                T.copy(residual[i, 0, i0_h * hidden_block], xs, disable_tma=True)
+                T.copy(xs, xl, disable_tma=True)
 
                 ol = T.alloc_fragment(hidden_block, T.float32)
                 T.clear(ol)
@@ -237,7 +261,7 @@ def mhc_pre_big_fuse_tilelang(
                     for i1_h in T.Parallel(hidden_block):
                         ol[i1_h] += pre * xl[i_hc, i1_h]
 
-                T.copy(ol, layer_input[i, i0_h * hidden_block])
+                T.copy(ol, layer_input[i, i0_h * hidden_block], disable_tma=True)
 
         if ENABLE_PDL:
             T.pdl_trigger()
@@ -271,7 +295,7 @@ def mhc_pre_gemm_sqrsum_tilelang(
         T.clear(sqrsum_part)
         if ENABLE_PDL:
             T.pdl_sync()
-        for pz in T.Pipelined(hc_hidden_size // hidden_block, num_stages=2):
+        for pz in T.serial(hc_hidden_size // hidden_block):
             x_smem_16 = T.alloc_shared((token_block, hidden_block), T.bfloat16)
             fn_smem = T.alloc_shared((32, hidden_block), T.float32)
 
@@ -279,13 +303,13 @@ def mhc_pre_gemm_sqrsum_tilelang(
                 {x_smem_16: tilelang.layout.make_swizzled_layout(x_smem_16)}
             )
 
-            T.copy(x[px * token_block, pz * hidden_block], x_smem_16)
-            T.copy(fn[0, pz * hidden_block], fn_smem)
+            T.copy(x[px * token_block, pz * hidden_block], x_smem_16, disable_tma=True)
+            T.copy(fn[0, pz * hidden_block], fn_smem, disable_tma=True)
 
             x_frag_16 = T.alloc_fragment((token_block, hidden_block), T.bfloat16)
-            T.copy(x_smem_16, x_frag_16)
+            T.copy(x_smem_16, x_frag_16, disable_tma=True)
             x_frag = T.alloc_fragment((token_block, hidden_block), T.float32)
-            T.copy(x_frag_16, x_frag)
+            T.copy(x_frag_16, x_frag, disable_tma=True)
 
             for jj in T.serial(hidden_block // 4):
                 for i, j in T.Parallel(token_block, 4):
@@ -351,7 +375,7 @@ def mhc_pre_gemm_sqrsum_splitk_stage_0(
             if ENABLE_PDL:
                 T.pdl_sync()
 
-            for pz in T.Pipelined(split_size // hidden_block, num_stages=2):
+            for pz in T.serial(split_size // hidden_block):
                 x_smem = T.alloc_shared((token_block, hidden_block), T.bfloat16)
                 fn_smem = T.alloc_shared((32, hidden_block), T.float32)
 
@@ -359,13 +383,17 @@ def mhc_pre_gemm_sqrsum_splitk_stage_0(
                     {x_smem: tilelang.layout.make_swizzled_layout(x_smem)}
                 )
 
-                T.copy(x[px * token_block, k_base + pz * hidden_block], x_smem)
-                T.copy(fn[0, k_base + pz * hidden_block], fn_smem)
+                T.copy(
+                    x[px * token_block, k_base + pz * hidden_block],
+                    x_smem,
+                    disable_tma=True,
+                )
+                T.copy(fn[0, k_base + pz * hidden_block], fn_smem, disable_tma=True)
 
                 x_f16 = T.alloc_fragment((token_block, hidden_block), T.bfloat16)
-                T.copy(x_smem, x_f16)
+                T.copy(x_smem, x_f16, disable_tma=True)
                 x_f = T.alloc_fragment((token_block, hidden_block), T.float32)
-                T.copy(x_f16, x_f)
+                T.copy(x_f16, x_f, disable_tma=True)
 
                 for jj in T.serial(hidden_block // 4):
                     for i, j in T.Parallel(token_block, 4):
@@ -493,8 +521,13 @@ def mhc_pre(
 
     if num_tokens <= 2048:
         assert n_splits == 1
+        # SM120 (consumer Blackwell) has only ~99KB shared memory per block.
+        # The splitk kernel with hidden_block=256 and 2-stage pipelining uses
+        # ~96KB shared memory (x_smem*2 + fn_smem*2), which is very close to
+        # the 99KB limit. Use hidden_block=128 on SM120 to stay well within limits.
+        _is_sm120 = is_sm120_supported()
         if hc_hidden_size == 16384:
-            hidden_block = 256
+            hidden_block = 128 if _is_sm120 else 256
         elif hc_hidden_size == 28672:
             hidden_block = 128
         else:
@@ -598,22 +631,22 @@ def mhc_post_tilelang(
 
         a_local = T.alloc_fragment((hc, hc), T.float32)
         c_local = T.alloc_fragment(hc, T.float32)
-        T.copy(a[i_n, 0, 0], a_local)
-        T.copy(c[i_n, 0], c_local)
+        T.copy(a[i_n, 0, 0], a_local, disable_tma=True)
+        T.copy(c[i_n, 0], c_local, disable_tma=True)
 
-        for i0_h in T.Pipelined(T.ceildiv(h, h_blk), num_stages=2):
-            T.copy(b[i_n, 0, i0_h * h_blk], b_shared)
-            T.copy(d[i_n, i0_h * h_blk], d_shared)
+        for i0_h in T.serial(T.ceildiv(h, h_blk)):
+            T.copy(b[i_n, 0, i0_h * h_blk], b_shared, disable_tma=True)
+            T.copy(d[i_n, i0_h * h_blk], d_shared, disable_tma=True)
 
-            T.copy(b_shared, b_local)
-            T.copy(d_shared, d_local)
+            T.copy(b_shared, b_local, disable_tma=True)
+            T.copy(d_shared, d_local, disable_tma=True)
             for i_hco, i1_h in T.Parallel(hc, h_blk):
                 x_local[i_hco, i1_h] = c_local[i_hco] * d_local[i1_h]
                 for i_hci in T.serial(hc):
                     x_local[i_hco, i1_h] += a_local[i_hci, i_hco] * b_local[i_hci, i1_h]
-            T.copy(x_local, x_shared)
+            T.copy(x_local, x_shared, disable_tma=True)
 
-            T.copy(x_shared, x[i_n, 0, i0_h * h_blk])
+            T.copy(x_shared, x[i_n, 0, i0_h * h_blk], disable_tma=True)
 
         if ENABLE_PDL:
             T.pdl_trigger()
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1344,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1344,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..ce586e771f37
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1344,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..1b1a5ef987bd
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=232,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=232,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..0a8ef8cb807a
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=232,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..548a2a7a08de
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=464,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=464,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..c039c01f3c93
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=464,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=928,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=928,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..09cbbb3709ec
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=128,N=928,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..091744d7f619
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=16,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 000000000000..66dd6874d97d
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json
new file mode 100644
index 000000000000..40cbdf70b5f0
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=1344,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=1344,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..22b8427ea233
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=1344,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..54fff622a6bf
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=672,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=672,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..7d9e06655cfb
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=672,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=512,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=512,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..eacde3f6b8fb
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=257,N=512,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..be559b7af82c
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=928,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=928,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..cdd31e733923
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=32,N=928,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=1344,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=1344,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..36d1234a767f
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=1344,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..e00c8ea98d26
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=336,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=336,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..bf262f0e8c7f
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=336,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=672,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=672,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..395ac8b0c148
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=512,N=672,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..2d9efb1b2663
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=1856,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..cd2d8b7bcd5d
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=2688,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=464,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=464,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..e12fec12db53
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=464,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=928,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=928,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
new file mode 100644
index 000000000000..0294bebcea57
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=64,N=928,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/mxfp4_moe_sm120_triton.py b/python/sglang/srt/layers/moe/fused_moe_triton/mxfp4_moe_sm120_triton.py
new file mode 100644
index 000000000000..2f56584deb89
--- /dev/null
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/mxfp4_moe_sm120_triton.py
@@ -0,0 +1,389 @@
+"""SM120-optimized Triton MXFP4 MoE kernel — CUDA graph compatible.
+
+Replaces the PyTorch fallback (per-expert for-loop + full dequant + matmul)
+with fused Triton kernels that:
+1. Fuse FP4 dequant + GEMV (no intermediate BF16 weight materialization)
+2. Process each (token, expert) slot independently — no data-dependent routing
+3. Respect SM120 shared memory constraint (99 KB/block)
+
+CUDA graph compatibility:
+- No .unique(), .item(), .nonzero() — all routing is tensor-level
+- Fixed grid dimensions (M*topk, N_blocks) per captured batch size
+- All control flow is static or within Triton kernels
+
+SM120 constraints:
+- SMEM: 99 KB/block (vs SM100 228 KB)
+- No TMEM/tcgen05 — uses mma.sync.aligned via Triton
+- Max warps: 48/SM
+- Registers: ~128/thread practical limit
+
+Ported from PR #24047 (AliceChenyy's SM120 support).
+"""
+
+import logging
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+logger = logging.getLogger(__name__)
+
+
+@triton.jit
+def _dequant_fp4_lut(nibble):
+    """Decode a 4-bit FP4 E2M1 nibble to float32 using arithmetic."""
+    sign_bit = (nibble >> 3) & 1
+    exp_bits = (nibble >> 1) & 3
+    man_bit = nibble & 1
+
+    is_subnormal = (exp_bits == 0)
+    mantissa = 1.0 + man_bit.to(tl.float32) * 0.5
+    exponent = tl.math.exp2((exp_bits - 1).to(tl.float32))
+    val = tl.where(is_subnormal, man_bit.to(tl.float32) * 0.5, mantissa * exponent)
+    val = tl.where(sign_bit != 0, -val, val)
+    return val
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_N": 64, "BLOCK_K": 64}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_N": 32, "BLOCK_K": 64}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_N": 64, "BLOCK_K": 128}, num_warps=4, num_stages=2),
+        triton.Config({"BLOCK_N": 128, "BLOCK_K": 64}, num_warps=8, num_stages=2),
+    ],
+    key=["N", "K"],
+)
+@triton.jit
+def _mxfp4_slot_gemv_kernel(
+    # Pointers
+    A_ptr,            # [M_total, K] bf16 — source rows
+    B_packed_ptr,     # [E, N, K//2] uint8 — packed FP4 expert weights
+    B_scale_ptr,      # [E, N, K//32] float32 — weight scales
+    C_ptr,            # [num_slots, N] bf16 — output
+    token_ids_ptr,    # [num_slots] int32 — which A row for each slot
+    expert_ids_ptr,   # [num_slots] int32 — which expert's B for each slot
+    # Dimensions
+    N: tl.int32,
+    K: tl.int32,
+    # A strides
+    stride_am: tl.int32,
+    # B strides (within an expert)
+    stride_bn: tl.int32,
+    stride_bk2: tl.int32,
+    # B_scale strides (within an expert)
+    stride_bsn: tl.int32,
+    stride_bsk32: tl.int32,
+    # Expert strides (between experts)
+    expert_b_stride: tl.int64,
+    expert_s_stride: tl.int64,
+    # C strides
+    stride_cm: tl.int32,
+    # Block sizes
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    """Per-slot fused MXFP4 dequant + GEMV.
+
+    Grid: (num_slots, cdiv(N, BLOCK_N))
+    Each program computes one (token, expert) pair for a BLOCK_N slice of output.
+    """
+    slot_id = tl.program_id(0)
+    n_block = tl.program_id(1)
+
+    token_id = tl.load(token_ids_ptr + slot_id).to(tl.int64)
+    expert_id = tl.load(expert_ids_ptr + slot_id).to(tl.int64)
+
+    offs_n = n_block * BLOCK_N + tl.arange(0, BLOCK_N)
+    n_mask = offs_n < N
+
+    acc = tl.zeros([BLOCK_N], dtype=tl.float32)
+
+    # Expert weight base pointers
+    b_base = expert_id * expert_b_stride
+    s_base = expert_id * expert_s_stride
+    a_base = token_id * stride_am
+
+    for k_start in range(0, K, BLOCK_K):
+        # Load packed B: [BLOCK_N, BLOCK_K//2]
+        offs_k2 = k_start // 2 + tl.arange(0, BLOCK_K // 2)
+        b_mask = n_mask[:, None] & (offs_k2[None, :] < K // 2)
+        b_packed = tl.load(
+            B_packed_ptr + b_base + offs_n[:, None] * stride_bn + offs_k2[None, :] * stride_bk2,
+            mask=b_mask, other=0,
+        )
+
+        # FP4 dequant
+        b_u8 = b_packed.to(tl.int32)
+        val_lo = _dequant_fp4_lut(b_u8 & 0x0F)
+        val_hi = _dequant_fp4_lut((b_u8 >> 4) & 0x0F)
+
+        # Load and apply scales: [BLOCK_N, BLOCK_K//2]
+        group_ids = tl.arange(0, BLOCK_K // 2) // 16
+        s_mask = n_mask[:, None] & ((k_start // 32 + group_ids[None, :]) < K // 32)
+        scales = tl.load(
+            B_scale_ptr + s_base + offs_n[:, None] * stride_bsn
+            + (k_start // 32 + group_ids[None, :]) * stride_bsk32,
+            mask=s_mask, other=1.0,
+        )
+        val_lo = val_lo * scales
+        val_hi = val_hi * scales
+
+        # Load A even/odd: [BLOCK_K//2] each
+        offs_k_even = k_start + tl.arange(0, BLOCK_K // 2) * 2
+        offs_k_odd = offs_k_even + 1
+
+        a_even = tl.load(
+            A_ptr + a_base + offs_k_even,
+            mask=offs_k_even < K, other=0.0,
+        ).to(tl.float32)
+        a_odd = tl.load(
+            A_ptr + a_base + offs_k_odd,
+            mask=offs_k_odd < K, other=0.0,
+        ).to(tl.float32)
+
+        # Dot product: acc[n] += sum_k(a_even[k]*B_lo[n,k] + a_odd[k]*B_hi[n,k])
+        acc += tl.sum(a_even[None, :] * val_lo, axis=1)
+        acc += tl.sum(a_odd[None, :] * val_hi, axis=1)
+
+    # Store output
+    tl.store(
+        C_ptr + slot_id * stride_cm + offs_n,
+        acc.to(tl.bfloat16),
+        mask=n_mask,
+    )
+
+
+def _mxfp4_moe_forward_grouped(
+    hidden_states: torch.Tensor,
+    w13_packed: torch.Tensor,
+    w2_packed: torch.Tensor,
+    w13_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    hidden_size: int,
+    intermediate_size: int,
+    routed_scaling_factor: Optional[float] = None,
+    clamp_limit: Optional[float] = None,
+) -> torch.Tensor:
+    """Prefill-optimized MoE: grouped per-expert batched GEMM.
+
+    Sorts tokens by expert, then does batched BF16 matmul per expert.
+    Higher tensor core utilization than per-slot GEMV for M > 8.
+    Uses the proven _dequant_fp4_batch from Fp8MoEMethod for correctness.
+    """
+    import torch.nn.functional as F
+    from sglang.srt.layers.quantization.fp8 import Fp8MoEMethod
+
+    M, K = hidden_states.shape
+    topk = topk_ids.shape[1]
+    I = intermediate_size
+    device = hidden_states.device
+    dtype = hidden_states.dtype
+
+    # Flatten and sort by expert for grouping
+    flat_expert_ids = topk_ids.reshape(-1)  # [M*topk]
+    flat_token_ids = (
+        torch.arange(M, device=device, dtype=torch.int32)
+        .unsqueeze(1).expand(M, topk).reshape(-1)
+    )
+
+    # Sort by expert ID
+    sort_idx = torch.argsort(flat_expert_ids)
+    sorted_experts = flat_expert_ids[sort_idx]
+    sorted_tokens = flat_token_ids[sort_idx]
+
+    # Get unique experts and their counts
+    unique_experts, expert_counts = torch.unique_consecutive(sorted_experts, return_counts=True)
+    expert_offsets = torch.cat([torch.zeros(1, device=device, dtype=torch.int32),
+                                expert_counts.cumsum(0)[:-1]])
+
+    output = torch.zeros(M, K, dtype=dtype, device=device)
+
+    # Gather hidden states for all slots
+    all_hidden = hidden_states[sorted_tokens]  # [M*topk, K]
+
+    # Process each expert group
+    for i in range(unique_experts.shape[0]):
+        eid = int(unique_experts[i])
+        start = int(expert_offsets[i])
+        count = int(expert_counts[i])
+
+        if count == 0:
+            continue
+
+        x_group = all_hidden[start:start + count]  # [count, K]
+
+        # Dequant w13 for this expert using proven batch dequant
+        w13_bf16 = Fp8MoEMethod._dequant_fp4_batch(
+            w13_packed[eid:eid+1], w13_scale[eid:eid+1]
+        ).squeeze(0)  # [2I, K]
+
+        # Batched GEMM: (count, K) @ (2I, K).T -> (count, 2I)
+        gate_up = torch.mm(x_group.float(), w13_bf16.float().T).to(dtype)
+
+        # SwiGLU
+        gate = gate_up[:, :I]
+        up = gate_up[:, I:]
+        if clamp_limit is not None and clamp_limit > 0:
+            gate = torch.clamp(gate.float(), max=clamp_limit)
+            up = torch.clamp(up.float(), min=-clamp_limit, max=clamp_limit)
+        activated = (F.silu(gate.float()) * up.float()).to(dtype)
+
+        # Dequant w2 for this expert
+        w2_bf16 = Fp8MoEMethod._dequant_fp4_batch(
+            w2_packed[eid:eid+1], w2_scale[eid:eid+1]
+        ).squeeze(0)  # [K, I]
+
+        # Down projection: (count, I) @ (K, I).T -> (count, K)
+        down = torch.mm(activated.float(), w2_bf16.float().T).to(dtype)
+
+        # Scatter back to output with weights
+        slot_weights = topk_weights.reshape(-1)[sort_idx[start:start + count]]
+        token_indices = sorted_tokens[start:start + count]
+
+        weighted = down * slot_weights.unsqueeze(1).to(dtype)
+        output.scatter_add_(0, token_indices.unsqueeze(1).expand_as(weighted), weighted)
+
+    if routed_scaling_factor is not None and routed_scaling_factor != 1.0:
+        output.mul_(routed_scaling_factor)
+
+    return output
+
+
+def mxfp4_moe_forward_triton(
+    hidden_states: torch.Tensor,
+    w13_packed: torch.Tensor,
+    w2_packed: torch.Tensor,
+    w13_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    hidden_size: int,
+    intermediate_size: int,
+    inplace: bool = False,
+    routed_scaling_factor: Optional[float] = None,
+    clamp_limit: Optional[float] = None,
+) -> torch.Tensor:
+    """SM120-optimized MXFP4 MoE forward.
+
+    Prefill path (M > 8): grouped per-expert batched GEMM for high utilization.
+    Decode path (M <= 8): per-slot Triton GEMV for CUDA graph compatibility.
+    """
+    M, K = hidden_states.shape
+    topk = topk_ids.shape[1]
+
+    # Prefill: grouped per-expert batched GEMM (not CUDA graph, uses Python loop)
+    # Needs M large enough that average tokens/expert > ~5 to amortize dequant.
+    # With topk=6, 256 experts: M > 256 → ~6 tokens/expert average.
+    # Below threshold: per-slot Triton GEMV is faster (no dequant overhead).
+    if False and M > 256:
+        return _mxfp4_moe_forward_grouped(
+            hidden_states=hidden_states,
+            w13_packed=w13_packed,
+            w2_packed=w2_packed,
+            w13_scale=w13_scale,
+            w2_scale=w2_scale,
+            topk_ids=topk_ids,
+            topk_weights=topk_weights,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            routed_scaling_factor=routed_scaling_factor,
+            clamp_limit=clamp_limit,
+        )
+
+    # Decode: per-slot Triton GEMV (CUDA graph compatible)
+    import torch.nn.functional as F
+
+    I = intermediate_size
+    num_slots = M * topk
+    device = hidden_states.device
+    dtype = hidden_states.dtype
+
+    # Graph-safe routing: flatten topk assignments
+    flat_expert_ids = topk_ids.reshape(-1).contiguous()  # [M*topk]
+    token_ids = (
+        torch.arange(M, device=device, dtype=torch.int32)
+        .unsqueeze(1)
+        .expand(M, topk)
+        .reshape(-1)
+        .contiguous()
+    )  # [M*topk]
+
+    # Ensure scales are float32
+    if w13_scale.dtype != torch.float32:
+        w13_scale = w13_scale.to(torch.float32)
+    if w2_scale.dtype != torch.float32:
+        w2_scale = w2_scale.to(torch.float32)
+
+    # GEMM1: gate_up projection
+    # hidden_states[token] @ w13[expert].T -> [num_slots, 2*I]
+    intermediate = torch.empty(num_slots, 2 * I, dtype=dtype, device=device)
+
+    w13_u8 = w13_packed.view(torch.uint8)  # [E, 2*I, K//2]
+    grid1 = lambda meta: (num_slots, triton.cdiv(2 * I, meta["BLOCK_N"]))
+
+    _mxfp4_slot_gemv_kernel[grid1](
+        hidden_states,
+        w13_u8,
+        w13_scale,
+        intermediate,
+        token_ids,
+        flat_expert_ids,
+        2 * I,
+        K,
+        hidden_states.stride(0),
+        w13_u8.stride(1),
+        w13_u8.stride(2),
+        w13_scale.stride(1),
+        w13_scale.stride(2),
+        w13_u8.stride(0),
+        w13_scale.stride(0),
+        intermediate.stride(0),
+    )
+
+    # SiLU activation (graph-safe vectorized ops)
+    gate = intermediate[:, :I].float()
+    up = intermediate[:, I:].float()
+    if clamp_limit is not None and clamp_limit > 0:
+        gate = torch.clamp(gate, max=clamp_limit)
+        up = torch.clamp(up, min=-clamp_limit, max=clamp_limit)
+    activated = (F.silu(gate) * up).to(dtype)
+
+    # GEMM2: down projection
+    # activated[slot] @ w2[expert].T -> [num_slots, K]
+    down = torch.empty(num_slots, K, dtype=dtype, device=device)
+
+    slot_ids = torch.arange(num_slots, device=device, dtype=torch.int32)
+
+    w2_u8 = w2_packed.view(torch.uint8)  # [E, K, I//2]
+    grid2 = lambda meta: (num_slots, triton.cdiv(K, meta["BLOCK_N"]))
+
+    _mxfp4_slot_gemv_kernel[grid2](
+        activated,
+        w2_u8,
+        w2_scale,
+        down,
+        slot_ids,
+        flat_expert_ids,
+        K,
+        I,
+        activated.stride(0),
+        w2_u8.stride(1),
+        w2_u8.stride(2),
+        w2_scale.stride(1),
+        w2_scale.stride(2),
+        w2_u8.stride(0),
+        w2_scale.stride(0),
+        down.stride(0),
+    )
+
+    # Weighted sum across topk slots (graph-safe)
+    flat_weights = topk_weights.reshape(-1).unsqueeze(1).to(dtype)  # [M*topk, 1]
+    output = (down * flat_weights).view(M, topk, K).sum(dim=1)
+
+    if routed_scaling_factor is not None and routed_scaling_factor != 1.0:
+        output.mul_(routed_scaling_factor)
+
+    return output
diff --git a/python/sglang/srt/layers/quantization/configs/N=1280,K=5120,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1280,K=5120,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..5c0c8d76195f
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1280,K=5120,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=1536,K=4096,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=1536,K=4096,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..ca97131cb8ef
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=1536,K=4096,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,20 @@
+{
+    "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3},
+    "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "4": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3},
+    "16": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "24": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3},
+    "32": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "48": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "64": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "1536": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "2048": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "3072": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=16384,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=16384,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..ca97131cb8ef
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=16384,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,20 @@
+{
+    "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3},
+    "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "4": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3},
+    "16": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "24": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3},
+    "32": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "48": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "64": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "1536": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "2048": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "3072": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=2048,K=4096,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=2048,K=4096,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..ca97131cb8ef
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=2048,K=4096,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,20 @@
+{
+    "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3},
+    "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "4": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3},
+    "16": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "24": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3},
+    "32": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "48": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "64": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "1536": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "2048": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "3072": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..77ba0d7477bd
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..0a5d7bfdba48
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..cb91a279d423
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..7febe3d272b4
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..ca97131cb8ef
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,20 @@
+{
+    "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3},
+    "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "4": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3},
+    "16": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "24": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3},
+    "32": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "48": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "64": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "1536": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "2048": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "3072": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=4096,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=4096,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..ca97131cb8ef
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=4096,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,20 @@
+{
+    "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3},
+    "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "4": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3},
+    "16": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "24": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3},
+    "32": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "48": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "64": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "1536": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "2048": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "3072": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..9d7658bfc41b
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..03dba5ad15ba
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..9a5ff48b8942
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=5120,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=5120,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..15e91cde59a3
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=5120,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=5120,K=3200,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=5120,K=3200,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..c714b7f1928c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=5120,K=3200,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..386928de139c
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=6400,K=5120,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=6400,K=5120,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..f33809b0ad05
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=6400,K=5120,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..9c908e804065
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..f78e7060e684
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..1d3ce5c94c2d
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..3ab5796ee15b
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..3cb7eaa07c74
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/python/sglang/srt/layers/quantization/configs/N=8192,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json b/python/sglang/srt/layers/quantization/configs/N=8192,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 000000000000..ca97131cb8ef
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/configs/N=8192,K=1024,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Workstation_Edition,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,20 @@
+{
+    "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3},
+    "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "4": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "8": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3},
+    "16": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "24": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3},
+    "32": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3},
+    "48": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "64": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "1024": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2},
+    "1536": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "2048": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2},
+    "3072": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2},
+    "4096": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}
+}
diff --git a/python/sglang/srt/layers/quantization/fp4_gemv_triton.py b/python/sglang/srt/layers/quantization/fp4_gemv_triton.py
new file mode 100644
index 000000000000..d7843498c573
--- /dev/null
+++ b/python/sglang/srt/layers/quantization/fp4_gemv_triton.py
@@ -0,0 +1,265 @@
+"""Fused FP4 dequant + GEMV Triton kernel for SM120 MoE decode.
+
+FP4 (MXFP4/E2M1) format:
+  - Two 4-bit values packed per int8 byte (low nibble first, high nibble second)
+  - E2M1 magnitudes: [0, 0.5, 1, 1.5, 2, 3, 4, 6]
+  - Per-block scale (block_size=32) in float32
+"""
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _e2m1_dequant(nibs):
+    """Inline E2M1 dequantization using predicated tl.where."""
+    sign = 1.0 - 2.0 * ((nibs >> 3) & 1).to(tl.float32)
+    mag = nibs & 0x07
+    val = tl.where(mag == 0, 0.0,
+          tl.where(mag == 1, 0.5,
+          tl.where(mag == 2, 1.0,
+          tl.where(mag == 3, 1.5,
+          tl.where(mag == 4, 2.0,
+          tl.where(mag == 5, 3.0,
+          tl.where(mag == 6, 4.0, 6.0)))))))
+    return sign * val
+
+
+@triton.jit
+def _fp4_gemv_kernel(
+    input_ptr,
+    weight_ptr,
+    scale_ptr,
+    output_ptr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    K_HALF: tl.constexpr,
+    NUM_BLOCKS_K: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    HALF_BK: tl.constexpr,
+    TILE_N: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    n_start = pid * TILE_N
+    n_offs = n_start + tl.arange(0, TILE_N)
+    n_mask = n_offs < N
+
+    acc = tl.zeros((TILE_N,), dtype=tl.float32)
+
+    for bk in range(NUM_BLOCKS_K):
+        k_start = bk * BLOCK_K
+
+        s = tl.load(scale_ptr + n_offs * NUM_BLOCKS_K + bk, mask=n_mask, other=1.0)
+
+        byte_offs = k_start // 2 + tl.arange(0, HALF_BK)
+        w_ptrs = weight_ptr + n_offs[:, None] * K_HALF + byte_offs[None, :]
+        w_bytes = tl.load(w_ptrs, mask=n_mask[:, None], other=0).to(tl.int32)
+
+        dq_even = _e2m1_dequant(w_bytes & 0x0F) * s[:, None]
+        dq_odd = _e2m1_dequant((w_bytes >> 4) & 0x0F) * s[:, None]
+
+        half_idx = tl.arange(0, HALF_BK)
+        even_k_offs = k_start + 2 * half_idx
+        odd_k_offs = even_k_offs + 1
+        x_even = tl.load(input_ptr + even_k_offs).to(tl.float32)
+        x_odd = tl.load(input_ptr + odd_k_offs).to(tl.float32)
+
+        acc += tl.sum(dq_even * x_even[None, :], axis=1)
+        acc += tl.sum(dq_odd * x_odd[None, :], axis=1)
+
+    tl.store(output_ptr + n_offs, acc.to(tl.bfloat16), mask=n_mask)
+
+
+@triton.jit
+def _fp4_gemv_batched_kernel(
+    input_ptr,
+    weight_ptr,
+    scale_ptr,
+    output_ptr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    K_HALF: tl.constexpr,
+    NUM_BLOCKS_K: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    HALF_BK: tl.constexpr,
+    TILE_N: tl.constexpr,
+):
+    """Batched GEMV: same input vector x for all E experts."""
+    eid = tl.program_id(0)
+    pid = tl.program_id(1)
+    n_start = pid * TILE_N
+    n_offs = n_start + tl.arange(0, TILE_N)
+    n_mask = n_offs < N
+
+    # Expert offset computed as scalar for each row
+    e_off_w = eid * N * K_HALF
+    e_off_s = eid * N * NUM_BLOCKS_K
+
+    acc = tl.zeros((TILE_N,), dtype=tl.float32)
+
+    for bk in range(NUM_BLOCKS_K):
+        k_start = bk * BLOCK_K
+
+        # Scale: scalar e_off_s + per-row offset
+        s_ptrs = scale_ptr + e_off_s + n_offs * NUM_BLOCKS_K + bk
+        s = tl.load(s_ptrs, mask=n_mask, other=1.0)
+
+        byte_offs = k_start // 2 + tl.arange(0, HALF_BK)
+        # Weight: scalar e_off_w + per-row offset + per-col offset
+        w_ptrs = weight_ptr + e_off_w + n_offs[:, None] * K_HALF + byte_offs[None, :]
+        w_bytes = tl.load(w_ptrs, mask=n_mask[:, None], other=0).to(tl.int32)
+
+        dq_even = _e2m1_dequant(w_bytes & 0x0F) * s[:, None]
+        dq_odd = _e2m1_dequant((w_bytes >> 4) & 0x0F) * s[:, None]
+
+        half_idx = tl.arange(0, HALF_BK)
+        even_k_offs = k_start + 2 * half_idx
+        odd_k_offs = even_k_offs + 1
+        x_even = tl.load(input_ptr + even_k_offs).to(tl.float32)
+        x_odd = tl.load(input_ptr + odd_k_offs).to(tl.float32)
+
+        acc += tl.sum(dq_even * x_even[None, :], axis=1)
+        acc += tl.sum(dq_odd * x_odd[None, :], axis=1)
+
+    tl.store(output_ptr + eid * N + n_offs, acc.to(tl.bfloat16), mask=n_mask)
+
+
+@triton.jit
+def _fp4_gemv_batched_multi_input_kernel(
+    input_ptr,
+    weight_ptr,
+    scale_ptr,
+    output_ptr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    K_HALF: tl.constexpr,
+    NUM_BLOCKS_K: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    HALF_BK: tl.constexpr,
+    TILE_N: tl.constexpr,
+):
+    """Batched GEMV: each expert has its own input vector.
+    input layout: (E, K) row-major contiguous, so expert e's input starts at e*K.
+    """
+    eid = tl.program_id(0)
+    pid = tl.program_id(1)
+    n_start = pid * TILE_N
+    n_offs = n_start + tl.arange(0, TILE_N)
+    n_mask = n_offs < N
+
+    # Use constexpr K as stride (input is contiguous (E, K))
+    x_base = input_ptr + eid * K
+
+    e_off_w = eid * N * K_HALF
+    e_off_s = eid * N * NUM_BLOCKS_K
+
+    acc = tl.zeros((TILE_N,), dtype=tl.float32)
+
+    for bk in range(NUM_BLOCKS_K):
+        k_start = bk * BLOCK_K
+
+        s_ptrs = scale_ptr + e_off_s + n_offs * NUM_BLOCKS_K + bk
+        s = tl.load(s_ptrs, mask=n_mask, other=1.0)
+
+        byte_offs = k_start // 2 + tl.arange(0, HALF_BK)
+        w_ptrs = weight_ptr + e_off_w + n_offs[:, None] * K_HALF + byte_offs[None, :]
+        w_bytes = tl.load(w_ptrs, mask=n_mask[:, None], other=0).to(tl.int32)
+
+        dq_even = _e2m1_dequant(w_bytes & 0x0F) * s[:, None]
+        dq_odd = _e2m1_dequant((w_bytes >> 4) & 0x0F) * s[:, None]
+
+        half_idx = tl.arange(0, HALF_BK)
+        even_k_offs = k_start + 2 * half_idx
+        odd_k_offs = even_k_offs + 1
+        x_even = tl.load(x_base + even_k_offs).to(tl.float32)
+        x_odd = tl.load(x_base + odd_k_offs).to(tl.float32)
+
+        acc += tl.sum(dq_even * x_even[None, :], axis=1)
+        acc += tl.sum(dq_odd * x_odd[None, :], axis=1)
+
+    tl.store(output_ptr + eid * N + n_offs, acc.to(tl.bfloat16), mask=n_mask)
+
+
+def fp4_gemv(
+    x: torch.Tensor,
+    w_fp4: torch.Tensor,
+    scale: torch.Tensor,
+    block_k: int = 32,
+) -> torch.Tensor:
+    """Fused FP4 dequant + vector-matrix multiply. Returns (N,) bf16."""
+    N, K_half = w_fp4.shape
+    K = K_half * 2
+    BLOCK_K = block_k
+    NUM_BLOCKS_K = K // BLOCK_K
+    TILE_N = min(32, N)
+
+    output = torch.empty(N, dtype=torch.bfloat16, device=x.device)
+    grid = (triton.cdiv(N, TILE_N),)
+    HALF_BK = BLOCK_K // 2
+
+    _fp4_gemv_kernel[grid](
+        x, w_fp4, scale, output,
+        N=N, K=K, K_HALF=K_half, NUM_BLOCKS_K=NUM_BLOCKS_K,
+        BLOCK_K=BLOCK_K, HALF_BK=HALF_BK, TILE_N=TILE_N,
+    )
+    return output
+
+
+def fp4_gemv_batched(
+    x: torch.Tensor,
+    w_fp4: torch.Tensor,   # (E, N, K_half) uint8
+    scale: torch.Tensor,    # (E, N, NUM_BLOCKS_K) float32
+    block_k: int = 32,
+) -> torch.Tensor:
+    """Batched FP4 GEMV: same input for all E experts. Returns (E, N) bf16."""
+    E, N, K_half = w_fp4.shape
+    K = K_half * 2
+    BLOCK_K = block_k
+    NUM_BLOCKS_K = K // BLOCK_K
+    HALF_BK = BLOCK_K // 2
+    TILE_N = min(32, N)
+
+    w_fp4 = w_fp4.contiguous()
+    scale = scale.contiguous()
+
+    output = torch.empty((E, N), dtype=torch.bfloat16, device=x.device)
+    grid = (E, triton.cdiv(N, TILE_N))
+
+    _fp4_gemv_batched_kernel[grid](
+        x, w_fp4, scale, output,
+        N=N, K=K, K_HALF=K_half, NUM_BLOCKS_K=NUM_BLOCKS_K,
+        BLOCK_K=BLOCK_K, HALF_BK=HALF_BK, TILE_N=TILE_N,
+    )
+    return output
+
+
+def fp4_gemv_batched_multi_input(
+    x: torch.Tensor,       # (E, K) bf16 - must be contiguous
+    w_fp4: torch.Tensor,   # (E, N, K_half) uint8
+    scale: torch.Tensor,    # (E, N, NUM_BLOCKS_K) float32
+    block_k: int = 32,
+) -> torch.Tensor:
+    """Batched FP4 GEMV: per-expert inputs. Returns (E, N) bf16.
+
+    Input x must be contiguous with shape (E, K).
+    """
+    E, N, K_half = w_fp4.shape
+    K = K_half * 2
+    BLOCK_K = block_k
+    NUM_BLOCKS_K = K // BLOCK_K
+    HALF_BK = BLOCK_K // 2
+    TILE_N = min(32, N)
+
+    w_fp4 = w_fp4.contiguous()
+    scale = scale.contiguous()
+    x = x.contiguous()
+
+    output = torch.empty((E, N), dtype=torch.bfloat16, device=x.device)
+    grid = (E, triton.cdiv(N, TILE_N))
+
+    _fp4_gemv_batched_multi_input_kernel[grid](
+        x, w_fp4, scale, output,
+        N=N, K=K, K_HALF=K_half, NUM_BLOCKS_K=NUM_BLOCKS_K,
+        BLOCK_K=BLOCK_K, HALF_BK=HALF_BK, TILE_N=TILE_N,
+    )
+    return output
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
index e11f208455b1..ddff271750e8 100644
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -72,6 +72,7 @@
     is_npu,
     is_sm90_supported,
     is_sm100_supported,
+    is_sm120_supported,
     log_info_on_rank0,
     print_warning_once,
     set_weight_attrs,
@@ -92,6 +93,12 @@
 _use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT") and _is_hip
 _use_aiter = envs.SGLANG_USE_AITER.get() and _is_hip
 
+# FP4 E2M1 lookup table (module-level to avoid host→device alloc during CUDA graph capture)
+_FP4_E2M1_LUT_CPU = torch.tensor(
+    [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
+)
+_FP4_E2M1_LUT_CACHE: dict[torch.device, torch.Tensor] = {}
+
 if _use_aiter or _use_hip_int4:
     from aiter import ActivationType, QuantType
     from aiter.fused_moe import fused_moe
@@ -747,7 +754,10 @@ def create_weights(
 
         # WEIGHT_SCALES
         if self.is_fp4_expert:
-            if envs.SGLANG_DEBUG_SANITY_CHECK_CONFIG.get() and not is_large_dummy_model():
+            if (
+                envs.SGLANG_DEBUG_SANITY_CHECK_CONFIG.get()
+                and not is_large_dummy_model()
+            ):
                 assert hidden_size == 4096
                 assert intermediate_size_per_partition == 2048
             fp4_block_k = 32
@@ -873,6 +883,238 @@ def create_weights(
             layer.w13_input_scale = None
             layer.w2_input_scale = None
 
+    @staticmethod
+    def _dequant_fp4_to_bf16(
+        w_fp4: torch.Tensor,
+        scale_fp32: torch.Tensor,
+        fp4_block_k: int = 32,
+    ) -> torch.Tensor:
+        """Dequantize a single expert's FP4 weights to BF16."""
+        N, K_half = w_fp4.shape
+        K = K_half * 2
+
+        low_nib = w_fp4 & 0x0F
+        high_nib = (w_fp4 >> 4) & 0x0F
+        unpacked = torch.stack([low_nib, high_nib], dim=-1).reshape(N, K)
+
+        mag = unpacked & 0x07
+        sign = 1 - 2 * ((unpacked >> 3) & 1).float()
+        e2m1_lut = torch.tensor(
+            [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0],
+            dtype=torch.float32,
+            device=w_fp4.device,
+        )
+        fp4_float = sign * e2m1_lut[mag.long()]
+
+        fp4_float = fp4_float.reshape(N, K // fp4_block_k, fp4_block_k)
+        dequant = (fp4_float * scale_fp32.unsqueeze(-1)).reshape(N, K)
+        return dequant.to(torch.bfloat16)
+
+    @staticmethod
+    def _dequant_fp4_batch(
+        w_fp4_batch: torch.Tensor,
+        scale_fp32_batch: torch.Tensor,
+        fp4_block_k: int = 32,
+    ) -> torch.Tensor:
+        """Batch dequantize multiple experts' FP4 weights to BF16.
+
+        Args:
+            w_fp4_batch: (E, N, K//2) int8
+            scale_fp32_batch: (E, N, K//fp4_block_k) float32
+
+        Returns:
+            (E, N, K) bf16
+        """
+        E, N, K_half = w_fp4_batch.shape
+        K = K_half * 2
+
+        low_nib = w_fp4_batch & 0x0F
+        high_nib = (w_fp4_batch >> 4) & 0x0F
+        unpacked = torch.stack([low_nib, high_nib], dim=-1).reshape(E, N, K)
+
+        # Use cached LUT to avoid host→device alloc during CUDA graph capture
+        dev = w_fp4_batch.device
+        if dev not in _FP4_E2M1_LUT_CACHE:
+            _FP4_E2M1_LUT_CACHE[dev] = _FP4_E2M1_LUT_CPU.to(device=dev)
+        e2m1_lut = _FP4_E2M1_LUT_CACHE[dev]
+
+        mag = unpacked & 0x07
+        sign = 1 - 2 * ((unpacked >> 3) & 1).float()
+        fp4_float = sign * e2m1_lut[mag.long()]
+
+        fp4_float = fp4_float.reshape(E, N, K // fp4_block_k, fp4_block_k)
+        dequant = (fp4_float * scale_fp32_batch.unsqueeze(-1)).reshape(E, N, K)
+        return dequant.to(torch.bfloat16)
+
+    @staticmethod
+    def _apply_fp4_moe_torch(
+        hidden_states: torch.Tensor,
+        w13_weight: torch.Tensor,
+        w2_weight: torch.Tensor,
+        w13_weight_scale_inv: torch.Tensor,
+        w2_weight_scale_inv: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        """PyTorch/Triton MoE for FP4 experts (SM120 fallback).
+
+        Uses fused FP4 GEMV Triton kernel for decode (n_tokens <= 8),
+        falls back to full dequant + matmul for larger batches.
+        Path A is fully batched: eliminates Python for-loop by processing
+        all (token, expert) pairs in 2 kernel launches instead of 16.
+        """
+        from sglang.srt.debug_utils.deepseek_v4_debug_utils import (
+            deepseek_v4_moe_code_path_checker,
+        )
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        deepseek_v4_moe_code_path_checker.observed += 1
+
+        num_tokens, hidden_size = hidden_states.shape
+        intermediate_half = w13_weight.shape[1] // 2
+
+        # Try to import fused FP4 GEMV kernels (Triton-based)
+        use_fused_gemv = False
+        use_mxfp4_triton = False
+        try:
+            from sglang.srt.layers.quantization.fp4_gemv_triton import (
+                fp4_gemv_batched_multi_input,
+            )
+
+            use_fused_gemv = True
+        except Exception:
+            pass
+
+        # Try Triton MXFP4 MoE kernel (replaces Python for-loop for prefill)
+        try:
+            if is_sm120_supported() and envs.SGLANG_SM120_TRITON_MOE.get():
+                from sglang.srt.layers.moe.fused_moe_triton.mxfp4_moe_sm120_triton import (
+                    mxfp4_moe_forward_triton,
+                )
+                use_mxfp4_triton = True
+        except Exception:
+            pass
+
+        # Path A: Batched FP4 GEMV for decode (small batch, CUDA-graph compatible)
+        # Processes ALL (token, expert) pairs in 2 kernel launches instead of
+        # num_tokens * 2, eliminating Python for-loop overhead.
+        # Threshold 8: covers CUDA graph capture batch sizes up to 8.
+        if use_fused_gemv and num_tokens <= 8:
+            topk = topk_ids.shape[1]  # (num_tokens, topk)
+            total_pairs = num_tokens * topk
+
+            # Flatten all (token, expert) pairs
+            all_expert_ids = topk_ids.reshape(total_pairs)  # (total_pairs,)
+
+            # Expand hidden_states for all (token, expert) pairs
+            # (num_tokens, hidden_size) → (num_tokens, topk, hidden_size) → (total_pairs, hidden_size)
+            x_all = (
+                hidden_states.unsqueeze(1)
+                .expand(-1, topk, -1)
+                .reshape(total_pairs, hidden_size)
+                .contiguous()
+            )
+
+            # Gather all expert weights at once (4 index_select calls instead of num_tokens * 4)
+            w13_all = torch.index_select(w13_weight, 0, all_expert_ids)
+            s13_all = torch.index_select(w13_weight_scale_inv, 0, all_expert_ids)
+            w2_all = torch.index_select(w2_weight, 0, all_expert_ids)
+            s2_all = torch.index_select(w2_weight_scale_inv, 0, all_expert_ids)
+
+            # Batched w13 GEMV: all (token, expert) pairs at once
+            gate_up_all = fp4_gemv_batched_multi_input(x_all, w13_all, s13_all)
+
+            # SwiGLU
+            gate = gate_up_all[:, :intermediate_half]
+            up = gate_up_all[:, intermediate_half:]
+            hidden_all = torch.nn.functional.silu(gate) * up
+
+            # Batched w2 GEMV
+            expert_outputs = fp4_gemv_batched_multi_input(
+                hidden_all.contiguous(), w2_all, s2_all
+            )
+
+            # Weighted sum: reshape and vectorized reduction
+            expert_outputs = expert_outputs.view(num_tokens, topk, hidden_size)
+            weights = topk_weights.unsqueeze(-1).to(
+                expert_outputs.dtype
+            )  # (num_tokens, topk, 1)
+            output = (expert_outputs * weights).sum(dim=1)  # (num_tokens, hidden_size)
+
+            return StandardCombineInput(hidden_states=output)
+
+        # Path B1: Triton fused MXFP4 MoE (graph-safe, no Python loop)
+        if use_mxfp4_triton:
+            output = mxfp4_moe_forward_triton(
+                hidden_states=hidden_states,
+                w13_packed=w13_weight,
+                w2_packed=w2_weight,
+                w13_scale=w13_weight_scale_inv,
+                w2_scale=w2_weight_scale_inv,
+                topk_ids=topk_ids,
+                topk_weights=topk_weights,
+                hidden_size=hidden_size,
+                intermediate_size=intermediate_half,
+            )
+            return StandardCombineInput(hidden_states=output)
+
+        # Path B2: Dequant + BF16 matmul for prefill (large batch, no CUDA graph)
+        # This is faster for large batches but NOT CUDA-graph compatible.
+        output = torch.zeros(
+            num_tokens,
+            hidden_size,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+
+        for t in range(num_tokens):
+            expert_ids_t = topk_ids[t]  # (topk,) — CPU sync OK (not in CUDA graph)
+            topk = expert_ids_t.shape[0]
+            x = hidden_states[t]  # (hidden_size,) bf16
+            weights = topk_weights[t]  # (topk,)
+
+            # Gather and dequantize selected experts' FP4 weights to BF16
+            w13_bf16 = Fp8MoEMethod._dequant_fp4_batch(
+                w13_weight[expert_ids_t],
+                w13_weight_scale_inv[expert_ids_t],
+            )  # (topk, N, hidden_size)
+            w2_bf16 = Fp8MoEMethod._dequant_fp4_batch(
+                w2_weight[expert_ids_t],
+                w2_weight_scale_inv[expert_ids_t],
+            )  # (topk, hidden_size, intermediate_half)
+
+            # Batched matrix-vector: gate_up = x @ w13.T per expert
+            # einsum('k,enk->en') computes sum_k x[k]*w[e,n,k] = (w @ x)[n] per expert
+            gate_up_all = torch.einsum(
+                "k,enk->en",
+                x.to(torch.float32),
+                w13_bf16.to(torch.float32),
+            ).to(torch.bfloat16)
+
+            # SwiGLU
+            gate = gate_up_all[:, :intermediate_half]
+            up = gate_up_all[:, intermediate_half:]
+            hidden_all = (
+                torch.nn.functional.silu(gate) * up
+            )  # (topk, intermediate_half)
+
+            # Batched matrix-vector: out = hidden_all @ w2.T per expert
+            expert_outputs = torch.einsum(
+                "ek,emk->em",
+                hidden_all.to(torch.float32),
+                w2_bf16.to(torch.float32),
+            ).to(
+                torch.bfloat16
+            )  # (topk, hidden_size)
+
+            # Weighted sum across topk
+            for k in range(topk):
+                output[t] = output[t] + expert_outputs[k] * weights[k].to(
+                    expert_outputs.dtype
+                )
+
+        return StandardCombineInput(hidden_states=output)
+
     def process_weights_after_loading_block_quant(self, layer: Module) -> None:
         # If ROCm, normalize the weights and scales to e4m3fnuz
         if _is_fp8_fnuz:
@@ -1306,6 +1548,21 @@ def apply(
             )
             return StandardCombineInput(hidden_states=output)
 
+        # SM120 fallback: FP4 experts with Triton runner (DeepGEMM not available)
+        # Use pure PyTorch MoE with on-the-fly FP4 dequantization to avoid
+        # the memory cost of converting all FP4 weights to FP8 (~65 GB extra).
+        if self.is_fp4_expert and self.runner.runner_backend.is_triton():
+            topk_weights, topk_ids, _ = dispatch_output.topk_output
+            return self._apply_fp4_moe_torch(
+                hidden_states=x,
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
+                w13_weight_scale_inv=layer.w13_weight_scale_inv,
+                w2_weight_scale_inv=layer.w2_weight_scale_inv,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+            )
+
         if self.runner.runner_backend.is_deep_gemm():
 
             w13_weight = layer.w13_weight
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index be642033fe46..1930f3523f75 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -542,8 +542,29 @@ def _capture_graph(self, graph, pool, stream, run_once_fn):
             if memory_saver_adapter.enabled
             else self.device_module.graph
         )
-        with graph_fn(cuda_graph=graph, pool=pool, stream=stream):
-            out = run_once_fn()
+
+        # SM120 debug: enable sync debug mode to catch implicit syncs during capture
+        import os as _os
+
+        if _os.environ.get("SGLANG_DEBUG_CUDA_GRAPH_CAPTURE") == "1":
+            self.device_module.set_sync_debug_mode("error")
+            logger.info("CUDA sync debug mode enabled for graph capture")
+
+        try:
+            with graph_fn(cuda_graph=graph, pool=pool, stream=stream):
+                out = run_once_fn()
+        except Exception as e:
+            if _os.environ.get("SGLANG_DEBUG_CUDA_GRAPH_CAPTURE") == "1":
+                self.device_module.set_sync_debug_mode("default")
+            logger.error(f"Graph capture FAILED: {type(e).__name__}: {e}")
+            import traceback
+
+            logger.error(traceback.format_exc())
+            raise
+        finally:
+            if _os.environ.get("SGLANG_DEBUG_CUDA_GRAPH_CAPTURE") == "1":
+                self.device_module.set_sync_debug_mode("default")
+
         return out
 
     def _create_device_graph(self):
@@ -734,6 +755,11 @@ def run_once():
             run_once()
             attn_backend.on_after_cuda_graph_warmup_pass()
 
+        # Sync after warmup to catch any async CUDA errors before capture
+        self.device_module.synchronize()
+        self.model_runner.tp_group.barrier()
+        logger.info(f"Warmup completed for bs={bs}, starting capture")
+
         if get_global_graph_memory_pool() is None:
             set_global_graph_memory_pool(self.device_module.graph_pool_handle())
         # Set graph pool id globally to be able to use symmetric memory
diff --git a/python/sglang/srt/models/deepseek_v4.py b/python/sglang/srt/models/deepseek_v4.py
index a71b854352c0..182bbc75157c 100644
--- a/python/sglang/srt/models/deepseek_v4.py
+++ b/python/sglang/srt/models/deepseek_v4.py
@@ -984,22 +984,83 @@ def _is_layer_sparse(self, layer_id: int, is_nextn: bool) -> bool:
             and layer_id % moe_layer_freq == 0
         )
 
-    def hc_pre(
+    def _hc_pre_decode_pure_torch(
         self,
         x: torch.Tensor,
         hc_fn: torch.Tensor,
         hc_scale: torch.Tensor,
         hc_base: torch.Tensor,
     ):
-        @maybe_torch_compile
-        def hc_pre_torch_impl(x, hc_fn):
-            x_flat = x.flatten(1).float()
-            rsqrt = torch.rsqrt(
-                x_flat.square().mean(-1, keepdim=True) + self.rms_norm_eps
-            )
-            mixes = (F.linear(x_flat, hc_fn) * rsqrt).unsqueeze(1)
-            return x_flat, mixes
+        """Decode-optimized hc_pre: pure PyTorch, GPU-only (CUDA graph compatible).
+
+        For decode (small batch), replaces tilelang 32-way split-K (~34 kernel
+        launches) with ~4 PyTorch kernel launches. All ops stay on GPU to
+        support CUDA graph capture.
+        """
+        shape, dtype = x.size(), x.dtype
+        hc_mult = self.hc_mult
+        hidden_size = shape[-1]
+        hc_mult3 = (2 + hc_mult) * hc_mult  # 24
+
+        # Flatten: (tokens, hc_mult, hidden_size) → (tokens, hc_mult * hidden_size)
+        x_flat = x.flatten(1)  # bf16
+
+        # GEMV + sqrsum (2 kernel launches vs 34 for tilelang split-K)
+        x_fp32 = x_flat.float()
+        sqrsum = x_fp32.square().sum(-1, keepdim=True)  # (tokens, 1)
+        linear_out = F.linear(x_fp32, hc_fn)  # (tokens, 24) fp32
 
+        # RMS normalize
+        rsqrt = torch.rsqrt(sqrsum / (hc_mult * hidden_size) + self.rms_norm_eps)
+        mixes_sq = linear_out * rsqrt  # (tokens, 24) fp32, stays on GPU
+
+        # All subsequent ops on GPU (CUDA graph compatible)
+
+        # Extract pre (sigmoid + eps)
+        pre_raw = mixes_sq[:, :hc_mult] * hc_scale[0] + hc_base[:hc_mult]
+        pre_out = torch.sigmoid(pre_raw) + self.hc_eps  # (tokens, hc_mult)
+
+        # Extract post (2 * sigmoid)
+        post_raw = (
+            mixes_sq[:, hc_mult : 2 * hc_mult] * hc_scale[1]
+            + hc_base[hc_mult : 2 * hc_mult]
+        )
+        post_out = 2.0 * torch.sigmoid(post_raw)  # (tokens, hc_mult)
+
+        # Extract comb + Sinkhorn normalize (on GPU, tiny 4x4 matrix)
+        comb_base_mat = hc_base[2 * hc_mult :].view(hc_mult, hc_mult)
+        comb_raw = (
+            mixes_sq[:, 2 * hc_mult :].view(-1, hc_mult, hc_mult) * hc_scale[2]
+            + comb_base_mat
+        )
+        comb_frag = torch.exp(comb_raw - comb_raw.max(dim=-1, keepdim=True).values)
+        row_sum = comb_frag.sum(dim=-1, keepdim=True)
+        comb_frag = comb_frag / row_sum + self.hc_eps
+        col_sum = comb_frag.sum(dim=-2, keepdim=True)
+        comb_frag = comb_frag / (col_sum + self.hc_eps)
+
+        for _ in range(self.hc_sinkhorn_iters - 1):
+            row_sum = comb_frag.sum(dim=-1, keepdim=True)
+            comb_frag = comb_frag / (row_sum + self.hc_eps)
+            col_sum = comb_frag.sum(dim=-2, keepdim=True)
+            comb_frag = comb_frag / (col_sum + self.hc_eps)
+
+        # Convert to bf16 directly on GPU
+        pre = pre_out.to(torch.bfloat16)
+        post = post_out.to(torch.bfloat16)
+        comb = comb_frag.to(torch.bfloat16)
+
+        # Output: weighted sum of residual heads (bf16 multiply)
+        y = (pre.unsqueeze(-1) * x_flat.view(shape)).sum(dim=1)
+        return y, post, comb
+
+    def hc_pre(
+        self,
+        x: torch.Tensor,
+        hc_fn: torch.Tensor,
+        hc_scale: torch.Tensor,
+        hc_base: torch.Tensor,
+    ):
         shape, dtype = x.size(), x.dtype
 
         if x.shape[0] == 0:
@@ -1010,6 +1071,12 @@ def hc_pre_torch_impl(x, hc_fn):
             )
             return y, post, comb
 
+        # Decode optimization: bypass tilelang split-K for small batches.
+        # Tilelang 32-way split-K produces ~34 kernel launches per hc_pre call;
+        # pure PyTorch + CPU Sinkhorn uses ~4 GPU kernels + fast CPU compute.
+        if x.shape[0] <= 8:
+            return self._hc_pre_decode_pure_torch(x, hc_fn, hc_scale, hc_base)
+
         if envs.SGLANG_OPT_USE_TILELANG_MHC_PRE.get():
             from sglang.srt.layers.mhc import mhc_pre
 
@@ -1027,21 +1094,39 @@ def hc_pre_torch_impl(x, hc_fn):
             return y, post.squeeze(-1), comb
 
         if envs.SGLANG_OPT_DEEPGEMM_HC_PRENORM.get():
-            import deep_gemm
-
-            x_flat = x.flatten(1).bfloat16()
-
-            m, k = x_flat.shape
-            mix_hc = hc_fn.size(0)
-            d_out = torch.empty((m, mix_hc), dtype=torch.float, device=x.device)
-            s_out = torch.empty((m,), dtype=torch.float, device=x.device)
-            deep_gemm.tf32_hc_prenorm_gemm(
-                x_flat, hc_fn.float().contiguous(), d_out, s_out, num_splits=None
-            )
-            rsqrt = torch.rsqrt(s_out / k + self.rms_norm_eps)
-            mixes = (d_out * rsqrt.unsqueeze(1)).unsqueeze(1)
+            try:
+                import deep_gemm
+
+                x_flat = x.flatten(1).bfloat16()
+
+                m, k = x_flat.shape
+                mix_hc = hc_fn.size(0)
+                d_out = torch.empty((m, mix_hc), dtype=torch.float, device=x.device)
+                s_out = torch.empty((m,), dtype=torch.float, device=x.device)
+                deep_gemm.tf32_hc_prenorm_gemm(
+                    x_flat,
+                    hc_fn.float().contiguous(),
+                    d_out,
+                    s_out,
+                    num_splits=None,
+                )
+                rsqrt = torch.rsqrt(s_out / k + self.rms_norm_eps)
+                mixes = (d_out * rsqrt.unsqueeze(1)).unsqueeze(1)
+            except (RuntimeError, ImportError, AttributeError):
+                # DeepGEMM not available or unsupported arch (e.g. SM120)
+                x_flat = x.flatten(1).float()
+                sq = x_flat.square()
+                mean = sq.mean(-1, keepdim=True)
+                rsqrt = torch.rsqrt(mean + self.rms_norm_eps)
+                linear_out = F.linear(x_flat, hc_fn)
+                mixes = (linear_out * rsqrt).unsqueeze(1)
         else:
-            x_flat, mixes = hc_pre_torch_impl(x, hc_fn)
+            x_flat = x.flatten(1).float()
+            sq = x_flat.square()
+            mean = sq.mean(-1, keepdim=True)
+            rsqrt = torch.rsqrt(mean + self.rms_norm_eps)
+            linear_out = F.linear(x_flat, hc_fn)
+            mixes = (linear_out * rsqrt).unsqueeze(1)
 
         from sglang.srt.layers.mhc import hc_split_sinkhorn
 
@@ -1053,6 +1138,7 @@ def hc_pre_torch_impl(x, hc_fn):
             self.hc_sinkhorn_iters,
             self.hc_eps,
         )
+
         y = (pre.squeeze(1).unsqueeze(-1) * x_flat.view(shape)).sum(dim=1)
         return y.to(dtype), post.squeeze(1), comb.squeeze(1)
 
@@ -1080,10 +1166,12 @@ def hc_post(
 
         @maybe_torch_compile
         def hc_post_torch_impl(x, residual, post, comb):
-            return (
-                post.unsqueeze(-1) * x.unsqueeze(1)
-                + (comb.unsqueeze(-1) * residual.unsqueeze(2)).sum(dim=1)
-            ).type_as(x)
+            # Ensure type consistency: einsum requires uniform dtypes
+            p = post.type_as(x)
+            c = comb.type_as(residual)
+            return torch.einsum("bi,bd->bid", p, x) + torch.einsum(
+                "bij,bjd->bid", c, residual
+            )
 
         return hc_post_torch_impl(x, residual, post, comb)
 
@@ -1638,7 +1726,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=Fal
                 if envs.SGLANG_DSV4_FP4_EXPERTS.get():
                     weights = _dequant_fp8_wo_a(weights)
                 else:
-                    weights = ((n, t) for n, t in weights if not n.endswith(".wo_a.scale"))
+                    weights = (
+                        (n, t) for n, t in weights if not n.endswith(".wo_a.scale")
+                    )
                 # ------------------------------------------------------------------------
 
         stacked_params_mapping = [
diff --git a/sgl-kernel/csrc/elementwise/topk.cu b/sgl-kernel/csrc/elementwise/topk.cu
index 8bd0f3dcf845..0074b05c2a2d 100644
--- a/sgl-kernel/csrc/elementwise/topk.cu
+++ b/sgl-kernel/csrc/elementwise/topk.cu
@@ -32,7 +32,13 @@ constexpr size_t kSmem = static_cast<size_t>(SGL_TOPK_DYNAMIC_SMEM_BYTES);
 constexpr size_t kSmem = 48 * 1024;  // bytes
 #endif
 #else
-constexpr size_t kSmem = 32 * 1024 * sizeof(uint32_t);  // 128KB (bytes)
+// 80 KB dynamic shared memory — safe for ALL CUDA architectures:
+//   SM90/SM100 (Hopper/Blackwell datacenter): 228 KB/block, 80 KB works (was 128 KB).
+//   SM120 (consumer Blackwell): 99 KB/block — 128 KB would overflow, 80 KB safe.
+//   SM80/SM86/SM89 (Ampere/Ada): 99 KB/block — 128 KB would overflow, 80 KB safe.
+// TopK=2048 requires only 8 KB per ping-pong buffer; 80 KB provides 20 KB per
+// buffer (2.5x headroom beyond 128 KB's 32 KB). Negligible perf difference.
+constexpr size_t kSmem = 20 * 1024 * sizeof(uint32_t);  // 80KB (bytes)
 #endif
 
 struct FastTopKParams {