From e32b47cfe6ab71ea899ec6cbd5034d4677194fbf Mon Sep 17 00:00:00 2001
From: Yangmin Li <yangminl@nvidia.com>
Date: Wed, 29 Oct 2025 15:53:08 -0700
Subject: [PATCH 01/10] fuse kernel initial code

---
 benchmark/kernels/bench_flashmla_fused_kv.py  | 104 +++
 mla_fusion_standalone/pyproject.toml          |  11 +
 mla_fusion_standalone/setup.py                |  69 ++
 mla_fusion_standalone/test_import.py          |  43 ++
 .../layers/attention/trtllm_mla_backend.py    | 230 ++++--
 sgl-kernel/CMakeLists.txt                     |   1 +
 sgl-kernel/build_minimal.sh                   |  25 +
 sgl-kernel/csrc/common_extension.cc           |  12 +
 .../csrc/elementwise/mla_rope_fp8_kv_fused.cu | 678 ++++++++++++++++++
 sgl-kernel/include/sgl_kernel_ops.h           |  14 +
 test/srt/test_mla_fp8_fused_kv.py             |  79 ++
 11 files changed, 1209 insertions(+), 57 deletions(-)
 create mode 100644 benchmark/kernels/bench_flashmla_fused_kv.py
 create mode 100644 mla_fusion_standalone/pyproject.toml
 create mode 100644 mla_fusion_standalone/setup.py
 create mode 100644 mla_fusion_standalone/test_import.py
 create mode 100755 sgl-kernel/build_minimal.sh
 create mode 100644 sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
 create mode 100644 test/srt/test_mla_fp8_fused_kv.py

diff --git a/benchmark/kernels/bench_flashmla_fused_kv.py b/benchmark/kernels/bench_flashmla_fused_kv.py
new file mode 100644
index 00000000000..61e08bf622d
--- /dev/null
+++ b/benchmark/kernels/bench_flashmla_fused_kv.py
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+"""
+Microbenchmark: fused vs baseline (emulated) for MLA RoPE + FP8 + KV write.
+Uses the sgl_kernel.mla_rope_quantize_fp8_fused extension.
+"""
+import time
+import torch
+
+_has_sgl_kernel = False
+mla_rope_quantize_fp8_fused = None
+try:
+    from mla_fusion_kernel import mla_rope_quantize_fp8_fused
+    _has_sgl_kernel = True
+    print("Using standalone mla_fusion_kernel")
+except ImportError:
+    try:
+        from sgl_kernel import mla_rope_quantize_fp8_fused
+        _has_sgl_kernel = True
+        print("Using sgl_kernel.mla_rope_quantize_fp8_fused")
+    except ImportError:
+        print("ERROR: Fusion kernel not available. Please build mla_fusion_standalone first.")
+        _has_sgl_kernel = False
+
+def run_one(nnz=1024, Dn=512, Dr=64, iters=200, warmup=20, device="cuda"):
+    if not _has_sgl_kernel:
+        return 0, 0, 0
+    
+    torch.manual_seed(0)
+
+    q_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
+    q_rope = torch.randn(nnz, Dr, device=device, dtype=torch.float16)
+    k_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
+    k_rope = torch.randn(nnz, Dr, device=device, dtype=torch.float16)
+
+    max_seq = max(2048, nnz)
+    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[:, None]
+    idx = torch.arange(Dr, device=device, dtype=torch.float32)[None, :]
+    freqs = 0.1 * (idx + 1.0)
+    cos = torch.cos(t * freqs)
+    sin = torch.sin(t * freqs)
+    cos_sin = torch.cat([cos, sin], dim=1)
+    pos_ids = torch.randint(low=0, high=max_seq, size=(nnz,), device=device, dtype=torch.long)
+
+    slots = nnz + 8
+    loc = torch.arange(nnz, device=device, dtype=torch.long)
+
+    q_out = torch.empty(nnz, Dn + Dr, device=device, dtype=torch.uint8)
+    k_nope_out = torch.empty(nnz, Dn, device=device, dtype=torch.uint8)
+    k_rope_out = torch.empty(nnz, Dr, device=device, dtype=torch.uint8)
+    kv_base = torch.zeros(slots, 1, Dn + Dr, device=device, dtype=torch.uint8)
+
+    # baselines
+    def baseline():
+        mla_rope_quantize_fp8_fused(q_nope, q_rope, k_nope, k_rope, cos_sin, pos_ids, False,
+                                    q_out, k_nope_out, k_rope_out, None, None)
+        kv_base.zero_()
+        kv_base[loc, 0, :Dn] = k_nope_out
+        kv_base[loc, 0, Dn:] = k_rope_out
+
+    kv_fused = torch.zeros_like(kv_base)
+
+    def fused():
+        mla_rope_quantize_fp8_fused(q_nope, q_rope, k_nope, k_rope, cos_sin, pos_ids, False,
+                                    q_out, None, None, kv_fused, loc)
+
+    # warmup
+    for _ in range(warmup):
+        baseline()
+    torch.cuda.synchronize()
+    t0 = time.time()
+    for _ in range(iters):
+        baseline()
+    torch.cuda.synchronize()
+    t1 = time.time()
+    baseline_ms = (t1 - t0) * 1000.0 / iters
+
+    for _ in range(warmup):
+        fused()
+    torch.cuda.synchronize()
+    t0 = time.time()
+    for _ in range(iters):
+        fused()
+    torch.cuda.synchronize()
+    t1 = time.time()
+    fused_ms = (t1 - t0) * 1000.0 / iters
+
+    return baseline_ms, fused_ms, baseline_ms / fused_ms
+
+if __name__ == "__main__":
+    if not _has_sgl_kernel:
+        print("Benchmark skipped: sgl_kernel not available")
+        exit(1)
+    
+    print("MLA RoPE + FP8 Quantization + KV Cache Write Fusion Benchmark")
+    print("=" * 70)
+    print("Config: Dn=512, Dr=64, iters=1000, warmup=100")
+    print("=" * 70)
+    
+    # Test larger batch sizes and more iterations for stable measurements
+    for nnz in [1024, 4096, 8192, 16384, 32768]:
+        b, f, s = run_one(nnz=nnz, iters=1000, warmup=100)
+        if b > 0:
+            speedup_pct = (s - 1.0) * 100
+            print(f"nnz={nnz:5d} | baseline={b:7.3f} ms | fused={f:7.3f} ms | speedup x{s:4.2f} ({speedup_pct:+5.1f}%)")
diff --git a/mla_fusion_standalone/pyproject.toml b/mla_fusion_standalone/pyproject.toml
new file mode 100644
index 00000000000..bbc8d622a9e
--- /dev/null
+++ b/mla_fusion_standalone/pyproject.toml
@@ -0,0 +1,11 @@
+[build-system]
+requires = ["setuptools", "wheel", "torch"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "mla-fusion-kernel"
+version = "0.1.0"
+description = "Standalone MLA RoPE + FP8 Fusion Kernel"
+requires-python = ">=3.8"
+dependencies = ["torch"]
+
diff --git a/mla_fusion_standalone/setup.py b/mla_fusion_standalone/setup.py
new file mode 100644
index 00000000000..082e37a8f9c
--- /dev/null
+++ b/mla_fusion_standalone/setup.py
@@ -0,0 +1,69 @@
+"""
+Standalone build for MLA RoPE FP8 Fusion kernel
+"""
+from setuptools import setup
+import os
+import sys
+
+# Delay torch import until build time
+def get_cuda_arch():
+    try:
+        import torch
+        cuda_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+        if cuda_arch_list is None:
+            # Auto-detect
+            if torch.cuda.is_available():
+                capability = torch.cuda.get_device_capability()
+                cuda_arch_list = f"{capability[0]}.{capability[1]}"
+            else:
+                # Default to common architectures
+                cuda_arch_list = "8.0;9.0;10.0"
+        print(f"Building for CUDA architectures: {cuda_arch_list}")
+        return cuda_arch_list
+    except Exception as e:
+        print(f"Warning: Could not detect CUDA arch, using defaults: {e}")
+        return "8.0;9.0;10.0"
+
+def get_extensions():
+    from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+    
+    cuda_arch_list = get_cuda_arch()
+
+    return [
+        CUDAExtension(
+            name='mla_fusion_kernel',
+            sources=[
+                '../sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu',
+            ],
+            include_dirs=[
+                '../sgl-kernel/include',
+            ],
+            extra_compile_args={
+                'cxx': ['-O3', '-std=c++17'],
+                'nvcc': [
+                    '-O3',
+                    '--use_fast_math',
+                    '-U__CUDA_NO_HALF_OPERATORS__',
+                    '-U__CUDA_NO_HALF_CONVERSIONS__',
+                    '-U__CUDA_NO_BFLOAT16_CONVERSIONS__',
+                    '-U__CUDA_NO_HALF2_OPERATORS__',
+                    '--expt-relaxed-constexpr',
+                    '--expt-extended-lambda',
+                ] + [f'-gencode=arch=compute_{arch.replace(".", "")},code=sm_{arch.replace(".", "")}' 
+                     for arch in cuda_arch_list.split(';')],
+            },
+        )
+    ]
+
+if __name__ == '__main__':
+    from torch.utils.cpp_extension import BuildExtension
+    
+    setup(
+        name='mla_fusion_kernel',
+        ext_modules=get_extensions(),
+        cmdclass={
+            'build_ext': BuildExtension
+        },
+        python_requires='>=3.8',
+    )
+
diff --git a/mla_fusion_standalone/test_import.py b/mla_fusion_standalone/test_import.py
new file mode 100644
index 00000000000..90ee849d2b5
--- /dev/null
+++ b/mla_fusion_standalone/test_import.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""
+Test script to verify the fusion kernel works
+"""
+import torch
+import mla_fusion_kernel
+
+print("✅ Module imported successfully!")
+print(f"Available functions: {dir(mla_fusion_kernel)}")
+
+# Test basic call
+device = "cuda" if torch.cuda.is_available() else "cpu"
+if device == "cuda":
+    print(f"\n✅ CUDA is available")
+    print(f"Device: {torch.cuda.get_device_name(0)}")
+    
+    # Create dummy inputs
+    nnz, Dn, Dr = 4, 512, 64
+    q_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
+    q_rope = torch.randn(nnz, Dr, device=device, dtype=torch.float16)
+    k_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
+    k_rope = torch.randn(nnz, Dr, device=device, dtype=torch.float16)
+    
+    cos_sin = torch.randn(2048, Dr*2, device=device, dtype=torch.float32)
+    pos_ids = torch.arange(nnz, device=device, dtype=torch.int64)
+    
+    q_out = torch.empty(nnz, Dn+Dr, device=device, dtype=torch.uint8)
+    k_nope_out = torch.empty(nnz, Dn, device=device, dtype=torch.uint8)
+    k_rope_out = torch.empty(nnz, Dr, device=device, dtype=torch.uint8)
+    
+    try:
+        mla_fusion_kernel.mla_rope_quantize_fp8_fused(
+            q_nope, q_rope, k_nope, k_rope,
+            cos_sin, pos_ids, False,
+            q_out, k_nope_out, k_rope_out,
+            None, None
+        )
+        print("✅ Kernel executed successfully!")
+    except Exception as e:
+        print(f"❌ Kernel execution failed: {e}")
+else:
+    print("⚠️ CUDA not available, skipping kernel test")
+
diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
index 65ae9c4dc6e..619d0e0fd5b 100755
--- a/python/sglang/srt/layers/attention/trtllm_mla_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -38,6 +38,15 @@
 
 if _is_cuda:
     from sgl_kernel import concat_mla_absorb_q
+    try:
+        # Try standalone build first
+        from mla_fusion_kernel import mla_rope_quantize_fp8_fused
+    except ImportError:
+        # Fallback to sgl_kernel
+        try:
+            from sgl_kernel import mla_rope_quantize_fp8_fused
+        except ImportError:
+            mla_rope_quantize_fp8_fused = None  # Will use non-fused path
 
 # Constants
 DEFAULT_WORKSPACE_SIZE_MB = 128  # Memory workspace size in MB
@@ -634,12 +643,18 @@ def quantize_and_rope_for_fp8(
         forward_batch: ForwardBatch,
         cos_sin_cache: torch.Tensor,
         is_neox: bool,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        *,
+        layer: Optional["RadixAttention"] = None,
+        save_kv_cache: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         """Quantize and apply RoPE for FP8 attention path.
 
         This function handles the FP8 quantization and RoPE application for MLA attention.
         It takes separate query/key nope and rope components, applies RoPE to the rope parts,
         quantizes all components to FP8, and merges the query components into a single tensor.
+        
+        When layer and save_kv_cache are provided, it uses the fused kernel to directly write
+        K into the KV cache, eliminating intermediate global memory operations.
 
         Args:
             q_nope: Query no-position-encoding component [seq_len, num_heads, kv_lora_rank]
@@ -654,12 +669,14 @@ def quantize_and_rope_for_fp8(
             cos_sin_cache: Precomputed cosine/sine cache for RoPE
                 - expected dtype: matches q_/k_ input dtype (torch.bfloat16)
             is_neox: Whether to use NeoX-style RoPE (interleaved) or GPT-style (half rotation)
+            layer: Optional RadixAttention layer for fused KV cache write
+            save_kv_cache: If True and layer is provided, use fused kernel to directly write KV cache
 
         Returns:
             tuple: (merged_q_out, k_nope_out, k_rope_out) quantized to FP8
                 - merged_q_out: [seq_len, num_heads, kv_lora_rank + qk_rope_head_dim], dtype=torch.float8_e4m3fn
-                - k_nope_out:   [seq_len, num_heads, kv_lora_rank], dtype=torch.float8_e4m3fn
-                - k_rope_out:   [seq_len, num_heads, qk_rope_head_dim], dtype=torch.float8_e4m3fn
+                - k_nope_out:   [seq_len, num_heads, kv_lora_rank], dtype=torch.float8_e4m3fn or None (if fused)
+                - k_rope_out:   [seq_len, num_heads, qk_rope_head_dim], dtype=torch.float8_e4m3fn or None (if fused)
         """
         attn_dtype = torch.float8_e4m3fn
         q_len, num_heads = q_rope.shape[0], q_rope.shape[1]
@@ -673,35 +690,98 @@ def quantize_and_rope_for_fp8(
             dtype=attn_dtype,
         )
 
-        # Key outputs maintain original shapes but with FP8 dtype
-        k_rope_out = k_rope.new_empty(k_rope.shape, dtype=attn_dtype)
-        k_nope_out = k_nope.new_empty(k_nope.shape, dtype=attn_dtype)
-
-        # Apply RoPE and quantize all components in a single fused kernel call
-        # This kernel handles:
-        # 1. RoPE application to q_rope and k_rope using cos_sin_cache and positions
-        # 2. Quantization of all components to FP8 format
-        # 3. Output placement into pre-allocated tensors
-        flashinfer.rope.mla_rope_quantize_fp8(
-            q_rope=q_rope,
-            k_rope=k_rope,
-            q_nope=q_nope,
-            k_nope=k_nope,
-            cos_sin_cache=cos_sin_cache,
-            pos_ids=forward_batch.positions,
-            is_neox=is_neox,
-            quantize_dtype=attn_dtype,
-            # Output tensor slicing: q_out contains [nope_part, rope_part]
-            q_rope_out=q_out[..., self.kv_lora_rank :],  # RoPE part goes to end
-            k_rope_out=k_rope_out,
-            q_nope_out=q_out[..., : self.kv_lora_rank],  # Nope part goes to beginning
-            k_nope_out=k_nope_out,
-            # Quantization scales (set to 1.0 for no additional scaling)
-            quant_scale_q=1.0,
-            quant_scale_kv=1.0,
+        # Determine if we can use the fused kernel (RoPE + quantization + direct KV write)
+        # Conditions: save_kv_cache, layer provided, not NSA packed format, kernel available
+        kernel_available = mla_rope_quantize_fp8_fused is not None
+        has_layer = layer is not None
+        nsa_flag = getattr(forward_batch.token_to_kv_pool, "nsa_kv_cache_store_fp8", False)
+        
+        # Enable fused KV write for performance optimization
+        use_fused_kv_write = (
+            kernel_available
+            and save_kv_cache
+            and has_layer
+            and not nsa_flag
         )
+        
+        # DEBUG: Basic branch tracking
+        import sys
+        if use_fused_kv_write:
+            sys.stderr.write(f"[MLA FUSION] ✅ Using fused kernel (layer={layer.layer_id}, tokens={q_len})\n")
+        elif not kernel_available:
+            sys.stderr.write(f"[MLA FUSION] ❌ Kernel not available, using standard path\n")
+        sys.stderr.flush()
+
+        if use_fused_kv_write:
+            # Fused path: RoPE + quantization + direct KV cache write
+            # This eliminates the intermediate write/read of k_nope_out/k_rope_out
+            
+            # CRITICAL: Reshape KV buffer to [num_blocks, page_size, kv_dim] layout
+            # get_key_buffer() returns flattened [num_blocks*page_size, 1, kv_dim]
+            # We need to reshape it to proper 3D layout for correct page-internal indexing
+            k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            kv_buffer = k_cache.view(-1, self.page_size, self.kv_cache_dim)
+            
+            # Note: Keep as FP8 tensor, C++ will do reinterpret_cast to uint8*
+            # Don't use .view(torch.uint8) which is wrong API usage
+            
+            # Dtype assertion for safety
+            assert q_nope.dtype in (torch.float16, torch.bfloat16), f"Unsupported dtype: {q_nope.dtype}"
+            assert k_nope.dtype == q_nope.dtype, "Q and K dtype mismatch"
+            
+            # Call fused kernel
+            # Note: Kernel now natively supports both FP16 and BF16 via C++ templates
+            # Note: kernel accepts 2D or 3D Q inputs (handles stride properly)
+            # Note: Pass FP8 tensors as-is, C++ will reinterpret_cast to uint8*
+            # K inputs must be 2D
+            mla_rope_quantize_fp8_fused(
+                q_nope,  # [nnz, num_heads, Dn] or [nnz, Dn] - kernel handles both, FP16/BF16
+                q_rope,  # [nnz, num_heads, Dr] or [nnz, Dr] - kernel handles both, FP16/BF16
+                k_nope,  # [nnz, Dn] - already 2D from squeeze(1), FP16/BF16
+                k_rope,  # [nnz, Dr] - already 2D from squeeze(1), FP16/BF16
+                cos_sin_cache,
+                forward_batch.positions,
+                is_neox,
+                q_out,   # FP8 tensor, C++ will reinterpret as uint8*
+                None,  # k_nope_out - skip intermediate output
+                None,  # k_rope_out - skip intermediate output
+                kv_buffer,  # FP8 tensor, direct write to KV cache
+                forward_batch.out_cache_loc,
+            )
+            
+            # Return Q output and None for K outputs (already written to cache)
+            return q_out, None, None
+        else:
+            # Standard path: RoPE + quantization only (backward compatible)
+            # Key outputs maintain original shapes but with FP8 dtype
+            k_rope_out = k_rope.new_empty(k_rope.shape, dtype=attn_dtype)
+            k_nope_out = k_nope.new_empty(k_nope.shape, dtype=attn_dtype)
+
+            # Apply RoPE and quantize all components in a single kernel call
+            # This kernel handles:
+            # 1. RoPE application to q_rope and k_rope using cos_sin_cache and positions
+            # 2. Quantization of all components to FP8 format
+            # 3. Output placement into pre-allocated tensors
+            flashinfer.rope.mla_rope_quantize_fp8(
+                q_rope=q_rope,
+                k_rope=k_rope,
+                q_nope=q_nope,
+                k_nope=k_nope,
+                cos_sin_cache=cos_sin_cache,
+                pos_ids=forward_batch.positions,
+                is_neox=is_neox,
+                quantize_dtype=attn_dtype,
+                # Output tensor slicing: q_out contains [nope_part, rope_part]
+                q_rope_out=q_out[..., self.kv_lora_rank :],  # RoPE part goes to end
+                k_rope_out=k_rope_out,
+                q_nope_out=q_out[..., : self.kv_lora_rank],  # Nope part goes to beginning
+                k_nope_out=k_nope_out,
+                # Quantization scales (set to 1.0 for no additional scaling)
+                quant_scale_q=1.0,
+                quant_scale_kv=1.0,
+            )
 
-        return q_out, k_nope_out, k_rope_out
+            return q_out, k_nope_out, k_rope_out
 
     def pad_draft_extend_query(
         self,
@@ -798,12 +878,16 @@ def forward_decode(
     ) -> torch.Tensor:
         """Run forward for decode using TRTLLM MLA kernel."""
         merge_query = q_rope is not None
+        fused_kv = False  # Track if using fused KV write path
         if self.data_type == torch.float8_e4m3fn:
             # For FP8 path, we quantize the query and rope parts and merge them into a single tensor
             # Note: rope application in deepseek_v2.py:forward_absorb_prepare is skipped for FP8 decode path of this trtllm_mla backend
             assert all(
                 x is not None for x in [q_rope, k_rope, cos_sin_cache]
             ), "For FP8 path and using flashinfer.rope.mla_rope_quantize we need all of q_rope, k_rope and cos_sin_cache to be not None."
+            
+            # Call quantize_and_rope_for_fp8 with layer and save_kv_cache parameters
+            # This enables the fused kernel path when conditions are met
             q, k, k_rope = self.quantize_and_rope_for_fp8(
                 q,
                 q_rope,
@@ -812,14 +896,20 @@ def forward_decode(
                 forward_batch,
                 cos_sin_cache,
                 is_neox,
+                layer=layer,
+                save_kv_cache=save_kv_cache,
             )
             merge_query = False
-
-        # Save KV cache if requested
-        if save_kv_cache:
-            assert (
-                k is not None and k_rope is not None
-            ), "For populating trtllm_mla kv cache, both k_nope and k_rope should be not None."
+            # Check if fused path was used (K outputs are None when directly written to KV cache)
+            fused_kv = (k is None and k_rope is None)
+            if fused_kv:
+                import sys
+                sys.stderr.write(f"[DECODE] ✅ Fused path used\n")
+                sys.stderr.flush()
+
+        # Save KV cache if requested (only if not using fused path)
+        # When k or k_rope is None, it means the fused kernel already wrote to KV cache
+        if save_kv_cache and not fused_kv and k is not None and k_rope is not None:
             forward_batch.token_to_kv_pool.set_mla_kv_buffer(
                 layer, forward_batch.out_cache_loc, k, k_rope
             )
@@ -899,30 +989,54 @@ def forward_extend(
     ) -> torch.Tensor:
         # TODO refactor to avoid code duplication
         merge_query = q_rope is not None
-        if (
+        fused_kv = False  # Track if we used fused KV write path
+        
+        # For FP8 path, check if we should use quantize_and_rope_for_fp8
+        # Conditions:
+        # 1. Using FP8 data type
+        # 2. In target_verify or draft_extend mode
+        # 3. All rope-related parameters are available (not None)
+        use_fp8_quantize = (
             self.data_type == torch.float8_e4m3fn
-        ) and forward_batch.forward_mode.is_target_verify():
+            and (
+                forward_batch.forward_mode.is_target_verify()
+                or forward_batch.forward_mode.is_draft_extend(include_v2=True)
+            )
+            and all(x is not None for x in [q_rope, k_rope, cos_sin_cache])
+        )
+        
+        if use_fp8_quantize:
             # For FP8 path, we quantize the query and rope parts and merge them into a single tensor
             # Note: rope application in deepseek_v2.py:forward_absorb_prepare is skipped for FP8 decode path of this trtllm_mla backend
-            assert all(
-                x is not None for x in [q_rope, k_rope, cos_sin_cache]
-            ), "For FP8 path and using flashinfer.rope.mla_rope_quantize we need all of q_rope, k_rope and cos_sin_cache to be not None."
-            q, k, k_rope = self.quantize_and_rope_for_fp8(
-                q,
-                q_rope,
-                k.squeeze(1),
-                k_rope.squeeze(1),
+            
+            # Call quantize_and_rope_for_fp8 with layer and save_kv_cache parameters
+            # This enables the fused kernel path when conditions are met
+            # NOTE: Do NOT squeeze Q - fused kernel supports 3D Q: [nnz, num_heads, dim]
+            q, k_fp8_nope, k_fp8_rope = self.quantize_and_rope_for_fp8(
+                q,  # ✅ Keep Q as is (supports both 2D and 3D)
+                q_rope,  # ✅ Keep Q_rope as is
+                k.squeeze(1) if k.dim() == 3 else k,  # K must be 2D
+                k_rope.squeeze(1) if k_rope.dim() == 3 else k_rope,  # K_rope must be 2D
                 forward_batch,
                 cos_sin_cache,
                 is_neox,
+                layer=layer,
+                save_kv_cache=save_kv_cache,
             )
             merge_query = False
-
-        # Save KV cache if requested
-        if save_kv_cache:
-            assert (
-                k is not None and k_rope is not None
-            ), "For populating trtllm_mla kv cache, both k_nope and k_rope should be not None."
+            # Fused path: K already written to KV cache, function returns None for K outputs
+            fused_kv = (k_fp8_nope is None and k_fp8_rope is None)
+            if fused_kv:
+                import sys
+                sys.stderr.write(f"[EXTEND] ✅ Fused path used\n")
+                sys.stderr.flush()
+            # Update local variables for subsequent logic
+            k = k_fp8_nope
+            k_rope = k_fp8_rope
+
+        # Save KV cache if requested (only if not using fused path)
+        # When k or k_rope is None, it means the fused kernel already wrote to KV cache
+        if save_kv_cache and not fused_kv and k is not None and k_rope is not None:
             forward_batch.token_to_kv_pool.set_mla_kv_buffer(
                 layer, forward_batch.out_cache_loc, k, k_rope
             )
@@ -1018,11 +1132,13 @@ def forward_extend(
             output = raw_out.view(-1, layer.tp_q_head_num * layer.v_head_dim)
             return output
 
-        if k_rope is not None:
-            k = torch.cat([k, k_rope], dim=-1)
-        k = k.view(-1, layer.tp_k_head_num, layer.head_dim)
-
-        v = v.view(-1, layer.tp_k_head_num, layer.v_head_dim)
+        # Only prepare k/v tensors if NOT using fused path (for MHA prefill)
+        # In fused path, K is already in KV cache and we use decode kernel
+        if not fused_kv:
+            if k_rope is not None:
+                k = torch.cat([k, k_rope], dim=-1)
+            k = k.view(-1, layer.tp_k_head_num, layer.head_dim)
+            v = v.view(-1, layer.tp_k_head_num, layer.v_head_dim)
 
         if forward_batch.attn_attend_prefix_cache:
             # MHA for chunked prefix kv cache when running model with MLA
diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt
index 0c8bb02a486..553b849e9f2 100644
--- a/sgl-kernel/CMakeLists.txt
+++ b/sgl-kernel/CMakeLists.txt
@@ -287,6 +287,7 @@ set(SOURCES
     "csrc/elementwise/concat_mla.cu"
     "csrc/elementwise/copy.cu"
     "csrc/elementwise/fused_add_rms_norm_kernel.cu"
+    "csrc/elementwise/mla_rope_fp8_kv_fused.cu"
     "csrc/elementwise/rope.cu"
     "csrc/elementwise/topk.cu"
     "csrc/expert_specialization/es_fp8_blockwise.cu"
diff --git a/sgl-kernel/build_minimal.sh b/sgl-kernel/build_minimal.sh
new file mode 100755
index 00000000000..d5805f5371a
--- /dev/null
+++ b/sgl-kernel/build_minimal.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Minimal build script for sgl-kernel (only compile needed architectures)
+
+# 清理之前的编译
+rm -rf _skbuild/ build/ *.egg-info
+
+# 设置编译选项
+export CMAKE_BUILD_PARALLEL_LEVEL=2          # 减少并行度，避免 OOM
+export SGL_KERNEL_COMPILE_THREADS=4          # NVCC 线程数
+
+# 只编译 B200 (SM100) 的架构，跳过其他
+# 你的 B200 是 compute_100，不需要 SM80/89/90
+export ENABLE_BELOW_SM90=OFF                 # 跳过 A100/V100 等旧架构
+export SGL_KERNEL_ENABLE_FA3=OFF             # 跳过 Flash-Attention 3 (SM90a)
+export SGL_KERNEL_ENABLE_SM90A=OFF           # 跳过 SM90A
+
+# 只保留 SM100
+export TORCH_CUDA_ARCH_LIST="10.0"
+
+# 开始编译
+pip install -e . --no-build-isolation -v 2>&1 | tee build.log
+
+echo ""
+echo "Build completed! Check build.log for details."
+
diff --git a/sgl-kernel/csrc/common_extension.cc b/sgl-kernel/csrc/common_extension.cc
index 03a7ec0151f..b6a39ad38f8 100644
--- a/sgl-kernel/csrc/common_extension.cc
+++ b/sgl-kernel/csrc/common_extension.cc
@@ -94,6 +94,18 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
       "Tensor? v, Tensor!? k_buffer, Tensor!? v_buffer, Tensor? kv_cache_loc) -> ()");
   m.impl("apply_rope_pos_ids_cos_sin_cache", torch::kCUDA, &apply_rope_pos_ids_cos_sin_cache);
 
+  m.def(
+      "mla_rope_quantize_fp8_fused(Tensor q_nope, Tensor q_rope, Tensor k_nope, Tensor k_rope, "
+      "Tensor cos_sin_cache, Tensor pos_ids, bool is_neox, Tensor! q_out, "
+      "Tensor!? k_nope_out, Tensor!? k_rope_out, Tensor? kv_buffer, Tensor? kv_cache_loc) -> ()");
+  m.impl("mla_rope_quantize_fp8_fused", torch::kCUDA, &mla_rope_quantize_fp8_fused);
+
+  m.def(
+      "mla_rope_quantize_fp8_fused(Tensor q_nope, Tensor q_rope, Tensor k_nope, Tensor k_rope, "
+      "Tensor cos_sin_cache, Tensor pos_ids, bool is_neox, Tensor! q_out, "
+      "Tensor!? k_nope_out, Tensor!? k_rope_out, Tensor? kv_buffer, Tensor? kv_cache_loc) -> ()");
+  m.impl("mla_rope_quantize_fp8_fused", torch::kCUDA, &mla_rope_quantize_fp8_fused);
+
   m.def(
       "downcast_fp8(Tensor k, Tensor v, Tensor k_out, Tensor v_out, Tensor k_scale, Tensor v_scale, Tensor loc, "
       "int mult, int offset) -> ()");
diff --git a/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu b/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
new file mode 100644
index 00000000000..a994bf83c9f
--- /dev/null
+++ b/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
@@ -0,0 +1,678 @@
+/*
+ * Copyright (c) 2024 by SGLang team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ *
+ * MLA RoPE + FP8 Quantization + KV Cache Write Fusion Kernel
+ *
+ * This is a SGLang-native kernel that fuses three operations for DeepSeek V3.2 MLA:
+ * 1. Apply RoPE (Rotary Position Embedding) to q_rope and k_rope
+ * 2. Quantize all components (q_nope, q_rope, k_nope, k_rope) to FP8 E4M3
+ * 3. Optionally write K directly into KV cache buffer
+ *
+ * Motivation:
+ * - Original path: mla_rope_quantize_fp8 (FlashInfer) → writes k_out → set_mla_kv_buffer reads k_out → writes KV cache
+ * - Fused path: This kernel → directly writes to KV cache (eliminates intermediate global memory ops)
+ *
+ * Performance: ~4.9x faster than baseline (measured on B200), includes:
+ * - Vectorized memory access (4-byte aligned loads/stores)
+ * - Warp-level parallelism (32 threads per row)
+ * - Direct KV cache write (no intermediate buffers)
+ */
+
+// Only include PyBind11 for standalone builds
+#ifdef TORCH_EXTENSION_NAME
+#include <torch/extension.h>
+#else
+#include <ATen/ATen.h>
+#include <torch/types.h>
+#endif
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>  // BF16 support
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#include <cuda_fp8.h>
+#endif
+#include <optional>
+#include <stdint.h>
+
+// Utility macros (borrowed from pytorch_extension_utils.h style)
+#define CHECK_INPUT(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be " #d "D")
+
+// ---- Helpers -------------------------------------------------------
+
+#define CHECK_SAME_DEVICE(a, b) TORCH_CHECK(a.device() == b.device(), #a " and " #b " must be on same device")
+
+namespace {
+
+// ============================================================================
+// Dtype Traits: Support both FP16 (__half) and BF16 (nv_bfloat16)
+// ============================================================================
+template <typename T>
+struct Vec2Traits;
+
+template <>
+struct Vec2Traits<__half> {
+    using v2 = __half2;
+    
+    __device__ static inline float2 to_float2(v2 h2) {
+        return __half22float2(h2);
+    }
+    
+    __device__ static inline float to_float(const __half& h) {
+        return __half2float(h);
+    }
+};
+
+template <>
+struct Vec2Traits<nv_bfloat16> {
+    using v2 = nv_bfloat162;
+    
+    __device__ static inline float2 to_float2(v2 h2) {
+        return __bfloat1622float2(h2);
+    }
+    
+    __device__ static inline float to_float(const nv_bfloat16& h) {
+        return __bfloat162float(h);
+    }
+};
+
+// Convert float -> FP8 E4M3 (finite saturation). Return raw byte.
+__device__ inline uint8_t float_to_e4m3fn_byte(float x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    // CUDA 12+ with native FP8 support
+    __nv_fp8_storage_t byte = __nv_cvt_float_to_fp8(x, __NV_SATFINITE, __NV_E4M3);
+    return static_cast<uint8_t>(byte);
+#else
+    // Fallback: Manual FP8 E4M3 conversion
+    // E4M3 format: 1 sign bit, 4 exponent bits, 3 mantissa bits
+    // Range: [-448, 448], NaN represented as 0x7F
+    
+    // Clamp to FP8 E4M3 range
+    x = fmaxf(-448.0f, fminf(448.0f, x));
+    
+    // Simple conversion (not bit-exact but close enough for testing)
+    // In production, you'd want proper rounding
+    union {
+        float f;
+        uint32_t u;
+    } conv;
+    conv.f = x;
+    
+    // Extract sign
+    uint32_t sign = (conv.u >> 31) & 0x1;
+    
+    // Handle zero
+    if (x == 0.0f) return 0;
+    
+    // Simplified: scale and round
+    // This is a placeholder - for production use proper FP8 conversion
+    int exp = ((conv.u >> 23) & 0xFF) - 127;  // Extract exponent
+    exp = max(-6, min(8, exp));  // E4M3 range
+    
+    uint32_t mant = (conv.u >> 20) & 0x7;  // Top 3 bits of mantissa
+    
+    uint8_t result = (sign << 7) | ((exp + 7) << 3) | mant;
+    return result;
+#endif
+}
+
+// Pack 4 bytes into uint32_t for vectorized write
+__device__ inline uint32_t pack4(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3) {
+    return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) | ((uint32_t)a3 << 24);
+}
+
+// Apply RoPE to a pair (xr, xi) given cos, sin
+__device__ inline void rope_rotate(float& xr, float& xi, float c, float s, bool /*is_neox*/) {
+    float xr_new = xr * c - xi * s;
+    float xi_new = xr * s + xi * c;
+    xr = xr_new;
+    xi = xi_new;
+}
+
+// ============================================================================
+// Vectorized kernel: warp-per-row, vectorized load/store
+// Template supports both FP16 (__half) and BF16 (nv_bfloat16)
+// ============================================================================
+template<int WARPS_PER_CTA, typename T>
+__global__ void FusedRopeQuantizeKernelVec(
+    const T* __restrict__ q_nope,
+    const T* __restrict__ q_rope,
+    int64_t qn_stride_tok, int64_t qn_stride_head,  // Q_nope strides in elements
+    int64_t qr_stride_tok, int64_t qr_stride_head,  // Q_rope strides in elements
+    const T* __restrict__ k_nope,
+    const T* __restrict__ k_rope,
+    const float* __restrict__ cos_sin,
+    const int64_t* __restrict__ pos_ids,
+    int nnz, int num_heads, int Dn, int Dr,
+    bool is_neox,
+    uint8_t* __restrict__ q_out_fp8,
+    int64_t qout_stride_tok_bytes, int64_t qout_stride_head_bytes,  // Q_out strides in bytes
+    uint8_t* __restrict__ k_nope_out_fp8,
+    uint8_t* __restrict__ k_rope_out_fp8,
+    uint8_t* __restrict__ kv_buffer_bytes,
+    int64_t kv_stride_n_bytes,
+    int64_t kv_stride_m_bytes,  // NEW: stride for page-internal row
+    int page_size,              // NEW: page size for row offset calculation
+    const int64_t* __restrict__ kv_cache_loc
+) {
+    constexpr int WARP_SIZE = 32;
+    int warp_in_block = threadIdx.x / WARP_SIZE;
+    int lane = threadIdx.x & (WARP_SIZE - 1);
+
+    int global_row = blockIdx.x * WARPS_PER_CTA + warp_in_block;
+    if (global_row >= nnz * num_heads) return;
+
+    // Decompose global row index: token_id and head_id
+    int token_id = global_row / num_heads;
+    int head_id = global_row % num_heads;
+
+    // Pointers for this (token, head) using proper strides
+    // Use template type T (not hardcoded __half) for BF16 support
+    const T* qn = q_nope + size_t(token_id) * qn_stride_tok + size_t(head_id) * qn_stride_head;
+    const T* qr = q_rope + size_t(token_id) * qr_stride_tok + size_t(head_id) * qr_stride_head;
+    
+    // K is always 2D: [nnz_tokens, dim]
+    const T* kn = k_nope + size_t(token_id) * Dn;
+    const T* kr = k_rope + size_t(token_id) * Dr;
+
+    // Q output using byte strides
+    uint8_t* qdst = q_out_fp8 + size_t(token_id) * qout_stride_tok_bytes + size_t(head_id) * qout_stride_head_bytes;
+    
+    // K outputs (if provided)
+    uint8_t* kndst = k_nope_out_fp8 ? (k_nope_out_fp8 + size_t(token_id) * Dn) : nullptr;
+    uint8_t* krdst = k_rope_out_fp8 ? (k_rope_out_fp8 + size_t(token_id) * Dr) : nullptr;
+
+    // Get position for RoPE and KV cache row calculation
+    int pos = static_cast<int>(pos_ids[token_id]);
+
+    // KV cache destination: include row offset within page
+    // CRITICAL FIX: write to correct row in page (pos % page_size), not always row 0
+    uint8_t* kvdst = nullptr;
+    if (kv_buffer_bytes && kv_cache_loc) {
+        int64_t slot = kv_cache_loc[token_id];
+        int row = pos % page_size;  // ⬅️ Token's row within page
+        kvdst = kv_buffer_bytes 
+              + slot * kv_stride_n_bytes 
+              + static_cast<int64_t>(row) * kv_stride_m_bytes;
+    }
+    const float* cos_ptr = cos_sin + size_t(pos) * (2 * Dr);
+    const float* sin_ptr = cos_ptr + Dr;
+
+    // Use traits for dtype-agnostic vectorized load
+    using V2 = typename Vec2Traits<T>::v2;
+
+    // Process Q_nope: vectorized quantize + write
+    for (int c = lane * 4; c < Dn; c += WARP_SIZE * 4) {
+        V2 h0 = *reinterpret_cast<const V2*>(qn + c + 0);
+        V2 h1 = *reinterpret_cast<const V2*>(qn + c + 2);
+        float2 f0 = Vec2Traits<T>::to_float2(h0);
+        float2 f1 = Vec2Traits<T>::to_float2(h1);
+
+        uint32_t packed = pack4(
+            float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y),
+            float_to_e4m3fn_byte(f1.x), float_to_e4m3fn_byte(f1.y));
+        *reinterpret_cast<uint32_t*>(qdst + c) = packed;
+    }
+
+    // Process Q_rope: paired rotation + vectorized quantize + write
+    for (int c = lane * 4; c < Dr; c += WARP_SIZE * 4) {
+        V2 h0 = *reinterpret_cast<const V2*>(qr + c + 0);
+        V2 h1 = *reinterpret_cast<const V2*>(qr + c + 2);
+        float2 f0 = Vec2Traits<T>::to_float2(h0);
+        float2 f1 = Vec2Traits<T>::to_float2(h1);
+
+        float c0 = cos_ptr[c + 0], s0 = sin_ptr[c + 0];
+        float c1 = cos_ptr[c + 2], s1 = sin_ptr[c + 2];
+        rope_rotate(f0.x, f0.y, c0, s0, is_neox);
+        rope_rotate(f1.x, f1.y, c1, s1, is_neox);
+
+        uint32_t packed = pack4(
+            float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y),
+            float_to_e4m3fn_byte(f1.x), float_to_e4m3fn_byte(f1.y));
+        *reinterpret_cast<uint32_t*>(qdst + Dn + c) = packed;
+    }
+
+    // Process K_nope and K_rope: only once per token (head_id == 0)
+    // K is 2D [nnz_tokens, dim], not per-head
+    if (head_id == 0) {
+        // Process K_nope: vectorized quantize + write to k_nope_out and/or KV buffer
+        for (int c = lane * 4; c < Dn; c += WARP_SIZE * 4) {
+            V2 h0 = *reinterpret_cast<const V2*>(kn + c + 0);
+            V2 h1 = *reinterpret_cast<const V2*>(kn + c + 2);
+            float2 f0 = Vec2Traits<T>::to_float2(h0);
+            float2 f1 = Vec2Traits<T>::to_float2(h1);
+
+            uint32_t packed = pack4(
+                float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y),
+                float_to_e4m3fn_byte(f1.x), float_to_e4m3fn_byte(f1.y));
+
+            if (kndst) *reinterpret_cast<uint32_t*>(kndst + c) = packed;
+            if (kvdst) *reinterpret_cast<uint32_t*>(kvdst + c) = packed;
+        }
+
+        // Process K_rope: paired rotation + vectorized quantize + write to k_rope_out and/or KV buffer
+        for (int c = lane * 4; c < Dr; c += WARP_SIZE * 4) {
+            V2 h0 = *reinterpret_cast<const V2*>(kr + c + 0);
+            V2 h1 = *reinterpret_cast<const V2*>(kr + c + 2);
+            float2 f0 = Vec2Traits<T>::to_float2(h0);
+            float2 f1 = Vec2Traits<T>::to_float2(h1);
+
+            float c0 = cos_ptr[c + 0], s0 = sin_ptr[c + 0];
+            float c1 = cos_ptr[c + 2], s1 = sin_ptr[c + 2];
+            rope_rotate(f0.x, f0.y, c0, s0, is_neox);
+            rope_rotate(f1.x, f1.y, c1, s1, is_neox);
+
+            uint32_t packed = pack4(
+                float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y),
+                float_to_e4m3fn_byte(f1.x), float_to_e4m3fn_byte(f1.y));
+
+            if (krdst) *reinterpret_cast<uint32_t*>(krdst + c) = packed;
+            if (kvdst) *reinterpret_cast<uint32_t*>(kvdst + Dn + c) = packed;
+        }
+    }
+}
+
+// ============================================================================
+// Scalar fallback kernel: for dimensions not divisible by 4
+// Template supports both FP16 (__half) and BF16 (nv_bfloat16)
+// ============================================================================
+template <int BLOCK_THREADS, typename T>
+__global__ void FusedRopeQuantizeKernelScalar(
+    const T* __restrict__ q_nope,
+    const T* __restrict__ q_rope,
+    int64_t qn_stride_tok, int64_t qn_stride_head,
+    int64_t qr_stride_tok, int64_t qr_stride_head,
+    const T* __restrict__ k_nope,
+    const T* __restrict__ k_rope,
+    const float* __restrict__ cos_sin,
+    const int64_t* __restrict__ pos_ids,
+    int nnz, int num_heads, int Dn, int Dr,
+    bool is_neox,
+    uint8_t* __restrict__ q_out_fp8,
+    int64_t qout_stride_tok_bytes, int64_t qout_stride_head_bytes,
+    uint8_t* __restrict__ k_nope_out_fp8,
+    uint8_t* __restrict__ k_rope_out_fp8,
+    uint8_t* __restrict__ kv_buffer_bytes,
+    int64_t kv_stride_n_bytes,
+    int64_t kv_stride_m_bytes,  // NEW: stride for page-internal row
+    int page_size,              // NEW: page size for row offset calculation
+    const int64_t* __restrict__ kv_cache_loc
+) {
+    // Thread mapping: grid-stride loop over all (token, head) pairs
+    for (int global_row = blockIdx.x * BLOCK_THREADS + threadIdx.x; 
+         global_row < nnz * num_heads; 
+         global_row += gridDim.x * BLOCK_THREADS) {
+        
+        // Decompose global row index
+        int token_id = global_row / num_heads;
+        int head_id = global_row % num_heads;
+        
+        int pos = static_cast<int>(pos_ids[token_id]);
+        const float* cos_ptr = cos_sin + size_t(pos) * (2 * Dr);
+        const float* sin_ptr = cos_ptr + Dr;
+
+        // ---- Quantize q ----
+        // q_out: [nope | rope]
+        {
+            // Pointers using proper strides
+            const T* qn = q_nope + size_t(token_id) * qn_stride_tok + size_t(head_id) * qn_stride_head;
+            const T* qr = q_rope + size_t(token_id) * qr_stride_tok + size_t(head_id) * qr_stride_head;
+            uint8_t* qdst = q_out_fp8 + size_t(token_id) * qout_stride_tok_bytes + size_t(head_id) * qout_stride_head_bytes;
+            
+            // write nope part
+            for (int i = 0; i < Dn; ++i) {
+                float x = Vec2Traits<T>::to_float(qn[i]);
+                qdst[i] = float_to_e4m3fn_byte(x);
+            }
+            // rope part (avoid OOB read when Dr is odd)
+            for (int i = 0; i < Dr; i += 2) {
+                float xr = Vec2Traits<T>::to_float(qr[i + 0]);
+                float xi = 0.0f;
+                if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(qr[i + 1]);
+                float c = cos_ptr[i + 0];
+                float s = sin_ptr[i + 0];
+                // NeoX interleave is typically handled by reindexing; for demo we still rotate pairs.
+                rope_rotate(xr, xi, c, s, is_neox);
+                qdst[Dn + i + 0] = float_to_e4m3fn_byte(xr);
+                if (i + 1 < Dr) qdst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
+            }
+        }
+
+        // ---- Quantize k & optional fused KV write ----
+        // K is always 2D: [nnz_tokens, dim]
+        const T* kn = k_nope + size_t(token_id) * Dn;
+        const T* kr = k_rope + size_t(token_id) * Dr;
+
+        // Optional: write k_nope_out / k_rope_out (only once per token, not per head)
+        // Note: K outputs are 2D [nnz_tokens, dim], so only first head processes them
+        if (head_id == 0) {
+            if (k_nope_out_fp8) {
+                uint8_t* knd = k_nope_out_fp8 + size_t(token_id) * Dn;
+                for (int i = 0; i < Dn; ++i) {
+                    knd[i] = float_to_e4m3fn_byte(Vec2Traits<T>::to_float(kn[i]));
+                }
+            }
+            if (k_rope_out_fp8) {
+                uint8_t* krd = k_rope_out_fp8 + size_t(token_id) * Dr;
+                for (int i = 0; i < Dr; i += 2) {
+                    float xr = Vec2Traits<T>::to_float(kr[i + 0]);
+                    float xi = 0.0f;
+                    if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(kr[i + 1]);
+                    float c = cos_ptr[i + 0];
+                    float s = sin_ptr[i + 0];
+                    rope_rotate(xr, xi, c, s, is_neox);
+                    krd[i + 0] = float_to_e4m3fn_byte(xr);
+                    if (i + 1 < Dr) krd[i + 1] = float_to_e4m3fn_byte(xi);
+                }
+            }
+
+            // Fused direct KV write (if kv_buffer provided)
+            // CRITICAL FIX: write to correct row in page (pos % page_size), not always row 0
+            if (kv_buffer_bytes && kv_cache_loc) {
+                int64_t slot = kv_cache_loc[token_id];
+                int row = pos % page_size;  // ⬅️ Token's row within page
+                uint8_t* dst = kv_buffer_bytes 
+                             + slot * kv_stride_n_bytes 
+                             + static_cast<int64_t>(row) * kv_stride_m_bytes;
+                // Write nope first
+                for (int i = 0; i < Dn; ++i) {
+                    dst[i] = float_to_e4m3fn_byte(Vec2Traits<T>::to_float(kn[i]));
+                }
+                // Then rope with rotation (avoid OOB read when Dr is odd)
+                for (int i = 0; i < Dr; i += 2) {
+                    float xr = Vec2Traits<T>::to_float(kr[i + 0]);
+                    float xi = 0.0f;
+                    if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(kr[i + 1]);
+                    float c = cos_ptr[i + 0];
+                    float s = sin_ptr[i + 0];
+                    rope_rotate(xr, xi, c, s, is_neox);
+                    dst[Dn + i + 0] = float_to_e4m3fn_byte(xr);
+                    if (i + 1 < Dr) dst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
+                }
+            }
+        }
+    }
+}
+
+} // namespace
+
+// Python-exposed function
+// q_nope, q_rope, k_nope, k_rope: half/bfloat16 (we treat as half here for demo)
+// cos_sin_cache: float32 [max_seq, 2*Dr]
+// pos_ids: int64 [nnz]
+// q_out: uint8 [nnz, Dn+Dr] (stores E4M3 raw bytes)
+// k_nope_out/k_rope_out: optional uint8 outputs (None allowed)
+// kv_buffer: optional uint8 [(slots+page), 1, (Dn+Dr)] raw bytes
+// kv_cache_loc: optional int64 [nnz]
+void mla_rope_quantize_fp8_fused(
+    at::Tensor q_nope,
+    at::Tensor q_rope,
+    at::Tensor k_nope,
+    at::Tensor k_rope,
+    at::Tensor cos_sin_cache,
+    at::Tensor pos_ids,
+    bool is_neox,
+    at::Tensor q_out,
+    c10::optional<at::Tensor> k_nope_out,
+    c10::optional<at::Tensor> k_rope_out,
+    // fused args
+    c10::optional<at::Tensor> kv_buffer,
+    c10::optional<at::Tensor> kv_cache_loc
+) {
+    CHECK_INPUT(q_nope); CHECK_INPUT(q_rope); CHECK_INPUT(k_nope); CHECK_INPUT(k_rope);
+    CHECK_INPUT(cos_sin_cache); CHECK_INPUT(pos_ids); CHECK_INPUT(q_out);
+    CHECK_SAME_DEVICE(q_nope, q_rope); CHECK_SAME_DEVICE(q_nope, k_nope);
+    CHECK_SAME_DEVICE(q_nope, cos_sin_cache); CHECK_SAME_DEVICE(q_nope, pos_ids);
+    CHECK_SAME_DEVICE(q_nope, q_out);
+
+    // Q can be 2D or 3D: [nnz, dim] or [nnz, num_heads, dim]
+    // K must be 2D: [nnz, dim]
+    TORCH_CHECK(q_nope.dim() == 2 || q_nope.dim() == 3, "q_nope must be 2D or 3D");
+    TORCH_CHECK(q_rope.dim() == 2 || q_rope.dim() == 3, "q_rope must be 2D or 3D");
+    CHECK_DIM(2, k_nope); CHECK_DIM(2, k_rope);
+    CHECK_DIM(1, pos_ids); CHECK_DIM(2, cos_sin_cache);
+
+    // Determine dimensions and strides based on Q shape
+    int nnz_tokens, num_heads, Dn, Dr;
+    int64_t qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head;
+    int64_t qout_stride_tok_bytes, qout_stride_head_bytes;
+    
+    if (q_nope.dim() == 3) {
+        // 3D Q: [nnz_tokens, num_heads, dim]
+        nnz_tokens = q_nope.size(0);
+        num_heads = q_nope.size(1);
+        Dn = q_nope.size(2);
+        Dr = q_rope.size(2);
+        
+        TORCH_CHECK(q_rope.size(0) == nnz_tokens && q_rope.size(1) == num_heads, "q_rope shape mismatch");
+        TORCH_CHECK(q_out.dim() == 3 && q_out.size(0) == nnz_tokens && q_out.size(1) == num_heads && q_out.size(2) == (Dn + Dr),
+                    "q_out must be [nnz, num_heads, Dn+Dr] when Q is 3D");
+        
+        // Q strides in elements
+        qn_stride_tok = q_nope.stride(0);
+        qn_stride_head = q_nope.stride(1);
+        qr_stride_tok = q_rope.stride(0);
+        qr_stride_head = q_rope.stride(1);
+        
+        // q_out strides in BYTES (uint8)
+        qout_stride_tok_bytes = q_out.stride(0);
+        qout_stride_head_bytes = q_out.stride(1);
+    } else {
+        // 2D Q: [nnz_tokens, dim] (single head or flattened)
+        nnz_tokens = q_nope.size(0);
+        Dn = q_nope.size(1);
+        Dr = q_rope.size(1);
+        num_heads = 1;
+        
+        TORCH_CHECK(q_rope.size(0) == nnz_tokens, "q_rope vs q_nope mismatch");
+        TORCH_CHECK(q_out.dim() == 2 && q_out.size(0) == nnz_tokens && q_out.size(1) == (Dn + Dr),
+                    "q_out must be [nnz, Dn+Dr] when Q is 2D");
+        
+        qn_stride_tok = q_nope.stride(0);
+        qn_stride_head = 0;
+        qr_stride_tok = q_rope.stride(0);
+        qr_stride_head = 0;
+        
+        qout_stride_tok_bytes = q_out.stride(0);
+        qout_stride_head_bytes = 0;
+    }
+    
+    int nnz_k = k_rope.size(0);
+    TORCH_CHECK(k_nope.size(0) == nnz_k && k_nope.size(1) == Dn, "k_nope shape mismatch");
+    TORCH_CHECK(k_rope.size(0) == nnz_k && k_rope.size(1) == Dr, "k_rope shape mismatch");
+    TORCH_CHECK(nnz_k == nnz_tokens, "K batch size must match Q token count");
+    
+    // ===== Robustness checks (expert suggestions) =====
+    
+    // 1. K must be contiguous on last dim (or explicitly handle stride)
+    //    For simplicity, we enforce contiguous K on dim=1
+    if (k_nope.stride(1) != 1) {
+        TORCH_CHECK(false, "k_nope must be contiguous on last dim. Call .contiguous() before passing to kernel.");
+    }
+    if (k_rope.stride(1) != 1) {
+        TORCH_CHECK(false, "k_rope must be contiguous on last dim. Call .contiguous() before passing to kernel.");
+    }
+    
+    // 2. Q last dim must be contiguous (vectorized kernel assumes this)
+    int q_last_dim = q_nope.dim() - 1;
+    TORCH_CHECK(q_nope.stride(q_last_dim) == 1, "q_nope last dim must be contiguous");
+    TORCH_CHECK(q_rope.stride(q_last_dim) == 1, "q_rope last dim must be contiguous");
+    
+    // 3. q_out last dim must be contiguous
+    int qout_last_dim = q_out.dim() - 1;
+    TORCH_CHECK(q_out.stride(qout_last_dim) == 1, "q_out last dim must be contiguous");
+    
+    // ==================================================
+
+    uint8_t* k_nope_out_ptr = nullptr;
+    uint8_t* k_rope_out_ptr = nullptr;
+    if (k_nope_out.has_value()) {
+        auto t = k_nope_out.value();
+        CHECK_INPUT(t); CHECK_DIM(2, t);
+        TORCH_CHECK(t.size(0) == nnz_k && t.size(1) == Dn, "k_nope_out shape mismatch");
+        // Accept FP8 tensor, reinterpret as uint8*
+        k_nope_out_ptr = reinterpret_cast<uint8_t*>(t.data_ptr());
+    }
+    if (k_rope_out.has_value()) {
+        auto t = k_rope_out.value();
+        CHECK_INPUT(t); CHECK_DIM(2, t);
+        TORCH_CHECK(t.size(0) == nnz_k && t.size(1) == Dr, "k_rope_out shape mismatch");
+        // Accept FP8 tensor, reinterpret as uint8*
+        k_rope_out_ptr = reinterpret_cast<uint8_t*>(t.data_ptr());
+    }
+
+    uint8_t* kv_buf_ptr = nullptr;
+    int64_t kv_stride_n_bytes = 0;
+    int64_t kv_stride_m_bytes = 0;
+    int page_size = 0;
+    const int64_t* kv_loc_ptr = nullptr;
+    if (kv_buffer.has_value() || kv_cache_loc.has_value()) {
+        TORCH_CHECK(kv_buffer.has_value() && kv_cache_loc.has_value(),
+            "kv_buffer and kv_cache_loc must be both provided for fused write");
+        auto kv = kv_buffer.value();
+        auto loc = kv_cache_loc.value();
+        CHECK_INPUT(kv); CHECK_INPUT(loc);
+        CHECK_DIM(3, kv); CHECK_DIM(1, loc);
+        TORCH_CHECK(kv.size(2) == (Dn + Dr), "kv_buffer last dim must be Dn+Dr");
+        TORCH_CHECK(loc.size(0) == nnz_k, "kv_cache_loc size must match K batch size");
+        
+        // CRITICAL: Check contiguity on last dim to avoid silent errors
+        TORCH_CHECK(kv.stride(2) == 1, "kv_buffer last dim must be contiguous (stride=1)");
+        
+        // Accept FP8 tensor, reinterpret as uint8*
+        kv_buf_ptr = reinterpret_cast<uint8_t*>(kv.data_ptr());
+        
+        // KV buffer layout: [num_blocks, page_size, kv_dim]
+        // stride(0): block stride in bytes (already uint8 elements = bytes)
+        // stride(1): page-internal row stride in bytes
+        page_size = kv.size(1);
+        kv_stride_n_bytes = kv.stride(0);  // Block stride
+        kv_stride_m_bytes = kv.stride(1);  // Row stride within page
+        kv_loc_ptr = loc.data_ptr<int64_t>();
+    }
+
+    // Get common pointers
+    const float* cs_ptr = cos_sin_cache.data_ptr<float>();
+    const int64_t* pos_ptr = pos_ids.data_ptr<int64_t>();
+    // Accept FP8 tensor for q_out, reinterpret as uint8*
+    uint8_t* q_out_ptr = reinterpret_cast<uint8_t*>(q_out.data_ptr());
+
+    // Get current CUDA stream (compatible with PyTorch 2.x)
+    cudaStream_t stream = c10::cuda::getCurrentCUDAStream().stream();
+    
+    // Total number of work items: nnz_tokens * num_heads
+    int total_rows = nnz_tokens * num_heads;
+    
+    // Dispatch: use vectorized kernel if dimensions are 4-byte aligned
+    // Vectorized path requires (Dn+Dr) % 4 == 0 for uint32_t writes
+    bool can_vectorize = ((Dn & 3) == 0) && ((Dr & 3) == 0);
+    
+    if (can_vectorize) {
+        // Additional check: q_out strides must be 4-byte aligned for vectorized writes
+        // Since q_out is uint8 and we write uint32_t, strides should be multiples of 4
+        bool strides_aligned = (qout_stride_tok_bytes % 4 == 0) && 
+                               (num_heads > 1 ? (qout_stride_head_bytes % 4 == 0) : true);
+        if (!strides_aligned) {
+            // Fallback to scalar kernel if strides are misaligned
+            can_vectorize = false;
+        }
+    }
+    
+    // ===== Dtype dispatch: support both FP16 and BF16 =====
+    auto dtype = q_nope.scalar_type();
+    
+    if (dtype == at::kHalf) {
+        // FP16 path
+        const __half* qn_ptr = reinterpret_cast<const __half*>(q_nope.data_ptr<at::Half>());
+        const __half* qr_ptr = reinterpret_cast<const __half*>(q_rope.data_ptr<at::Half>());
+        const __half* kn_ptr = reinterpret_cast<const __half*>(k_nope.data_ptr<at::Half>());
+        const __half* kr_ptr = reinterpret_cast<const __half*>(k_rope.data_ptr<at::Half>());
+        
+        if (can_vectorize) {
+            constexpr int WARPS_PER_CTA = 4;
+            dim3 vecBlock(WARPS_PER_CTA * 32);
+            dim3 vecGrid((total_rows + WARPS_PER_CTA - 1) / WARPS_PER_CTA);
+            
+            FusedRopeQuantizeKernelVec<WARPS_PER_CTA, __half><<<vecGrid, vecBlock, 0, stream>>>(
+                qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
+                kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
+                q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
+                k_nope_out_ptr, k_rope_out_ptr, 
+                kv_buf_ptr, kv_stride_n_bytes, kv_stride_m_bytes, page_size, kv_loc_ptr
+            );
+        } else {
+            constexpr int BLOCK_THREADS = 256;
+            dim3 grid((total_rows + BLOCK_THREADS - 1) / BLOCK_THREADS);
+            
+            FusedRopeQuantizeKernelScalar<BLOCK_THREADS, __half><<<grid, BLOCK_THREADS, 0, stream>>>(
+                qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
+                kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
+                q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
+                k_nope_out_ptr, k_rope_out_ptr, 
+                kv_buf_ptr, kv_stride_n_bytes, kv_stride_m_bytes, page_size, kv_loc_ptr
+            );
+        }
+    } else if (dtype == at::kBFloat16) {
+        // BF16 path
+        const nv_bfloat16* qn_ptr = reinterpret_cast<const nv_bfloat16*>(q_nope.data_ptr<at::BFloat16>());
+        const nv_bfloat16* qr_ptr = reinterpret_cast<const nv_bfloat16*>(q_rope.data_ptr<at::BFloat16>());
+        const nv_bfloat16* kn_ptr = reinterpret_cast<const nv_bfloat16*>(k_nope.data_ptr<at::BFloat16>());
+        const nv_bfloat16* kr_ptr = reinterpret_cast<const nv_bfloat16*>(k_rope.data_ptr<at::BFloat16>());
+        
+        if (can_vectorize) {
+            constexpr int WARPS_PER_CTA = 4;
+            dim3 vecBlock(WARPS_PER_CTA * 32);
+            dim3 vecGrid((total_rows + WARPS_PER_CTA - 1) / WARPS_PER_CTA);
+            
+            FusedRopeQuantizeKernelVec<WARPS_PER_CTA, nv_bfloat16><<<vecGrid, vecBlock, 0, stream>>>(
+                qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
+                kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
+                q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
+                k_nope_out_ptr, k_rope_out_ptr, 
+                kv_buf_ptr, kv_stride_n_bytes, kv_stride_m_bytes, page_size, kv_loc_ptr
+            );
+        } else {
+            constexpr int BLOCK_THREADS = 256;
+            dim3 grid((total_rows + BLOCK_THREADS - 1) / BLOCK_THREADS);
+            
+            FusedRopeQuantizeKernelScalar<BLOCK_THREADS, nv_bfloat16><<<grid, BLOCK_THREADS, 0, stream>>>(
+                qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
+                kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
+                q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
+                k_nope_out_ptr, k_rope_out_ptr, 
+                kv_buf_ptr, kv_stride_n_bytes, kv_stride_m_bytes, page_size, kv_loc_ptr
+            );
+        }
+    } else {
+        TORCH_CHECK(false, "Unsupported dtype for fused kernel. Only FP16 and BF16 are supported.");
+    }
+    
+    TORCH_CHECK(cudaGetLastError() == cudaSuccess, "Kernel launch failed");
+}
+
+// PyBind11 module definition (ONLY for standalone build)
+// When building as part of sgl_kernel, this is handled by common_extension.cc
+// TORCH_EXTENSION_NAME is only defined by torch.utils.cpp_extension (standalone)
+#ifdef TORCH_EXTENSION_NAME
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("mla_rope_quantize_fp8_fused", &mla_rope_quantize_fp8_fused,
+          "Fused MLA RoPE + FP8 quantization with optional direct KV write",
+          py::arg("q_nope"),
+          py::arg("q_rope"),
+          py::arg("k_nope"),
+          py::arg("k_rope"),
+          py::arg("cos_sin_cache"),
+          py::arg("pos_ids"),
+          py::arg("is_neox"),
+          py::arg("q_out"),
+          py::arg("k_nope_out") = py::none(),
+          py::arg("k_rope_out") = py::none(),
+          py::arg("kv_buffer") = py::none(),
+          py::arg("kv_cache_loc") = py::none());
+}
+#endif
diff --git a/sgl-kernel/include/sgl_kernel_ops.h b/sgl-kernel/include/sgl_kernel_ops.h
index e95bcc2ff8c..12f20bf8b34 100644
--- a/sgl-kernel/include/sgl_kernel_ops.h
+++ b/sgl-kernel/include/sgl_kernel_ops.h
@@ -157,6 +157,20 @@ void apply_rope_pos_ids_cos_sin_cache(
     const std::optional<at::Tensor>& v_buffer,
     const std::optional<at::Tensor>& kv_cache_loc);
 
+void mla_rope_quantize_fp8_fused(
+    at::Tensor q_nope,
+    at::Tensor q_rope,
+    at::Tensor k_nope,
+    at::Tensor k_rope,
+    at::Tensor cos_sin_cache,
+    at::Tensor pos_ids,
+    bool is_neox,
+    at::Tensor q_out,
+    const std::optional<at::Tensor>& k_nope_out,
+    const std::optional<at::Tensor>& k_rope_out,
+    const std::optional<at::Tensor>& kv_buffer,
+    const std::optional<at::Tensor>& kv_cache_loc);
+
 void downcast_fp8(
     at::Tensor& k,
     at::Tensor& v,
diff --git a/test/srt/test_mla_fp8_fused_kv.py b/test/srt/test_mla_fp8_fused_kv.py
new file mode 100644
index 00000000000..172727a40b1
--- /dev/null
+++ b/test/srt/test_mla_fp8_fused_kv.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+"""
+PyTest: correctness of fused KV write vs baseline (non-fused).
+Tests the mla_rope_quantize_fp8_fused kernel from sgl_kernel extension.
+"""
+import torch
+import pytest
+
+# Try to import fusion kernel (standalone or from sgl_kernel)
+_has_sgl_kernel = False
+mla_rope_quantize_fp8_fused = None
+try:
+    from mla_fusion_kernel import mla_rope_quantize_fp8_fused
+    _has_sgl_kernel = True
+except ImportError:
+    try:
+        from sgl_kernel import mla_rope_quantize_fp8_fused
+        _has_sgl_kernel = True
+    except ImportError:
+        pass
+
+requires_ext = pytest.mark.skipif(not _has_sgl_kernel, reason="sgl_kernel extension not available")
+
+@requires_ext
+@pytest.mark.parametrize("nnz", [256, 1024])
+@pytest.mark.parametrize("Dn,Dr", [(512, 64)])
+def test_fused_matches_baseline(nnz, Dn, Dr):
+    device = "cuda"
+    torch.manual_seed(0)
+
+    # Inputs (half); in a real path you may use bfloat16. We pick half for demo.
+    q_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
+    q_rope = torch.randn(nnz, Dr, device=device, dtype=torch.float16)
+    k_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
+    k_rope = torch.randn(nnz, Dr, device=device, dtype=torch.float16)
+
+    max_seq = max(2048, nnz)
+    # cos/sin cache: [max_seq, 2*Dr]
+    # Simple deterministic cache for test
+    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[:, None]
+    idx = torch.arange(Dr, device=device, dtype=torch.float32)[None, :]
+    # frequencies: small values to avoid overflow
+    freqs = 0.1 * (idx + 1.0)
+    cos = torch.cos(t * freqs)
+    sin = torch.sin(t * freqs)
+    cos_sin = torch.cat([cos, sin], dim=1)  # [max_seq, 2*Dr]
+
+    pos_ids = torch.randint(low=0, high=max_seq, size=(nnz,), device=device, dtype=torch.long)
+
+    # Baseline: produce k_nope_out/k_rope_out and emulate set_mla_kv_buffer (bytes concat)
+    q_out_base = torch.empty(nnz, Dn + Dr, device=device, dtype=torch.uint8)
+    k_nope_out = torch.empty(nnz, Dn, device=device, dtype=torch.uint8)
+    k_rope_out = torch.empty(nnz, Dr, device=device, dtype=torch.uint8)
+
+    mla_rope_quantize_fp8_fused(
+        q_nope, q_rope, k_nope, k_rope, cos_sin, pos_ids, False,
+        q_out_base, k_nope_out, k_rope_out, None, None
+    )
+
+    # emulate set_mla_kv_buffer_triton: concat bytes into KV buffer
+    slots = nnz + 8
+    kv_base = torch.zeros(slots, 1, Dn + Dr, device=device, dtype=torch.uint8)
+    loc = torch.arange(nnz, device=device, dtype=torch.long)
+    kv_base[loc, 0, :Dn] = k_nope_out
+    kv_base[loc, 0, Dn:] = k_rope_out
+
+    # Fused: direct KV write, skip K outputs
+    q_out_fused = torch.empty_like(q_out_base)
+    kv_fused = torch.zeros_like(kv_base)
+
+    mla_rope_quantize_fp8_fused(
+        q_nope, q_rope, k_nope, k_rope, cos_sin, pos_ids, False,
+        q_out_fused, None, None, kv_fused, loc
+    )
+
+    # Assertions
+    assert torch.equal(q_out_base, q_out_fused), "q_out must match exactly (bytewise)"
+    assert torch.equal(kv_base, kv_fused), "KV fused write must match baseline concat"
+

From 8d27fff27393d394fde6ca3b8a64d5bf387742dd Mon Sep 17 00:00:00 2001
From: Yangmin Li <yangminl@nvidia.com>
Date: Thu, 30 Oct 2025 22:28:36 -0700
Subject: [PATCH 02/10] single req successful

---
 .../layers/attention/trtllm_mla_backend.py    | 124 +++++++--
 .../csrc/elementwise/mla_rope_fp8_kv_fused.cu | 235 +++++++++++-------
 2 files changed, 259 insertions(+), 100 deletions(-)

diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
index 619d0e0fd5b..9f25ab44df0 100755
--- a/python/sglang/srt/layers/attention/trtllm_mla_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -678,6 +678,11 @@ def quantize_and_rope_for_fp8(
                 - k_nope_out:   [seq_len, num_heads, kv_lora_rank], dtype=torch.float8_e4m3fn or None (if fused)
                 - k_rope_out:   [seq_len, num_heads, qk_rope_head_dim], dtype=torch.float8_e4m3fn or None (if fused)
         """
+        # Import at the beginning to avoid UnboundLocalError
+        import os
+        import torch
+        import sys
+        
         attn_dtype = torch.float8_e4m3fn
         q_len, num_heads = q_rope.shape[0], q_rope.shape[1]
 
@@ -692,42 +697,113 @@ def quantize_and_rope_for_fp8(
 
         # Determine if we can use the fused kernel (RoPE + quantization + direct KV write)
         # Conditions: save_kv_cache, layer provided, not NSA packed format, kernel available
+        
+        compute_cap = torch.cuda.get_device_capability()
+        allow_fused_env = os.getenv("SGL_MLA_FUSED", "1") == "1"
         kernel_available = mla_rope_quantize_fp8_fused is not None
         has_layer = layer is not None
         nsa_flag = getattr(forward_batch.token_to_kv_pool, "nsa_kv_cache_store_fp8", False)
         
-        # Enable fused KV write for performance optimization
+        # CRITICAL: Only enable fusion on SM90+ (H100/B200)
+        # SM<90 (A100/H800) has fallback FP8 conversion that's too simplified
+        # and causes severe quantization errors, leading to 0 accuracy
+        # TEMPORARY DEBUG: Check if we should disable fused for specific paths
+        in_decode = os.getenv("_SGL_IN_DECODE", "0") == "1"  # Internal flag
+        allow_decode_fused = os.getenv("SGL_DECODE_FUSED", "0") == "1"
+        
         use_fused_kv_write = (
             kernel_available
+            and allow_fused_env
+            and (compute_cap[0] >= 9)  # Only SM90+ (H100/B200)
             and save_kv_cache
             and has_layer
             and not nsa_flag
+            and (not in_decode or allow_decode_fused)  # Can disable decode separately
         )
         
         # DEBUG: Basic branch tracking
-        import sys
         if use_fused_kv_write:
-            sys.stderr.write(f"[MLA FUSION] ✅ Using fused kernel (layer={layer.layer_id}, tokens={q_len})\n")
+            sys.stderr.write(f"[MLA FUSION] ✅ Using fused kernel (layer={layer.layer_id}, tokens={q_len}, SM={compute_cap[0]}{compute_cap[1]})\n")
         elif not kernel_available:
             sys.stderr.write(f"[MLA FUSION] ❌ Kernel not available, using standard path\n")
+        elif compute_cap[0] < 9:
+            sys.stderr.write(f"[MLA FUSION] ⚠️  SM{compute_cap[0]}{compute_cap[1]} < SM90, using standard path (FP8 fallback not reliable)\n")
         sys.stderr.flush()
 
         if use_fused_kv_write:
             # Fused path: RoPE + quantization + direct KV cache write
             # This eliminates the intermediate write/read of k_nope_out/k_rope_out
             
-            # CRITICAL: Reshape KV buffer to [num_blocks, page_size, kv_dim] layout
-            # get_key_buffer() returns flattened [num_blocks*page_size, 1, kv_dim]
-            # We need to reshape it to proper 3D layout for correct page-internal indexing
-            k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
-            kv_buffer = k_cache.view(-1, self.page_size, self.kv_cache_dim)
+            # CRITICAL FIX: Use 2D buffer with flat row indexing (SGLang semantics)
+            # out_cache_loc IS the flat row index, NOT a block ID
+            # Do NOT reshape to 3D or use pos % page_size
+            k_cache_raw = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
             
-            # Note: Keep as FP8 tensor, C++ will do reinterpret_cast to uint8*
-            # Don't use .view(torch.uint8) which is wrong API usage
+            # get_key_buffer returns (size+page_size, 1, kv_dim), squeeze to 2D
+            if k_cache_raw.dim() == 3:
+                assert k_cache_raw.size(1) == 1, f"Expected middle dim=1, got {k_cache_raw.shape}"
+                kv_buffer = k_cache_raw.squeeze(1)  # [total_rows, kv_dim]
+            else:
+                kv_buffer = k_cache_raw  # Already 2D
             
-            # Dtype assertion for safety
+            # Prepare cache locations and positions as int64 contiguous
+            kv_loc = forward_batch.out_cache_loc.to(torch.int64).contiguous()
+            positions = forward_batch.positions.to(torch.int64).contiguous()
+            
+            # Assertions to catch issues early
+            assert kv_buffer.dim() == 2, f"kv_buffer must be 2D, got shape {kv_buffer.shape}"
+            assert kv_buffer.size(1) == self.kv_cache_dim, f"kv_buffer last dim must be {self.kv_cache_dim}, got {kv_buffer.size(1)}"
+            assert kv_buffer.size(1) == (self.kv_lora_rank + self.qk_rope_head_dim), \
+                f"kv_buffer dim mismatch: expect {self.kv_lora_rank + self.qk_rope_head_dim}, got {kv_buffer.size(1)}"
+            assert (self.qk_rope_head_dim % 2) == 0, "RoPE dim (Dr) must be even for pairing"
+            assert kv_loc.dim() == 1, f"kv_loc must be 1D, got {kv_loc.dim()}"
+            assert kv_loc.numel() == q_len, f"kv_loc size {kv_loc.numel()} must match token count {q_len}"
             assert q_nope.dtype in (torch.float16, torch.bfloat16), f"Unsupported dtype: {q_nope.dtype}"
             assert k_nope.dtype == q_nope.dtype, "Q and K dtype mismatch"
+            # CRITICAL: Check cos_sin_cache dimension
+            assert cos_sin_cache.size(1) == self.qk_rope_head_dim, \
+                f"cos_sin_cache second dim must be rope_dim={self.qk_rope_head_dim}, got {cos_sin_cache.size(1)}"
+            # Check all kv_loc values are within valid range
+            assert kv_loc.min() >= 0 and kv_loc.max() < kv_buffer.size(0), \
+                f"kv_loc out of range: min={kv_loc.min()}, max={kv_loc.max()}, buffer_size={kv_buffer.size(0)}"
+            # Sanity check: positions should be reasonable
+            assert positions.min() >= 0, f"Invalid positions: min={positions.min()}"
+            # Check tensor contiguity
+            assert kv_buffer.is_contiguous() or kv_buffer.stride(-1) == 1, "kv_buffer last dim must be contiguous"
+            
+            # DEBUG: Check for potential issues with KV write
+            debug_kv = os.getenv("SGL_DEBUG_KV_WRITE", "0") == "1"
+            if debug_kv:
+                sys.stderr.write(f"[KV DEBUG] layer={layer.layer_id}, tokens={q_len}, "
+                               f"kv_loc={kv_loc.cpu().numpy()}, pos={positions.cpu().numpy()}, "
+                               f"kv_buffer.data_ptr={kv_buffer.data_ptr():#x}, "
+                               f"kv_buffer.size(0)={kv_buffer.size(0)}\n")
+                sys.stderr.flush()
+                
+                # Verify buffer is the same as what will be read later
+                k_cache_verify = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+                if k_cache_verify.dim() == 3:
+                    k_cache_verify = k_cache_verify.squeeze(1)
+                assert k_cache_verify.data_ptr() == kv_buffer.data_ptr(), \
+                    f"KV buffer address changed! Expected {kv_buffer.data_ptr():#x}, got {k_cache_verify.data_ptr():#x}"
+            
+            # Debug output
+            sys.stderr.write(f"[FUSED DEBUG] kv.shape={tuple(kv_buffer.shape)}, loc.shape={tuple(kv_loc.shape)}, "
+                           f"q_nope.shape={tuple(q_nope.shape)}, dtype={q_nope.dtype}\n")
+            # DEBUG: Save inputs/outputs for offline comparison
+            debug_save = os.getenv("SGL_DEBUG_SAVE_ROPE", "0") == "1"
+            if debug_save and layer.layer_id == 0:  # Only save first layer
+                torch.save({
+                    'q_nope': q_nope.cpu(),
+                    'q_rope': q_rope.cpu(),
+                    'k_nope': k_nope.cpu(),
+                    'k_rope': k_rope.cpu(),
+                    'cos_sin_cache': cos_sin_cache.cpu(),
+                    'positions': positions.cpu(),
+                    'is_neox': is_neox,
+                }, '/tmp/rope_inputs.pt')
+                sys.stderr.write(f"[DEBUG] Saved inputs to /tmp/rope_inputs.pt\n")
+                sys.stderr.flush()
             
             # Call fused kernel
             # Note: Kernel now natively supports both FP16 and BF16 via C++ templates
@@ -740,16 +816,18 @@ def quantize_and_rope_for_fp8(
                 k_nope,  # [nnz, Dn] - already 2D from squeeze(1), FP16/BF16
                 k_rope,  # [nnz, Dr] - already 2D from squeeze(1), FP16/BF16
                 cos_sin_cache,
-                forward_batch.positions,
+                positions,  # int64 contiguous
                 is_neox,
                 q_out,   # FP8 tensor, C++ will reinterpret as uint8*
                 None,  # k_nope_out - skip intermediate output
                 None,  # k_rope_out - skip intermediate output
-                kv_buffer,  # FP8 tensor, direct write to KV cache
-                forward_batch.out_cache_loc,
+                kv_buffer,  # 2D FP8 tensor [total_rows, kv_dim]
+                kv_loc,  # int64 flat row indices
             )
             
             # Return Q output and None for K outputs (already written to cache)
+            sys.stderr.write(f"[FUSED] Returning from fused path: q_out.shape={q_out.shape}, k=None, k_rope=None\n")
+            sys.stderr.flush()
             return q_out, None, None
         else:
             # Standard path: RoPE + quantization only (backward compatible)
@@ -757,6 +835,19 @@ def quantize_and_rope_for_fp8(
             k_rope_out = k_rope.new_empty(k_rope.shape, dtype=attn_dtype)
             k_nope_out = k_nope.new_empty(k_nope.shape, dtype=attn_dtype)
 
+            # DEBUG: Save inputs for offline comparison
+            debug_save = os.getenv("SGL_DEBUG_SAVE_ROPE", "0") == "1"
+            if debug_save:
+                torch.save({
+                    'q_nope': q_nope.cpu(),
+                    'q_rope': q_rope.cpu(),
+                    'k_nope': k_nope.cpu(),
+                    'k_rope': k_rope.cpu(),
+                    'cos_sin_cache': cos_sin_cache.cpu(),
+                    'positions': forward_batch.positions.cpu(),
+                    'is_neox': is_neox,
+                }, '/tmp/flashinfer_rope_inputs.pt')
+                
             # Apply RoPE and quantize all components in a single kernel call
             # This kernel handles:
             # 1. RoPE application to q_rope and k_rope using cos_sin_cache and positions
@@ -888,6 +979,9 @@ def forward_decode(
             
             # Call quantize_and_rope_for_fp8 with layer and save_kv_cache parameters
             # This enables the fused kernel path when conditions are met
+            # TEMPORARY: Disable fused in decode for debugging
+            import os
+            allow_decode_fused = os.getenv("SGL_DECODE_FUSED", "0") == "1"
             q, k, k_rope = self.quantize_and_rope_for_fp8(
                 q,
                 q_rope,
@@ -896,7 +990,7 @@ def forward_decode(
                 forward_batch,
                 cos_sin_cache,
                 is_neox,
-                layer=layer,
+                layer=layer if allow_decode_fused else None,  # Disable fused by passing None
                 save_kv_cache=save_kv_cache,
             )
             merge_query = False
diff --git a/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu b/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
index a994bf83c9f..c45e7205c3b 100644
--- a/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
+++ b/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
@@ -38,6 +38,8 @@
 #endif
 #include <optional>
 #include <stdint.h>
+#include <cstdlib>  // For std::getenv
+#include <string>   // For std::string
 
 // Utility macros (borrowed from pytorch_extension_utils.h style)
 #define CHECK_INPUT(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
@@ -155,9 +157,7 @@ __global__ void FusedRopeQuantizeKernelVec(
     uint8_t* __restrict__ k_nope_out_fp8,
     uint8_t* __restrict__ k_rope_out_fp8,
     uint8_t* __restrict__ kv_buffer_bytes,
-    int64_t kv_stride_n_bytes,
-    int64_t kv_stride_m_bytes,  // NEW: stride for page-internal row
-    int page_size,              // NEW: page size for row offset calculation
+    int64_t kv_stride_row_bytes,  // 2D: row stride in bytes
     const int64_t* __restrict__ kv_cache_loc
 ) {
     constexpr int WARP_SIZE = 32;
@@ -187,21 +187,22 @@ __global__ void FusedRopeQuantizeKernelVec(
     uint8_t* kndst = k_nope_out_fp8 ? (k_nope_out_fp8 + size_t(token_id) * Dn) : nullptr;
     uint8_t* krdst = k_rope_out_fp8 ? (k_rope_out_fp8 + size_t(token_id) * Dr) : nullptr;
 
-    // Get position for RoPE and KV cache row calculation
+    // Get position for RoPE
     int pos = static_cast<int>(pos_ids[token_id]);
 
-    // KV cache destination: include row offset within page
-    // CRITICAL FIX: write to correct row in page (pos % page_size), not always row 0
+    // CRITICAL FIX: kv_cache_loc IS the flat row index (SGLang semantics)
+    // Do NOT use pos % page_size! That was causing KV to be written to wrong locations.
+    // In SGLang: kv_buffer is 2D [total_rows, kv_dim], loc = direct row index
     uint8_t* kvdst = nullptr;
     if (kv_buffer_bytes && kv_cache_loc) {
-        int64_t slot = kv_cache_loc[token_id];
-        int row = pos % page_size;  // ⬅️ Token's row within page
-        kvdst = kv_buffer_bytes 
-              + slot * kv_stride_n_bytes 
-              + static_cast<int64_t>(row) * kv_stride_m_bytes;
+        int64_t flat_row = kv_cache_loc[token_id];  // This IS the row index
+        kvdst = kv_buffer_bytes + flat_row * kv_stride_row_bytes;
     }
-    const float* cos_ptr = cos_sin + size_t(pos) * (2 * Dr);
-    const float* sin_ptr = cos_ptr + Dr;
+    
+    // CRITICAL FIX: cos_sin_cache layout is [max_pos, rope_dim] (not 2*rope_dim!)
+    // First half (rope_dim/2) is cos, second half is sin
+    const float* cos_ptr = cos_sin + size_t(pos) * Dr;  // rope_dim = Dr
+    const float* sin_ptr = cos_ptr + (Dr / 2);           // sin at offset Dr/2
 
     // Use traits for dtype-agnostic vectorized load
     using V2 = typename Vec2Traits<T>::v2;
@@ -220,16 +221,22 @@ __global__ void FusedRopeQuantizeKernelVec(
     }
 
     // Process Q_rope: paired rotation + vectorized quantize + write
+    // Each iteration processes 2 pairs: (c, c+1) and (c+2, c+3)
     for (int c = lane * 4; c < Dr; c += WARP_SIZE * 4) {
-        V2 h0 = *reinterpret_cast<const V2*>(qr + c + 0);
-        V2 h1 = *reinterpret_cast<const V2*>(qr + c + 2);
+        V2 h0 = *reinterpret_cast<const V2*>(qr + c + 0);  // [c+0, c+1]
+        V2 h1 = *reinterpret_cast<const V2*>(qr + c + 2);  // [c+2, c+3]
         float2 f0 = Vec2Traits<T>::to_float2(h0);
         float2 f1 = Vec2Traits<T>::to_float2(h1);
 
-        float c0 = cos_ptr[c + 0], s0 = sin_ptr[c + 0];
-        float c1 = cos_ptr[c + 2], s1 = sin_ptr[c + 2];
-        rope_rotate(f0.x, f0.y, c0, s0, is_neox);
-        rope_rotate(f1.x, f1.y, c1, s1, is_neox);
+        // CRITICAL: For GPT/interleaved RoPE, pair (2k, 2k+1) uses cos[k], NOT cos[2k]
+        // So: (qr[0],qr[1]) uses cos[0]; (qr[2],qr[3]) uses cos[1]
+        int base0 = (c + 0) >> 1;  // pair index: c=0 → base=0, c=4 → base=2
+        int base1 = (c + 2) >> 1;  // pair index: c=0 → base=1, c=4 → base=3
+        float c0 = cos_ptr[base0], s0 = sin_ptr[base0];
+        float c1 = cos_ptr[base1], s1 = sin_ptr[base1];
+        
+        rope_rotate(f0.x, f0.y, c0, s0, false);
+        rope_rotate(f1.x, f1.y, c1, s1, false);
 
         uint32_t packed = pack4(
             float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y),
@@ -255,17 +262,21 @@ __global__ void FusedRopeQuantizeKernelVec(
             if (kvdst) *reinterpret_cast<uint32_t*>(kvdst + c) = packed;
         }
 
-        // Process K_rope: paired rotation + vectorized quantize + write to k_rope_out and/or KV buffer
+        // Process K_rope: paired rotation + vectorized quantize + write
         for (int c = lane * 4; c < Dr; c += WARP_SIZE * 4) {
-            V2 h0 = *reinterpret_cast<const V2*>(kr + c + 0);
-            V2 h1 = *reinterpret_cast<const V2*>(kr + c + 2);
+            V2 h0 = *reinterpret_cast<const V2*>(kr + c + 0);  // [c+0, c+1]
+            V2 h1 = *reinterpret_cast<const V2*>(kr + c + 2);  // [c+2, c+3]
             float2 f0 = Vec2Traits<T>::to_float2(h0);
             float2 f1 = Vec2Traits<T>::to_float2(h1);
 
-            float c0 = cos_ptr[c + 0], s0 = sin_ptr[c + 0];
-            float c1 = cos_ptr[c + 2], s1 = sin_ptr[c + 2];
-            rope_rotate(f0.x, f0.y, c0, s0, is_neox);
-            rope_rotate(f1.x, f1.y, c1, s1, is_neox);
+            // CRITICAL: Use pair index (same as Q_rope above)
+            int base0 = (c + 0) >> 1;
+            int base1 = (c + 2) >> 1;
+            float c0 = cos_ptr[base0], s0 = sin_ptr[base0];
+            float c1 = cos_ptr[base1], s1 = sin_ptr[base1];
+            
+            rope_rotate(f0.x, f0.y, c0, s0, false);
+            rope_rotate(f1.x, f1.y, c1, s1, false);
 
             uint32_t packed = pack4(
                 float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y),
@@ -298,9 +309,7 @@ __global__ void FusedRopeQuantizeKernelScalar(
     uint8_t* __restrict__ k_nope_out_fp8,
     uint8_t* __restrict__ k_rope_out_fp8,
     uint8_t* __restrict__ kv_buffer_bytes,
-    int64_t kv_stride_n_bytes,
-    int64_t kv_stride_m_bytes,  // NEW: stride for page-internal row
-    int page_size,              // NEW: page size for row offset calculation
+    int64_t kv_stride_row_bytes,  // 2D: row stride in bytes
     const int64_t* __restrict__ kv_cache_loc
 ) {
     // Thread mapping: grid-stride loop over all (token, head) pairs
@@ -313,8 +322,10 @@ __global__ void FusedRopeQuantizeKernelScalar(
         int head_id = global_row % num_heads;
         
         int pos = static_cast<int>(pos_ids[token_id]);
-        const float* cos_ptr = cos_sin + size_t(pos) * (2 * Dr);
-        const float* sin_ptr = cos_ptr + Dr;
+        // CRITICAL FIX: cos_sin_cache is [max_pos, rope_dim]
+        // First half is cos, second half is sin
+        const float* cos_ptr = cos_sin + size_t(pos) * Dr;
+        const float* sin_ptr = cos_ptr + (Dr / 2);
 
         // ---- Quantize q ----
         // q_out: [nope | rope]
@@ -329,17 +340,32 @@ __global__ void FusedRopeQuantizeKernelScalar(
                 float x = Vec2Traits<T>::to_float(qn[i]);
                 qdst[i] = float_to_e4m3fn_byte(x);
             }
-            // rope part (avoid OOB read when Dr is odd)
-            for (int i = 0; i < Dr; i += 2) {
-                float xr = Vec2Traits<T>::to_float(qr[i + 0]);
-                float xi = 0.0f;
-                if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(qr[i + 1]);
-                float c = cos_ptr[i + 0];
-                float s = sin_ptr[i + 0];
-                // NeoX interleave is typically handled by reindexing; for demo we still rotate pairs.
-                rope_rotate(xr, xi, c, s, is_neox);
-                qdst[Dn + i + 0] = float_to_e4m3fn_byte(xr);
-                if (i + 1 < Dr) qdst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
+            // rope part: handle GPT vs NeoX layout
+            if (!is_neox) {
+                // GPT/interleaved style: pair (2k, 2k+1) uses cos[k], NOT cos[2k]
+                for (int i = 0; i < Dr; i += 2) {
+                    int base = i >> 1;  // CRITICAL: pair index
+                    float xr = Vec2Traits<T>::to_float(qr[i + 0]);
+                    float xi = 0.0f;
+                    if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(qr[i + 1]);
+                    float c = cos_ptr[base];
+                    float s = sin_ptr[base];
+                    rope_rotate(xr, xi, c, s, false);
+                    qdst[Dn + i + 0] = float_to_e4m3fn_byte(xr);
+                    if (i + 1 < Dr) qdst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
+                }
+            } else {
+                // NeoX style: pairs (i, i+Dr/2)
+                int half = Dr / 2;
+                for (int i = 0; i < half; ++i) {
+                    float xr = Vec2Traits<T>::to_float(qr[i]);         // real part
+                    float xi = Vec2Traits<T>::to_float(qr[i + half]);  // imag part (second half)
+                    float c = cos_ptr[i];
+                    float s = sin_ptr[i];
+                    rope_rotate(xr, xi, c, s, true);
+                    qdst[Dn + i]        = float_to_e4m3fn_byte(xr);
+                    qdst[Dn + i + half] = float_to_e4m3fn_byte(xi);
+                }
             }
         }
 
@@ -359,40 +385,69 @@ __global__ void FusedRopeQuantizeKernelScalar(
             }
             if (k_rope_out_fp8) {
                 uint8_t* krd = k_rope_out_fp8 + size_t(token_id) * Dr;
-                for (int i = 0; i < Dr; i += 2) {
-                    float xr = Vec2Traits<T>::to_float(kr[i + 0]);
-                    float xi = 0.0f;
-                    if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(kr[i + 1]);
-                    float c = cos_ptr[i + 0];
-                    float s = sin_ptr[i + 0];
-                    rope_rotate(xr, xi, c, s, is_neox);
-                    krd[i + 0] = float_to_e4m3fn_byte(xr);
-                    if (i + 1 < Dr) krd[i + 1] = float_to_e4m3fn_byte(xi);
+                if (!is_neox) {
+                    // GPT/interleaved style: pair (2k, 2k+1) uses cos[k]
+                    for (int i = 0; i < Dr; i += 2) {
+                        int base = i >> 1;  // CRITICAL: pair index
+                        float xr = Vec2Traits<T>::to_float(kr[i + 0]);
+                        float xi = 0.0f;
+                        if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(kr[i + 1]);
+                        float c = cos_ptr[base];
+                        float s = sin_ptr[base];
+                        rope_rotate(xr, xi, c, s, false);
+                        krd[i + 0] = float_to_e4m3fn_byte(xr);
+                        if (i + 1 < Dr) krd[i + 1] = float_to_e4m3fn_byte(xi);
+                    }
+                } else {
+                    // NeoX style: pairs (i, i+Dr/2)
+                    int half = Dr / 2;
+                    for (int i = 0; i < half; ++i) {
+                        float xr = Vec2Traits<T>::to_float(kr[i]);
+                        float xi = Vec2Traits<T>::to_float(kr[i + half]);
+                        float c = cos_ptr[i];
+                        float s = sin_ptr[i];
+                        rope_rotate(xr, xi, c, s, true);
+                        krd[i]        = float_to_e4m3fn_byte(xr);
+                        krd[i + half] = float_to_e4m3fn_byte(xi);
+                    }
                 }
             }
 
-            // Fused direct KV write (if kv_buffer provided)
-            // CRITICAL FIX: write to correct row in page (pos % page_size), not always row 0
+            // CRITICAL FIX: kv_cache_loc IS the flat row index (SGLang semantics)
+            // Do NOT use pos % page_size! That was causing KV to be written to wrong locations.
             if (kv_buffer_bytes && kv_cache_loc) {
-                int64_t slot = kv_cache_loc[token_id];
-                int row = pos % page_size;  // ⬅️ Token's row within page
-                uint8_t* dst = kv_buffer_bytes 
-                             + slot * kv_stride_n_bytes 
-                             + static_cast<int64_t>(row) * kv_stride_m_bytes;
+                int64_t flat_row = kv_cache_loc[token_id];  // This IS the row index
+                uint8_t* dst = kv_buffer_bytes + flat_row * kv_stride_row_bytes;
                 // Write nope first
                 for (int i = 0; i < Dn; ++i) {
                     dst[i] = float_to_e4m3fn_byte(Vec2Traits<T>::to_float(kn[i]));
                 }
-                // Then rope with rotation (avoid OOB read when Dr is odd)
-                for (int i = 0; i < Dr; i += 2) {
-                    float xr = Vec2Traits<T>::to_float(kr[i + 0]);
-                    float xi = 0.0f;
-                    if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(kr[i + 1]);
-                    float c = cos_ptr[i + 0];
-                    float s = sin_ptr[i + 0];
-                    rope_rotate(xr, xi, c, s, is_neox);
-                    dst[Dn + i + 0] = float_to_e4m3fn_byte(xr);
-                    if (i + 1 < Dr) dst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
+                // Then rope with rotation: handle GPT vs NeoX
+                if (!is_neox) {
+                    // GPT/interleaved style: pair (2k, 2k+1) uses cos[k]
+                    for (int i = 0; i < Dr; i += 2) {
+                        int base = i >> 1;  // CRITICAL: pair index
+                        float xr = Vec2Traits<T>::to_float(kr[i + 0]);
+                        float xi = 0.0f;
+                        if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(kr[i + 1]);
+                        float c = cos_ptr[base];
+                        float s = sin_ptr[base];
+                        rope_rotate(xr, xi, c, s, false);
+                        dst[Dn + i + 0] = float_to_e4m3fn_byte(xr);
+                        if (i + 1 < Dr) dst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
+                    }
+                } else {
+                    // NeoX style: pairs (i, i+Dr/2)
+                    int half = Dr / 2;
+                    for (int i = 0; i < half; ++i) {
+                        float xr = Vec2Traits<T>::to_float(kr[i]);
+                        float xi = Vec2Traits<T>::to_float(kr[i + half]);
+                        float c = cos_ptr[i];
+                        float s = sin_ptr[i];
+                        rope_rotate(xr, xi, c, s, true);
+                        dst[Dn + i]        = float_to_e4m3fn_byte(xr);
+                        dst[Dn + i + half] = float_to_e4m3fn_byte(xi);
+                    }
                 }
             }
         }
@@ -527,9 +582,7 @@ void mla_rope_quantize_fp8_fused(
     }
 
     uint8_t* kv_buf_ptr = nullptr;
-    int64_t kv_stride_n_bytes = 0;
-    int64_t kv_stride_m_bytes = 0;
-    int page_size = 0;
+    int64_t kv_stride_row_bytes = 0;
     const int64_t* kv_loc_ptr = nullptr;
     if (kv_buffer.has_value() || kv_cache_loc.has_value()) {
         TORCH_CHECK(kv_buffer.has_value() && kv_cache_loc.has_value(),
@@ -537,22 +590,24 @@ void mla_rope_quantize_fp8_fused(
         auto kv = kv_buffer.value();
         auto loc = kv_cache_loc.value();
         CHECK_INPUT(kv); CHECK_INPUT(loc);
-        CHECK_DIM(3, kv); CHECK_DIM(1, loc);
-        TORCH_CHECK(kv.size(2) == (Dn + Dr), "kv_buffer last dim must be Dn+Dr");
+        CHECK_DIM(1, loc);
+        
+        // CRITICAL FIX: Support 2D buffer [total_rows, kv_dim] (SGLang semantics)
+        // out_cache_loc is flat row index, NOT block ID
+        TORCH_CHECK(kv.dim() == 2, "kv_buffer must be 2D [total_rows, kv_dim]");
+        TORCH_CHECK(kv.size(1) == (Dn + Dr), "kv_buffer last dim must be Dn+Dr");
         TORCH_CHECK(loc.size(0) == nnz_k, "kv_cache_loc size must match K batch size");
         
         // CRITICAL: Check contiguity on last dim to avoid silent errors
-        TORCH_CHECK(kv.stride(2) == 1, "kv_buffer last dim must be contiguous (stride=1)");
+        TORCH_CHECK(kv.stride(1) == 1, "kv_buffer last dim must be contiguous (stride=1)");
         
         // Accept FP8 tensor, reinterpret as uint8*
         kv_buf_ptr = reinterpret_cast<uint8_t*>(kv.data_ptr());
         
-        // KV buffer layout: [num_blocks, page_size, kv_dim]
-        // stride(0): block stride in bytes (already uint8 elements = bytes)
-        // stride(1): page-internal row stride in bytes
-        page_size = kv.size(1);
-        kv_stride_n_bytes = kv.stride(0);  // Block stride
-        kv_stride_m_bytes = kv.stride(1);  // Row stride within page
+        // 2D KV buffer layout: [total_rows, kv_dim]
+        // stride(0): row stride in elements, convert to bytes
+        int elem_size = kv.element_size();
+        kv_stride_row_bytes = kv.stride(0) * elem_size;
         kv_loc_ptr = loc.data_ptr<int64_t>();
     }
 
@@ -570,7 +625,8 @@ void mla_rope_quantize_fp8_fused(
     
     // Dispatch: use vectorized kernel if dimensions are 4-byte aligned
     // Vectorized path requires (Dn+Dr) % 4 == 0 for uint32_t writes
-    bool can_vectorize = ((Dn & 3) == 0) && ((Dr & 3) == 0);
+    // CRITICAL: Disable vectorization for NeoX (pairs are (i, i+Dr/2), not adjacent)
+    bool can_vectorize = ((Dn & 3) == 0) && ((Dr & 3) == 0) && !is_neox;
     
     if (can_vectorize) {
         // Additional check: q_out strides must be 4-byte aligned for vectorized writes
@@ -586,6 +642,15 @@ void mla_rope_quantize_fp8_fused(
     // ===== Dtype dispatch: support both FP16 and BF16 =====
     auto dtype = q_nope.scalar_type();
     
+    // DEBUG: Print which kernel path we're using
+    const char* debug_path = std::getenv("SGL_DEBUG_KERNEL_PATH");
+    if (debug_path && std::string(debug_path) == "1") {
+        printf("[KERNEL DEBUG] can_vectorize=%d, dtype=%s, Dn=%d, Dr=%d, is_neox=%d\n",
+               can_vectorize, 
+               dtype == at::kHalf ? "FP16" : (dtype == at::kBFloat16 ? "BF16" : "OTHER"),
+               Dn, Dr, is_neox);
+    }
+    
     if (dtype == at::kHalf) {
         // FP16 path
         const __half* qn_ptr = reinterpret_cast<const __half*>(q_nope.data_ptr<at::Half>());
@@ -603,7 +668,7 @@ void mla_rope_quantize_fp8_fused(
                 kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
                 q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
                 k_nope_out_ptr, k_rope_out_ptr, 
-                kv_buf_ptr, kv_stride_n_bytes, kv_stride_m_bytes, page_size, kv_loc_ptr
+                kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr
             );
         } else {
             constexpr int BLOCK_THREADS = 256;
@@ -614,7 +679,7 @@ void mla_rope_quantize_fp8_fused(
                 kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
                 q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
                 k_nope_out_ptr, k_rope_out_ptr, 
-                kv_buf_ptr, kv_stride_n_bytes, kv_stride_m_bytes, page_size, kv_loc_ptr
+                kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr
             );
         }
     } else if (dtype == at::kBFloat16) {
@@ -634,7 +699,7 @@ void mla_rope_quantize_fp8_fused(
                 kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
                 q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
                 k_nope_out_ptr, k_rope_out_ptr, 
-                kv_buf_ptr, kv_stride_n_bytes, kv_stride_m_bytes, page_size, kv_loc_ptr
+                kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr
             );
         } else {
             constexpr int BLOCK_THREADS = 256;
@@ -645,7 +710,7 @@ void mla_rope_quantize_fp8_fused(
                 kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
                 q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
                 k_nope_out_ptr, k_rope_out_ptr, 
-                kv_buf_ptr, kv_stride_n_bytes, kv_stride_m_bytes, page_size, kv_loc_ptr
+                kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr
             );
         }
     } else {

From 665beaeb71e9aca7178b074c21088d3927ce147d Mon Sep 17 00:00:00 2001
From: Yangmin Li <yangminl@nvidia.com>
Date: Fri, 31 Oct 2025 10:02:03 -0700
Subject: [PATCH 03/10] run successfully parallel

---
 .../layers/attention/trtllm_mla_backend.py    | 181 +++---------------
 .../csrc/elementwise/mla_rope_fp8_kv_fused.cu |  32 +++-
 2 files changed, 49 insertions(+), 164 deletions(-)

diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
index 9f25ab44df0..a6770a38ba3 100755
--- a/python/sglang/srt/layers/attention/trtllm_mla_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -678,11 +678,6 @@ def quantize_and_rope_for_fp8(
                 - k_nope_out:   [seq_len, num_heads, kv_lora_rank], dtype=torch.float8_e4m3fn or None (if fused)
                 - k_rope_out:   [seq_len, num_heads, qk_rope_head_dim], dtype=torch.float8_e4m3fn or None (if fused)
         """
-        # Import at the beginning to avoid UnboundLocalError
-        import os
-        import torch
-        import sys
-        
         attn_dtype = torch.float8_e4m3fn
         q_len, num_heads = q_rope.shape[0], q_rope.shape[1]
 
@@ -697,137 +692,52 @@ def quantize_and_rope_for_fp8(
 
         # Determine if we can use the fused kernel (RoPE + quantization + direct KV write)
         # Conditions: save_kv_cache, layer provided, not NSA packed format, kernel available
-        
         compute_cap = torch.cuda.get_device_capability()
-        allow_fused_env = os.getenv("SGL_MLA_FUSED", "1") == "1"
         kernel_available = mla_rope_quantize_fp8_fused is not None
         has_layer = layer is not None
         nsa_flag = getattr(forward_batch.token_to_kv_pool, "nsa_kv_cache_store_fp8", False)
         
-        # CRITICAL: Only enable fusion on SM90+ (H100/B200)
-        # SM<90 (A100/H800) has fallback FP8 conversion that's too simplified
-        # and causes severe quantization errors, leading to 0 accuracy
-        # TEMPORARY DEBUG: Check if we should disable fused for specific paths
-        in_decode = os.getenv("_SGL_IN_DECODE", "0") == "1"  # Internal flag
-        allow_decode_fused = os.getenv("SGL_DECODE_FUSED", "0") == "1"
-        
+        # Only enable fusion on SM90+ (H100/B200) due to FP8 hardware support
         use_fused_kv_write = (
             kernel_available
-            and allow_fused_env
-            and (compute_cap[0] >= 9)  # Only SM90+ (H100/B200)
+            and (compute_cap[0] >= 9)
             and save_kv_cache
             and has_layer
             and not nsa_flag
-            and (not in_decode or allow_decode_fused)  # Can disable decode separately
         )
-        
-        # DEBUG: Basic branch tracking
-        if use_fused_kv_write:
-            sys.stderr.write(f"[MLA FUSION] ✅ Using fused kernel (layer={layer.layer_id}, tokens={q_len}, SM={compute_cap[0]}{compute_cap[1]})\n")
-        elif not kernel_available:
-            sys.stderr.write(f"[MLA FUSION] ❌ Kernel not available, using standard path\n")
-        elif compute_cap[0] < 9:
-            sys.stderr.write(f"[MLA FUSION] ⚠️  SM{compute_cap[0]}{compute_cap[1]} < SM90, using standard path (FP8 fallback not reliable)\n")
-        sys.stderr.flush()
 
         if use_fused_kv_write:
             # Fused path: RoPE + quantization + direct KV cache write
             # This eliminates the intermediate write/read of k_nope_out/k_rope_out
             
-            # CRITICAL FIX: Use 2D buffer with flat row indexing (SGLang semantics)
-            # out_cache_loc IS the flat row index, NOT a block ID
-            # Do NOT reshape to 3D or use pos % page_size
+            # Get KV buffer (2D: [total_rows, kv_dim])
             k_cache_raw = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
-            
-            # get_key_buffer returns (size+page_size, 1, kv_dim), squeeze to 2D
             if k_cache_raw.dim() == 3:
-                assert k_cache_raw.size(1) == 1, f"Expected middle dim=1, got {k_cache_raw.shape}"
-                kv_buffer = k_cache_raw.squeeze(1)  # [total_rows, kv_dim]
+                kv_buffer = k_cache_raw.squeeze(1)
             else:
-                kv_buffer = k_cache_raw  # Already 2D
+                kv_buffer = k_cache_raw
             
             # Prepare cache locations and positions as int64 contiguous
             kv_loc = forward_batch.out_cache_loc.to(torch.int64).contiguous()
             positions = forward_batch.positions.to(torch.int64).contiguous()
             
-            # Assertions to catch issues early
-            assert kv_buffer.dim() == 2, f"kv_buffer must be 2D, got shape {kv_buffer.shape}"
-            assert kv_buffer.size(1) == self.kv_cache_dim, f"kv_buffer last dim must be {self.kv_cache_dim}, got {kv_buffer.size(1)}"
-            assert kv_buffer.size(1) == (self.kv_lora_rank + self.qk_rope_head_dim), \
-                f"kv_buffer dim mismatch: expect {self.kv_lora_rank + self.qk_rope_head_dim}, got {kv_buffer.size(1)}"
-            assert (self.qk_rope_head_dim % 2) == 0, "RoPE dim (Dr) must be even for pairing"
-            assert kv_loc.dim() == 1, f"kv_loc must be 1D, got {kv_loc.dim()}"
-            assert kv_loc.numel() == q_len, f"kv_loc size {kv_loc.numel()} must match token count {q_len}"
-            assert q_nope.dtype in (torch.float16, torch.bfloat16), f"Unsupported dtype: {q_nope.dtype}"
-            assert k_nope.dtype == q_nope.dtype, "Q and K dtype mismatch"
-            # CRITICAL: Check cos_sin_cache dimension
-            assert cos_sin_cache.size(1) == self.qk_rope_head_dim, \
-                f"cos_sin_cache second dim must be rope_dim={self.qk_rope_head_dim}, got {cos_sin_cache.size(1)}"
-            # Check all kv_loc values are within valid range
-            assert kv_loc.min() >= 0 and kv_loc.max() < kv_buffer.size(0), \
-                f"kv_loc out of range: min={kv_loc.min()}, max={kv_loc.max()}, buffer_size={kv_buffer.size(0)}"
-            # Sanity check: positions should be reasonable
-            assert positions.min() >= 0, f"Invalid positions: min={positions.min()}"
-            # Check tensor contiguity
-            assert kv_buffer.is_contiguous() or kv_buffer.stride(-1) == 1, "kv_buffer last dim must be contiguous"
-            
-            # DEBUG: Check for potential issues with KV write
-            debug_kv = os.getenv("SGL_DEBUG_KV_WRITE", "0") == "1"
-            if debug_kv:
-                sys.stderr.write(f"[KV DEBUG] layer={layer.layer_id}, tokens={q_len}, "
-                               f"kv_loc={kv_loc.cpu().numpy()}, pos={positions.cpu().numpy()}, "
-                               f"kv_buffer.data_ptr={kv_buffer.data_ptr():#x}, "
-                               f"kv_buffer.size(0)={kv_buffer.size(0)}\n")
-                sys.stderr.flush()
-                
-                # Verify buffer is the same as what will be read later
-                k_cache_verify = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
-                if k_cache_verify.dim() == 3:
-                    k_cache_verify = k_cache_verify.squeeze(1)
-                assert k_cache_verify.data_ptr() == kv_buffer.data_ptr(), \
-                    f"KV buffer address changed! Expected {kv_buffer.data_ptr():#x}, got {k_cache_verify.data_ptr():#x}"
-            
-            # Debug output
-            sys.stderr.write(f"[FUSED DEBUG] kv.shape={tuple(kv_buffer.shape)}, loc.shape={tuple(kv_loc.shape)}, "
-                           f"q_nope.shape={tuple(q_nope.shape)}, dtype={q_nope.dtype}\n")
-            # DEBUG: Save inputs/outputs for offline comparison
-            debug_save = os.getenv("SGL_DEBUG_SAVE_ROPE", "0") == "1"
-            if debug_save and layer.layer_id == 0:  # Only save first layer
-                torch.save({
-                    'q_nope': q_nope.cpu(),
-                    'q_rope': q_rope.cpu(),
-                    'k_nope': k_nope.cpu(),
-                    'k_rope': k_rope.cpu(),
-                    'cos_sin_cache': cos_sin_cache.cpu(),
-                    'positions': positions.cpu(),
-                    'is_neox': is_neox,
-                }, '/tmp/rope_inputs.pt')
-                sys.stderr.write(f"[DEBUG] Saved inputs to /tmp/rope_inputs.pt\n")
-                sys.stderr.flush()
-            
-            # Call fused kernel
-            # Note: Kernel now natively supports both FP16 and BF16 via C++ templates
-            # Note: kernel accepts 2D or 3D Q inputs (handles stride properly)
-            # Note: Pass FP8 tensors as-is, C++ will reinterpret_cast to uint8*
-            # K inputs must be 2D
+            # Call fused kernel: RoPE + quantize + write KV cache
             mla_rope_quantize_fp8_fused(
-                q_nope,  # [nnz, num_heads, Dn] or [nnz, Dn] - kernel handles both, FP16/BF16
-                q_rope,  # [nnz, num_heads, Dr] or [nnz, Dr] - kernel handles both, FP16/BF16
-                k_nope,  # [nnz, Dn] - already 2D from squeeze(1), FP16/BF16
-                k_rope,  # [nnz, Dr] - already 2D from squeeze(1), FP16/BF16
+                q_nope,
+                q_rope,
+                k_nope,
+                k_rope,
                 cos_sin_cache,
-                positions,  # int64 contiguous
+                positions,
                 is_neox,
-                q_out,   # FP8 tensor, C++ will reinterpret as uint8*
+                q_out,
                 None,  # k_nope_out - skip intermediate output
                 None,  # k_rope_out - skip intermediate output
-                kv_buffer,  # 2D FP8 tensor [total_rows, kv_dim]
-                kv_loc,  # int64 flat row indices
+                kv_buffer,
+                kv_loc,
             )
             
             # Return Q output and None for K outputs (already written to cache)
-            sys.stderr.write(f"[FUSED] Returning from fused path: q_out.shape={q_out.shape}, k=None, k_rope=None\n")
-            sys.stderr.flush()
             return q_out, None, None
         else:
             # Standard path: RoPE + quantization only (backward compatible)
@@ -835,24 +745,7 @@ def quantize_and_rope_for_fp8(
             k_rope_out = k_rope.new_empty(k_rope.shape, dtype=attn_dtype)
             k_nope_out = k_nope.new_empty(k_nope.shape, dtype=attn_dtype)
 
-            # DEBUG: Save inputs for offline comparison
-            debug_save = os.getenv("SGL_DEBUG_SAVE_ROPE", "0") == "1"
-            if debug_save:
-                torch.save({
-                    'q_nope': q_nope.cpu(),
-                    'q_rope': q_rope.cpu(),
-                    'k_nope': k_nope.cpu(),
-                    'k_rope': k_rope.cpu(),
-                    'cos_sin_cache': cos_sin_cache.cpu(),
-                    'positions': forward_batch.positions.cpu(),
-                    'is_neox': is_neox,
-                }, '/tmp/flashinfer_rope_inputs.pt')
-                
             # Apply RoPE and quantize all components in a single kernel call
-            # This kernel handles:
-            # 1. RoPE application to q_rope and k_rope using cos_sin_cache and positions
-            # 2. Quantization of all components to FP8 format
-            # 3. Output placement into pre-allocated tensors
             flashinfer.rope.mla_rope_quantize_fp8(
                 q_rope=q_rope,
                 k_rope=k_rope,
@@ -970,18 +863,15 @@ def forward_decode(
         """Run forward for decode using TRTLLM MLA kernel."""
         merge_query = q_rope is not None
         fused_kv = False  # Track if using fused KV write path
+        
         if self.data_type == torch.float8_e4m3fn:
-            # For FP8 path, we quantize the query and rope parts and merge them into a single tensor
-            # Note: rope application in deepseek_v2.py:forward_absorb_prepare is skipped for FP8 decode path of this trtllm_mla backend
+            # For FP8 path, quantize and apply RoPE
             assert all(
                 x is not None for x in [q_rope, k_rope, cos_sin_cache]
-            ), "For FP8 path and using flashinfer.rope.mla_rope_quantize we need all of q_rope, k_rope and cos_sin_cache to be not None."
+            ), "For FP8 path we need all of q_rope, k_rope and cos_sin_cache to be not None."
             
-            # Call quantize_and_rope_for_fp8 with layer and save_kv_cache parameters
+            # Call quantize_and_rope_for_fp8 with layer and save_kv_cache
             # This enables the fused kernel path when conditions are met
-            # TEMPORARY: Disable fused in decode for debugging
-            import os
-            allow_decode_fused = os.getenv("SGL_DECODE_FUSED", "0") == "1"
             q, k, k_rope = self.quantize_and_rope_for_fp8(
                 q,
                 q_rope,
@@ -990,19 +880,14 @@ def forward_decode(
                 forward_batch,
                 cos_sin_cache,
                 is_neox,
-                layer=layer if allow_decode_fused else None,  # Disable fused by passing None
+                layer=layer,
                 save_kv_cache=save_kv_cache,
             )
             merge_query = False
             # Check if fused path was used (K outputs are None when directly written to KV cache)
             fused_kv = (k is None and k_rope is None)
-            if fused_kv:
-                import sys
-                sys.stderr.write(f"[DECODE] ✅ Fused path used\n")
-                sys.stderr.flush()
 
         # Save KV cache if requested (only if not using fused path)
-        # When k or k_rope is None, it means the fused kernel already wrote to KV cache
         if save_kv_cache and not fused_kv and k is not None and k_rope is not None:
             forward_batch.token_to_kv_pool.set_mla_kv_buffer(
                 layer, forward_batch.out_cache_loc, k, k_rope
@@ -1081,15 +966,10 @@ def forward_extend(
         cos_sin_cache: Optional[torch.Tensor] = None,
         is_neox: Optional[bool] = False,
     ) -> torch.Tensor:
-        # TODO refactor to avoid code duplication
         merge_query = q_rope is not None
         fused_kv = False  # Track if we used fused KV write path
         
-        # For FP8 path, check if we should use quantize_and_rope_for_fp8
-        # Conditions:
-        # 1. Using FP8 data type
-        # 2. In target_verify or draft_extend mode
-        # 3. All rope-related parameters are available (not None)
+        # For FP8 path in target_verify or draft_extend mode, use quantize_and_rope_for_fp8
         use_fp8_quantize = (
             self.data_type == torch.float8_e4m3fn
             and (
@@ -1100,17 +980,12 @@ def forward_extend(
         )
         
         if use_fp8_quantize:
-            # For FP8 path, we quantize the query and rope parts and merge them into a single tensor
-            # Note: rope application in deepseek_v2.py:forward_absorb_prepare is skipped for FP8 decode path of this trtllm_mla backend
-            
-            # Call quantize_and_rope_for_fp8 with layer and save_kv_cache parameters
-            # This enables the fused kernel path when conditions are met
-            # NOTE: Do NOT squeeze Q - fused kernel supports 3D Q: [nnz, num_heads, dim]
+            # FP8 path: quantize and apply RoPE with optional fused KV write
             q, k_fp8_nope, k_fp8_rope = self.quantize_and_rope_for_fp8(
-                q,  # ✅ Keep Q as is (supports both 2D and 3D)
-                q_rope,  # ✅ Keep Q_rope as is
-                k.squeeze(1) if k.dim() == 3 else k,  # K must be 2D
-                k_rope.squeeze(1) if k_rope.dim() == 3 else k_rope,  # K_rope must be 2D
+                q,
+                q_rope,
+                k.squeeze(1) if k.dim() == 3 else k,
+                k_rope.squeeze(1) if k_rope.dim() == 3 else k_rope,
                 forward_batch,
                 cos_sin_cache,
                 is_neox,
@@ -1120,16 +995,11 @@ def forward_extend(
             merge_query = False
             # Fused path: K already written to KV cache, function returns None for K outputs
             fused_kv = (k_fp8_nope is None and k_fp8_rope is None)
-            if fused_kv:
-                import sys
-                sys.stderr.write(f"[EXTEND] ✅ Fused path used\n")
-                sys.stderr.flush()
             # Update local variables for subsequent logic
             k = k_fp8_nope
             k_rope = k_fp8_rope
 
         # Save KV cache if requested (only if not using fused path)
-        # When k or k_rope is None, it means the fused kernel already wrote to KV cache
         if save_kv_cache and not fused_kv and k is not None and k_rope is not None:
             forward_batch.token_to_kv_pool.set_mla_kv_buffer(
                 layer, forward_batch.out_cache_loc, k, k_rope
@@ -1197,7 +1067,6 @@ def forward_extend(
                     q, padded_q, metadata.seq_lens_q, metadata.cu_seqlens_q
                 )
 
-            # TODO may use `mla_rope_quantize_fp8` fusion
             q = q.view(bs, -1, layer.tp_q_head_num, layer.head_dim)
             assert kv_cache.dtype == self.data_type
 
diff --git a/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu b/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
index c45e7205c3b..ff580253714 100644
--- a/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
+++ b/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
@@ -148,6 +148,8 @@ __global__ void FusedRopeQuantizeKernelVec(
     int64_t qr_stride_tok, int64_t qr_stride_head,  // Q_rope strides in elements
     const T* __restrict__ k_nope,
     const T* __restrict__ k_rope,
+    int64_t kn_stride_tok,  // NEW: K_nope stride(0) in elements
+    int64_t kr_stride_tok,  // NEW: K_rope stride(0) in elements
     const float* __restrict__ cos_sin,
     const int64_t* __restrict__ pos_ids,
     int nnz, int num_heads, int Dn, int Dr,
@@ -177,8 +179,9 @@ __global__ void FusedRopeQuantizeKernelVec(
     const T* qr = q_rope + size_t(token_id) * qr_stride_tok + size_t(head_id) * qr_stride_head;
     
     // K is always 2D: [nnz_tokens, dim]
-    const T* kn = k_nope + size_t(token_id) * Dn;
-    const T* kr = k_rope + size_t(token_id) * Dr;
+    // CRITICAL FIX: Use actual stride(0) instead of assuming Dn/Dr (handles non-contiguous K)
+    const T* kn = k_nope + size_t(token_id) * kn_stride_tok;
+    const T* kr = k_rope + size_t(token_id) * kr_stride_tok;
 
     // Q output using byte strides
     uint8_t* qdst = q_out_fp8 + size_t(token_id) * qout_stride_tok_bytes + size_t(head_id) * qout_stride_head_bytes;
@@ -300,6 +303,8 @@ __global__ void FusedRopeQuantizeKernelScalar(
     int64_t qr_stride_tok, int64_t qr_stride_head,
     const T* __restrict__ k_nope,
     const T* __restrict__ k_rope,
+    int64_t kn_stride_tok,  // NEW: K_nope stride(0) in elements
+    int64_t kr_stride_tok,  // NEW: K_rope stride(0) in elements
     const float* __restrict__ cos_sin,
     const int64_t* __restrict__ pos_ids,
     int nnz, int num_heads, int Dn, int Dr,
@@ -371,8 +376,9 @@ __global__ void FusedRopeQuantizeKernelScalar(
 
         // ---- Quantize k & optional fused KV write ----
         // K is always 2D: [nnz_tokens, dim]
-        const T* kn = k_nope + size_t(token_id) * Dn;
-        const T* kr = k_rope + size_t(token_id) * Dr;
+        // CRITICAL FIX: Use actual stride(0) instead of assuming Dn/Dr (handles non-contiguous K)
+        const T* kn = k_nope + size_t(token_id) * kn_stride_tok;
+        const T* kr = k_rope + size_t(token_id) * kr_stride_tok;
 
         // Optional: write k_nope_out / k_rope_out (only once per token, not per head)
         // Note: K outputs are 2D [nnz_tokens, dim], so only first head processes them
@@ -542,6 +548,12 @@ void mla_rope_quantize_fp8_fused(
     TORCH_CHECK(k_rope.size(0) == nnz_k && k_rope.size(1) == Dr, "k_rope shape mismatch");
     TORCH_CHECK(nnz_k == nnz_tokens, "K batch size must match Q token count");
     
+    // ===== K strides (CRITICAL FIX: use stride(0) to handle non-contiguous batches) =====
+    // In multi-req concurrent scenarios, K may have stride(0) != dim due to slicing/gather
+    // We MUST use the actual stride(0) instead of assuming Dn/Dr for correct row addressing
+    int64_t kn_stride_tok = k_nope.stride(0);  // elements per token
+    int64_t kr_stride_tok = k_rope.stride(0);  // elements per token
+    
     // ===== Robustness checks (expert suggestions) =====
     
     // 1. K must be contiguous on last dim (or explicitly handle stride)
@@ -665,7 +677,8 @@ void mla_rope_quantize_fp8_fused(
             
             FusedRopeQuantizeKernelVec<WARPS_PER_CTA, __half><<<vecGrid, vecBlock, 0, stream>>>(
                 qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
-                kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
+                kn_ptr, kr_ptr, kn_stride_tok, kr_stride_tok,
+                cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
                 q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
                 k_nope_out_ptr, k_rope_out_ptr, 
                 kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr
@@ -676,7 +689,8 @@ void mla_rope_quantize_fp8_fused(
             
             FusedRopeQuantizeKernelScalar<BLOCK_THREADS, __half><<<grid, BLOCK_THREADS, 0, stream>>>(
                 qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
-                kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
+                kn_ptr, kr_ptr, kn_stride_tok, kr_stride_tok,
+                cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
                 q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
                 k_nope_out_ptr, k_rope_out_ptr, 
                 kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr
@@ -696,7 +710,8 @@ void mla_rope_quantize_fp8_fused(
             
             FusedRopeQuantizeKernelVec<WARPS_PER_CTA, nv_bfloat16><<<vecGrid, vecBlock, 0, stream>>>(
                 qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
-                kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
+                kn_ptr, kr_ptr, kn_stride_tok, kr_stride_tok,
+                cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
                 q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
                 k_nope_out_ptr, k_rope_out_ptr, 
                 kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr
@@ -707,7 +722,8 @@ void mla_rope_quantize_fp8_fused(
             
             FusedRopeQuantizeKernelScalar<BLOCK_THREADS, nv_bfloat16><<<grid, BLOCK_THREADS, 0, stream>>>(
                 qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
-                kn_ptr, kr_ptr, cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
+                kn_ptr, kr_ptr, kn_stride_tok, kr_stride_tok,
+                cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
                 q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
                 k_nope_out_ptr, k_rope_out_ptr, 
                 kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr

From 5f95f7a45bd97e6c0ef40bfc4090a65827adb118 Mon Sep 17 00:00:00 2001
From: Yangmin Li <yangminl@nvidia.com>
Date: Fri, 31 Oct 2025 12:40:15 -0700
Subject: [PATCH 04/10] small code adjustment

---
 python/sglang/srt/layers/attention/trtllm_mla_backend.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
index a6770a38ba3..471da0c84e7 100755
--- a/python/sglang/srt/layers/attention/trtllm_mla_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -865,10 +865,11 @@ def forward_decode(
         fused_kv = False  # Track if using fused KV write path
         
         if self.data_type == torch.float8_e4m3fn:
-            # For FP8 path, quantize and apply RoPE
+            # For FP8 path, we quantize the query and rope parts and merge them into a single tensor
+            # Note: rope application in deepseek_v2.py:forward_absorb_prepare is skipped for FP8 decode path of this trtllm_mla backend
             assert all(
                 x is not None for x in [q_rope, k_rope, cos_sin_cache]
-            ), "For FP8 path we need all of q_rope, k_rope and cos_sin_cache to be not None."
+            ), "For FP8 path and using flashinfer.rope.mla_rope_quantize we need all of q_rope, k_rope and cos_sin_cache to be not None."
             
             # Call quantize_and_rope_for_fp8 with layer and save_kv_cache
             # This enables the fused kernel path when conditions are met
@@ -966,9 +967,12 @@ def forward_extend(
         cos_sin_cache: Optional[torch.Tensor] = None,
         is_neox: Optional[bool] = False,
     ) -> torch.Tensor:
+        # TODO refactor to avoid code duplication for forward_decode
         merge_query = q_rope is not None
         fused_kv = False  # Track if we used fused KV write path
         
+        # TODO: Check if the condition restrictions (target_verify/draft_extend only) are necessary
+        #       Consider if we can enable FP8 quantize_and_rope for all extend paths safely
         # For FP8 path in target_verify or draft_extend mode, use quantize_and_rope_for_fp8
         use_fp8_quantize = (
             self.data_type == torch.float8_e4m3fn

From 1cad1b289861eecad15e21da4f58378e6800d2e4 Mon Sep 17 00:00:00 2001
From: Yangmin Li <yangminl@nvidia.com>
Date: Sat, 1 Nov 2025 08:57:44 -0700
Subject: [PATCH 05/10] minor fix

---
 .../layers/attention/trtllm_mla_backend.py    | 14 ++++++++----
 sgl-kernel/build_minimal.sh                   | 22 +++++++++----------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
index 471da0c84e7..9658d2688fd 100755
--- a/python/sglang/srt/layers/attention/trtllm_mla_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -889,7 +889,10 @@ def forward_decode(
             fused_kv = (k is None and k_rope is None)
 
         # Save KV cache if requested (only if not using fused path)
-        if save_kv_cache and not fused_kv and k is not None and k_rope is not None:
+        if save_kv_cache and not fused_kv:
+            assert (
+                k is not None and k_rope is not None
+            ), "For populating trtllm_mla kv cache, both k_nope and k_rope should be not None."
             forward_batch.token_to_kv_pool.set_mla_kv_buffer(
                 layer, forward_batch.out_cache_loc, k, k_rope
             )
@@ -988,8 +991,8 @@ def forward_extend(
             q, k_fp8_nope, k_fp8_rope = self.quantize_and_rope_for_fp8(
                 q,
                 q_rope,
-                k.squeeze(1) if k.dim() == 3 else k,
-                k_rope.squeeze(1) if k_rope.dim() == 3 else k_rope,
+                k.squeeze(1),
+                k_rope.squeeze(1),
                 forward_batch,
                 cos_sin_cache,
                 is_neox,
@@ -1004,7 +1007,10 @@ def forward_extend(
             k_rope = k_fp8_rope
 
         # Save KV cache if requested (only if not using fused path)
-        if save_kv_cache and not fused_kv and k is not None and k_rope is not None:
+        if save_kv_cache and not fused_kv:
+            assert (
+                k is not None and k_rope is not None
+            ), "For populating trtllm_mla kv cache, both k_nope and k_rope should be not None."
             forward_batch.token_to_kv_pool.set_mla_kv_buffer(
                 layer, forward_batch.out_cache_loc, k, k_rope
             )
diff --git a/sgl-kernel/build_minimal.sh b/sgl-kernel/build_minimal.sh
index d5805f5371a..8aaa32247e4 100755
--- a/sgl-kernel/build_minimal.sh
+++ b/sgl-kernel/build_minimal.sh
@@ -1,23 +1,23 @@
 #!/bin/bash
 # Minimal build script for sgl-kernel (only compile needed architectures)
 
-# 清理之前的编译
+# Clean previous build artifacts
 rm -rf _skbuild/ build/ *.egg-info
 
-# 设置编译选项
-export CMAKE_BUILD_PARALLEL_LEVEL=2          # 减少并行度，避免 OOM
-export SGL_KERNEL_COMPILE_THREADS=4          # NVCC 线程数
+# Configure build options
+export CMAKE_BUILD_PARALLEL_LEVEL=2          # Reduce parallelism to avoid OOM
+export SGL_KERNEL_COMPILE_THREADS=4          # NVCC thread count
 
-# 只编译 B200 (SM100) 的架构，跳过其他
-# 你的 B200 是 compute_100，不需要 SM80/89/90
-export ENABLE_BELOW_SM90=OFF                 # 跳过 A100/V100 等旧架构
-export SGL_KERNEL_ENABLE_FA3=OFF             # 跳过 Flash-Attention 3 (SM90a)
-export SGL_KERNEL_ENABLE_SM90A=OFF           # 跳过 SM90A
+# Only compile for B200 (SM100) architecture, skip others
+# B200 is compute_100, no need for SM80/89/90
+export ENABLE_BELOW_SM90=OFF                 # Skip older architectures (A100/V100 etc.)
+export SGL_KERNEL_ENABLE_FA3=OFF             # Skip Flash-Attention 3 (SM90a)
+export SGL_KERNEL_ENABLE_SM90A=OFF           # Skip SM90A
 
-# 只保留 SM100
+# Only keep SM100
 export TORCH_CUDA_ARCH_LIST="10.0"
 
-# 开始编译
+# Start compilation
 pip install -e . --no-build-isolation -v 2>&1 | tee build.log
 
 echo ""

From 122ffd6afb9d07e42d85e525ede7f579db6e8d93 Mon Sep 17 00:00:00 2001
From: Yangmin Li <yangminl@nvidia.com>
Date: Sat, 1 Nov 2025 16:55:06 -0700
Subject: [PATCH 06/10] update unit tests

---
 sgl-kernel/tests/test_mla_rope_fp8_fused.py | 283 ++++++++++++++++++++
 test/srt/test_mla_fp8_fused_kv.py           |  79 ------
 2 files changed, 283 insertions(+), 79 deletions(-)
 create mode 100644 sgl-kernel/tests/test_mla_rope_fp8_fused.py
 delete mode 100644 test/srt/test_mla_fp8_fused_kv.py

diff --git a/sgl-kernel/tests/test_mla_rope_fp8_fused.py b/sgl-kernel/tests/test_mla_rope_fp8_fused.py
new file mode 100644
index 00000000000..876c365ce3a
--- /dev/null
+++ b/sgl-kernel/tests/test_mla_rope_fp8_fused.py
@@ -0,0 +1,283 @@
+# -*- coding: utf-8 -*-
+"""
+PyTest: correctness of fused MLA RoPE + FP8 quantization + KV write.
+Tests the mla_rope_quantize_fp8_fused kernel from sgl_kernel extension.
+
+Tests both:
+1. Baseline path (kernel writes to k_nope_out/k_rope_out)
+2. Fused path (kernel directly writes to KV cache buffer)
+"""
+import itertools
+
+import pytest
+import torch
+
+try:
+    # Try standalone build first
+    from mla_fusion_kernel import mla_rope_quantize_fp8_fused
+    _has_sgl_kernel = True
+except ImportError:
+    # Fallback to sgl_kernel
+    try:
+        from sgl_kernel import mla_rope_quantize_fp8_fused
+        _has_sgl_kernel = True
+    except ImportError:
+        mla_rope_quantize_fp8_fused = None  # Will use non-fused path
+        _has_sgl_kernel = False
+
+requires_ext = pytest.mark.skipif(
+    not _has_sgl_kernel, reason="sgl_kernel extension not available"
+)
+
+
+@requires_ext
+@pytest.mark.parametrize(
+    "nnz,num_heads,Dn,Dr,dtype",
+    list(
+        itertools.product(
+            [64, 256],  # nnz: number of tokens
+            [1, 8],  # num_heads: 1 for 2D Q, 8 for 3D Q
+            [512],  # Dn: nope dimension
+            [64],  # Dr: rope dimension
+            [torch.float16, torch.bfloat16],  # dtypes
+        )
+    ),
+)
+def test_fused_matches_baseline(nnz, num_heads, Dn, Dr, dtype):
+    """Test that fused KV write produces same results as baseline."""
+    device = "cuda"
+    torch.manual_seed(42)
+
+    # Create inputs based on whether we're testing 2D or 3D Q
+    if num_heads == 1:
+        # 2D case: [nnz, dim]
+        q_nope = torch.randn(nnz, Dn, device=device, dtype=dtype)
+        q_rope = torch.randn(nnz, Dr, device=device, dtype=dtype)
+        q_out_shape = (nnz, Dn + Dr)
+    else:
+        # 3D case: [nnz, num_heads, dim]
+        q_nope = torch.randn(nnz, num_heads, Dn, device=device, dtype=dtype)
+        q_rope = torch.randn(nnz, num_heads, Dr, device=device, dtype=dtype)
+        q_out_shape = (nnz, num_heads, Dn + Dr)
+
+    # K is always 2D regardless of Q shape
+    k_nope = torch.randn(nnz, Dn, device=device, dtype=dtype)
+    k_rope = torch.randn(nnz, Dr, device=device, dtype=dtype)
+
+    # Create cos/sin cache
+    max_seq = max(2048, nnz)
+    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[
+        :, None
+    ]
+    idx = torch.arange(Dr, device=device, dtype=torch.float32)[None, :]
+    freqs = 0.1 * (idx + 1.0)  # Small frequencies to avoid overflow
+    cos = torch.cos(t * freqs)
+    sin = torch.sin(t * freqs)
+    cos_sin = torch.cat([cos, sin], dim=1)  # [max_seq, 2*Dr]
+
+    # Random position IDs
+    pos_ids = torch.randint(
+        low=0, high=max_seq, size=(nnz,), device=device, dtype=torch.long
+    )
+
+    # ========================================================================
+    # BASELINE PATH: Write to k_nope_out/k_rope_out, then concat manually
+    # ========================================================================
+    q_out_base = torch.empty(q_out_shape, device=device, dtype=torch.uint8)
+    k_nope_out = torch.empty(nnz, Dn, device=device, dtype=torch.uint8)
+    k_rope_out = torch.empty(nnz, Dr, device=device, dtype=torch.uint8)
+
+    mla_rope_quantize_fp8_fused(
+        q_nope,
+        q_rope,
+        k_nope,
+        k_rope,
+        cos_sin,
+        pos_ids,
+        False,  # is_neox
+        q_out_base,
+        k_nope_out,
+        k_rope_out,
+        None,  # kv_buffer
+        None,  # kv_cache_loc
+    )
+
+    # Manually concat K parts into KV buffer (simulating set_mla_kv_buffer)
+    slots = nnz + 8  # Add some extra slots
+    kv_base = torch.zeros(slots, Dn + Dr, device=device, dtype=torch.uint8)  # 2D format
+    loc = torch.arange(nnz, device=device, dtype=torch.long)
+    kv_base[loc, :Dn] = k_nope_out
+    kv_base[loc, Dn:] = k_rope_out
+
+    # ========================================================================
+    # FUSED PATH: Direct KV write, skip separate K outputs
+    # ========================================================================
+    q_out_fused = torch.empty_like(q_out_base)
+    kv_fused = torch.zeros_like(kv_base)
+
+    mla_rope_quantize_fp8_fused(
+        q_nope,
+        q_rope,
+        k_nope,
+        k_rope,
+        cos_sin,
+        pos_ids,
+        False,  # is_neox
+        q_out_fused,
+        None,  # k_nope_out
+        None,  # k_rope_out
+        kv_fused,  # Direct write to KV buffer
+        loc,  # kv_cache_loc
+    )
+
+    # ========================================================================
+    # ASSERTIONS
+    # ========================================================================
+    # For FP8 quantized outputs, we can't expect perfect match due to
+    # rounding, but they should be very close when converted back to float
+    torch.testing.assert_close(
+        q_out_base.float(),
+        q_out_fused.float(),
+        rtol=1e-3,
+        atol=1e-3,
+        msg=f"q_out mismatch for {dtype=}, {num_heads=}",
+    )
+
+    torch.testing.assert_close(
+        kv_base.float(),
+        kv_fused.float(),
+        rtol=1e-3,
+        atol=1e-3,
+        msg=f"KV buffer mismatch for {dtype=}, {num_heads=}",
+    )
+
+    # For stricter check on used slots (should be exact)
+    assert torch.equal(
+        kv_base[loc], kv_fused[loc]
+    ), f"Used KV slots must match exactly for {dtype=}, {num_heads=}"
+
+
+@requires_ext
+@pytest.mark.parametrize("nnz,Dn,Dr", [(128, 512, 64), (1024, 512, 64)])
+def test_baseline_only_path(nnz, Dn, Dr):
+    """Test that baseline path (without KV buffer) works correctly."""
+    device = "cuda"
+    dtype = torch.float16
+    torch.manual_seed(42)
+
+    # 2D inputs
+    q_nope = torch.randn(nnz, Dn, device=device, dtype=dtype)
+    q_rope = torch.randn(nnz, Dr, device=device, dtype=dtype)
+    k_nope = torch.randn(nnz, Dn, device=device, dtype=dtype)
+    k_rope = torch.randn(nnz, Dr, device=device, dtype=dtype)
+
+    # Create cos/sin cache
+    max_seq = 2048
+    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[
+        :, None
+    ]
+    idx = torch.arange(Dr, device=device, dtype=torch.float32)[None, :]
+    freqs = 0.1 * (idx + 1.0)
+    cos = torch.cos(t * freqs)
+    sin = torch.sin(t * freqs)
+    cos_sin = torch.cat([cos, sin], dim=1)
+
+    pos_ids = torch.randint(
+        low=0, high=max_seq, size=(nnz,), device=device, dtype=torch.long
+    )
+
+    # Allocate outputs
+    q_out = torch.empty(nnz, Dn + Dr, device=device, dtype=torch.uint8)
+    k_nope_out = torch.empty(nnz, Dn, device=device, dtype=torch.uint8)
+    k_rope_out = torch.empty(nnz, Dr, device=device, dtype=torch.uint8)
+
+    # Call kernel (baseline path only)
+    mla_rope_quantize_fp8_fused(
+        q_nope,
+        q_rope,
+        k_nope,
+        k_rope,
+        cos_sin,
+        pos_ids,
+        False,
+        q_out,
+        k_nope_out,
+        k_rope_out,
+        None,  # No KV buffer
+        None,  # No kv_cache_loc
+    )
+
+    # Basic sanity checks
+    assert q_out.shape == (nnz, Dn + Dr)
+    assert k_nope_out.shape == (nnz, Dn)
+    assert k_rope_out.shape == (nnz, Dr)
+
+    # Check that outputs are not all zeros (actual quantization happened)
+    assert q_out.abs().sum() > 0, "q_out should not be all zeros"
+    assert k_nope_out.abs().sum() > 0, "k_nope_out should not be all zeros"
+    assert k_rope_out.abs().sum() > 0, "k_rope_out should not be all zeros"
+
+
+@requires_ext
+def test_fused_only_path():
+    """Test that fused path (only KV buffer, no separate K outputs) works."""
+    device = "cuda"
+    dtype = torch.float16
+    nnz, Dn, Dr = 128, 512, 64
+    torch.manual_seed(42)
+
+    # 2D inputs
+    q_nope = torch.randn(nnz, Dn, device=device, dtype=dtype)
+    q_rope = torch.randn(nnz, Dr, device=device, dtype=dtype)
+    k_nope = torch.randn(nnz, Dn, device=device, dtype=dtype)
+    k_rope = torch.randn(nnz, Dr, device=device, dtype=dtype)
+
+    # Create cos/sin cache
+    max_seq = 2048
+    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[
+        :, None
+    ]
+    idx = torch.arange(Dr, device=device, dtype=torch.float32)[None, :]
+    freqs = 0.1 * (idx + 1.0)
+    cos = torch.cos(t * freqs)
+    sin = torch.sin(t * freqs)
+    cos_sin = torch.cat([cos, sin], dim=1)
+
+    pos_ids = torch.randint(
+        low=0, high=max_seq, size=(nnz,), device=device, dtype=torch.long
+    )
+
+    # Allocate only Q output and KV buffer
+    q_out = torch.empty(nnz, Dn + Dr, device=device, dtype=torch.uint8)
+    slots = nnz + 16
+    kv_buffer = torch.zeros(slots, Dn + Dr, device=device, dtype=torch.uint8)  # 2D format
+    loc = torch.arange(nnz, device=device, dtype=torch.long)
+
+    # Call kernel (fused path only)
+    mla_rope_quantize_fp8_fused(
+        q_nope,
+        q_rope,
+        k_nope,
+        k_rope,
+        cos_sin,
+        pos_ids,
+        False,
+        q_out,
+        None,  # No k_nope_out
+        None,  # No k_rope_out
+        kv_buffer,  # Direct KV write
+        loc,
+    )
+
+    # Check that KV buffer was written
+    assert kv_buffer[loc].abs().sum() > 0, "KV buffer should have been written"
+    # Check unused slots are still zero
+    unused = torch.ones(slots, dtype=torch.bool, device=device)
+    unused[loc] = False
+    assert kv_buffer[unused].abs().sum() == 0, "Unused KV slots should remain zero"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
+
+
diff --git a/test/srt/test_mla_fp8_fused_kv.py b/test/srt/test_mla_fp8_fused_kv.py
deleted file mode 100644
index 172727a40b1..00000000000
--- a/test/srt/test_mla_fp8_fused_kv.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-PyTest: correctness of fused KV write vs baseline (non-fused).
-Tests the mla_rope_quantize_fp8_fused kernel from sgl_kernel extension.
-"""
-import torch
-import pytest
-
-# Try to import fusion kernel (standalone or from sgl_kernel)
-_has_sgl_kernel = False
-mla_rope_quantize_fp8_fused = None
-try:
-    from mla_fusion_kernel import mla_rope_quantize_fp8_fused
-    _has_sgl_kernel = True
-except ImportError:
-    try:
-        from sgl_kernel import mla_rope_quantize_fp8_fused
-        _has_sgl_kernel = True
-    except ImportError:
-        pass
-
-requires_ext = pytest.mark.skipif(not _has_sgl_kernel, reason="sgl_kernel extension not available")
-
-@requires_ext
-@pytest.mark.parametrize("nnz", [256, 1024])
-@pytest.mark.parametrize("Dn,Dr", [(512, 64)])
-def test_fused_matches_baseline(nnz, Dn, Dr):
-    device = "cuda"
-    torch.manual_seed(0)
-
-    # Inputs (half); in a real path you may use bfloat16. We pick half for demo.
-    q_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
-    q_rope = torch.randn(nnz, Dr, device=device, dtype=torch.float16)
-    k_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
-    k_rope = torch.randn(nnz, Dr, device=device, dtype=torch.float16)
-
-    max_seq = max(2048, nnz)
-    # cos/sin cache: [max_seq, 2*Dr]
-    # Simple deterministic cache for test
-    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[:, None]
-    idx = torch.arange(Dr, device=device, dtype=torch.float32)[None, :]
-    # frequencies: small values to avoid overflow
-    freqs = 0.1 * (idx + 1.0)
-    cos = torch.cos(t * freqs)
-    sin = torch.sin(t * freqs)
-    cos_sin = torch.cat([cos, sin], dim=1)  # [max_seq, 2*Dr]
-
-    pos_ids = torch.randint(low=0, high=max_seq, size=(nnz,), device=device, dtype=torch.long)
-
-    # Baseline: produce k_nope_out/k_rope_out and emulate set_mla_kv_buffer (bytes concat)
-    q_out_base = torch.empty(nnz, Dn + Dr, device=device, dtype=torch.uint8)
-    k_nope_out = torch.empty(nnz, Dn, device=device, dtype=torch.uint8)
-    k_rope_out = torch.empty(nnz, Dr, device=device, dtype=torch.uint8)
-
-    mla_rope_quantize_fp8_fused(
-        q_nope, q_rope, k_nope, k_rope, cos_sin, pos_ids, False,
-        q_out_base, k_nope_out, k_rope_out, None, None
-    )
-
-    # emulate set_mla_kv_buffer_triton: concat bytes into KV buffer
-    slots = nnz + 8
-    kv_base = torch.zeros(slots, 1, Dn + Dr, device=device, dtype=torch.uint8)
-    loc = torch.arange(nnz, device=device, dtype=torch.long)
-    kv_base[loc, 0, :Dn] = k_nope_out
-    kv_base[loc, 0, Dn:] = k_rope_out
-
-    # Fused: direct KV write, skip K outputs
-    q_out_fused = torch.empty_like(q_out_base)
-    kv_fused = torch.zeros_like(kv_base)
-
-    mla_rope_quantize_fp8_fused(
-        q_nope, q_rope, k_nope, k_rope, cos_sin, pos_ids, False,
-        q_out_fused, None, None, kv_fused, loc
-    )
-
-    # Assertions
-    assert torch.equal(q_out_base, q_out_fused), "q_out must match exactly (bytewise)"
-    assert torch.equal(kv_base, kv_fused), "KV fused write must match baseline concat"
-

From 0bdf96e028ac9295ab6212fc5379c1a4aa20b727 Mon Sep 17 00:00:00 2001
From: Yangmin Li <yangminl@nvidia.com>
Date: Sat, 1 Nov 2025 17:51:16 -0700
Subject: [PATCH 07/10] code cleaning

---
 benchmark/kernels/bench_flashmla_fused_kv.py  |   61 +-
 mla_fusion_standalone/pyproject.toml          |    1 -
 mla_fusion_standalone/setup.py                |   56 +-
 mla_fusion_standalone/test_import.py          |   43 -
 .../layers/attention/trtllm_mla_backend.py    |   41 +-
 sgl-kernel/build_minimal.sh                   |    1 -
 .../csrc/elementwise/mla_rope_fp8_kv_fused.cu | 1221 ++++++++---------
 sgl-kernel/tests/test_mla_rope_fp8_fused.py   |   20 +-
 8 files changed, 681 insertions(+), 763 deletions(-)
 delete mode 100644 mla_fusion_standalone/test_import.py

diff --git a/benchmark/kernels/bench_flashmla_fused_kv.py b/benchmark/kernels/bench_flashmla_fused_kv.py
index 61e08bf622d..7012ed75420 100644
--- a/benchmark/kernels/bench_flashmla_fused_kv.py
+++ b/benchmark/kernels/bench_flashmla_fused_kv.py
@@ -4,27 +4,33 @@
 Uses the sgl_kernel.mla_rope_quantize_fp8_fused extension.
 """
 import time
+
 import torch
 
 _has_sgl_kernel = False
 mla_rope_quantize_fp8_fused = None
 try:
     from mla_fusion_kernel import mla_rope_quantize_fp8_fused
+
     _has_sgl_kernel = True
     print("Using standalone mla_fusion_kernel")
 except ImportError:
     try:
         from sgl_kernel import mla_rope_quantize_fp8_fused
+
         _has_sgl_kernel = True
         print("Using sgl_kernel.mla_rope_quantize_fp8_fused")
     except ImportError:
-        print("ERROR: Fusion kernel not available. Please build mla_fusion_standalone first.")
+        print(
+            "ERROR: Fusion kernel not available. Please build mla_fusion_standalone first."
+        )
         _has_sgl_kernel = False
 
+
 def run_one(nnz=1024, Dn=512, Dr=64, iters=200, warmup=20, device="cuda"):
     if not _has_sgl_kernel:
         return 0, 0, 0
-    
+
     torch.manual_seed(0)
 
     q_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
@@ -39,7 +45,9 @@ def run_one(nnz=1024, Dn=512, Dr=64, iters=200, warmup=20, device="cuda"):
     cos = torch.cos(t * freqs)
     sin = torch.sin(t * freqs)
     cos_sin = torch.cat([cos, sin], dim=1)
-    pos_ids = torch.randint(low=0, high=max_seq, size=(nnz,), device=device, dtype=torch.long)
+    pos_ids = torch.randint(
+        low=0, high=max_seq, size=(nnz,), device=device, dtype=torch.long
+    )
 
     slots = nnz + 8
     loc = torch.arange(nnz, device=device, dtype=torch.long)
@@ -47,21 +55,45 @@ def run_one(nnz=1024, Dn=512, Dr=64, iters=200, warmup=20, device="cuda"):
     q_out = torch.empty(nnz, Dn + Dr, device=device, dtype=torch.uint8)
     k_nope_out = torch.empty(nnz, Dn, device=device, dtype=torch.uint8)
     k_rope_out = torch.empty(nnz, Dr, device=device, dtype=torch.uint8)
-    kv_base = torch.zeros(slots, 1, Dn + Dr, device=device, dtype=torch.uint8)
+    kv_base = torch.zeros(slots, Dn + Dr, device=device, dtype=torch.uint8)
 
     # baselines
     def baseline():
-        mla_rope_quantize_fp8_fused(q_nope, q_rope, k_nope, k_rope, cos_sin, pos_ids, False,
-                                    q_out, k_nope_out, k_rope_out, None, None)
+        mla_rope_quantize_fp8_fused(
+            q_nope,
+            q_rope,
+            k_nope,
+            k_rope,
+            cos_sin,
+            pos_ids,
+            False,
+            q_out,
+            k_nope_out,
+            k_rope_out,
+            None,
+            None,
+        )
         kv_base.zero_()
-        kv_base[loc, 0, :Dn] = k_nope_out
-        kv_base[loc, 0, Dn:] = k_rope_out
+        kv_base[loc, :Dn] = k_nope_out
+        kv_base[loc, Dn:] = k_rope_out
 
     kv_fused = torch.zeros_like(kv_base)
 
     def fused():
-        mla_rope_quantize_fp8_fused(q_nope, q_rope, k_nope, k_rope, cos_sin, pos_ids, False,
-                                    q_out, None, None, kv_fused, loc)
+        mla_rope_quantize_fp8_fused(
+            q_nope,
+            q_rope,
+            k_nope,
+            k_rope,
+            cos_sin,
+            pos_ids,
+            False,
+            q_out,
+            None,
+            None,
+            kv_fused,
+            loc,
+        )
 
     # warmup
     for _ in range(warmup):
@@ -86,19 +118,22 @@ def fused():
 
     return baseline_ms, fused_ms, baseline_ms / fused_ms
 
+
 if __name__ == "__main__":
     if not _has_sgl_kernel:
         print("Benchmark skipped: sgl_kernel not available")
         exit(1)
-    
+
     print("MLA RoPE + FP8 Quantization + KV Cache Write Fusion Benchmark")
     print("=" * 70)
     print("Config: Dn=512, Dr=64, iters=1000, warmup=100")
     print("=" * 70)
-    
+
     # Test larger batch sizes and more iterations for stable measurements
     for nnz in [1024, 4096, 8192, 16384, 32768]:
         b, f, s = run_one(nnz=nnz, iters=1000, warmup=100)
         if b > 0:
             speedup_pct = (s - 1.0) * 100
-            print(f"nnz={nnz:5d} | baseline={b:7.3f} ms | fused={f:7.3f} ms | speedup x{s:4.2f} ({speedup_pct:+5.1f}%)")
+            print(
+                f"nnz={nnz:5d} | baseline={b:7.3f} ms | fused={f:7.3f} ms | speedup x{s:4.2f} ({speedup_pct:+5.1f}%)"
+            )
diff --git a/mla_fusion_standalone/pyproject.toml b/mla_fusion_standalone/pyproject.toml
index bbc8d622a9e..1629b0ac33c 100644
--- a/mla_fusion_standalone/pyproject.toml
+++ b/mla_fusion_standalone/pyproject.toml
@@ -8,4 +8,3 @@ version = "0.1.0"
 description = "Standalone MLA RoPE + FP8 Fusion Kernel"
 requires-python = ">=3.8"
 dependencies = ["torch"]
-
diff --git a/mla_fusion_standalone/setup.py b/mla_fusion_standalone/setup.py
index 082e37a8f9c..cab4aec2592 100644
--- a/mla_fusion_standalone/setup.py
+++ b/mla_fusion_standalone/setup.py
@@ -1,14 +1,18 @@
 """
 Standalone build for MLA RoPE FP8 Fusion kernel
 """
-from setuptools import setup
+
 import os
 import sys
 
+from setuptools import setup
+
+
 # Delay torch import until build time
 def get_cuda_arch():
     try:
         import torch
+
         cuda_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
         if cuda_arch_list is None:
             # Auto-detect
@@ -24,46 +28,48 @@ def get_cuda_arch():
         print(f"Warning: Could not detect CUDA arch, using defaults: {e}")
         return "8.0;9.0;10.0"
 
+
 def get_extensions():
     from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-    
+
     cuda_arch_list = get_cuda_arch()
 
     return [
         CUDAExtension(
-            name='mla_fusion_kernel',
+            name="mla_fusion_kernel",
             sources=[
-                '../sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu',
+                "../sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu",
             ],
             include_dirs=[
-                '../sgl-kernel/include',
+                "../sgl-kernel/include",
             ],
             extra_compile_args={
-                'cxx': ['-O3', '-std=c++17'],
-                'nvcc': [
-                    '-O3',
-                    '--use_fast_math',
-                    '-U__CUDA_NO_HALF_OPERATORS__',
-                    '-U__CUDA_NO_HALF_CONVERSIONS__',
-                    '-U__CUDA_NO_BFLOAT16_CONVERSIONS__',
-                    '-U__CUDA_NO_HALF2_OPERATORS__',
-                    '--expt-relaxed-constexpr',
-                    '--expt-extended-lambda',
-                ] + [f'-gencode=arch=compute_{arch.replace(".", "")},code=sm_{arch.replace(".", "")}' 
-                     for arch in cuda_arch_list.split(';')],
+                "cxx": ["-O3", "-std=c++17"],
+                "nvcc": [
+                    "-O3",
+                    "--use_fast_math",
+                    "-U__CUDA_NO_HALF_OPERATORS__",
+                    "-U__CUDA_NO_HALF_CONVERSIONS__",
+                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                    "-U__CUDA_NO_HALF2_OPERATORS__",
+                    "--expt-relaxed-constexpr",
+                    "--expt-extended-lambda",
+                ]
+                + [
+                    f'-gencode=arch=compute_{arch.replace(".", "")},code=sm_{arch.replace(".", "")}'
+                    for arch in cuda_arch_list.split(";")
+                ],
             },
         )
     ]
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     from torch.utils.cpp_extension import BuildExtension
-    
+
     setup(
-        name='mla_fusion_kernel',
+        name="mla_fusion_kernel",
         ext_modules=get_extensions(),
-        cmdclass={
-            'build_ext': BuildExtension
-        },
-        python_requires='>=3.8',
+        cmdclass={"build_ext": BuildExtension},
+        python_requires=">=3.8",
     )
-
diff --git a/mla_fusion_standalone/test_import.py b/mla_fusion_standalone/test_import.py
deleted file mode 100644
index 90ee849d2b5..00000000000
--- a/mla_fusion_standalone/test_import.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test script to verify the fusion kernel works
-"""
-import torch
-import mla_fusion_kernel
-
-print("✅ Module imported successfully!")
-print(f"Available functions: {dir(mla_fusion_kernel)}")
-
-# Test basic call
-device = "cuda" if torch.cuda.is_available() else "cpu"
-if device == "cuda":
-    print(f"\n✅ CUDA is available")
-    print(f"Device: {torch.cuda.get_device_name(0)}")
-    
-    # Create dummy inputs
-    nnz, Dn, Dr = 4, 512, 64
-    q_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
-    q_rope = torch.randn(nnz, Dr, device=device, dtype=torch.float16)
-    k_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
-    k_rope = torch.randn(nnz, Dr, device=device, dtype=torch.float16)
-    
-    cos_sin = torch.randn(2048, Dr*2, device=device, dtype=torch.float32)
-    pos_ids = torch.arange(nnz, device=device, dtype=torch.int64)
-    
-    q_out = torch.empty(nnz, Dn+Dr, device=device, dtype=torch.uint8)
-    k_nope_out = torch.empty(nnz, Dn, device=device, dtype=torch.uint8)
-    k_rope_out = torch.empty(nnz, Dr, device=device, dtype=torch.uint8)
-    
-    try:
-        mla_fusion_kernel.mla_rope_quantize_fp8_fused(
-            q_nope, q_rope, k_nope, k_rope,
-            cos_sin, pos_ids, False,
-            q_out, k_nope_out, k_rope_out,
-            None, None
-        )
-        print("✅ Kernel executed successfully!")
-    except Exception as e:
-        print(f"❌ Kernel execution failed: {e}")
-else:
-    print("⚠️ CUDA not available, skipping kernel test")
-
diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
index 9658d2688fd..0731b06f327 100755
--- a/python/sglang/srt/layers/attention/trtllm_mla_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -38,6 +38,7 @@
 
 if _is_cuda:
     from sgl_kernel import concat_mla_absorb_q
+
     try:
         # Try standalone build first
         from mla_fusion_kernel import mla_rope_quantize_fp8_fused
@@ -652,7 +653,7 @@ def quantize_and_rope_for_fp8(
         This function handles the FP8 quantization and RoPE application for MLA attention.
         It takes separate query/key nope and rope components, applies RoPE to the rope parts,
         quantizes all components to FP8, and merges the query components into a single tensor.
-        
+
         When layer and save_kv_cache are provided, it uses the fused kernel to directly write
         K into the KV cache, eliminating intermediate global memory operations.
 
@@ -695,8 +696,10 @@ def quantize_and_rope_for_fp8(
         compute_cap = torch.cuda.get_device_capability()
         kernel_available = mla_rope_quantize_fp8_fused is not None
         has_layer = layer is not None
-        nsa_flag = getattr(forward_batch.token_to_kv_pool, "nsa_kv_cache_store_fp8", False)
-        
+        nsa_flag = getattr(
+            forward_batch.token_to_kv_pool, "nsa_kv_cache_store_fp8", False
+        )
+
         # Only enable fusion on SM90+ (H100/B200) due to FP8 hardware support
         use_fused_kv_write = (
             kernel_available
@@ -709,18 +712,14 @@ def quantize_and_rope_for_fp8(
         if use_fused_kv_write:
             # Fused path: RoPE + quantization + direct KV cache write
             # This eliminates the intermediate write/read of k_nope_out/k_rope_out
-            
-            # Get KV buffer (2D: [total_rows, kv_dim])
-            k_cache_raw = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
-            if k_cache_raw.dim() == 3:
-                kv_buffer = k_cache_raw.squeeze(1)
-            else:
-                kv_buffer = k_cache_raw
-            
+
+            # Get KV buffer (supports both 2D and 3D formats)
+            kv_buffer = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+
             # Prepare cache locations and positions as int64 contiguous
             kv_loc = forward_batch.out_cache_loc.to(torch.int64).contiguous()
             positions = forward_batch.positions.to(torch.int64).contiguous()
-            
+
             # Call fused kernel: RoPE + quantize + write KV cache
             mla_rope_quantize_fp8_fused(
                 q_nope,
@@ -736,7 +735,7 @@ def quantize_and_rope_for_fp8(
                 kv_buffer,
                 kv_loc,
             )
-            
+
             # Return Q output and None for K outputs (already written to cache)
             return q_out, None, None
         else:
@@ -758,7 +757,9 @@ def quantize_and_rope_for_fp8(
                 # Output tensor slicing: q_out contains [nope_part, rope_part]
                 q_rope_out=q_out[..., self.kv_lora_rank :],  # RoPE part goes to end
                 k_rope_out=k_rope_out,
-                q_nope_out=q_out[..., : self.kv_lora_rank],  # Nope part goes to beginning
+                q_nope_out=q_out[
+                    ..., : self.kv_lora_rank
+                ],  # Nope part goes to beginning
                 k_nope_out=k_nope_out,
                 # Quantization scales (set to 1.0 for no additional scaling)
                 quant_scale_q=1.0,
@@ -863,14 +864,14 @@ def forward_decode(
         """Run forward for decode using TRTLLM MLA kernel."""
         merge_query = q_rope is not None
         fused_kv = False  # Track if using fused KV write path
-        
+
         if self.data_type == torch.float8_e4m3fn:
             # For FP8 path, we quantize the query and rope parts and merge them into a single tensor
             # Note: rope application in deepseek_v2.py:forward_absorb_prepare is skipped for FP8 decode path of this trtllm_mla backend
             assert all(
                 x is not None for x in [q_rope, k_rope, cos_sin_cache]
             ), "For FP8 path and using flashinfer.rope.mla_rope_quantize we need all of q_rope, k_rope and cos_sin_cache to be not None."
-            
+
             # Call quantize_and_rope_for_fp8 with layer and save_kv_cache
             # This enables the fused kernel path when conditions are met
             q, k, k_rope = self.quantize_and_rope_for_fp8(
@@ -886,7 +887,7 @@ def forward_decode(
             )
             merge_query = False
             # Check if fused path was used (K outputs are None when directly written to KV cache)
-            fused_kv = (k is None and k_rope is None)
+            fused_kv = k is None and k_rope is None
 
         # Save KV cache if requested (only if not using fused path)
         if save_kv_cache and not fused_kv:
@@ -973,7 +974,7 @@ def forward_extend(
         # TODO refactor to avoid code duplication for forward_decode
         merge_query = q_rope is not None
         fused_kv = False  # Track if we used fused KV write path
-        
+
         # TODO: Check if the condition restrictions (target_verify/draft_extend only) are necessary
         #       Consider if we can enable FP8 quantize_and_rope for all extend paths safely
         # For FP8 path in target_verify or draft_extend mode, use quantize_and_rope_for_fp8
@@ -985,7 +986,7 @@ def forward_extend(
             )
             and all(x is not None for x in [q_rope, k_rope, cos_sin_cache])
         )
-        
+
         if use_fp8_quantize:
             # FP8 path: quantize and apply RoPE with optional fused KV write
             q, k_fp8_nope, k_fp8_rope = self.quantize_and_rope_for_fp8(
@@ -1001,7 +1002,7 @@ def forward_extend(
             )
             merge_query = False
             # Fused path: K already written to KV cache, function returns None for K outputs
-            fused_kv = (k_fp8_nope is None and k_fp8_rope is None)
+            fused_kv = k_fp8_nope is None and k_fp8_rope is None
             # Update local variables for subsequent logic
             k = k_fp8_nope
             k_rope = k_fp8_rope
diff --git a/sgl-kernel/build_minimal.sh b/sgl-kernel/build_minimal.sh
index 8aaa32247e4..a56d09398b7 100755
--- a/sgl-kernel/build_minimal.sh
+++ b/sgl-kernel/build_minimal.sh
@@ -22,4 +22,3 @@ pip install -e . --no-build-isolation -v 2>&1 | tee build.log
 
 echo ""
 echo "Build completed! Check build.log for details."
-
diff --git a/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu b/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
index ff580253714..8e29da5984d 100644
--- a/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
+++ b/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
@@ -1,27 +1,21 @@
-/*
- * Copyright (c) 2024 by SGLang team.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- *
- * MLA RoPE + FP8 Quantization + KV Cache Write Fusion Kernel
- *
- * This is a SGLang-native kernel that fuses three operations for DeepSeek V3.2 MLA:
- * 1. Apply RoPE (Rotary Position Embedding) to q_rope and k_rope
- * 2. Quantize all components (q_nope, q_rope, k_nope, k_rope) to FP8 E4M3
- * 3. Optionally write K directly into KV cache buffer
- *
- * Motivation:
- * - Original path: mla_rope_quantize_fp8 (FlashInfer) → writes k_out → set_mla_kv_buffer reads k_out → writes KV cache
- * - Fused path: This kernel → directly writes to KV cache (eliminates intermediate global memory ops)
- *
- * Performance: ~4.9x faster than baseline (measured on B200), includes:
- * - Vectorized memory access (4-byte aligned loads/stores)
- * - Warp-level parallelism (32 threads per row)
- * - Direct KV cache write (no intermediate buffers)
- */
-
-// Only include PyBind11 for standalone builds
+/* Copyright 2024 SGLang Team. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// MLA RoPE + FP8 Quantization + KV Cache Write Fusion Kernel
+// Fuses RoPE application, FP8 quantization, and direct KV cache write
+
 #ifdef TORCH_EXTENSION_NAME
 #include <torch/extension.h>
 #else
@@ -30,265 +24,209 @@
 #endif
 
 #include <c10/cuda/CUDAStream.h>
-#include <cuda_runtime.h>
+#include <cuda_bf16.h>
 #include <cuda_fp16.h>
-#include <cuda_bf16.h>  // BF16 support
+#include <cuda_runtime.h>
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 #include <cuda_fp8.h>
 #endif
-#include <optional>
-#include <stdint.h>
-#include <cstdlib>  // For std::getenv
-#include <string>   // For std::string
 
-// Utility macros (borrowed from pytorch_extension_utils.h style)
+// TODO: Use pytorch_extension_utils.h when it's available in sgl-kernel/include
 #define CHECK_INPUT(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
 #define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be " #d "D")
-
-// ---- Helpers -------------------------------------------------------
-
-#define CHECK_SAME_DEVICE(a, b) TORCH_CHECK(a.device() == b.device(), #a " and " #b " must be on same device")
+#define CHECK_EQ(a, b) TORCH_CHECK((a) == (b), #a " must equal " #b)
+#define CHECK_LAST_DIM_CONTIGUOUS(x) TORCH_CHECK(x.stride(x.dim() - 1) == 1, #x " last dim must be contiguous")
 
 namespace {
 
-// ============================================================================
-// Dtype Traits: Support both FP16 (__half) and BF16 (nv_bfloat16)
-// ============================================================================
 template <typename T>
 struct Vec2Traits;
 
 template <>
 struct Vec2Traits<__half> {
-    using v2 = __half2;
-    
-    __device__ static inline float2 to_float2(v2 h2) {
-        return __half22float2(h2);
-    }
-    
-    __device__ static inline float to_float(const __half& h) {
-        return __half2float(h);
-    }
+  using v2 = __half2;
+  __device__ static inline float2 to_float2(v2 h2) {
+    return __half22float2(h2);
+  }
+  __device__ static inline float to_float(const __half& h) {
+    return __half2float(h);
+  }
 };
 
 template <>
 struct Vec2Traits<nv_bfloat16> {
-    using v2 = nv_bfloat162;
-    
-    __device__ static inline float2 to_float2(v2 h2) {
-        return __bfloat1622float2(h2);
-    }
-    
-    __device__ static inline float to_float(const nv_bfloat16& h) {
-        return __bfloat162float(h);
-    }
+  using v2 = nv_bfloat162;
+  __device__ static inline float2 to_float2(v2 h2) {
+    return __bfloat1622float2(h2);
+  }
+  __device__ static inline float to_float(const nv_bfloat16& h) {
+    return __bfloat162float(h);
+  }
 };
 
-// Convert float -> FP8 E4M3 (finite saturation). Return raw byte.
 __device__ inline uint8_t float_to_e4m3fn_byte(float x) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-    // CUDA 12+ with native FP8 support
-    __nv_fp8_storage_t byte = __nv_cvt_float_to_fp8(x, __NV_SATFINITE, __NV_E4M3);
-    return static_cast<uint8_t>(byte);
+  __nv_fp8_storage_t byte = __nv_cvt_float_to_fp8(x, __NV_SATFINITE, __NV_E4M3);
+  return static_cast<uint8_t>(byte);
 #else
-    // Fallback: Manual FP8 E4M3 conversion
-    // E4M3 format: 1 sign bit, 4 exponent bits, 3 mantissa bits
-    // Range: [-448, 448], NaN represented as 0x7F
-    
-    // Clamp to FP8 E4M3 range
-    x = fmaxf(-448.0f, fminf(448.0f, x));
-    
-    // Simple conversion (not bit-exact but close enough for testing)
-    // In production, you'd want proper rounding
-    union {
-        float f;
-        uint32_t u;
-    } conv;
-    conv.f = x;
-    
-    // Extract sign
-    uint32_t sign = (conv.u >> 31) & 0x1;
-    
-    // Handle zero
-    if (x == 0.0f) return 0;
-    
-    // Simplified: scale and round
-    // This is a placeholder - for production use proper FP8 conversion
-    int exp = ((conv.u >> 23) & 0xFF) - 127;  // Extract exponent
-    exp = max(-6, min(8, exp));  // E4M3 range
-    
-    uint32_t mant = (conv.u >> 20) & 0x7;  // Top 3 bits of mantissa
-    
-    uint8_t result = (sign << 7) | ((exp + 7) << 3) | mant;
-    return result;
+  x = fmaxf(-448.0f, fminf(448.0f, x));
+  union {
+    float f;
+    uint32_t u;
+  } conv;
+  conv.f = x;
+  uint32_t sign = (conv.u >> 31) & 0x1;
+  if (x == 0.0f) return 0;
+  int exp = ((conv.u >> 23) & 0xFF) - 127;
+  exp = max(-6, min(8, exp));
+  uint32_t mant = (conv.u >> 20) & 0x7;
+  uint8_t result = (sign << 7) | ((exp + 7) << 3) | mant;
+  return result;
 #endif
 }
 
-// Pack 4 bytes into uint32_t for vectorized write
 __device__ inline uint32_t pack4(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3) {
-    return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) | ((uint32_t)a3 << 24);
+  return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) | ((uint32_t)a3 << 24);
 }
 
-// Apply RoPE to a pair (xr, xi) given cos, sin
-__device__ inline void rope_rotate(float& xr, float& xi, float c, float s, bool /*is_neox*/) {
-    float xr_new = xr * c - xi * s;
-    float xi_new = xr * s + xi * c;
-    xr = xr_new;
-    xi = xi_new;
+__device__ inline void rope_rotate(float& xr, float& xi, float c, float s, bool) {
+  float xr_new = xr * c - xi * s;
+  float xi_new = xr * s + xi * c;
+  xr = xr_new;
+  xi = xi_new;
 }
 
-// ============================================================================
-// Vectorized kernel: warp-per-row, vectorized load/store
-// Template supports both FP16 (__half) and BF16 (nv_bfloat16)
-// ============================================================================
-template<int WARPS_PER_CTA, typename T>
+template <int WARPS_PER_CTA, typename T>
 __global__ void FusedRopeQuantizeKernelVec(
     const T* __restrict__ q_nope,
     const T* __restrict__ q_rope,
-    int64_t qn_stride_tok, int64_t qn_stride_head,  // Q_nope strides in elements
-    int64_t qr_stride_tok, int64_t qr_stride_head,  // Q_rope strides in elements
+    int64_t qn_stride_tok,
+    int64_t qn_stride_head,
+    int64_t qr_stride_tok,
+    int64_t qr_stride_head,
     const T* __restrict__ k_nope,
     const T* __restrict__ k_rope,
-    int64_t kn_stride_tok,  // NEW: K_nope stride(0) in elements
-    int64_t kr_stride_tok,  // NEW: K_rope stride(0) in elements
+    int64_t kn_stride_tok,
+    int64_t kr_stride_tok,
     const float* __restrict__ cos_sin,
     const int64_t* __restrict__ pos_ids,
-    int nnz, int num_heads, int Dn, int Dr,
+    int nnz,
+    int num_heads,
+    int Dn,
+    int Dr,
     bool is_neox,
     uint8_t* __restrict__ q_out_fp8,
-    int64_t qout_stride_tok_bytes, int64_t qout_stride_head_bytes,  // Q_out strides in bytes
+    int64_t qout_stride_tok_bytes,
+    int64_t qout_stride_head_bytes,
     uint8_t* __restrict__ k_nope_out_fp8,
     uint8_t* __restrict__ k_rope_out_fp8,
     uint8_t* __restrict__ kv_buffer_bytes,
-    int64_t kv_stride_row_bytes,  // 2D: row stride in bytes
-    const int64_t* __restrict__ kv_cache_loc
-) {
-    constexpr int WARP_SIZE = 32;
-    int warp_in_block = threadIdx.x / WARP_SIZE;
-    int lane = threadIdx.x & (WARP_SIZE - 1);
-
-    int global_row = blockIdx.x * WARPS_PER_CTA + warp_in_block;
-    if (global_row >= nnz * num_heads) return;
-
-    // Decompose global row index: token_id and head_id
-    int token_id = global_row / num_heads;
-    int head_id = global_row % num_heads;
-
-    // Pointers for this (token, head) using proper strides
-    // Use template type T (not hardcoded __half) for BF16 support
-    const T* qn = q_nope + size_t(token_id) * qn_stride_tok + size_t(head_id) * qn_stride_head;
-    const T* qr = q_rope + size_t(token_id) * qr_stride_tok + size_t(head_id) * qr_stride_head;
-    
-    // K is always 2D: [nnz_tokens, dim]
-    // CRITICAL FIX: Use actual stride(0) instead of assuming Dn/Dr (handles non-contiguous K)
-    const T* kn = k_nope + size_t(token_id) * kn_stride_tok;
-    const T* kr = k_rope + size_t(token_id) * kr_stride_tok;
-
-    // Q output using byte strides
-    uint8_t* qdst = q_out_fp8 + size_t(token_id) * qout_stride_tok_bytes + size_t(head_id) * qout_stride_head_bytes;
-    
-    // K outputs (if provided)
-    uint8_t* kndst = k_nope_out_fp8 ? (k_nope_out_fp8 + size_t(token_id) * Dn) : nullptr;
-    uint8_t* krdst = k_rope_out_fp8 ? (k_rope_out_fp8 + size_t(token_id) * Dr) : nullptr;
-
-    // Get position for RoPE
-    int pos = static_cast<int>(pos_ids[token_id]);
-
-    // CRITICAL FIX: kv_cache_loc IS the flat row index (SGLang semantics)
-    // Do NOT use pos % page_size! That was causing KV to be written to wrong locations.
-    // In SGLang: kv_buffer is 2D [total_rows, kv_dim], loc = direct row index
-    uint8_t* kvdst = nullptr;
-    if (kv_buffer_bytes && kv_cache_loc) {
-        int64_t flat_row = kv_cache_loc[token_id];  // This IS the row index
-        kvdst = kv_buffer_bytes + flat_row * kv_stride_row_bytes;
-    }
-    
-    // CRITICAL FIX: cos_sin_cache layout is [max_pos, rope_dim] (not 2*rope_dim!)
-    // First half (rope_dim/2) is cos, second half is sin
-    const float* cos_ptr = cos_sin + size_t(pos) * Dr;  // rope_dim = Dr
-    const float* sin_ptr = cos_ptr + (Dr / 2);           // sin at offset Dr/2
-
-    // Use traits for dtype-agnostic vectorized load
-    using V2 = typename Vec2Traits<T>::v2;
-
-    // Process Q_nope: vectorized quantize + write
+    int64_t kv_stride_row_bytes,
+    const int64_t* __restrict__ kv_cache_loc) {
+  constexpr int WARP_SIZE = 32;
+  int warp_in_block = threadIdx.x / WARP_SIZE;
+  int lane = threadIdx.x & (WARP_SIZE - 1);
+
+  int global_row = blockIdx.x * WARPS_PER_CTA + warp_in_block;
+  if (global_row >= nnz * num_heads) return;
+
+  int token_id = global_row / num_heads;
+  int head_id = global_row % num_heads;
+
+  const T* qn = q_nope + size_t(token_id) * qn_stride_tok + size_t(head_id) * qn_stride_head;
+  const T* qr = q_rope + size_t(token_id) * qr_stride_tok + size_t(head_id) * qr_stride_head;
+  const T* kn = k_nope + size_t(token_id) * kn_stride_tok;
+  const T* kr = k_rope + size_t(token_id) * kr_stride_tok;
+
+  uint8_t* qdst = q_out_fp8 + size_t(token_id) * qout_stride_tok_bytes + size_t(head_id) * qout_stride_head_bytes;
+  uint8_t* kndst = k_nope_out_fp8 ? (k_nope_out_fp8 + size_t(token_id) * Dn) : nullptr;
+  uint8_t* krdst = k_rope_out_fp8 ? (k_rope_out_fp8 + size_t(token_id) * Dr) : nullptr;
+
+  int pos = static_cast<int>(pos_ids[token_id]);
+
+  uint8_t* kvdst = nullptr;
+  if (kv_buffer_bytes && kv_cache_loc) {
+    int64_t flat_row = kv_cache_loc[token_id];
+    kvdst = kv_buffer_bytes + flat_row * kv_stride_row_bytes;
+  }
+
+  const float* cos_ptr = cos_sin + size_t(pos) * Dr;
+  const float* sin_ptr = cos_ptr + (Dr / 2);
+
+  using V2 = typename Vec2Traits<T>::v2;
+
+  // Process Q_nope: vectorized quantize + write
+  for (int c = lane * 4; c < Dn; c += WARP_SIZE * 4) {
+    V2 h0 = *reinterpret_cast<const V2*>(qn + c + 0);
+    V2 h1 = *reinterpret_cast<const V2*>(qn + c + 2);
+    float2 f0 = Vec2Traits<T>::to_float2(h0);
+    float2 f1 = Vec2Traits<T>::to_float2(h1);
+
+    uint32_t packed = pack4(
+        float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y), float_to_e4m3fn_byte(f1.x), float_to_e4m3fn_byte(f1.y));
+    *reinterpret_cast<uint32_t*>(qdst + c) = packed;
+  }
+
+  for (int c = lane * 4; c < Dr; c += WARP_SIZE * 4) {
+    V2 h0 = *reinterpret_cast<const V2*>(qr + c + 0);
+    V2 h1 = *reinterpret_cast<const V2*>(qr + c + 2);
+    float2 f0 = Vec2Traits<T>::to_float2(h0);
+    float2 f1 = Vec2Traits<T>::to_float2(h1);
+
+    int base0 = (c + 0) >> 1;
+    int base1 = (c + 2) >> 1;
+    float c0 = cos_ptr[base0], s0 = sin_ptr[base0];
+    float c1 = cos_ptr[base1], s1 = sin_ptr[base1];
+
+    rope_rotate(f0.x, f0.y, c0, s0, false);
+    rope_rotate(f1.x, f1.y, c1, s1, false);
+
+    uint32_t packed = pack4(
+        float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y), float_to_e4m3fn_byte(f1.x), float_to_e4m3fn_byte(f1.y));
+    *reinterpret_cast<uint32_t*>(qdst + Dn + c) = packed;
+  }
+
+  if (head_id == 0) {
     for (int c = lane * 4; c < Dn; c += WARP_SIZE * 4) {
-        V2 h0 = *reinterpret_cast<const V2*>(qn + c + 0);
-        V2 h1 = *reinterpret_cast<const V2*>(qn + c + 2);
-        float2 f0 = Vec2Traits<T>::to_float2(h0);
-        float2 f1 = Vec2Traits<T>::to_float2(h1);
-
-        uint32_t packed = pack4(
-            float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y),
-            float_to_e4m3fn_byte(f1.x), float_to_e4m3fn_byte(f1.y));
-        *reinterpret_cast<uint32_t*>(qdst + c) = packed;
+      V2 h0 = *reinterpret_cast<const V2*>(kn + c + 0);
+      V2 h1 = *reinterpret_cast<const V2*>(kn + c + 2);
+      float2 f0 = Vec2Traits<T>::to_float2(h0);
+      float2 f1 = Vec2Traits<T>::to_float2(h1);
+
+      uint32_t packed = pack4(
+          float_to_e4m3fn_byte(f0.x),
+          float_to_e4m3fn_byte(f0.y),
+          float_to_e4m3fn_byte(f1.x),
+          float_to_e4m3fn_byte(f1.y));
+
+      if (kndst) *reinterpret_cast<uint32_t*>(kndst + c) = packed;
+      if (kvdst) *reinterpret_cast<uint32_t*>(kvdst + c) = packed;
     }
 
-    // Process Q_rope: paired rotation + vectorized quantize + write
-    // Each iteration processes 2 pairs: (c, c+1) and (c+2, c+3)
     for (int c = lane * 4; c < Dr; c += WARP_SIZE * 4) {
-        V2 h0 = *reinterpret_cast<const V2*>(qr + c + 0);  // [c+0, c+1]
-        V2 h1 = *reinterpret_cast<const V2*>(qr + c + 2);  // [c+2, c+3]
-        float2 f0 = Vec2Traits<T>::to_float2(h0);
-        float2 f1 = Vec2Traits<T>::to_float2(h1);
-
-        // CRITICAL: For GPT/interleaved RoPE, pair (2k, 2k+1) uses cos[k], NOT cos[2k]
-        // So: (qr[0],qr[1]) uses cos[0]; (qr[2],qr[3]) uses cos[1]
-        int base0 = (c + 0) >> 1;  // pair index: c=0 → base=0, c=4 → base=2
-        int base1 = (c + 2) >> 1;  // pair index: c=0 → base=1, c=4 → base=3
-        float c0 = cos_ptr[base0], s0 = sin_ptr[base0];
-        float c1 = cos_ptr[base1], s1 = sin_ptr[base1];
-        
-        rope_rotate(f0.x, f0.y, c0, s0, false);
-        rope_rotate(f1.x, f1.y, c1, s1, false);
-
-        uint32_t packed = pack4(
-            float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y),
-            float_to_e4m3fn_byte(f1.x), float_to_e4m3fn_byte(f1.y));
-        *reinterpret_cast<uint32_t*>(qdst + Dn + c) = packed;
-    }
-
-    // Process K_nope and K_rope: only once per token (head_id == 0)
-    // K is 2D [nnz_tokens, dim], not per-head
-    if (head_id == 0) {
-        // Process K_nope: vectorized quantize + write to k_nope_out and/or KV buffer
-        for (int c = lane * 4; c < Dn; c += WARP_SIZE * 4) {
-            V2 h0 = *reinterpret_cast<const V2*>(kn + c + 0);
-            V2 h1 = *reinterpret_cast<const V2*>(kn + c + 2);
-            float2 f0 = Vec2Traits<T>::to_float2(h0);
-            float2 f1 = Vec2Traits<T>::to_float2(h1);
-
-            uint32_t packed = pack4(
-                float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y),
-                float_to_e4m3fn_byte(f1.x), float_to_e4m3fn_byte(f1.y));
-
-            if (kndst) *reinterpret_cast<uint32_t*>(kndst + c) = packed;
-            if (kvdst) *reinterpret_cast<uint32_t*>(kvdst + c) = packed;
-        }
-
-        // Process K_rope: paired rotation + vectorized quantize + write
-        for (int c = lane * 4; c < Dr; c += WARP_SIZE * 4) {
-            V2 h0 = *reinterpret_cast<const V2*>(kr + c + 0);  // [c+0, c+1]
-            V2 h1 = *reinterpret_cast<const V2*>(kr + c + 2);  // [c+2, c+3]
-            float2 f0 = Vec2Traits<T>::to_float2(h0);
-            float2 f1 = Vec2Traits<T>::to_float2(h1);
-
-            // CRITICAL: Use pair index (same as Q_rope above)
-            int base0 = (c + 0) >> 1;
-            int base1 = (c + 2) >> 1;
-            float c0 = cos_ptr[base0], s0 = sin_ptr[base0];
-            float c1 = cos_ptr[base1], s1 = sin_ptr[base1];
-            
-            rope_rotate(f0.x, f0.y, c0, s0, false);
-            rope_rotate(f1.x, f1.y, c1, s1, false);
-
-            uint32_t packed = pack4(
-                float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y),
-                float_to_e4m3fn_byte(f1.x), float_to_e4m3fn_byte(f1.y));
-
-            if (krdst) *reinterpret_cast<uint32_t*>(krdst + c) = packed;
-            if (kvdst) *reinterpret_cast<uint32_t*>(kvdst + Dn + c) = packed;
-        }
+      V2 h0 = *reinterpret_cast<const V2*>(kr + c + 0);
+      V2 h1 = *reinterpret_cast<const V2*>(kr + c + 2);
+      float2 f0 = Vec2Traits<T>::to_float2(h0);
+      float2 f1 = Vec2Traits<T>::to_float2(h1);
+
+      int base0 = (c + 0) >> 1;
+      int base1 = (c + 2) >> 1;
+      float c0 = cos_ptr[base0], s0 = sin_ptr[base0];
+      float c1 = cos_ptr[base1], s1 = sin_ptr[base1];
+
+      rope_rotate(f0.x, f0.y, c0, s0, false);
+      rope_rotate(f1.x, f1.y, c1, s1, false);
+
+      uint32_t packed = pack4(
+          float_to_e4m3fn_byte(f0.x),
+          float_to_e4m3fn_byte(f0.y),
+          float_to_e4m3fn_byte(f1.x),
+          float_to_e4m3fn_byte(f1.y));
+
+      if (krdst) *reinterpret_cast<uint32_t*>(krdst + c) = packed;
+      if (kvdst) *reinterpret_cast<uint32_t*>(kvdst + Dn + c) = packed;
     }
+  }
 }
 
 // ============================================================================
@@ -299,177 +237,133 @@ template <int BLOCK_THREADS, typename T>
 __global__ void FusedRopeQuantizeKernelScalar(
     const T* __restrict__ q_nope,
     const T* __restrict__ q_rope,
-    int64_t qn_stride_tok, int64_t qn_stride_head,
-    int64_t qr_stride_tok, int64_t qr_stride_head,
+    int64_t qn_stride_tok,
+    int64_t qn_stride_head,
+    int64_t qr_stride_tok,
+    int64_t qr_stride_head,
     const T* __restrict__ k_nope,
     const T* __restrict__ k_rope,
     int64_t kn_stride_tok,  // NEW: K_nope stride(0) in elements
     int64_t kr_stride_tok,  // NEW: K_rope stride(0) in elements
     const float* __restrict__ cos_sin,
     const int64_t* __restrict__ pos_ids,
-    int nnz, int num_heads, int Dn, int Dr,
+    int nnz,
+    int num_heads,
+    int Dn,
+    int Dr,
     bool is_neox,
     uint8_t* __restrict__ q_out_fp8,
-    int64_t qout_stride_tok_bytes, int64_t qout_stride_head_bytes,
+    int64_t qout_stride_tok_bytes,
+    int64_t qout_stride_head_bytes,
     uint8_t* __restrict__ k_nope_out_fp8,
     uint8_t* __restrict__ k_rope_out_fp8,
     uint8_t* __restrict__ kv_buffer_bytes,
     int64_t kv_stride_row_bytes,  // 2D: row stride in bytes
-    const int64_t* __restrict__ kv_cache_loc
-) {
-    // Thread mapping: grid-stride loop over all (token, head) pairs
-    for (int global_row = blockIdx.x * BLOCK_THREADS + threadIdx.x; 
-         global_row < nnz * num_heads; 
-         global_row += gridDim.x * BLOCK_THREADS) {
-        
-        // Decompose global row index
-        int token_id = global_row / num_heads;
-        int head_id = global_row % num_heads;
-        
-        int pos = static_cast<int>(pos_ids[token_id]);
-        // CRITICAL FIX: cos_sin_cache is [max_pos, rope_dim]
-        // First half is cos, second half is sin
-        const float* cos_ptr = cos_sin + size_t(pos) * Dr;
-        const float* sin_ptr = cos_ptr + (Dr / 2);
-
-        // ---- Quantize q ----
-        // q_out: [nope | rope]
-        {
-            // Pointers using proper strides
-            const T* qn = q_nope + size_t(token_id) * qn_stride_tok + size_t(head_id) * qn_stride_head;
-            const T* qr = q_rope + size_t(token_id) * qr_stride_tok + size_t(head_id) * qr_stride_head;
-            uint8_t* qdst = q_out_fp8 + size_t(token_id) * qout_stride_tok_bytes + size_t(head_id) * qout_stride_head_bytes;
-            
-            // write nope part
-            for (int i = 0; i < Dn; ++i) {
-                float x = Vec2Traits<T>::to_float(qn[i]);
-                qdst[i] = float_to_e4m3fn_byte(x);
-            }
-            // rope part: handle GPT vs NeoX layout
-            if (!is_neox) {
-                // GPT/interleaved style: pair (2k, 2k+1) uses cos[k], NOT cos[2k]
-                for (int i = 0; i < Dr; i += 2) {
-                    int base = i >> 1;  // CRITICAL: pair index
-                    float xr = Vec2Traits<T>::to_float(qr[i + 0]);
-                    float xi = 0.0f;
-                    if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(qr[i + 1]);
-                    float c = cos_ptr[base];
-                    float s = sin_ptr[base];
-                    rope_rotate(xr, xi, c, s, false);
-                    qdst[Dn + i + 0] = float_to_e4m3fn_byte(xr);
-                    if (i + 1 < Dr) qdst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
-                }
-            } else {
-                // NeoX style: pairs (i, i+Dr/2)
-                int half = Dr / 2;
-                for (int i = 0; i < half; ++i) {
-                    float xr = Vec2Traits<T>::to_float(qr[i]);         // real part
-                    float xi = Vec2Traits<T>::to_float(qr[i + half]);  // imag part (second half)
-                    float c = cos_ptr[i];
-                    float s = sin_ptr[i];
-                    rope_rotate(xr, xi, c, s, true);
-                    qdst[Dn + i]        = float_to_e4m3fn_byte(xr);
-                    qdst[Dn + i + half] = float_to_e4m3fn_byte(xi);
-                }
-            }
+    const int64_t* __restrict__ kv_cache_loc) {
+  for (int global_row = blockIdx.x * BLOCK_THREADS + threadIdx.x; global_row < nnz * num_heads;
+       global_row += gridDim.x * BLOCK_THREADS) {
+    int token_id = global_row / num_heads;
+    int head_id = global_row % num_heads;
+
+    int pos = static_cast<int>(pos_ids[token_id]);
+    const float* cos_ptr = cos_sin + size_t(pos) * Dr;
+    const float* sin_ptr = cos_ptr + (Dr / 2);
+
+    {
+      const T* qn = q_nope + size_t(token_id) * qn_stride_tok + size_t(head_id) * qn_stride_head;
+      const T* qr = q_rope + size_t(token_id) * qr_stride_tok + size_t(head_id) * qr_stride_head;
+      uint8_t* qdst = q_out_fp8 + size_t(token_id) * qout_stride_tok_bytes + size_t(head_id) * qout_stride_head_bytes;
+
+      for (int i = 0; i < Dn; ++i) {
+        qdst[i] = float_to_e4m3fn_byte(Vec2Traits<T>::to_float(qn[i]));
+      }
+
+      if (!is_neox) {
+        for (int i = 0; i < Dr; i += 2) {
+          int base = i >> 1;
+          float xr = Vec2Traits<T>::to_float(qr[i + 0]);
+          float xi = (i + 1 < Dr) ? Vec2Traits<T>::to_float(qr[i + 1]) : 0.0f;
+          rope_rotate(xr, xi, cos_ptr[base], sin_ptr[base], false);
+          qdst[Dn + i] = float_to_e4m3fn_byte(xr);
+          if (i + 1 < Dr) qdst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
         }
+      } else {
+        int half = Dr / 2;
+        for (int i = 0; i < half; ++i) {
+          float xr = Vec2Traits<T>::to_float(qr[i]);
+          float xi = Vec2Traits<T>::to_float(qr[i + half]);
+          rope_rotate(xr, xi, cos_ptr[i], sin_ptr[i], true);
+          qdst[Dn + i] = float_to_e4m3fn_byte(xr);
+          qdst[Dn + i + half] = float_to_e4m3fn_byte(xi);
+        }
+      }
+    }
 
-        // ---- Quantize k & optional fused KV write ----
-        // K is always 2D: [nnz_tokens, dim]
-        // CRITICAL FIX: Use actual stride(0) instead of assuming Dn/Dr (handles non-contiguous K)
-        const T* kn = k_nope + size_t(token_id) * kn_stride_tok;
-        const T* kr = k_rope + size_t(token_id) * kr_stride_tok;
-
-        // Optional: write k_nope_out / k_rope_out (only once per token, not per head)
-        // Note: K outputs are 2D [nnz_tokens, dim], so only first head processes them
-        if (head_id == 0) {
-            if (k_nope_out_fp8) {
-                uint8_t* knd = k_nope_out_fp8 + size_t(token_id) * Dn;
-                for (int i = 0; i < Dn; ++i) {
-                    knd[i] = float_to_e4m3fn_byte(Vec2Traits<T>::to_float(kn[i]));
-                }
-            }
-            if (k_rope_out_fp8) {
-                uint8_t* krd = k_rope_out_fp8 + size_t(token_id) * Dr;
-                if (!is_neox) {
-                    // GPT/interleaved style: pair (2k, 2k+1) uses cos[k]
-                    for (int i = 0; i < Dr; i += 2) {
-                        int base = i >> 1;  // CRITICAL: pair index
-                        float xr = Vec2Traits<T>::to_float(kr[i + 0]);
-                        float xi = 0.0f;
-                        if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(kr[i + 1]);
-                        float c = cos_ptr[base];
-                        float s = sin_ptr[base];
-                        rope_rotate(xr, xi, c, s, false);
-                        krd[i + 0] = float_to_e4m3fn_byte(xr);
-                        if (i + 1 < Dr) krd[i + 1] = float_to_e4m3fn_byte(xi);
-                    }
-                } else {
-                    // NeoX style: pairs (i, i+Dr/2)
-                    int half = Dr / 2;
-                    for (int i = 0; i < half; ++i) {
-                        float xr = Vec2Traits<T>::to_float(kr[i]);
-                        float xi = Vec2Traits<T>::to_float(kr[i + half]);
-                        float c = cos_ptr[i];
-                        float s = sin_ptr[i];
-                        rope_rotate(xr, xi, c, s, true);
-                        krd[i]        = float_to_e4m3fn_byte(xr);
-                        krd[i + half] = float_to_e4m3fn_byte(xi);
-                    }
-                }
-            }
-
-            // CRITICAL FIX: kv_cache_loc IS the flat row index (SGLang semantics)
-            // Do NOT use pos % page_size! That was causing KV to be written to wrong locations.
-            if (kv_buffer_bytes && kv_cache_loc) {
-                int64_t flat_row = kv_cache_loc[token_id];  // This IS the row index
-                uint8_t* dst = kv_buffer_bytes + flat_row * kv_stride_row_bytes;
-                // Write nope first
-                for (int i = 0; i < Dn; ++i) {
-                    dst[i] = float_to_e4m3fn_byte(Vec2Traits<T>::to_float(kn[i]));
-                }
-                // Then rope with rotation: handle GPT vs NeoX
-                if (!is_neox) {
-                    // GPT/interleaved style: pair (2k, 2k+1) uses cos[k]
-                    for (int i = 0; i < Dr; i += 2) {
-                        int base = i >> 1;  // CRITICAL: pair index
-                        float xr = Vec2Traits<T>::to_float(kr[i + 0]);
-                        float xi = 0.0f;
-                        if (i + 1 < Dr) xi = Vec2Traits<T>::to_float(kr[i + 1]);
-                        float c = cos_ptr[base];
-                        float s = sin_ptr[base];
-                        rope_rotate(xr, xi, c, s, false);
-                        dst[Dn + i + 0] = float_to_e4m3fn_byte(xr);
-                        if (i + 1 < Dr) dst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
-                    }
-                } else {
-                    // NeoX style: pairs (i, i+Dr/2)
-                    int half = Dr / 2;
-                    for (int i = 0; i < half; ++i) {
-                        float xr = Vec2Traits<T>::to_float(kr[i]);
-                        float xi = Vec2Traits<T>::to_float(kr[i + half]);
-                        float c = cos_ptr[i];
-                        float s = sin_ptr[i];
-                        rope_rotate(xr, xi, c, s, true);
-                        dst[Dn + i]        = float_to_e4m3fn_byte(xr);
-                        dst[Dn + i + half] = float_to_e4m3fn_byte(xi);
-                    }
-                }
-            }
+    const T* kn = k_nope + size_t(token_id) * kn_stride_tok;
+    const T* kr = k_rope + size_t(token_id) * kr_stride_tok;
+
+    if (head_id == 0) {
+      if (k_nope_out_fp8) {
+        uint8_t* knd = k_nope_out_fp8 + size_t(token_id) * Dn;
+        for (int i = 0; i < Dn; ++i) {
+          knd[i] = float_to_e4m3fn_byte(Vec2Traits<T>::to_float(kn[i]));
         }
+      }
+      if (k_rope_out_fp8) {
+        uint8_t* krd = k_rope_out_fp8 + size_t(token_id) * Dr;
+        if (!is_neox) {
+          for (int i = 0; i < Dr; i += 2) {
+            int base = i >> 1;
+            float xr = Vec2Traits<T>::to_float(kr[i]);
+            float xi = (i + 1 < Dr) ? Vec2Traits<T>::to_float(kr[i + 1]) : 0.0f;
+            rope_rotate(xr, xi, cos_ptr[base], sin_ptr[base], false);
+            krd[i] = float_to_e4m3fn_byte(xr);
+            if (i + 1 < Dr) krd[i + 1] = float_to_e4m3fn_byte(xi);
+          }
+        } else {
+          int half = Dr / 2;
+          for (int i = 0; i < half; ++i) {
+            float xr = Vec2Traits<T>::to_float(kr[i]);
+            float xi = Vec2Traits<T>::to_float(kr[i + half]);
+            rope_rotate(xr, xi, cos_ptr[i], sin_ptr[i], true);
+            krd[i] = float_to_e4m3fn_byte(xr);
+            krd[i + half] = float_to_e4m3fn_byte(xi);
+          }
+        }
+      }
+
+      if (kv_buffer_bytes && kv_cache_loc) {
+        int64_t flat_row = kv_cache_loc[token_id];
+        uint8_t* dst = kv_buffer_bytes + flat_row * kv_stride_row_bytes;
+        for (int i = 0; i < Dn; ++i) {
+          dst[i] = float_to_e4m3fn_byte(Vec2Traits<T>::to_float(kn[i]));
+        }
+        if (!is_neox) {
+          for (int i = 0; i < Dr; i += 2) {
+            int base = i >> 1;
+            float xr = Vec2Traits<T>::to_float(kr[i]);
+            float xi = (i + 1 < Dr) ? Vec2Traits<T>::to_float(kr[i + 1]) : 0.0f;
+            rope_rotate(xr, xi, cos_ptr[base], sin_ptr[base], false);
+            dst[Dn + i] = float_to_e4m3fn_byte(xr);
+            if (i + 1 < Dr) dst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
+          }
+        } else {
+          int half = Dr / 2;
+          for (int i = 0; i < half; ++i) {
+            float xr = Vec2Traits<T>::to_float(kr[i]);
+            float xi = Vec2Traits<T>::to_float(kr[i + half]);
+            rope_rotate(xr, xi, cos_ptr[i], sin_ptr[i], true);
+            dst[Dn + i] = float_to_e4m3fn_byte(xr);
+            dst[Dn + i + half] = float_to_e4m3fn_byte(xi);
+          }
+        }
+      }
     }
+  }
 }
 
-} // namespace
+}  // namespace
 
-// Python-exposed function
-// q_nope, q_rope, k_nope, k_rope: half/bfloat16 (we treat as half here for demo)
-// cos_sin_cache: float32 [max_seq, 2*Dr]
-// pos_ids: int64 [nnz]
-// q_out: uint8 [nnz, Dn+Dr] (stores E4M3 raw bytes)
-// k_nope_out/k_rope_out: optional uint8 outputs (None allowed)
-// kv_buffer: optional uint8 [(slots+page), 1, (Dn+Dr)] raw bytes
-// kv_cache_loc: optional int64 [nnz]
 void mla_rope_quantize_fp8_fused(
     at::Tensor q_nope,
     at::Tensor q_rope,
@@ -481,279 +375,310 @@ void mla_rope_quantize_fp8_fused(
     at::Tensor q_out,
     c10::optional<at::Tensor> k_nope_out,
     c10::optional<at::Tensor> k_rope_out,
-    // fused args
     c10::optional<at::Tensor> kv_buffer,
-    c10::optional<at::Tensor> kv_cache_loc
-) {
-    CHECK_INPUT(q_nope); CHECK_INPUT(q_rope); CHECK_INPUT(k_nope); CHECK_INPUT(k_rope);
-    CHECK_INPUT(cos_sin_cache); CHECK_INPUT(pos_ids); CHECK_INPUT(q_out);
-    CHECK_SAME_DEVICE(q_nope, q_rope); CHECK_SAME_DEVICE(q_nope, k_nope);
-    CHECK_SAME_DEVICE(q_nope, cos_sin_cache); CHECK_SAME_DEVICE(q_nope, pos_ids);
-    CHECK_SAME_DEVICE(q_nope, q_out);
-
-    // Q can be 2D or 3D: [nnz, dim] or [nnz, num_heads, dim]
-    // K must be 2D: [nnz, dim]
-    TORCH_CHECK(q_nope.dim() == 2 || q_nope.dim() == 3, "q_nope must be 2D or 3D");
-    TORCH_CHECK(q_rope.dim() == 2 || q_rope.dim() == 3, "q_rope must be 2D or 3D");
-    CHECK_DIM(2, k_nope); CHECK_DIM(2, k_rope);
-    CHECK_DIM(1, pos_ids); CHECK_DIM(2, cos_sin_cache);
-
-    // Determine dimensions and strides based on Q shape
-    int nnz_tokens, num_heads, Dn, Dr;
-    int64_t qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head;
-    int64_t qout_stride_tok_bytes, qout_stride_head_bytes;
-    
-    if (q_nope.dim() == 3) {
-        // 3D Q: [nnz_tokens, num_heads, dim]
-        nnz_tokens = q_nope.size(0);
-        num_heads = q_nope.size(1);
-        Dn = q_nope.size(2);
-        Dr = q_rope.size(2);
-        
-        TORCH_CHECK(q_rope.size(0) == nnz_tokens && q_rope.size(1) == num_heads, "q_rope shape mismatch");
-        TORCH_CHECK(q_out.dim() == 3 && q_out.size(0) == nnz_tokens && q_out.size(1) == num_heads && q_out.size(2) == (Dn + Dr),
-                    "q_out must be [nnz, num_heads, Dn+Dr] when Q is 3D");
-        
-        // Q strides in elements
-        qn_stride_tok = q_nope.stride(0);
-        qn_stride_head = q_nope.stride(1);
-        qr_stride_tok = q_rope.stride(0);
-        qr_stride_head = q_rope.stride(1);
-        
-        // q_out strides in BYTES (uint8)
-        qout_stride_tok_bytes = q_out.stride(0);
-        qout_stride_head_bytes = q_out.stride(1);
-    } else {
-        // 2D Q: [nnz_tokens, dim] (single head or flattened)
-        nnz_tokens = q_nope.size(0);
-        Dn = q_nope.size(1);
-        Dr = q_rope.size(1);
-        num_heads = 1;
-        
-        TORCH_CHECK(q_rope.size(0) == nnz_tokens, "q_rope vs q_nope mismatch");
-        TORCH_CHECK(q_out.dim() == 2 && q_out.size(0) == nnz_tokens && q_out.size(1) == (Dn + Dr),
-                    "q_out must be [nnz, Dn+Dr] when Q is 2D");
-        
-        qn_stride_tok = q_nope.stride(0);
-        qn_stride_head = 0;
-        qr_stride_tok = q_rope.stride(0);
-        qr_stride_head = 0;
-        
-        qout_stride_tok_bytes = q_out.stride(0);
-        qout_stride_head_bytes = 0;
-    }
-    
-    int nnz_k = k_rope.size(0);
-    TORCH_CHECK(k_nope.size(0) == nnz_k && k_nope.size(1) == Dn, "k_nope shape mismatch");
-    TORCH_CHECK(k_rope.size(0) == nnz_k && k_rope.size(1) == Dr, "k_rope shape mismatch");
-    TORCH_CHECK(nnz_k == nnz_tokens, "K batch size must match Q token count");
-    
-    // ===== K strides (CRITICAL FIX: use stride(0) to handle non-contiguous batches) =====
-    // In multi-req concurrent scenarios, K may have stride(0) != dim due to slicing/gather
-    // We MUST use the actual stride(0) instead of assuming Dn/Dr for correct row addressing
-    int64_t kn_stride_tok = k_nope.stride(0);  // elements per token
-    int64_t kr_stride_tok = k_rope.stride(0);  // elements per token
-    
-    // ===== Robustness checks (expert suggestions) =====
-    
-    // 1. K must be contiguous on last dim (or explicitly handle stride)
-    //    For simplicity, we enforce contiguous K on dim=1
-    if (k_nope.stride(1) != 1) {
-        TORCH_CHECK(false, "k_nope must be contiguous on last dim. Call .contiguous() before passing to kernel.");
-    }
-    if (k_rope.stride(1) != 1) {
-        TORCH_CHECK(false, "k_rope must be contiguous on last dim. Call .contiguous() before passing to kernel.");
-    }
-    
-    // 2. Q last dim must be contiguous (vectorized kernel assumes this)
-    int q_last_dim = q_nope.dim() - 1;
-    TORCH_CHECK(q_nope.stride(q_last_dim) == 1, "q_nope last dim must be contiguous");
-    TORCH_CHECK(q_rope.stride(q_last_dim) == 1, "q_rope last dim must be contiguous");
-    
-    // 3. q_out last dim must be contiguous
-    int qout_last_dim = q_out.dim() - 1;
-    TORCH_CHECK(q_out.stride(qout_last_dim) == 1, "q_out last dim must be contiguous");
-    
-    // ==================================================
-
-    uint8_t* k_nope_out_ptr = nullptr;
-    uint8_t* k_rope_out_ptr = nullptr;
-    if (k_nope_out.has_value()) {
-        auto t = k_nope_out.value();
-        CHECK_INPUT(t); CHECK_DIM(2, t);
-        TORCH_CHECK(t.size(0) == nnz_k && t.size(1) == Dn, "k_nope_out shape mismatch");
-        // Accept FP8 tensor, reinterpret as uint8*
-        k_nope_out_ptr = reinterpret_cast<uint8_t*>(t.data_ptr());
-    }
-    if (k_rope_out.has_value()) {
-        auto t = k_rope_out.value();
-        CHECK_INPUT(t); CHECK_DIM(2, t);
-        TORCH_CHECK(t.size(0) == nnz_k && t.size(1) == Dr, "k_rope_out shape mismatch");
-        // Accept FP8 tensor, reinterpret as uint8*
-        k_rope_out_ptr = reinterpret_cast<uint8_t*>(t.data_ptr());
+    c10::optional<at::Tensor> kv_cache_loc) {
+  CHECK_INPUT(q_nope);
+  CHECK_INPUT(q_rope);
+  CHECK_INPUT(k_nope);
+  CHECK_INPUT(k_rope);
+  CHECK_INPUT(cos_sin_cache);
+  CHECK_INPUT(pos_ids);
+  CHECK_INPUT(q_out);
+
+  auto device = q_nope.device();
+  CHECK_EQ(q_rope.device(), device);
+  CHECK_EQ(k_nope.device(), device);
+  CHECK_EQ(k_rope.device(), device);
+  CHECK_EQ(cos_sin_cache.device(), device);
+  CHECK_EQ(pos_ids.device(), device);
+  CHECK_EQ(q_out.device(), device);
+
+  TORCH_CHECK(q_nope.dim() == 2 || q_nope.dim() == 3, "q_nope must be 2D or 3D");
+  TORCH_CHECK(q_rope.dim() == 2 || q_rope.dim() == 3, "q_rope must be 2D or 3D");
+  CHECK_DIM(2, k_nope);
+  CHECK_DIM(2, k_rope);
+  CHECK_DIM(1, pos_ids);
+  CHECK_DIM(2, cos_sin_cache);
+
+  // Determine dimensions and strides based on Q shape
+  int nnz_tokens, num_heads, Dn, Dr;
+  int64_t qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head;
+  int64_t qout_stride_tok_bytes, qout_stride_head_bytes;
+
+  if (q_nope.dim() == 3) {
+    nnz_tokens = q_nope.size(0);
+    num_heads = q_nope.size(1);
+    Dn = q_nope.size(2);
+    Dr = q_rope.size(2);
+
+    CHECK_EQ(q_rope.size(0), nnz_tokens);
+    CHECK_EQ(q_rope.size(1), num_heads);
+    CHECK_EQ(q_out.dim(), 3);
+    CHECK_EQ(q_out.size(0), nnz_tokens);
+    CHECK_EQ(q_out.size(1), num_heads);
+    CHECK_EQ(q_out.size(2), Dn + Dr);
+
+    qn_stride_tok = q_nope.stride(0);
+    qn_stride_head = q_nope.stride(1);
+    qr_stride_tok = q_rope.stride(0);
+    qr_stride_head = q_rope.stride(1);
+    qout_stride_tok_bytes = q_out.stride(0);
+    qout_stride_head_bytes = q_out.stride(1);
+  } else {
+    nnz_tokens = q_nope.size(0);
+    Dn = q_nope.size(1);
+    Dr = q_rope.size(1);
+    num_heads = 1;
+
+    CHECK_EQ(q_rope.size(0), nnz_tokens);
+    CHECK_EQ(q_out.dim(), 2);
+    CHECK_EQ(q_out.size(0), nnz_tokens);
+    CHECK_EQ(q_out.size(1), Dn + Dr);
+
+    qn_stride_tok = q_nope.stride(0);
+    qn_stride_head = 0;
+    qr_stride_tok = q_rope.stride(0);
+    qr_stride_head = 0;
+    qout_stride_tok_bytes = q_out.stride(0);
+    qout_stride_head_bytes = 0;
+  }
+
+  int nnz_k = k_rope.size(0);
+  CHECK_EQ(k_nope.size(0), nnz_k);
+  CHECK_EQ(k_nope.size(1), Dn);
+  CHECK_EQ(k_rope.size(0), nnz_k);
+  CHECK_EQ(k_rope.size(1), Dr);
+  CHECK_EQ(nnz_k, nnz_tokens);
+
+  int64_t kn_stride_tok = k_nope.stride(0);
+  int64_t kr_stride_tok = k_rope.stride(0);
+
+  CHECK_LAST_DIM_CONTIGUOUS(k_nope);
+  CHECK_LAST_DIM_CONTIGUOUS(k_rope);
+  CHECK_LAST_DIM_CONTIGUOUS(q_nope);
+  CHECK_LAST_DIM_CONTIGUOUS(q_rope);
+  CHECK_LAST_DIM_CONTIGUOUS(q_out);
+
+  uint8_t* k_nope_out_ptr = nullptr;
+  uint8_t* k_rope_out_ptr = nullptr;
+  if (k_nope_out.has_value()) {
+    auto t = k_nope_out.value();
+    CHECK_INPUT(t);
+    CHECK_DIM(2, t);
+    CHECK_EQ(t.size(0), nnz_k);
+    CHECK_EQ(t.size(1), Dn);
+    k_nope_out_ptr = reinterpret_cast<uint8_t*>(t.data_ptr());
+  }
+  if (k_rope_out.has_value()) {
+    auto t = k_rope_out.value();
+    CHECK_INPUT(t);
+    CHECK_DIM(2, t);
+    CHECK_EQ(t.size(0), nnz_k);
+    CHECK_EQ(t.size(1), Dr);
+    k_rope_out_ptr = reinterpret_cast<uint8_t*>(t.data_ptr());
+  }
+
+  uint8_t* kv_buf_ptr = nullptr;
+  int64_t kv_stride_row_bytes = 0;
+  const int64_t* kv_loc_ptr = nullptr;
+  if (kv_buffer.has_value() || kv_cache_loc.has_value()) {
+    TORCH_CHECK(kv_buffer.has_value() && kv_cache_loc.has_value(), "kv_buffer and kv_cache_loc must be both provided");
+    auto kv = kv_buffer.value();
+    auto loc = kv_cache_loc.value();
+    CHECK_INPUT(kv);
+    CHECK_INPUT(loc);
+    CHECK_DIM(1, loc);
+
+    TORCH_CHECK(kv.dim() == 2 || (kv.dim() == 3 && kv.size(1) == 1), "kv_buffer must be 2D or 3D with middle dim=1");
+
+    int kv_dim_actual = (kv.dim() == 3) ? kv.size(2) : kv.size(1);
+    CHECK_EQ(kv_dim_actual, Dn + Dr);
+    CHECK_EQ(loc.size(0), nnz_k);
+    CHECK_LAST_DIM_CONTIGUOUS(kv);
+
+    kv_buf_ptr = reinterpret_cast<uint8_t*>(kv.data_ptr());
+    kv_stride_row_bytes = kv.stride(0) * kv.element_size();
+    kv_loc_ptr = loc.data_ptr<int64_t>();
+  }
+
+  const float* cs_ptr = cos_sin_cache.data_ptr<float>();
+  const int64_t* pos_ptr = pos_ids.data_ptr<int64_t>();
+  uint8_t* q_out_ptr = reinterpret_cast<uint8_t*>(q_out.data_ptr());
+
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream().stream();
+  int total_rows = nnz_tokens * num_heads;
+
+  bool can_vectorize = ((Dn & 3) == 0) && ((Dr & 3) == 0) && !is_neox;
+  if (can_vectorize) {
+    bool strides_aligned =
+        (qout_stride_tok_bytes % 4 == 0) && (num_heads > 1 ? (qout_stride_head_bytes % 4 == 0) : true);
+    if (!strides_aligned) {
+      can_vectorize = false;
     }
+  }
 
-    uint8_t* kv_buf_ptr = nullptr;
-    int64_t kv_stride_row_bytes = 0;
-    const int64_t* kv_loc_ptr = nullptr;
-    if (kv_buffer.has_value() || kv_cache_loc.has_value()) {
-        TORCH_CHECK(kv_buffer.has_value() && kv_cache_loc.has_value(),
-            "kv_buffer and kv_cache_loc must be both provided for fused write");
-        auto kv = kv_buffer.value();
-        auto loc = kv_cache_loc.value();
-        CHECK_INPUT(kv); CHECK_INPUT(loc);
-        CHECK_DIM(1, loc);
-        
-        // CRITICAL FIX: Support 2D buffer [total_rows, kv_dim] (SGLang semantics)
-        // out_cache_loc is flat row index, NOT block ID
-        TORCH_CHECK(kv.dim() == 2, "kv_buffer must be 2D [total_rows, kv_dim]");
-        TORCH_CHECK(kv.size(1) == (Dn + Dr), "kv_buffer last dim must be Dn+Dr");
-        TORCH_CHECK(loc.size(0) == nnz_k, "kv_cache_loc size must match K batch size");
-        
-        // CRITICAL: Check contiguity on last dim to avoid silent errors
-        TORCH_CHECK(kv.stride(1) == 1, "kv_buffer last dim must be contiguous (stride=1)");
-        
-        // Accept FP8 tensor, reinterpret as uint8*
-        kv_buf_ptr = reinterpret_cast<uint8_t*>(kv.data_ptr());
-        
-        // 2D KV buffer layout: [total_rows, kv_dim]
-        // stride(0): row stride in elements, convert to bytes
-        int elem_size = kv.element_size();
-        kv_stride_row_bytes = kv.stride(0) * elem_size;
-        kv_loc_ptr = loc.data_ptr<int64_t>();
-    }
+  auto dtype = q_nope.scalar_type();
+
+  if (dtype == at::kHalf) {
+    const __half* qn_ptr = reinterpret_cast<const __half*>(q_nope.data_ptr<at::Half>());
+    const __half* qr_ptr = reinterpret_cast<const __half*>(q_rope.data_ptr<at::Half>());
+    const __half* kn_ptr = reinterpret_cast<const __half*>(k_nope.data_ptr<at::Half>());
+    const __half* kr_ptr = reinterpret_cast<const __half*>(k_rope.data_ptr<at::Half>());
 
-    // Get common pointers
-    const float* cs_ptr = cos_sin_cache.data_ptr<float>();
-    const int64_t* pos_ptr = pos_ids.data_ptr<int64_t>();
-    // Accept FP8 tensor for q_out, reinterpret as uint8*
-    uint8_t* q_out_ptr = reinterpret_cast<uint8_t*>(q_out.data_ptr());
-
-    // Get current CUDA stream (compatible with PyTorch 2.x)
-    cudaStream_t stream = c10::cuda::getCurrentCUDAStream().stream();
-    
-    // Total number of work items: nnz_tokens * num_heads
-    int total_rows = nnz_tokens * num_heads;
-    
-    // Dispatch: use vectorized kernel if dimensions are 4-byte aligned
-    // Vectorized path requires (Dn+Dr) % 4 == 0 for uint32_t writes
-    // CRITICAL: Disable vectorization for NeoX (pairs are (i, i+Dr/2), not adjacent)
-    bool can_vectorize = ((Dn & 3) == 0) && ((Dr & 3) == 0) && !is_neox;
-    
     if (can_vectorize) {
-        // Additional check: q_out strides must be 4-byte aligned for vectorized writes
-        // Since q_out is uint8 and we write uint32_t, strides should be multiples of 4
-        bool strides_aligned = (qout_stride_tok_bytes % 4 == 0) && 
-                               (num_heads > 1 ? (qout_stride_head_bytes % 4 == 0) : true);
-        if (!strides_aligned) {
-            // Fallback to scalar kernel if strides are misaligned
-            can_vectorize = false;
-        }
-    }
-    
-    // ===== Dtype dispatch: support both FP16 and BF16 =====
-    auto dtype = q_nope.scalar_type();
-    
-    // DEBUG: Print which kernel path we're using
-    const char* debug_path = std::getenv("SGL_DEBUG_KERNEL_PATH");
-    if (debug_path && std::string(debug_path) == "1") {
-        printf("[KERNEL DEBUG] can_vectorize=%d, dtype=%s, Dn=%d, Dr=%d, is_neox=%d\n",
-               can_vectorize, 
-               dtype == at::kHalf ? "FP16" : (dtype == at::kBFloat16 ? "BF16" : "OTHER"),
-               Dn, Dr, is_neox);
+      constexpr int WARPS_PER_CTA = 4;
+      dim3 vecBlock(WARPS_PER_CTA * 32);
+      dim3 vecGrid((total_rows + WARPS_PER_CTA - 1) / WARPS_PER_CTA);
+
+      FusedRopeQuantizeKernelVec<WARPS_PER_CTA, __half><<<vecGrid, vecBlock, 0, stream>>>(
+          qn_ptr,
+          qr_ptr,
+          qn_stride_tok,
+          qn_stride_head,
+          qr_stride_tok,
+          qr_stride_head,
+          kn_ptr,
+          kr_ptr,
+          kn_stride_tok,
+          kr_stride_tok,
+          cs_ptr,
+          pos_ptr,
+          nnz_tokens,
+          num_heads,
+          Dn,
+          Dr,
+          is_neox,
+          q_out_ptr,
+          qout_stride_tok_bytes,
+          qout_stride_head_bytes,
+          k_nope_out_ptr,
+          k_rope_out_ptr,
+          kv_buf_ptr,
+          kv_stride_row_bytes,
+          kv_loc_ptr);
+    } else {
+      constexpr int BLOCK_THREADS = 256;
+      dim3 grid((total_rows + BLOCK_THREADS - 1) / BLOCK_THREADS);
+
+      FusedRopeQuantizeKernelScalar<BLOCK_THREADS, __half><<<grid, BLOCK_THREADS, 0, stream>>>(
+          qn_ptr,
+          qr_ptr,
+          qn_stride_tok,
+          qn_stride_head,
+          qr_stride_tok,
+          qr_stride_head,
+          kn_ptr,
+          kr_ptr,
+          kn_stride_tok,
+          kr_stride_tok,
+          cs_ptr,
+          pos_ptr,
+          nnz_tokens,
+          num_heads,
+          Dn,
+          Dr,
+          is_neox,
+          q_out_ptr,
+          qout_stride_tok_bytes,
+          qout_stride_head_bytes,
+          k_nope_out_ptr,
+          k_rope_out_ptr,
+          kv_buf_ptr,
+          kv_stride_row_bytes,
+          kv_loc_ptr);
     }
-    
-    if (dtype == at::kHalf) {
-        // FP16 path
-        const __half* qn_ptr = reinterpret_cast<const __half*>(q_nope.data_ptr<at::Half>());
-        const __half* qr_ptr = reinterpret_cast<const __half*>(q_rope.data_ptr<at::Half>());
-        const __half* kn_ptr = reinterpret_cast<const __half*>(k_nope.data_ptr<at::Half>());
-        const __half* kr_ptr = reinterpret_cast<const __half*>(k_rope.data_ptr<at::Half>());
-        
-        if (can_vectorize) {
-            constexpr int WARPS_PER_CTA = 4;
-            dim3 vecBlock(WARPS_PER_CTA * 32);
-            dim3 vecGrid((total_rows + WARPS_PER_CTA - 1) / WARPS_PER_CTA);
-            
-            FusedRopeQuantizeKernelVec<WARPS_PER_CTA, __half><<<vecGrid, vecBlock, 0, stream>>>(
-                qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
-                kn_ptr, kr_ptr, kn_stride_tok, kr_stride_tok,
-                cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
-                q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
-                k_nope_out_ptr, k_rope_out_ptr, 
-                kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr
-            );
-        } else {
-            constexpr int BLOCK_THREADS = 256;
-            dim3 grid((total_rows + BLOCK_THREADS - 1) / BLOCK_THREADS);
-            
-            FusedRopeQuantizeKernelScalar<BLOCK_THREADS, __half><<<grid, BLOCK_THREADS, 0, stream>>>(
-                qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
-                kn_ptr, kr_ptr, kn_stride_tok, kr_stride_tok,
-                cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
-                q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
-                k_nope_out_ptr, k_rope_out_ptr, 
-                kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr
-            );
-        }
-    } else if (dtype == at::kBFloat16) {
-        // BF16 path
-        const nv_bfloat16* qn_ptr = reinterpret_cast<const nv_bfloat16*>(q_nope.data_ptr<at::BFloat16>());
-        const nv_bfloat16* qr_ptr = reinterpret_cast<const nv_bfloat16*>(q_rope.data_ptr<at::BFloat16>());
-        const nv_bfloat16* kn_ptr = reinterpret_cast<const nv_bfloat16*>(k_nope.data_ptr<at::BFloat16>());
-        const nv_bfloat16* kr_ptr = reinterpret_cast<const nv_bfloat16*>(k_rope.data_ptr<at::BFloat16>());
-        
-        if (can_vectorize) {
-            constexpr int WARPS_PER_CTA = 4;
-            dim3 vecBlock(WARPS_PER_CTA * 32);
-            dim3 vecGrid((total_rows + WARPS_PER_CTA - 1) / WARPS_PER_CTA);
-            
-            FusedRopeQuantizeKernelVec<WARPS_PER_CTA, nv_bfloat16><<<vecGrid, vecBlock, 0, stream>>>(
-                qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
-                kn_ptr, kr_ptr, kn_stride_tok, kr_stride_tok,
-                cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
-                q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
-                k_nope_out_ptr, k_rope_out_ptr, 
-                kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr
-            );
-        } else {
-            constexpr int BLOCK_THREADS = 256;
-            dim3 grid((total_rows + BLOCK_THREADS - 1) / BLOCK_THREADS);
-            
-            FusedRopeQuantizeKernelScalar<BLOCK_THREADS, nv_bfloat16><<<grid, BLOCK_THREADS, 0, stream>>>(
-                qn_ptr, qr_ptr, qn_stride_tok, qn_stride_head, qr_stride_tok, qr_stride_head,
-                kn_ptr, kr_ptr, kn_stride_tok, kr_stride_tok,
-                cs_ptr, pos_ptr, nnz_tokens, num_heads, Dn, Dr, is_neox,
-                q_out_ptr, qout_stride_tok_bytes, qout_stride_head_bytes,
-                k_nope_out_ptr, k_rope_out_ptr, 
-                kv_buf_ptr, kv_stride_row_bytes, kv_loc_ptr
-            );
-        }
+  } else if (dtype == at::kBFloat16) {
+    const nv_bfloat16* qn_ptr = reinterpret_cast<const nv_bfloat16*>(q_nope.data_ptr<at::BFloat16>());
+    const nv_bfloat16* qr_ptr = reinterpret_cast<const nv_bfloat16*>(q_rope.data_ptr<at::BFloat16>());
+    const nv_bfloat16* kn_ptr = reinterpret_cast<const nv_bfloat16*>(k_nope.data_ptr<at::BFloat16>());
+    const nv_bfloat16* kr_ptr = reinterpret_cast<const nv_bfloat16*>(k_rope.data_ptr<at::BFloat16>());
+
+    if (can_vectorize) {
+      constexpr int WARPS_PER_CTA = 4;
+      dim3 vecBlock(WARPS_PER_CTA * 32);
+      dim3 vecGrid((total_rows + WARPS_PER_CTA - 1) / WARPS_PER_CTA);
+
+      FusedRopeQuantizeKernelVec<WARPS_PER_CTA, nv_bfloat16><<<vecGrid, vecBlock, 0, stream>>>(
+          qn_ptr,
+          qr_ptr,
+          qn_stride_tok,
+          qn_stride_head,
+          qr_stride_tok,
+          qr_stride_head,
+          kn_ptr,
+          kr_ptr,
+          kn_stride_tok,
+          kr_stride_tok,
+          cs_ptr,
+          pos_ptr,
+          nnz_tokens,
+          num_heads,
+          Dn,
+          Dr,
+          is_neox,
+          q_out_ptr,
+          qout_stride_tok_bytes,
+          qout_stride_head_bytes,
+          k_nope_out_ptr,
+          k_rope_out_ptr,
+          kv_buf_ptr,
+          kv_stride_row_bytes,
+          kv_loc_ptr);
     } else {
-        TORCH_CHECK(false, "Unsupported dtype for fused kernel. Only FP16 and BF16 are supported.");
+      constexpr int BLOCK_THREADS = 256;
+      dim3 grid((total_rows + BLOCK_THREADS - 1) / BLOCK_THREADS);
+
+      FusedRopeQuantizeKernelScalar<BLOCK_THREADS, nv_bfloat16><<<grid, BLOCK_THREADS, 0, stream>>>(
+          qn_ptr,
+          qr_ptr,
+          qn_stride_tok,
+          qn_stride_head,
+          qr_stride_tok,
+          qr_stride_head,
+          kn_ptr,
+          kr_ptr,
+          kn_stride_tok,
+          kr_stride_tok,
+          cs_ptr,
+          pos_ptr,
+          nnz_tokens,
+          num_heads,
+          Dn,
+          Dr,
+          is_neox,
+          q_out_ptr,
+          qout_stride_tok_bytes,
+          qout_stride_head_bytes,
+          k_nope_out_ptr,
+          k_rope_out_ptr,
+          kv_buf_ptr,
+          kv_stride_row_bytes,
+          kv_loc_ptr);
     }
-    
-    TORCH_CHECK(cudaGetLastError() == cudaSuccess, "Kernel launch failed");
+  } else {
+    TORCH_CHECK(false, "Unsupported dtype for fused kernel. Only FP16 and BF16 are supported.");
+  }
+
+  TORCH_CHECK(cudaGetLastError() == cudaSuccess, "Kernel launch failed");
 }
 
-// PyBind11 module definition (ONLY for standalone build)
-// When building as part of sgl_kernel, this is handled by common_extension.cc
-// TORCH_EXTENSION_NAME is only defined by torch.utils.cpp_extension (standalone)
 #ifdef TORCH_EXTENSION_NAME
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("mla_rope_quantize_fp8_fused", &mla_rope_quantize_fp8_fused,
-          "Fused MLA RoPE + FP8 quantization with optional direct KV write",
-          py::arg("q_nope"),
-          py::arg("q_rope"),
-          py::arg("k_nope"),
-          py::arg("k_rope"),
-          py::arg("cos_sin_cache"),
-          py::arg("pos_ids"),
-          py::arg("is_neox"),
-          py::arg("q_out"),
-          py::arg("k_nope_out") = py::none(),
-          py::arg("k_rope_out") = py::none(),
-          py::arg("kv_buffer") = py::none(),
-          py::arg("kv_cache_loc") = py::none());
+  m.def(
+      "mla_rope_quantize_fp8_fused",
+      &mla_rope_quantize_fp8_fused,
+      "Fused MLA RoPE + FP8 quantization with optional KV cache write",
+      py::arg("q_nope"),
+      py::arg("q_rope"),
+      py::arg("k_nope"),
+      py::arg("k_rope"),
+      py::arg("cos_sin_cache"),
+      py::arg("pos_ids"),
+      py::arg("is_neox"),
+      py::arg("q_out"),
+      py::arg("k_nope_out") = py::none(),
+      py::arg("k_rope_out") = py::none(),
+      py::arg("kv_buffer") = py::none(),
+      py::arg("kv_cache_loc") = py::none());
 }
 #endif
diff --git a/sgl-kernel/tests/test_mla_rope_fp8_fused.py b/sgl-kernel/tests/test_mla_rope_fp8_fused.py
index 876c365ce3a..d6689dff186 100644
--- a/sgl-kernel/tests/test_mla_rope_fp8_fused.py
+++ b/sgl-kernel/tests/test_mla_rope_fp8_fused.py
@@ -15,11 +15,13 @@
 try:
     # Try standalone build first
     from mla_fusion_kernel import mla_rope_quantize_fp8_fused
+
     _has_sgl_kernel = True
 except ImportError:
     # Fallback to sgl_kernel
     try:
         from sgl_kernel import mla_rope_quantize_fp8_fused
+
         _has_sgl_kernel = True
     except ImportError:
         mla_rope_quantize_fp8_fused = None  # Will use non-fused path
@@ -66,9 +68,7 @@ def test_fused_matches_baseline(nnz, num_heads, Dn, Dr, dtype):
 
     # Create cos/sin cache
     max_seq = max(2048, nnz)
-    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[
-        :, None
-    ]
+    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[:, None]
     idx = torch.arange(Dr, device=device, dtype=torch.float32)[None, :]
     freqs = 0.1 * (idx + 1.0)  # Small frequencies to avoid overflow
     cos = torch.cos(t * freqs)
@@ -173,9 +173,7 @@ def test_baseline_only_path(nnz, Dn, Dr):
 
     # Create cos/sin cache
     max_seq = 2048
-    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[
-        :, None
-    ]
+    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[:, None]
     idx = torch.arange(Dr, device=device, dtype=torch.float32)[None, :]
     freqs = 0.1 * (idx + 1.0)
     cos = torch.cos(t * freqs)
@@ -234,9 +232,7 @@ def test_fused_only_path():
 
     # Create cos/sin cache
     max_seq = 2048
-    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[
-        :, None
-    ]
+    t = torch.linspace(0, 1, steps=max_seq, device=device, dtype=torch.float32)[:, None]
     idx = torch.arange(Dr, device=device, dtype=torch.float32)[None, :]
     freqs = 0.1 * (idx + 1.0)
     cos = torch.cos(t * freqs)
@@ -250,7 +246,9 @@ def test_fused_only_path():
     # Allocate only Q output and KV buffer
     q_out = torch.empty(nnz, Dn + Dr, device=device, dtype=torch.uint8)
     slots = nnz + 16
-    kv_buffer = torch.zeros(slots, Dn + Dr, device=device, dtype=torch.uint8)  # 2D format
+    kv_buffer = torch.zeros(
+        slots, Dn + Dr, device=device, dtype=torch.uint8
+    )  # 2D format
     loc = torch.arange(nnz, device=device, dtype=torch.long)
 
     # Call kernel (fused path only)
@@ -279,5 +277,3 @@ def test_fused_only_path():
 
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
-
-

From 3fefb6b57bdbfe65564b6bbf6c0cbb733e9afc48 Mon Sep 17 00:00:00 2001
From: Yangmin Li <yangminl@nvidia.com>
Date: Sun, 2 Nov 2025 09:05:35 -0800
Subject: [PATCH 08/10] code cleaning

---
 .../layers/attention/trtllm_mla_backend.py    | 12 ++++------
 sgl-kernel/build_minimal.sh                   | 24 -------------------
 .../csrc/elementwise/mla_rope_fp8_kv_fused.cu | 22 ++++++++---------
 3 files changed, 16 insertions(+), 42 deletions(-)
 delete mode 100755 sgl-kernel/build_minimal.sh

diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
index 0731b06f327..7c9c085c860 100755
--- a/python/sglang/srt/layers/attention/trtllm_mla_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -971,24 +971,23 @@ def forward_extend(
         cos_sin_cache: Optional[torch.Tensor] = None,
         is_neox: Optional[bool] = False,
     ) -> torch.Tensor:
-        # TODO refactor to avoid code duplication for forward_decode
+        # TODO refactor to avoid code duplication
         merge_query = q_rope is not None
         fused_kv = False  # Track if we used fused KV write path
 
-        # TODO: Check if the condition restrictions (target_verify/draft_extend only) are necessary
-        #       Consider if we can enable FP8 quantize_and_rope for all extend paths safely
-        # For FP8 path in target_verify or draft_extend mode, use quantize_and_rope_for_fp8
         use_fp8_quantize = (
             self.data_type == torch.float8_e4m3fn
             and (
                 forward_batch.forward_mode.is_target_verify()
                 or forward_batch.forward_mode.is_draft_extend(include_v2=True)
             )
-            and all(x is not None for x in [q_rope, k_rope, cos_sin_cache])
         )
 
         if use_fp8_quantize:
-            # FP8 path: quantize and apply RoPE with optional fused KV write
+            assert all(
+                x is not None for x in [q_rope, k_rope, cos_sin_cache]
+            ), "For FP8 path in target_verify/draft_extend, need all of q_rope, k_rope and cos_sin_cache to be not None."
+            
             q, k_fp8_nope, k_fp8_rope = self.quantize_and_rope_for_fp8(
                 q,
                 q_rope,
@@ -1003,7 +1002,6 @@ def forward_extend(
             merge_query = False
             # Fused path: K already written to KV cache, function returns None for K outputs
             fused_kv = k_fp8_nope is None and k_fp8_rope is None
-            # Update local variables for subsequent logic
             k = k_fp8_nope
             k_rope = k_fp8_rope
 
diff --git a/sgl-kernel/build_minimal.sh b/sgl-kernel/build_minimal.sh
deleted file mode 100755
index a56d09398b7..00000000000
--- a/sgl-kernel/build_minimal.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-# Minimal build script for sgl-kernel (only compile needed architectures)
-
-# Clean previous build artifacts
-rm -rf _skbuild/ build/ *.egg-info
-
-# Configure build options
-export CMAKE_BUILD_PARALLEL_LEVEL=2          # Reduce parallelism to avoid OOM
-export SGL_KERNEL_COMPILE_THREADS=4          # NVCC thread count
-
-# Only compile for B200 (SM100) architecture, skip others
-# B200 is compute_100, no need for SM80/89/90
-export ENABLE_BELOW_SM90=OFF                 # Skip older architectures (A100/V100 etc.)
-export SGL_KERNEL_ENABLE_FA3=OFF             # Skip Flash-Attention 3 (SM90a)
-export SGL_KERNEL_ENABLE_SM90A=OFF           # Skip SM90A
-
-# Only keep SM100
-export TORCH_CUDA_ARCH_LIST="10.0"
-
-# Start compilation
-pip install -e . --no-build-isolation -v 2>&1 | tee build.log
-
-echo ""
-echo "Build completed! Check build.log for details."
diff --git a/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu b/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
index 8e29da5984d..845397bb741 100644
--- a/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
+++ b/sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu
@@ -89,7 +89,7 @@ __device__ inline uint32_t pack4(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3)
   return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) | ((uint32_t)a3 << 24);
 }
 
-__device__ inline void rope_rotate(float& xr, float& xi, float c, float s, bool) {
+__device__ inline void rope_rotate(float& xr, float& xi, float c, float s) {
   float xr_new = xr * c - xi * s;
   float xi_new = xr * s + xi * c;
   xr = xr_new;
@@ -178,8 +178,8 @@ __global__ void FusedRopeQuantizeKernelVec(
     float c0 = cos_ptr[base0], s0 = sin_ptr[base0];
     float c1 = cos_ptr[base1], s1 = sin_ptr[base1];
 
-    rope_rotate(f0.x, f0.y, c0, s0, false);
-    rope_rotate(f1.x, f1.y, c1, s1, false);
+    rope_rotate(f0.x, f0.y, c0, s0);
+    rope_rotate(f1.x, f1.y, c1, s1);
 
     uint32_t packed = pack4(
         float_to_e4m3fn_byte(f0.x), float_to_e4m3fn_byte(f0.y), float_to_e4m3fn_byte(f1.x), float_to_e4m3fn_byte(f1.y));
@@ -214,8 +214,8 @@ __global__ void FusedRopeQuantizeKernelVec(
       float c0 = cos_ptr[base0], s0 = sin_ptr[base0];
       float c1 = cos_ptr[base1], s1 = sin_ptr[base1];
 
-      rope_rotate(f0.x, f0.y, c0, s0, false);
-      rope_rotate(f1.x, f1.y, c1, s1, false);
+      rope_rotate(f0.x, f0.y, c0, s0);
+      rope_rotate(f1.x, f1.y, c1, s1);
 
       uint32_t packed = pack4(
           float_to_e4m3fn_byte(f0.x),
@@ -283,7 +283,7 @@ __global__ void FusedRopeQuantizeKernelScalar(
           int base = i >> 1;
           float xr = Vec2Traits<T>::to_float(qr[i + 0]);
           float xi = (i + 1 < Dr) ? Vec2Traits<T>::to_float(qr[i + 1]) : 0.0f;
-          rope_rotate(xr, xi, cos_ptr[base], sin_ptr[base], false);
+          rope_rotate(xr, xi, cos_ptr[base], sin_ptr[base]);
           qdst[Dn + i] = float_to_e4m3fn_byte(xr);
           if (i + 1 < Dr) qdst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
         }
@@ -292,7 +292,7 @@ __global__ void FusedRopeQuantizeKernelScalar(
         for (int i = 0; i < half; ++i) {
           float xr = Vec2Traits<T>::to_float(qr[i]);
           float xi = Vec2Traits<T>::to_float(qr[i + half]);
-          rope_rotate(xr, xi, cos_ptr[i], sin_ptr[i], true);
+          rope_rotate(xr, xi, cos_ptr[i], sin_ptr[i]);
           qdst[Dn + i] = float_to_e4m3fn_byte(xr);
           qdst[Dn + i + half] = float_to_e4m3fn_byte(xi);
         }
@@ -316,7 +316,7 @@ __global__ void FusedRopeQuantizeKernelScalar(
             int base = i >> 1;
             float xr = Vec2Traits<T>::to_float(kr[i]);
             float xi = (i + 1 < Dr) ? Vec2Traits<T>::to_float(kr[i + 1]) : 0.0f;
-            rope_rotate(xr, xi, cos_ptr[base], sin_ptr[base], false);
+            rope_rotate(xr, xi, cos_ptr[base], sin_ptr[base]);
             krd[i] = float_to_e4m3fn_byte(xr);
             if (i + 1 < Dr) krd[i + 1] = float_to_e4m3fn_byte(xi);
           }
@@ -325,7 +325,7 @@ __global__ void FusedRopeQuantizeKernelScalar(
           for (int i = 0; i < half; ++i) {
             float xr = Vec2Traits<T>::to_float(kr[i]);
             float xi = Vec2Traits<T>::to_float(kr[i + half]);
-            rope_rotate(xr, xi, cos_ptr[i], sin_ptr[i], true);
+            rope_rotate(xr, xi, cos_ptr[i], sin_ptr[i]);
             krd[i] = float_to_e4m3fn_byte(xr);
             krd[i + half] = float_to_e4m3fn_byte(xi);
           }
@@ -343,7 +343,7 @@ __global__ void FusedRopeQuantizeKernelScalar(
             int base = i >> 1;
             float xr = Vec2Traits<T>::to_float(kr[i]);
             float xi = (i + 1 < Dr) ? Vec2Traits<T>::to_float(kr[i + 1]) : 0.0f;
-            rope_rotate(xr, xi, cos_ptr[base], sin_ptr[base], false);
+            rope_rotate(xr, xi, cos_ptr[base], sin_ptr[base]);
             dst[Dn + i] = float_to_e4m3fn_byte(xr);
             if (i + 1 < Dr) dst[Dn + i + 1] = float_to_e4m3fn_byte(xi);
           }
@@ -352,7 +352,7 @@ __global__ void FusedRopeQuantizeKernelScalar(
           for (int i = 0; i < half; ++i) {
             float xr = Vec2Traits<T>::to_float(kr[i]);
             float xi = Vec2Traits<T>::to_float(kr[i + half]);
-            rope_rotate(xr, xi, cos_ptr[i], sin_ptr[i], true);
+            rope_rotate(xr, xi, cos_ptr[i], sin_ptr[i]);
             dst[Dn + i] = float_to_e4m3fn_byte(xr);
             dst[Dn + i + half] = float_to_e4m3fn_byte(xi);
           }

From 633003a1586da898e80288ee6e16d6d3d491688a Mon Sep 17 00:00:00 2001
From: Yangmin Li <yangminl@nvidia.com>
Date: Sun, 2 Nov 2025 10:57:35 -0800
Subject: [PATCH 09/10] small fix

---
 python/sglang/srt/layers/attention/trtllm_mla_backend.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
index 7c9c085c860..e1a77adea6d 100755
--- a/python/sglang/srt/layers/attention/trtllm_mla_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -975,19 +975,19 @@ def forward_extend(
         merge_query = q_rope is not None
         fused_kv = False  # Track if we used fused KV write path
 
+        # FP8 quantize path: only enable when all required inputs are present
+        # If any parameter is None, gracefully fall back to standard path
+        # TODO: Verify removing explicit assert doesn't mask bugs where params should be present but are None
         use_fp8_quantize = (
             self.data_type == torch.float8_e4m3fn
             and (
                 forward_batch.forward_mode.is_target_verify()
                 or forward_batch.forward_mode.is_draft_extend(include_v2=True)
             )
+            and all(x is not None for x in [q_rope, k_rope, cos_sin_cache])
         )
 
         if use_fp8_quantize:
-            assert all(
-                x is not None for x in [q_rope, k_rope, cos_sin_cache]
-            ), "For FP8 path in target_verify/draft_extend, need all of q_rope, k_rope and cos_sin_cache to be not None."
-            
             q, k_fp8_nope, k_fp8_rope = self.quantize_and_rope_for_fp8(
                 q,
                 q_rope,

From 2a0a3ac585763ce4706b377300198b1fd24c8ca6 Mon Sep 17 00:00:00 2001
From: Yangmin Li <yangminl@nvidia.com>
Date: Fri, 7 Nov 2025 22:34:53 -0800
Subject: [PATCH 10/10] fix sgl-kernel build issue

---
 benchmark/kernels/bench_flashmla_fused_kv.py  | 27 +------
 mla_fusion_standalone/pyproject.toml          | 10 ---
 mla_fusion_standalone/setup.py                | 75 -------------------
 .../layers/attention/trtllm_mla_backend.py    |  9 +--
 sgl-kernel/csrc/common_extension.cc           |  6 --
 sgl-kernel/include/sgl_kernel_ops.h           |  8 +-
 sgl-kernel/python/sgl_kernel/__init__.py      |  1 +
 sgl-kernel/python/sgl_kernel/elementwise.py   | 47 ++++++++++++
 sgl-kernel/tests/test_mla_rope_fp8_fused.py   | 26 +------
 9 files changed, 59 insertions(+), 150 deletions(-)
 delete mode 100644 mla_fusion_standalone/pyproject.toml
 delete mode 100644 mla_fusion_standalone/setup.py

diff --git a/benchmark/kernels/bench_flashmla_fused_kv.py b/benchmark/kernels/bench_flashmla_fused_kv.py
index 7012ed75420..202e896c466 100644
--- a/benchmark/kernels/bench_flashmla_fused_kv.py
+++ b/benchmark/kernels/bench_flashmla_fused_kv.py
@@ -6,31 +6,10 @@
 import time
 
 import torch
-
-_has_sgl_kernel = False
-mla_rope_quantize_fp8_fused = None
-try:
-    from mla_fusion_kernel import mla_rope_quantize_fp8_fused
-
-    _has_sgl_kernel = True
-    print("Using standalone mla_fusion_kernel")
-except ImportError:
-    try:
-        from sgl_kernel import mla_rope_quantize_fp8_fused
-
-        _has_sgl_kernel = True
-        print("Using sgl_kernel.mla_rope_quantize_fp8_fused")
-    except ImportError:
-        print(
-            "ERROR: Fusion kernel not available. Please build mla_fusion_standalone first."
-        )
-        _has_sgl_kernel = False
+from sgl_kernel import mla_rope_quantize_fp8_fused
 
 
 def run_one(nnz=1024, Dn=512, Dr=64, iters=200, warmup=20, device="cuda"):
-    if not _has_sgl_kernel:
-        return 0, 0, 0
-
     torch.manual_seed(0)
 
     q_nope = torch.randn(nnz, Dn, device=device, dtype=torch.float16)
@@ -120,10 +99,6 @@ def fused():
 
 
 if __name__ == "__main__":
-    if not _has_sgl_kernel:
-        print("Benchmark skipped: sgl_kernel not available")
-        exit(1)
-
     print("MLA RoPE + FP8 Quantization + KV Cache Write Fusion Benchmark")
     print("=" * 70)
     print("Config: Dn=512, Dr=64, iters=1000, warmup=100")
diff --git a/mla_fusion_standalone/pyproject.toml b/mla_fusion_standalone/pyproject.toml
deleted file mode 100644
index 1629b0ac33c..00000000000
--- a/mla_fusion_standalone/pyproject.toml
+++ /dev/null
@@ -1,10 +0,0 @@
-[build-system]
-requires = ["setuptools", "wheel", "torch"]
-build-backend = "setuptools.build_meta"
-
-[project]
-name = "mla-fusion-kernel"
-version = "0.1.0"
-description = "Standalone MLA RoPE + FP8 Fusion Kernel"
-requires-python = ">=3.8"
-dependencies = ["torch"]
diff --git a/mla_fusion_standalone/setup.py b/mla_fusion_standalone/setup.py
deleted file mode 100644
index cab4aec2592..00000000000
--- a/mla_fusion_standalone/setup.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""
-Standalone build for MLA RoPE FP8 Fusion kernel
-"""
-
-import os
-import sys
-
-from setuptools import setup
-
-
-# Delay torch import until build time
-def get_cuda_arch():
-    try:
-        import torch
-
-        cuda_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
-        if cuda_arch_list is None:
-            # Auto-detect
-            if torch.cuda.is_available():
-                capability = torch.cuda.get_device_capability()
-                cuda_arch_list = f"{capability[0]}.{capability[1]}"
-            else:
-                # Default to common architectures
-                cuda_arch_list = "8.0;9.0;10.0"
-        print(f"Building for CUDA architectures: {cuda_arch_list}")
-        return cuda_arch_list
-    except Exception as e:
-        print(f"Warning: Could not detect CUDA arch, using defaults: {e}")
-        return "8.0;9.0;10.0"
-
-
-def get_extensions():
-    from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-    cuda_arch_list = get_cuda_arch()
-
-    return [
-        CUDAExtension(
-            name="mla_fusion_kernel",
-            sources=[
-                "../sgl-kernel/csrc/elementwise/mla_rope_fp8_kv_fused.cu",
-            ],
-            include_dirs=[
-                "../sgl-kernel/include",
-            ],
-            extra_compile_args={
-                "cxx": ["-O3", "-std=c++17"],
-                "nvcc": [
-                    "-O3",
-                    "--use_fast_math",
-                    "-U__CUDA_NO_HALF_OPERATORS__",
-                    "-U__CUDA_NO_HALF_CONVERSIONS__",
-                    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
-                    "-U__CUDA_NO_HALF2_OPERATORS__",
-                    "--expt-relaxed-constexpr",
-                    "--expt-extended-lambda",
-                ]
-                + [
-                    f'-gencode=arch=compute_{arch.replace(".", "")},code=sm_{arch.replace(".", "")}'
-                    for arch in cuda_arch_list.split(";")
-                ],
-            },
-        )
-    ]
-
-
-if __name__ == "__main__":
-    from torch.utils.cpp_extension import BuildExtension
-
-    setup(
-        name="mla_fusion_kernel",
-        ext_modules=get_extensions(),
-        cmdclass={"build_ext": BuildExtension},
-        python_requires=">=3.8",
-    )
diff --git a/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
index e1a77adea6d..657d7cfe7e9 100755
--- a/python/sglang/srt/layers/attention/trtllm_mla_backend.py
+++ b/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -40,14 +40,9 @@
     from sgl_kernel import concat_mla_absorb_q
 
     try:
-        # Try standalone build first
-        from mla_fusion_kernel import mla_rope_quantize_fp8_fused
+        from sgl_kernel import mla_rope_quantize_fp8_fused
     except ImportError:
-        # Fallback to sgl_kernel
-        try:
-            from sgl_kernel import mla_rope_quantize_fp8_fused
-        except ImportError:
-            mla_rope_quantize_fp8_fused = None  # Will use non-fused path
+        mla_rope_quantize_fp8_fused = None  # Will use non-fused path
 
 # Constants
 DEFAULT_WORKSPACE_SIZE_MB = 128  # Memory workspace size in MB
diff --git a/sgl-kernel/csrc/common_extension.cc b/sgl-kernel/csrc/common_extension.cc
index b6a39ad38f8..bca10cfe91a 100644
--- a/sgl-kernel/csrc/common_extension.cc
+++ b/sgl-kernel/csrc/common_extension.cc
@@ -100,12 +100,6 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
       "Tensor!? k_nope_out, Tensor!? k_rope_out, Tensor? kv_buffer, Tensor? kv_cache_loc) -> ()");
   m.impl("mla_rope_quantize_fp8_fused", torch::kCUDA, &mla_rope_quantize_fp8_fused);
 
-  m.def(
-      "mla_rope_quantize_fp8_fused(Tensor q_nope, Tensor q_rope, Tensor k_nope, Tensor k_rope, "
-      "Tensor cos_sin_cache, Tensor pos_ids, bool is_neox, Tensor! q_out, "
-      "Tensor!? k_nope_out, Tensor!? k_rope_out, Tensor? kv_buffer, Tensor? kv_cache_loc) -> ()");
-  m.impl("mla_rope_quantize_fp8_fused", torch::kCUDA, &mla_rope_quantize_fp8_fused);
-
   m.def(
       "downcast_fp8(Tensor k, Tensor v, Tensor k_out, Tensor v_out, Tensor k_scale, Tensor v_scale, Tensor loc, "
       "int mult, int offset) -> ()");
diff --git a/sgl-kernel/include/sgl_kernel_ops.h b/sgl-kernel/include/sgl_kernel_ops.h
index 12f20bf8b34..e3edfe2374b 100644
--- a/sgl-kernel/include/sgl_kernel_ops.h
+++ b/sgl-kernel/include/sgl_kernel_ops.h
@@ -166,10 +166,10 @@ void mla_rope_quantize_fp8_fused(
     at::Tensor pos_ids,
     bool is_neox,
     at::Tensor q_out,
-    const std::optional<at::Tensor>& k_nope_out,
-    const std::optional<at::Tensor>& k_rope_out,
-    const std::optional<at::Tensor>& kv_buffer,
-    const std::optional<at::Tensor>& kv_cache_loc);
+    c10::optional<at::Tensor> k_nope_out,
+    c10::optional<at::Tensor> k_rope_out,
+    c10::optional<at::Tensor> kv_buffer,
+    c10::optional<at::Tensor> kv_cache_loc);
 
 void downcast_fp8(
     at::Tensor& k,
diff --git a/sgl-kernel/python/sgl_kernel/__init__.py b/sgl-kernel/python/sgl_kernel/__init__.py
index 1c36681e189..19add301dea 100644
--- a/sgl-kernel/python/sgl_kernel/__init__.py
+++ b/sgl-kernel/python/sgl_kernel/__init__.py
@@ -30,6 +30,7 @@
     gelu_tanh_and_mul,
     gemma_fused_add_rmsnorm,
     gemma_rmsnorm,
+    mla_rope_quantize_fp8_fused,
     rmsnorm,
     silu_and_mul,
 )
diff --git a/sgl-kernel/python/sgl_kernel/elementwise.py b/sgl-kernel/python/sgl_kernel/elementwise.py
index 5684800bee4..bb645426a75 100644
--- a/sgl-kernel/python/sgl_kernel/elementwise.py
+++ b/sgl-kernel/python/sgl_kernel/elementwise.py
@@ -391,3 +391,50 @@ def concat_mla_absorb_q(
     )
     torch.ops.sgl_kernel.concat_mla_absorb_q(a, b, out)
     return out
+
+
+def mla_rope_quantize_fp8_fused(
+    q_nope: torch.Tensor,
+    q_rope: torch.Tensor,
+    k_nope: torch.Tensor,
+    k_rope: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    pos_ids: torch.Tensor,
+    is_neox: bool,
+    q_out: torch.Tensor,
+    k_nope_out: Optional[torch.Tensor],
+    k_rope_out: Optional[torch.Tensor],
+    kv_buffer: Optional[torch.Tensor],
+    kv_cache_loc: Optional[torch.Tensor],
+):
+    """
+    Fused MLA RoPE + FP8 quantization + KV cache write kernel.
+
+    Args:
+        q_nope: Query nope part
+        q_rope: Query rope part
+        k_nope: Key nope part
+        k_rope: Key rope part
+        cos_sin_cache: Precomputed cos/sin cache
+        pos_ids: Position IDs
+        is_neox: Whether to use NeoX-style RoPE
+        q_out: Output buffer for quantized Q
+        k_nope_out: Optional output buffer for quantized K nope
+        k_rope_out: Optional output buffer for quantized K rope
+        kv_buffer: Optional KV cache buffer for direct write
+        kv_cache_loc: Optional cache locations for KV buffer write
+    """
+    torch.ops.sgl_kernel.mla_rope_quantize_fp8_fused(
+        q_nope,
+        q_rope,
+        k_nope,
+        k_rope,
+        cos_sin_cache,
+        pos_ids,
+        is_neox,
+        q_out,
+        k_nope_out,
+        k_rope_out,
+        kv_buffer,
+        kv_cache_loc,
+    )
diff --git a/sgl-kernel/tests/test_mla_rope_fp8_fused.py b/sgl-kernel/tests/test_mla_rope_fp8_fused.py
index d6689dff186..a54f3920eb1 100644
--- a/sgl-kernel/tests/test_mla_rope_fp8_fused.py
+++ b/sgl-kernel/tests/test_mla_rope_fp8_fused.py
@@ -11,28 +11,10 @@
 
 import pytest
 import torch
+from sgl_kernel import mla_rope_quantize_fp8_fused
 
-try:
-    # Try standalone build first
-    from mla_fusion_kernel import mla_rope_quantize_fp8_fused
 
-    _has_sgl_kernel = True
-except ImportError:
-    # Fallback to sgl_kernel
-    try:
-        from sgl_kernel import mla_rope_quantize_fp8_fused
-
-        _has_sgl_kernel = True
-    except ImportError:
-        mla_rope_quantize_fp8_fused = None  # Will use non-fused path
-        _has_sgl_kernel = False
-
-requires_ext = pytest.mark.skipif(
-    not _has_sgl_kernel, reason="sgl_kernel extension not available"
-)
-
-
-@requires_ext
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize(
     "nnz,num_heads,Dn,Dr,dtype",
     list(
@@ -157,7 +139,7 @@ def test_fused_matches_baseline(nnz, num_heads, Dn, Dr, dtype):
     ), f"Used KV slots must match exactly for {dtype=}, {num_heads=}"
 
 
-@requires_ext
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("nnz,Dn,Dr", [(128, 512, 64), (1024, 512, 64)])
 def test_baseline_only_path(nnz, Dn, Dr):
     """Test that baseline path (without KV buffer) works correctly."""
@@ -216,7 +198,7 @@ def test_baseline_only_path(nnz, Dn, Dr):
     assert k_rope_out.abs().sum() > 0, "k_rope_out should not be all zeros"
 
 
-@requires_ext
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_fused_only_path():
     """Test that fused path (only KV buffer, no separate K outputs) works."""
     device = "cuda"