sgl-project · HaiShaw · Apr 8, 2026 · Apr 4, 2026 · Apr 7, 2026 · Apr 8, 2026
@@ -45,7 +45,7 @@
     quantize_k_cache,
     quantize_k_cache_separate,
 )
-from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_kernel import fp8_dtype, is_fp8_fnuz
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.mem_cache.utils import (
     get_mla_kv_buffer_triton,
@@ -1575,37 +1575,33 @@ def set_mla_kv_buffer(
     ):
         layer_id = layer.layer_id
 
-        if self.nsa_kv_cache_store_fp8:
-            if _is_hip:
-                # HIP FP8 path uses raw MLA KV layout (nope + rope) without per-block scales.
-                # Fuse BF16/FP16 -> FP8 cast with paged KV write.
-                fp8_dtype = (
-                    torch.float8_e4m3fnuz if _is_fp8_fnuz else torch.float8_e4m3fn
-                )
-                set_mla_kv_buffer_triton_fp8_quant(
-                    self.kv_buffer[layer_id - self.start_layer],
-                    loc,
-                    cache_k_nope,
-                    cache_k_rope,
-                    fp8_dtype,
-                )
-            else:
-                # OPTIMIZATION: Quantize k_nope and k_rope separately to avoid concat overhead
-                # This also enables reuse of set_mla_kv_buffer_triton two-tensor write path
-                # quantize_k_cache_separate returns (nope_part, rope_part) as uint8 bytes
-                cache_k_nope_fp8, cache_k_rope_fp8 = quantize_k_cache_separate(
-                    cache_k_nope, cache_k_rope
-                )
+        if _is_hip and self.use_nsa and self.dtype == fp8_dtype:
+            # HIP FP8 path uses raw MLA KV layout (nope + rope) without per-block scales.
+            # Fuse BF16/FP16 -> FP8 cast with paged KV write.
+            set_mla_kv_buffer_triton_fp8_quant(
+                self.kv_buffer[layer_id - self.start_layer],
+                loc,
+                cache_k_nope,
+                cache_k_rope,
+                fp8_dtype,
+            )
+        elif self.nsa_kv_cache_store_fp8:
+            # OPTIMIZATION: Quantize k_nope and k_rope separately to avoid concat overhead
+            # This also enables reuse of set_mla_kv_buffer_triton two-tensor write path
+            # quantize_k_cache_separate returns (nope_part, rope_part) as uint8 bytes
+            cache_k_nope_fp8, cache_k_rope_fp8 = quantize_k_cache_separate(
+                cache_k_nope, cache_k_rope
+            )
 
-                # Reuse existing two-tensor write kernel (works with FP8 byte layout)
-                # cache_k_nope_fp8: (num_tokens, 1, 528) uint8 [nope_fp8(512) | scales(16)]
-                # cache_k_rope_fp8: (num_tokens, 1, 128) uint8 [rope_bf16_bytes(128)]
-                set_mla_kv_buffer_triton(
-                    self.kv_buffer[layer_id - self.start_layer],
-                    loc,
-                    cache_k_nope_fp8,
-                    cache_k_rope_fp8,
-                )
+            # Reuse existing two-tensor write kernel (works with FP8 byte layout)
+            # cache_k_nope_fp8: (num_tokens, 1, 528) uint8 [nope_fp8(512) | scales(16)]
+            # cache_k_rope_fp8: (num_tokens, 1, 128) uint8 [rope_bf16_bytes(128)]
+            set_mla_kv_buffer_triton(
+                self.kv_buffer[layer_id - self.start_layer],
+                loc,
+                cache_k_nope_fp8,
+                cache_k_rope_fp8,
+            )
         else:
             if cache_k_nope.dtype != self.dtype:
                 cache_k_nope = cache_k_nope.to(self.dtype)