Update sparse MLA examples to support SKV adjustment and correctness checks

LeiWang1999 · LeiWang1999 · commit f1aa27eafe45 · 2025-10-07T16:53:06.000+08:00
- Changed SKV parameter from 32768 to 8192 in sparse MLA backward and forward tests.
- Added check_correctness parameter to test functions for validation of outputs.
- Updated test cases to reflect new SKV values and correctness checks.
diff --git a/examples/deepseek_v32/sparse_mla_bwd.py b/examples/deepseek_v32/sparse_mla_bwd.py
@@ -333,13 +333,14 @@ def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, sm_scale=None, is_c
 
 def test_sparse_mla_bwd(B=1,
                         S=4096,
-                        SKV=32768,
+                        SKV=8192,
                         H=64,
                         HKV=1,
                         DQKV=576,
                         DV=512,
                         topk=2048,
-                        dtype=torch.bfloat16):
+                        dtype=torch.bfloat16,
+                        check_correctness=True):
     # Prepare data
     q = torch.randn((B, S, H, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
     kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
@@ -359,7 +360,7 @@ def test_sparse_mla_bwd(B=1,
     tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse)
     ref_dq, ref_dkv = ref_sparse_mla_bwd_interface(q, kv, None, do, indices, None)
 
-    if SKV <= 4096:
+    if check_correctness:
         assert_tensors_similar(tl_dq, ref_dq, eps=1e-4, name="dq")
         assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
         print("assert_tensors_similar passed")
@@ -385,4 +386,13 @@ def fn():
 
 if __name__ == "__main__":
     test_sparse_mla_bwd(
-        B=1, S=4096, SKV=4096, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16)
+        B=1,
+        S=4096,
+        SKV=8192,
+        H=64,
+        HKV=1,
+        DQKV=576,
+        DV=512,
+        topk=2048,
+        dtype=torch.bfloat16,
+        check_correctness=True)
diff --git a/examples/deepseek_v32/sparse_mla_fwd.py b/examples/deepseek_v32/sparse_mla_fwd.py
@@ -234,13 +234,14 @@ def ref_sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, is_casual=True):
 
 def test_sparse_mla_fwd(B=1,
                         S=4096,
-                        SKV=4096,
+                        SKV=8192,
                         H=128,
                         HKV=1,
                         DQK=576,
                         DV=512,
                         topk=2048,
-                        dtype=torch.bfloat16):
+                        dtype=torch.bfloat16,
+                        check_correctness=True):
     torch.random.manual_seed(0)
     q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
     kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
@@ -254,7 +255,7 @@ def test_sparse_mla_fwd(B=1,
 
     tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices)
 
-    if SKV <= 4096:
+    if check_correctness:
         # otherwise may cause out of memory
         ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
         assert_tensors_similar(tl_out, ref_out, eps=1e-2, name="out")
@@ -277,4 +278,13 @@ def fn():
 
 if __name__ == "__main__":
     test_sparse_mla_fwd(
-        B=1, S=4096, SKV=4096, H=128, HKV=1, DQK=576, DV=512, topk=2048, dtype=torch.bfloat16)
+        B=1,
+        S=4096,
+        SKV=4096,
+        H=128,
+        HKV=1,
+        DQK=576,
+        DV=512,
+        topk=2048,
+        dtype=torch.bfloat16,
+        check_correctness=True)
diff --git a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
@@ -399,14 +399,15 @@ def ref_sparse_mla_fwd_interface(q,
 
 def test_sparse_mla_fwd_pipelined(B=1,
                                   S=4096,
-                                  SKV=4096,
+                                  SKV=8192,
                                   H=128,
                                   HKV=1,
                                   DQK=576,
                                   DV=512,
                                   topk=2048,
                                   dtype=torch.bfloat16,
-                                  q_start_s_index=1024):
+                                  q_start_s_index=1024,
+                                  check_correctness=True):
     KV_stride = 1
 
     torch.random.manual_seed(0)
@@ -456,8 +457,8 @@ def fn():
     parser.add_argument("--test_correctness", action="store_true")
     args = parser.parse_args()
     if args.test_correctness:
-        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 2048, 128, 1, 576, 512, 2048, torch.bfloat16
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
     else:
         B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 4096, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
-    test_sparse_mla_fwd(B, S, SKV, H, HKV, DQK, DV, topk, dtype)
-    test_sparse_mla_fwd(B, S, SKV, H, HKV, DQK, DV, topk, dtype)
+    test_sparse_mla_fwd_pipelined(
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=args.test_correctness)
diff --git a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
@@ -20,20 +20,23 @@ def test_example_fp8_lighting_indexer():
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd():
     # small shapes for testing
-    test_sparse_mla_fwd(S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256)
+    test_sparse_mla_fwd(
+        S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd_pipelined():
     # small shapes for testing
-    test_sparse_mla_fwd_pipelined(S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256)
+    test_sparse_mla_fwd_pipelined(
+        S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_bwd():
-    test_sparse_mla_bwd()
+    test_sparse_mla_bwd(
+        S=1024, SKV=2048, H=128, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
 
 
 if __name__ == "__main__":