RubiaCx
diff --git a/‎.clang-tidy‎
Lines changed: 1 addition & 0 deletions b/‎.clang-tidy‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/amd_ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/amd_ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/metal_ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/metal_ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/deepseek_v32/sparse_mla_bwd.py‎
Lines changed: 14 additions & 4 deletions b/‎examples/deepseek_v32/sparse_mla_bwd.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎examples/deepseek_v32/sparse_mla_fwd.py‎
Lines changed: 14 additions & 4 deletions b/‎examples/deepseek_v32/sparse_mla_fwd.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎examples/deepseek_v32/sparse_mla_fwd_pipelined.py‎
Lines changed: 6 additions & 5 deletions b/‎examples/deepseek_v32/sparse_mla_fwd_pipelined.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎examples/deepseek_v32/test_tilelang_example_deepseek_v32.py‎
Lines changed: 6 additions & 3 deletions b/‎examples/deepseek_v32/test_tilelang_example_deepseek_v32.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎examples/flash_attention/test_example_flash_attention.py‎
Lines changed: 5 additions & 5 deletions b/‎examples/flash_attention/test_example_flash_attention.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/norm/test_rms_norm.py‎
Lines changed: 2 additions & 8 deletions b/‎examples/norm/test_rms_norm.py‎
Lines changed: 2 additions & 8 deletions
@@ -46,6 +46,7 @@ Checks: >
   -cppcoreguidelines-pro-bounds-array-to-pointer-decay,
   -clang-analyzer-deadcode.DeadStores,
   -clang-analyzer-optin.cplusplus.VirtualCall,
+  -clang-diagnostic-tautological-constant-compare,
 
 WarningsAsErrors: '*'
 
 
@@ -119,4 +119,4 @@ jobs:
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         cd testing/python/amd
         unset PYTHONPATH
-        python -m pytest -v test_tilelang_test_amd.py
+        python -m pytest -v --cache-clear test_tilelang_test_amd.py
@@ -115,11 +115,11 @@ jobs:
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         cd examples
         unset PYTHONPATH
-        python -m pytest -n 4 **/test*.py -v -r fE --durations=0
+        python -m pytest -n 4 **/test*.py -v -r fE --durations=0 --cache-clear
 
     - name: Run tests
       run: |
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         cd testing/python
         unset PYTHONPATH
-        python -m pytest -n 4 -v -r fE --durations=0 --timeout=3600
+        python -m pytest -n 4 -v -r fE --durations=0 --cache-clear --timeout=3600
@@ -92,4 +92,4 @@ jobs:
       run: |
         cd testing/python
         unset PYTHONPATH
-        python -m pytest -k metal -v -r fE --durations=0 --timeout=3600
+        python -m pytest -k metal -v -r fE --durations=0 --cache-clear --timeout=3600
@@ -333,13 +333,14 @@ def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, sm_scale=None, is_c
 
 def test_sparse_mla_bwd(B=1,
                         S=4096,
-                        SKV=32768,
+                        SKV=8192,
                         H=64,
                         HKV=1,
                         DQKV=576,
                         DV=512,
                         topk=2048,
-                        dtype=torch.bfloat16):
+                        dtype=torch.bfloat16,
+                        check_correctness=True):
     # Prepare data
     q = torch.randn((B, S, H, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
     kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
@@ -359,7 +360,7 @@ def test_sparse_mla_bwd(B=1,
     tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse)
     ref_dq, ref_dkv = ref_sparse_mla_bwd_interface(q, kv, None, do, indices, None)
 
-    if SKV <= 4096:
+    if check_correctness:
         assert_tensors_similar(tl_dq, ref_dq, eps=1e-4, name="dq")
         assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
         print("assert_tensors_similar passed")
@@ -385,4 +386,13 @@ def fn():
 
 if __name__ == "__main__":
     test_sparse_mla_bwd(
-        B=1, S=4096, SKV=4096, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16)
+        B=1,
+        S=4096,
+        SKV=8192,
+        H=64,
+        HKV=1,
+        DQKV=576,
+        DV=512,
+        topk=2048,
+        dtype=torch.bfloat16,
+        check_correctness=True)
@@ -234,13 +234,14 @@ def ref_sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, is_casual=True):
 
 def test_sparse_mla_fwd(B=1,
                         S=4096,
-                        SKV=4096,
+                        SKV=8192,
                         H=128,
                         HKV=1,
                         DQK=576,
                         DV=512,
                         topk=2048,
-                        dtype=torch.bfloat16):
+                        dtype=torch.bfloat16,
+                        check_correctness=True):
     torch.random.manual_seed(0)
     q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
     kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
@@ -254,7 +255,7 @@ def test_sparse_mla_fwd(B=1,
 
     tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices)
 
-    if SKV <= 4096:
+    if check_correctness:
         # otherwise may cause out of memory
         ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
         assert_tensors_similar(tl_out, ref_out, eps=1e-2, name="out")
@@ -277,4 +278,13 @@ def fn():
 
 if __name__ == "__main__":
     test_sparse_mla_fwd(
-        B=1, S=4096, SKV=4096, H=128, HKV=1, DQK=576, DV=512, topk=2048, dtype=torch.bfloat16)
+        B=1,
+        S=4096,
+        SKV=4096,
+        H=128,
+        HKV=1,
+        DQK=576,
+        DV=512,
+        topk=2048,
+        dtype=torch.bfloat16,
+        check_correctness=True)
@@ -399,14 +399,15 @@ def ref_sparse_mla_fwd_interface(q,
 
 def test_sparse_mla_fwd_pipelined(B=1,
                                   S=4096,
-                                  SKV=4096,
+                                  SKV=8192,
                                   H=128,
                                   HKV=1,
                                   DQK=576,
                                   DV=512,
                                   topk=2048,
                                   dtype=torch.bfloat16,
-                                  q_start_s_index=1024):
+                                  q_start_s_index=1024,
+                                  check_correctness=True):
     KV_stride = 1
 
     torch.random.manual_seed(0)
@@ -456,8 +457,8 @@ def fn():
     parser.add_argument("--test_correctness", action="store_true")
     args = parser.parse_args()
     if args.test_correctness:
-        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 2048, 128, 1, 576, 512, 2048, torch.bfloat16
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
     else:
         B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 4096, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
-    test_sparse_mla_fwd(B, S, SKV, H, HKV, DQK, DV, topk, dtype)
-    test_sparse_mla_fwd(B, S, SKV, H, HKV, DQK, DV, topk, dtype)
+    test_sparse_mla_fwd_pipelined(
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=args.test_correctness)
@@ -20,20 +20,23 @@ def test_example_fp8_lighting_indexer():
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd():
     # small shapes for testing
-    test_sparse_mla_fwd(S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256)
+    test_sparse_mla_fwd(
+        S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd_pipelined():
     # small shapes for testing
-    test_sparse_mla_fwd_pipelined(S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256)
+    test_sparse_mla_fwd_pipelined(
+        S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_bwd():
-    test_sparse_mla_bwd()
+    test_sparse_mla_bwd(
+        S=1024, SKV=2048, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
 
 
 if __name__ == "__main__":
 
@@ -27,18 +27,18 @@ def test_example_gqa_bwd_wgmma_pipelined():
 
 @tilelang.testing.requires_cuda
 def test_example_mha_bwd():
-    example_mha_bwd.main()
+    example_mha_bwd.main(BATCH=1)
 
 
 @tilelang.testing.requires_cuda
 def test_example_mha_bwd_bhsd():
-    example_mha_bwd_bhsd.main()
+    example_mha_bwd_bhsd.main(BATCH=1)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_mha_bwd_wgmma_pipelined():
-    example_mha_bwd_wgmma_pipelined.main()
+    example_mha_bwd_wgmma_pipelined.main(BATCH=1)
 
 
 @tilelang.testing.requires_cuda
@@ -66,12 +66,12 @@ def test_example_mha_fwd_bhsd():
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_mha_fwd_bshd_wgmma_pipelined():
-    example_mha_fwd_bshd_wgmma_pipelined.main()
+    example_mha_fwd_bshd_wgmma_pipelined.main(batch=1, heads=32, seq_len=256)
 
 
 @tilelang.testing.requires_cuda
 def test_example_mha_fwd_bshd():
-    example_mha_fwd_bshd.main()
+    example_mha_fwd_bshd.main(batch=1, seq_len=256)
 
 
 @tilelang.testing.requires_cuda
 
@@ -63,15 +63,9 @@ def ref_program(x):
     return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-12)
 
 
-def test_rms_norm():
-    M, N, blk_m = 8192, 8192, 1
+def test_rms_norm(M=1024, N=1024, blk_m=1):
     program = rms_norm(M, N, blk_m)
-    kernel = tilelang.compile(
-        program,
-        out_idx=-1,
-        target="cuda",
-        execution_backend="cython",
-        pass_configs={"tl.disable_tma_lower": True})
+    kernel = tilelang.compile(program, out_idx=-1, pass_configs={"tl.disable_tma_lower": True})
     profiler = kernel.get_profiler()
     profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)