tile-ai · LeiWang1999 · Oct 9, 2025 · Sep 14, 2025 · Sep 14, 2025 · Sep 15, 2025
diff --git a/.clang-tidy b/.clang-tidy
@@ -46,6 +46,7 @@ Checks: >
   -cppcoreguidelines-pro-bounds-array-to-pointer-decay,
   -clang-analyzer-deadcode.DeadStores,
   -clang-analyzer-optin.cplusplus.VirtualCall,
+  -clang-diagnostic-tautological-constant-compare,
 
 WarningsAsErrors: '*'
 

diff --git a/.github/workflows/amd_ci.yml b/.github/workflows/amd_ci.yml
@@ -115,4 +115,4 @@ jobs:
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         cd testing/python/amd
         unset PYTHONPATH
-        python -m pytest -v test_tilelang_test_amd.py
+        python -m pytest -v --cache-clear test_tilelang_test_amd.py
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -111,11 +111,11 @@ jobs:
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         cd examples
         unset PYTHONPATH
-        python -m pytest -n 4 **/test*.py -v -r fE --durations=0
+        python -m pytest -n 4 **/test*.py -v -r fE --durations=0 --cache-clear
 
     - name: Run tests
       run: |
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         cd testing/python
         unset PYTHONPATH
-        python -m pytest -n 4 -v -r fE --durations=0 --timeout=3600
+        python -m pytest -n 4 -v -r fE --durations=0 --cache-clear --timeout=3600
diff --git a/.github/workflows/metal_ci.yml b/.github/workflows/metal_ci.yml
@@ -88,4 +88,4 @@ jobs:
       run: |
         cd testing/python
         unset PYTHONPATH
-        python -m pytest -k metal -v -r fE --durations=0 --timeout=3600
+        python -m pytest -k metal -v -r fE --durations=0 --cache-clear --timeout=3600
diff --git a/examples/deepseek_v32/sparse_mla_bwd.py b/examples/deepseek_v32/sparse_mla_bwd.py
@@ -333,13 +333,14 @@ def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, sm_scale=None, is_c
 
 def test_sparse_mla_bwd(B=1,
                         S=4096,
-                        SKV=32768,
+                        SKV=8192,
                         H=64,
                         HKV=1,
                         DQKV=576,
                         DV=512,
                         topk=2048,
-                        dtype=torch.bfloat16):
+                        dtype=torch.bfloat16,
+                        check_correctness=True):
     # Prepare data
     q = torch.randn((B, S, H, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
     kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
@@ -359,7 +360,7 @@ def test_sparse_mla_bwd(B=1,
     tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse)
     ref_dq, ref_dkv = ref_sparse_mla_bwd_interface(q, kv, None, do, indices, None)
 
-    if SKV <= 4096:
+    if check_correctness:
         assert_tensors_similar(tl_dq, ref_dq, eps=1e-4, name="dq")
         assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
         print("assert_tensors_similar passed")
@@ -385,4 +386,13 @@ def fn():
 
 if __name__ == "__main__":
     test_sparse_mla_bwd(
-        B=1, S=4096, SKV=4096, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16)
+        B=1,
+        S=4096,
+        SKV=8192,
+        H=64,
+        HKV=1,
+        DQKV=576,
+        DV=512,
+        topk=2048,
+        dtype=torch.bfloat16,
+        check_correctness=True)
diff --git a/examples/deepseek_v32/sparse_mla_fwd.py b/examples/deepseek_v32/sparse_mla_fwd.py
@@ -234,13 +234,14 @@ def ref_sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, is_casual=True):
 
 def test_sparse_mla_fwd(B=1,
                         S=4096,
-                        SKV=4096,
+                        SKV=8192,
                         H=128,
                         HKV=1,
                         DQK=576,
                         DV=512,
                         topk=2048,
-                        dtype=torch.bfloat16):
+                        dtype=torch.bfloat16,
+                        check_correctness=True):
     torch.random.manual_seed(0)
     q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
     kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
@@ -254,7 +255,7 @@ def test_sparse_mla_fwd(B=1,
 
     tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices)
 
-    if SKV <= 4096:
+    if check_correctness:
         # otherwise may cause out of memory
         ref_out = ref_sparse_mla_fwd_interface(q, kv, indices)
         assert_tensors_similar(tl_out, ref_out, eps=1e-2, name="out")
@@ -277,4 +278,13 @@ def fn():
 
 if __name__ == "__main__":
     test_sparse_mla_fwd(
-        B=1, S=4096, SKV=4096, H=128, HKV=1, DQK=576, DV=512, topk=2048, dtype=torch.bfloat16)
+        B=1,
+        S=4096,
+        SKV=4096,
+        H=128,
+        HKV=1,
+        DQK=576,
+        DV=512,
+        topk=2048,
+        dtype=torch.bfloat16,
+        check_correctness=True)
diff --git a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
@@ -399,14 +399,15 @@ def ref_sparse_mla_fwd_interface(q,
 
 def test_sparse_mla_fwd_pipelined(B=1,
                                   S=4096,
-                                  SKV=4096,
+                                  SKV=8192,
                                   H=128,
                                   HKV=1,
                                   DQK=576,
                                   DV=512,
                                   topk=2048,
                                   dtype=torch.bfloat16,
-                                  q_start_s_index=1024):
+                                  q_start_s_index=1024,
+                                  check_correctness=True):
     KV_stride = 1
 
     torch.random.manual_seed(0)
@@ -456,8 +457,8 @@ def fn():
     parser.add_argument("--test_correctness", action="store_true")
     args = parser.parse_args()
     if args.test_correctness:
-        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 2048, 128, 1, 576, 512, 2048, torch.bfloat16
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
     else:
         B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 4096, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
-    test_sparse_mla_fwd(B, S, SKV, H, HKV, DQK, DV, topk, dtype)
-    test_sparse_mla_fwd(B, S, SKV, H, HKV, DQK, DV, topk, dtype)
+    test_sparse_mla_fwd_pipelined(
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=args.test_correctness)
diff --git a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
@@ -20,20 +20,23 @@ def test_example_fp8_lighting_indexer():
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd():
     # small shapes for testing
-    test_sparse_mla_fwd(S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256)
+    test_sparse_mla_fwd(
+        S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd_pipelined():
     # small shapes for testing
-    test_sparse_mla_fwd_pipelined(S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256)
+    test_sparse_mla_fwd_pipelined(
+        S=1024, SKV=2048, H=128, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_bwd():
-    test_sparse_mla_bwd()
+    test_sparse_mla_bwd(
+        S=1024, SKV=2048, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
 
 
 if __name__ == "__main__":

diff --git a/examples/flash_attention/test_example_flash_attention.py b/examples/flash_attention/test_example_flash_attention.py
@@ -27,18 +27,18 @@ def test_example_gqa_bwd_wgmma_pipelined():
 
 @tilelang.testing.requires_cuda
 def test_example_mha_bwd():
-    example_mha_bwd.main()
+    example_mha_bwd.main(BATCH=1)
 
 
 @tilelang.testing.requires_cuda
 def test_example_mha_bwd_bhsd():
-    example_mha_bwd_bhsd.main()
+    example_mha_bwd_bhsd.main(BATCH=1)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_mha_bwd_wgmma_pipelined():
-    example_mha_bwd_wgmma_pipelined.main()
+    example_mha_bwd_wgmma_pipelined.main(BATCH=1)
 
 
 @tilelang.testing.requires_cuda
@@ -66,12 +66,12 @@ def test_example_mha_fwd_bhsd():
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_mha_fwd_bshd_wgmma_pipelined():
-    example_mha_fwd_bshd_wgmma_pipelined.main()
+    example_mha_fwd_bshd_wgmma_pipelined.main(batch=1, heads=32, seq_len=256)
 
 
 @tilelang.testing.requires_cuda
 def test_example_mha_fwd_bshd():
-    example_mha_fwd_bshd.main()
+    example_mha_fwd_bshd.main(batch=1, seq_len=256)
 
 
 @tilelang.testing.requires_cuda

diff --git a/examples/norm/test_rms_norm.py b/examples/norm/test_rms_norm.py
@@ -63,15 +63,9 @@ def ref_program(x):
     return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-12)
 
 
-def test_rms_norm():
-    M, N, blk_m = 8192, 8192, 1
+def test_rms_norm(M=1024, N=1024, blk_m=1):
     program = rms_norm(M, N, blk_m)
-    kernel = tilelang.compile(
-        program,
-        out_idx=-1,
-        target="cuda",
-        execution_backend="cython",
-        pass_configs={"tl.disable_tma_lower": True})
+    kernel = tilelang.compile(program, out_idx=-1, pass_configs={"tl.disable_tma_lower": True})
     profiler = kernel.get_profiler()
     profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
 

diff --git a/src/layout/gemm_layouts.cc b/src/layout/gemm_layouts.cc
@@ -177,8 +177,8 @@ Fragment makeGemmFragmentCHopper(const int block_m, const int block_n,
                                  const int warp_m, const int warp_n,
                                  const int element_size) {
   ICHECK(block_m % warp_m == 0);
-  // ICHECK(block_n == warp_n);
   ICHECK(warp_m % 16 == 0) << "warp_m=" << warp_m;
+
   auto warp_layout = makeGemmFragment8x8()->Repeat({2, warp_n / 8}, false,
                                                    false); // 16 x N (1 warp)
   auto block_layout = warp_layout->Repeat({block_m / warp_m, block_n / warp_n},
@@ -576,8 +576,8 @@ Layout MakeGemmVoltaBLayoutCongruous(int stride, int continuous) {
 }
 
 Layout makeGemmVoltaABLayout(int stride, int continuous, bool is_a,
-                             int kfactor) {
-  if (kfactor == 2)
+                             bool k_inner) {
+  if (k_inner)
     return MakeGemmVoltaABLayoutCrosswise(stride, continuous);
   if (is_a && continuous % 64 == 0)
     return MakeGemmVoltaALayoutCongruous(stride, continuous);
@@ -705,29 +705,29 @@ Layout makeGemmSparseAmpereABLayout(int mat_stride, int mat_continuous,
  * select specific swizzling strategies. It might be the same as mat_continuous
  *                   or different based on tiling or hardware details.
  * \param element_size The size of each element in the matrix, in bits (e.g., 8,
- * 16, 32, 64). \param kfactor An integer factor that influences layout
+ * 16, 32, 64). \param k_inner Whether the K dimension is in the inner loop.
  * selection, particularly for fp64 and int8 types. It often relates to how the
  * K dimension of the GEMM (M x K * K x N) is handled or tiled.
  *                - For fp64 (element_size == 64):
- *                  - kfactor == 1 often implies K is in the "outer" loop (e.g.,
- * KxN matrix).
- *                  - kfactor == 2 often implies K is in the "inner" loop (e.g.,
- * NxK matrix).
+ *                  - k_inner == false often implies K is in the "outer" loop
+ * (e.g., KxN matrix).
+ *                  - k_inner == true often implies K is in the "inner" loop
+ * (e.g., NxK matrix).
  *                - For int8 (element_size == 8):
- *                  - kfactor == 1 uses a padded layout.
+ *                  - k_inner == false uses a padded layout.
  * \return A Layout object representing the chosen memory layout.
  */
 Layout makeGemmABLayout(int mat_stride, int mat_continuous, int continuity,
-                        int element_size, int kfactor) {
+                        int element_size, bool k_inner) {
   if (element_size == 64) {
-    if (kfactor == 1 && continuity % 16 == 0) // float64 KxN
+    if (!k_inner && continuity % 16 == 0) // float64 KxN
       return makeGemmABLayoutF64_Kouter(mat_stride, mat_continuous);
-    if (kfactor == 2 && continuity % 16 == 0) // float64 NxK
+    if (k_inner && continuity % 16 == 0) // float64 NxK
       return makeGemmABLayoutF64_Kinner(mat_stride, mat_continuous);
     return makeGemmABLayoutPadded(mat_stride, mat_continuous, element_size);
   }
   int vector_size = 128 / element_size;
-  if (kfactor == 1 && element_size == 8) // int8 KxN
+  if (!k_inner && element_size == 8) // int8 KxN
     return makeGemmABLayoutPadded(mat_stride, mat_continuous, element_size);
   else if (mat_continuous % (vector_size * 8) == 0)
     return makeFullBankSwizzleLayout(mat_stride, mat_continuous, element_size);
@@ -739,16 +739,17 @@ Layout makeGemmABLayout(int mat_stride, int mat_continuous, int continuity,
 }
 
 Layout makeGemmABLayoutHopper(int mat_stride, int mat_continuous,
-                              int continuity, int element_size, int kfactor) {
+                              int continuity, int element_size, bool k_inner) {
   if (element_size == 64) {
-    if (kfactor == 1 && continuity % 16 == 0) // float64 KxN
+    if (!k_inner && continuity % 16 == 0) // float64 KxN
       return makeGemmABLayoutF64_Kouter(mat_stride, mat_continuous);
-    if (kfactor == 2 && continuity % 16 == 0) // float64 NxK
+    if (k_inner && continuity % 16 == 0) // float64 NxK
       return makeGemmABLayoutF64_Kinner(mat_stride, mat_continuous);
     return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
                                         element_size);
   }
   int vector_size = 128 / element_size;
+
   if (mat_continuous % (vector_size * 8) == 0)
     return makeFullBankSwizzleLayout(mat_stride, mat_continuous, element_size);
   else if (mat_continuous % (vector_size * 4) == 0)
@@ -761,11 +762,11 @@ Layout makeGemmABLayoutHopper(int mat_stride, int mat_continuous,
   else
     ICHECK(0) << "Unsupported layout for Hopper with stride=" << mat_stride
               << ", continuous=" << mat_continuous
-              << ", element_size=" << element_size << ", kfactor=" << kfactor;
+              << ", element_size=" << element_size << ", k_inner=" << k_inner;
 }
 
 Layout makeGemmABLayoutSm100(int mat_stride, int mat_continuous, int continuity,
-                             int element_size, int kfactor) {
+                             int element_size, bool k_inner) {
   if (element_size == 64) {
     ICHECK(0) << "float64 on sm100 is not supported now";
   }
@@ -782,7 +783,7 @@ Layout makeGemmABLayoutSm100(int mat_stride, int mat_continuous, int continuity,
   else
     ICHECK(0) << "Unsupported layout for sm100 with stride=" << mat_stride
               << ", continuous=" << mat_continuous
-              << ", element_size=" << element_size << ", kfactor=" << kfactor;
+              << ", element_size=" << element_size << ", k_inner=" << k_inner;
   __builtin_unreachable(); // to prevent compiler warning
 }
 

diff --git a/src/layout/layout.cc b/src/layout/layout.cc
@@ -484,6 +484,11 @@ TVM_FFI_STATIC_INIT_BLOCK({
            [](Layout layout) { return layout->GetForwardIndex(); })
       .def("tl.Layout_forward_vars",
            [](Layout layout) { return layout->GetForwardVars(); })
+      .def("tl.Layout_is_equal",
+           [](Layout layout, Layout other) {
+             const LayoutNode *other_node = other.as<LayoutNode>();
+             return layout->IsEqual(other_node);
+           })
       .def_packed("tl.Fragment",
                   [](PackedArgs args, Any *rv) {
                     *rv = Fragment(
@@ -492,6 +497,11 @@ TVM_FFI_STATIC_INIT_BLOCK({
                         /*forward_thread=*/args[2].cast<PrimExpr>(),
                         /*thread_replicate=*/args[3].cast<IterVar>());
                   })
+      .def("tl.Fragment_is_equal",
+           [](Fragment fragment, Fragment other) {
+             const FragmentNode *other_node = other.as<FragmentNode>();
+             return fragment->IsEqual(other_node);
+           })
       .def("tl.Fragment_thread_size",
            [](Fragment fragment) { return fragment->ThreadExtent(); })
       .def("tl.Fragment_thread",
@@ -509,10 +519,38 @@ TVM_FFI_STATIC_INIT_BLOCK({
       .def("tl.Fragment_condense_rep_var",
            [](Fragment fragment) { return fragment->CondenseReplicateVar(); })
       .def("tl.make_swizzled_layout",
+           [](int stride, int continuous, int element_size, bool k_inner,
+              bool allow_pad = true) {
+             if (allow_pad) {
+               return makeGemmABLayout(stride, continuous, continuous,
+                                       element_size, k_inner);
+             } else {
+               return makeGemmABLayoutHopper(stride, continuous, continuous,
+                                             element_size, k_inner);
+             }
+           })
+      .def("tl.make_wgmma_swizzled_layout",
+           [](int stride, int mat_continuous, int continuity, int element_size,
+              bool k_inner) {
+             return makeGemmABLayoutHopper(stride, mat_continuous, continuity,
+                                           element_size, k_inner);
+           })
+      .def("tl.make_full_bank_swizzled_layout",
            [](int stride, int continuous, int element_size) {
-             return makeGemmABLayout(stride, continuous, continuous,
-                                     element_size, 0);
-           });
+             return makeFullBankSwizzleLayout(stride, continuous, element_size);
+           })
+      .def("tl.make_half_bank_swizzled_layout",
+           [](int stride, int continuous, int element_size) {
+             return makeHalfBankSwizzleLayout(stride, continuous, element_size);
+           })
+      .def("tl.make_quarter_bank_swizzled_layout",
+           [](int stride, int continuous, int element_size) {
+             return makeQuarterBankSwizzleLayout(stride, continuous,
+                                                 element_size);
+           })
+      .def("tl.make_linear_layout", [](int stride, int continuous) {
+        return makeGemmLayoutLinear(stride, continuous);
+      });
 });
 
 TVM_FFI_STATIC_INIT_BLOCK({