tile-ai · LeiWang1999 · Jan 9, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
@@ -329,21 +329,15 @@ def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=1
     max_selected_blocks = int(math.ceil(max_cache_seqlen * (1 - sparse_ratio) / block_size))
     print("max_selected_blocks: ", max_selected_blocks)
     dtype = torch.float16
-    block_H = 64
 
     Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
     K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
     V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
     cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
-    # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
     # Ensure at least one element equals cache_seqlen
     random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
     cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
-
-    print("cache_seqlens: ", cache_seqlens)
-
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
-    print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_indices with -1 (for padding blocks)
     block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
 
@@ -357,13 +351,7 @@ def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=1
 
     # Sort indices within each batch-group for consistency
     block_indices, _ = block_indices.sort(dim=-1, descending=True)
-    # print("block_indices: ", block_indices)
-    actual_num_blocks = torch.sum(block_indices != -1, dim=-1).to(torch.int32)[:, 0]
-    print("actual_num_blocks: ", actual_num_blocks)
-    # print(block_indices.shape, actual_num_blocks.shape)
-
     max_num_blocks = torch.max(max_valid_num_blocks).item()
-    print("max_num_blocks: ", max_num_blocks)
 
     ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
 
@@ -402,6 +390,7 @@ def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=1
     avg_time = elapsed_time / 1000
     avg_flops = total_flops / avg_time
     print(f"Average time: {avg_time:.6f} seconds")
+    print(f"Average FLOPS: {avg_flops:.2f} GFLOPS")
 
     # Measure performance of reference implementation
     import flash_attn  # noqa: F401
@@ -415,7 +404,7 @@ def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=1
     avg_time_ref = elapsed_time_ref / 1000
     avg_flops_ref = total_flops / avg_time_ref
     print(f"Average time of ref: {avg_time_ref:.6f} seconds")
-
+    print(f"Average FLOPS of ref: {avg_flops_ref:.2f} GFLOPS")
     print(f"Speedup: {avg_time_ref / avg_time:.2f}x")
 
 

diff --git a/examples/gemm/example_gemm_intrinsics.py b/examples/gemm/example_gemm_intrinsics.py
@@ -6,7 +6,6 @@
 from tilelang.intrinsics.mma_macro_generator import (
     TensorCoreIntrinEmitter,
 )
-from tilelang.transform import simplify_prim_func
 
 
 def make_swizzle_layout(shared_buf):
@@ -25,7 +24,6 @@ def transform_func(i, j):
 
 
 @tilelang.jit(out_idx=[2])
-@simplify_prim_func
 def tl_matmul(
     M,
     N,

diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -7,7 +7,6 @@
 from tilelang.intrinsics.mma_macro_generator import (
     TensorCoreIntrinEmitter,
 )
-from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
 tilelang.testing.set_random_seed(0)
@@ -29,7 +28,6 @@ def transform_func(i, j):
 
 
 @tilelang.jit(out_idx=[2])
-@simplify_prim_func
 def tl_matmul(
     M,
     N,

diff --git a/examples/lazy_jit/lazyjit.en.ipynb b/examples/lazy_jit/lazyjit.en.ipynb
@@ -53,7 +53,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def gemm(\n",
     "    A,\n",
     "    B,\n",
@@ -209,7 +209,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def gemm_dyn_K(A, B):\n",
     "    M, N, K = T.dynamic(\"M, N, K\")\n",
     "    A: T.Tensor[[M, K], T.float16]\n",
@@ -248,7 +248,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def as_contingious(A):\n",
     "    M, N, dM, dN = T.dynamic(\"M, N, dM, dN\")\n",
     "    A: T.StridedTensor[[M, N], [dM, dN], T.float32]\n",
@@ -307,7 +307,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def gemm_ptr(\n",
     "    A,\n",
     "    B,\n",
@@ -359,7 +359,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def gemm_ptr_dyn(A, B, M, N, K):\n",
     "    M: T.int32\n",
     "    N: T.int32\n",
@@ -421,7 +421,7 @@
     }
    ],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def example_wrong_kernel(A):\n",
     "    M = T.const(\"M\")\n",
     "    A: T.Tensor[[M * 2, M * 3], T.float32]\n",
@@ -470,7 +470,7 @@
     }
    ],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def dyn_annot(\n",
     "    A: T.ptr,  # 1. T.ptr type annotation\n",
     "    is_2d=False,\n",
@@ -515,7 +515,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def add_one(X, data: T.float32 = 1):\n",
     "    M, N = T.const(\"M, N\")\n",
     "    X: T.Tensor[[M, N], T.float32]\n",
@@ -577,7 +577,7 @@
     "B = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
     "\n",
     "\n",
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def dummy_kernel(A, B):\n",
     "    M, N = T.const(\"M, N\")\n",
     "    A: T.Tensor[[M, N], T.float16]\n",
@@ -797,7 +797,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def element_wise(A, fn):\n",
     "    N = T.dynamic(\"N\")\n",
     "    A: T.Tensor[[N], T.float32]\n",
@@ -857,7 +857,7 @@
     "        n31(x * 3 + 1, var)\n",
     "\n",
     "\n",
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
     "    with T.Kernel(1) as _:\n",
     "        n31(n, A[0])"

diff --git a/examples/lazy_jit/lazyjit.zh.ipynb b/examples/lazy_jit/lazyjit.zh.ipynb
@@ -53,7 +53,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def gemm(\n",
     "    A,\n",
     "    B,\n",
@@ -209,7 +209,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def gemm_dyn_K(A, B):\n",
     "    M, N, K = T.dynamic(\"M, N, K\")\n",
     "    A: T.Tensor[[M, K], T.float16]\n",
@@ -248,7 +248,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def as_contingious(A):\n",
     "    M, N, dM, dN = T.dynamic(\"M, N, dM, dN\")\n",
     "    A: T.StridedTensor[[M, N], [dM, dN], T.float32]\n",
@@ -307,7 +307,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def gemm_ptr(\n",
     "    A,\n",
     "    B,\n",
@@ -359,7 +359,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def gemm_ptr_dyn(A, B, M, N, K):\n",
     "    M: T.int32\n",
     "    N: T.int32\n",
@@ -421,7 +421,7 @@
     }
    ],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def example_wrong_kernel(A):\n",
     "    M = T.const(\"M\")\n",
     "    A: T.Tensor[[M * 2, M * 3], T.float32]\n",
@@ -470,7 +470,7 @@
     }
    ],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def dyn_annot(\n",
     "    A: T.ptr,  # 1. T.ptr type annotation\n",
     "    is_2d=False,\n",
@@ -515,7 +515,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def add_one(X, data: T.float32 = 1):\n",
     "    M, N = T.const(\"M, N\")\n",
     "    X: T.Tensor[[M, N], T.float32]\n",
@@ -577,7 +577,7 @@
     "B = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
     "\n",
     "\n",
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def dummy_kernel(A, B):\n",
     "    M, N = T.const(\"M, N\")\n",
     "    A: T.Tensor[[M, N], T.float16]\n",
@@ -797,7 +797,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def element_wise(A, fn):\n",
     "    N = T.dynamic(\"N\")\n",
     "    A: T.Tensor[[N], T.float32]\n",
@@ -857,7 +857,7 @@
     "        n31(x * 3 + 1, var)\n",
     "\n",
     "\n",
-    "@tilelang.lazy_jit\n",
+    "@tilelang.jit\n",
     "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
     "    with T.Kernel(1) as _:\n",
     "        n31(n, A[0])"

diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
@@ -222,7 +222,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
           continue;
 
         // Check if buffer exists in use_list_
-        if (!use_list_.count(buffer)) {
+        if (!use_list_.count(buffer) && IsFragmentBuffer(buffer)) {
           LOG(WARNING) << "Layout inference failed for buffer " << buffer
                        << ". "
                        << "The buffer cannot be inferred with current layout "

diff --git a/src/transform/lower_tile_op.cc b/src/transform/lower_tile_op.cc
@@ -54,7 +54,6 @@ static Buffer makeBufferWithLayout(const Buffer &buffer, const Layout &layout,
   }
   Array<PrimExpr> layout_shape = layout->OutputShape();
   Array<PrimExpr> output_shape = layout_shape;
-
   if (ptr_type->storage_scope == "shared" ||
       ptr_type->storage_scope == "shared.dyn") {
     int replicate_extent = 1;
@@ -67,6 +66,8 @@ static Buffer makeBufferWithLayout(const Buffer &buffer, const Layout &layout,
     }
     for (size_t i = 0; i < layout_shape.size(); i++) {
       auto shape = layout_shape[i].as<IntImmNode>();
+      ICHECK(shape) << "Layout output shape must be constant integer, but got: "
+                    << layout_shape[i];
       layout_extent *= shape->value;
     }
     replicate_extent = buffer_extent / layout_extent;

diff --git a/testing/python/arith/test_arith_hard.py b/testing/python/arith/test_arith_hard.py
@@ -3,6 +3,7 @@
 from tvm.arith import Analyzer
 from tvm.ir.expr import Range
 from tvm.tir.expr import Not, Or
+from tvm.tir import all as tir_all
 
 
 def implies(x, y):
@@ -21,30 +22,25 @@ def check_expr(expr):
         if not result:
             smtlib2 = analyzer.get_smtlib2(expr)
             raise AssertionError(f"Failed to prove: {expr}\nSMT-LIB2:\n{smtlib2}")
-        # assert result, f"Failed to prove: {expr}"
 
-    @T.macro
     def complex_expr_1():
-        return implies(a > 0 and b > 0 and c > 0, ((b - a) // c) * c + a <= b)
+        return implies(tir_all(a > 0, b > 0, c > 0), ((b - a) // c) * c + a <= b)
 
     check_expr(complex_expr_1())
 
-    @T.macro
     def complex_expr_2():
-        return implies(a < b and b < c and a * d < b * d, b * d < c * d)
+        return implies(tir_all(a < b, b < c, a * d < b * d), b * d < c * d)
 
     check_expr(complex_expr_2())
 
-    @T.macro
     def complex_expr_3():
-        return implies(a >= 0 and a < 128, a // 128 == (a // 64 * 32 + a % 32 // 16 * 8) // 64)
+        return implies(tir_all(a >= 0, a < 128), a // 128 == (a // 64 * 32 + a % 32 // 16 * 8) // 64)
 
     check_expr(complex_expr_3())
 
-    @T.macro
     def complex_expr_4():
         return implies(
-            a >= 0 and a < 128,
+            tir_all(a >= 0, a < 128),
             (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 + 16 - (a // 64 + a % 8 // 4) // 2 * 64) // 512
             == (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 - (a // 64 + a % 8 // 4) // 2 * 64) // 512,
         )
@@ -59,9 +55,8 @@ def test_smtlib2():
     b = T.Var("b", T.int32)
     c = T.Var("c", T.int32)
 
-    @T.macro
     def complex_expr_1():
-        return implies(a > 0 and b > 0 and c > 0, ((b - a) // c) * c + a <= b)
+        return implies(tir_all(a > 0, b > 0, c > 0), ((b - a) // c) * c + a <= b)
 
     e = complex_expr_1()
     analyzer = Analyzer()

diff --git a/testing/python/issue/test_tilelang_issue_1549.py b/testing/python/issue/test_tilelang_issue_1549.py
@@ -4,6 +4,7 @@
 import torch
 
 
+@tilelang.testing.requires_cuda
 def test_issue_1549_strange_var_vectorization():
     @tl.jit
     def get_wrong_kernel(M: int = 4096):

diff --git a/testing/python/issue/test_tilelang_issue_1601.py b/testing/python/issue/test_tilelang_issue_1601.py
@@ -3,6 +3,7 @@
 import tilelang.language as T
 
 
+@tilelang.testing.requires_cuda
 def test_issue_1601():
     @tilelang.jit
     def qwq():

diff --git a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
@@ -6,17 +6,15 @@
 from tilelang.intrinsics import (
     make_mma_swizzle_layout as make_swizzle_layout,
 )
-
+from tilelang.transform import simplify_prim_func
 from tilelang.intrinsics.mma_macro_generator import (
     INT4TensorCoreIntrinEmitter,
     INT4TensorCoreIntrinEmitterWithLadderTransform,
 )
-from tilelang.transform import simplify_prim_func
 
 tilelang.testing.set_random_seed(42)
 
 
-# @simplify_prim_func
 def tl_matmul(
     M,
     N,