tile-ai · LeiWang1999 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/examples/gemm_sp/example_custom_compress.py b/examples/gemm_sp/example_custom_compress.py
@@ -275,8 +275,9 @@ def kernel(
             for tm in T.Parallel(block_M):
                 for g_i in range(0, block_K // group):
                     a_k = g_i * group
-                    T.clear(non_zero_cnt)
-                    T.clear(non_zero_elt_log_idx)
+                    non_zero_cnt[0] = 0
+                    for i in range(elem):
+                        non_zero_elt_log_idx[i] = 0
                     for i in range(group):
                         val = A_shared[tm, a_k + i]
                         if val != 0.0:

diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -312,7 +312,12 @@ std::string CodeGenTileLangCUDA::Finish() {
 void CodeGenTileLangCUDA::VisitStmt_(const tir::ForNode *op) {
   if (op->kind == tir::ForKind::kUnrolled) {
     PrintIndent();
-    stream << "#pragma unroll\n";
+    if (unroll_factor.count(op->loop_var.get())) {
+      stream << "#pragma unroll "
+             << PrintExpr(unroll_factor[op->loop_var.get()]) << "\n";
+    } else {
+      stream << "#pragma unroll\n";
+    }
   }
   std::string extent =
       PrintExpr(arith::Analyzer().Simplify(op->extent + op->min));
@@ -2661,7 +2666,12 @@ void CodeGenTileLangCUDA::VisitStmt_(const AttrStmtNode *op) {
     this->stream << "const dim3 blockIdx = " << pattern->value << "();\n";
     this->VisitStmt(op->body);
     return;
+  } else if (op->attr_key == "pragma_unroll_factor") {
+    const IntImmNode *factor = op->value.as<IntImmNode>();
+    ICHECK(factor);
+    unroll_factor[op->node.as<VarNode>()] = Downcast<IntImm>(factor);
   }
+
   CodeGenC::VisitStmt_(op);
 }
 

diff --git a/src/target/codegen_cuda.h b/src/target/codegen_cuda.h
@@ -140,6 +140,7 @@ class CodeGenTileLangCUDA final : public CodeGenC {
 
   std::unordered_map<const VarNode *, std::string> fragment_shapes;
   std::unordered_map<const VarNode *, std::string> fragment_layouts;
+  std::unordered_map<const VarNode *, IntImm> unroll_factor;
   friend void PrintConst(const FloatImmNode *op, std::ostream &os,
                          CodeGenTileLangCUDA *p);
-  std::unordered_map<const VarNode *, IntImm> unroll_factor;
-  friend void PrintConst(const FloatImmNode *op, std::ostream &os,
-                         CodeGenTileLangCUDA *p);
+  std::unordered_map<const VarNode *, IntImm> unroll_factor;
-  std::unordered_map<const VarNode *, IntImm> unroll_factor;
-  friend void PrintConst(const FloatImmNode *op, std::ostream &os,
-                         CodeGenTileLangCUDA *p);
+  std::unordered_map<const VarNode *, IntImm> unroll_factor;
   void PrintWmmaScope(const std::string &scope, DataType t,

diff --git a/testing/python/language/test_tilelang_language_unroll.py b/testing/python/language/test_tilelang_language_unroll.py
@@ -0,0 +1,37 @@
+import tilelang.testing
+from tilelang import tvm as tvm
+from tilelang import language as T
+
+
+def test_unroll_with_step():
+
+    @T.prim_func
+    def main(A_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+                for i in T.unroll(0, 16, step=4):
+                    A[0, i] = 1.0
+
+    kernel = tilelang.compile(main, target="cuda")
+    assert "#pragma unroll" in kernel.get_kernel_source()
+
+
+def test_unroll_with_unroll_factor():
+
+    @T.prim_func
+    def main(A_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (16, 16), dtype="float32", align=16)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+                for i in T.unroll(0, 16, unroll_factor=4):
+                    A[0, i] = 1.0
+
+    kernel = tilelang.compile(main, target="cuda")
+    assert "#pragma unroll 4" in kernel.get_kernel_source()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
@@ -24,7 +24,15 @@
     LocalBuffer,  # noqa: F401
     Ref,  # noqa: F401
 )
-from .loop import serial, Parallel, Persistent, Pipelined  # noqa: F401
+from .loop import (
+    Parallel,  # noqa: F401
+    Persistent,  # noqa: F401
+    Pipelined,  # noqa: F401
+    serial,  # noqa: F401
+    unroll,  # noqa: F401
+    Serial,  # noqa: F401
+    Unroll,  # noqa: F401
+)
 from .frame import has_let_value, get_let_value  # noqa: F401
 from .math_intrinsics import *  # noqa: F401
 from .kernel import (

diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
@@ -198,7 +198,7 @@ def legalize_arguments(arg: tir.Buffer | tir.Var):
     C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.gemm_sp_py"),
+        tir.op.Op.get("tl.tileop.gemm_sp_py"),
         A_arg,
         E_arg,
         B_arg,

diff --git a/tilelang/language/loop.py b/tilelang/language/loop.py
@@ -4,8 +4,9 @@
 from tvm import tir
 from tvm.tir import IntImm
 import tvm.script.ir_builder.tir as tb_tir
-from .v2.builder import SerialForWithStep
+from .v2.builder import SerialForWithStep, UnrollForWithStep
 from tilelang import _ffi_api
+from tvm.script.ir_builder.tir import frame
 
 
 def Parallel(*extents: tir.PrimExpr, coalesced_width: int | None = None):
@@ -97,7 +98,7 @@ def serial(start: tir.PrimExpr,
            stop: tir.PrimExpr | None = None,
            step: tir.PrimExpr | None = None,
            *,
-           annotations: dict[str, Any] | None = None):
+           annotations: dict[str, Any] | None = None) -> frame.ForFrame:
     step_is_one = False
     step_is_one |= isinstance(step, int) and step == 1
     step_is_one |= isinstance(step, IntImm) and step.value == 1
@@ -108,3 +109,70 @@ def serial(start: tir.PrimExpr,
             stop = start
             start = IntImm(start.dtype, 0) if hasattr(start, "dtype") else 0
         return SerialForWithStep(start, stop, step, annotations=annotations)
+
+
+def unroll(start: tir.PrimExpr,
+           stop: tir.PrimExpr | None = None,
+           step: tir.PrimExpr | None = None,
+           *,
+           explicit: bool = False,
+           unroll_factor: int | None = None,
+           annotations: dict[str, Any] | None = None) -> frame.ForFrame:
+    """The unrolled For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    step : PrimExpr
+        The step size of the iteration.
+
+    explicit : bool
+        Whether to explicitly unroll the loop.
+
+    unroll_factor : int
+        The unroll factor of the loop.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+
+    step_is_one = False
+    if stop is None:
+        stop = start
+        if hasattr(start, "dtype"):
+            start = IntImm(start.dtype, 0)
+        else:
+            start = 0
+
+    # Ensure annotations has {"pragma_unroll_explicit": True} by default
+    if annotations is None:
+        annotations = {"pragma_unroll_explicit": explicit}
+    else:
+        # Add "pragma_unroll_explicit": True if not already present
+        annotations = dict(annotations)
+        annotations.setdefault("pragma_unroll_explicit", explicit)
+
+    if unroll_factor is not None:
+        # check pragma_unroll_explicit must be False
+        if annotations.get("pragma_unroll_explicit", True):
+            raise ValueError("pragma_unroll_explicit must be True when unroll_factor is not None")
+        annotations.update({"pragma_unroll_factor": unroll_factor})
+
-    # Ensure annotations has {"pragma_unroll_explicit": True} by default
-    if annotations is None:
-        annotations = {"pragma_unroll_explicit": explicit}
-    else:
-        # Add "pragma_unroll_explicit": True if not already present
-        annotations = dict(annotations)
-        annotations.setdefault("pragma_unroll_explicit", explicit)
-
-    if unroll_factor is not None:
-        # check pragma_unroll_explicit must be False
-        if annotations.get("pragma_unroll_explicit", True):
-            raise ValueError("pragma_unroll_explicit must be True when unroll_factor is not None")
-        annotations.update({"pragma_unroll_factor": unroll_factor})
+    # Ensure annotations has {"pragma_unroll_explicit": explicit} by default
+    if annotations is None:
+        annotations = {"pragma_unroll_explicit": explicit}
+    else:
+        # Add "pragma_unroll_explicit": True if not already present
+        annotations = dict(annotations)
+        annotations.setdefault("pragma_unroll_explicit", explicit)
+
+    if unroll_factor is not None:
+        # require non‑explicit unroll when using a factor
+        if annotations.get("pragma_unroll_explicit", False):
+            raise ValueError("unroll_factor requires pragma_unroll_explicit=False")
+        annotations["pragma_unroll_factor"] = unroll_factor
-    # Ensure annotations has {"pragma_unroll_explicit": True} by default
-    if annotations is None:
-        annotations = {"pragma_unroll_explicit": explicit}
-    else:
-        # Add "pragma_unroll_explicit": True if not already present
-        annotations = dict(annotations)
-        annotations.setdefault("pragma_unroll_explicit", explicit)
-
-    if unroll_factor is not None:
-        # check pragma_unroll_explicit must be False
-        if annotations.get("pragma_unroll_explicit", True):
-            raise ValueError("pragma_unroll_explicit must be True when unroll_factor is not None")
-        annotations.update({"pragma_unroll_factor": unroll_factor})
+    # Ensure annotations has {"pragma_unroll_explicit": explicit} by default
+    if annotations is None:
+        annotations = {"pragma_unroll_explicit": explicit}
+    else:
+        # Add "pragma_unroll_explicit": True if not already present
+        annotations = dict(annotations)
+        annotations.setdefault("pragma_unroll_explicit", explicit)
+
+    if unroll_factor is not None:
+        # require non‑explicit unroll when using a factor
+        if annotations.get("pragma_unroll_explicit", False):
+            raise ValueError("unroll_factor requires pragma_unroll_explicit=False")
+        annotations["pragma_unroll_factor"] = unroll_factor
+    if step is None or step_is_one:
+        return tb_tir.unroll(start, stop, annotations=annotations)
+    else:
+        return UnrollForWithStep(start, stop, step, annotations=annotations)
+
+
+Serial = serial
+Unroll = unroll
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
@@ -112,6 +112,11 @@ class SerialForWithStep:
     annotations: dict[str, Any] | None = None
 
 
+@dataclass
+class UnrollForWithStep(SerialForWithStep):
+    ...
+
+
 # Python 3.9 compatibility: avoid PEP 604 unions at runtime
 # Use tuple for isinstance checks and typing.Union for annotations/aliases
 ContinueOrBreak = (ContinueFrame, BreakFrame)
@@ -270,7 +275,7 @@ def eval(self, val: Any):
     def ctx_for(self, it):
         self.check_continue_break()
         it = unwrap_expr(it)
-        if isinstance(it, SerialForWithStep):
+        if isinstance(it, (SerialForWithStep, UnrollForWithStep)):
             # Validate and compute the trip count before constructing the frame
             if isinstance(it.step, (int, IntImm)):
                 step_value = it.step if isinstance(it.step, int) else it.step.value
@@ -285,7 +290,14 @@ def ctx_for(self, it):
                     f'Using a non-constant step `{it.step}` in stepped serial may lead to undefined behavior in tilelang'
                 )
                 real_stop = tir.ceildiv(it.stop - it.start, it.step)
-            real_frame = tir.serial(real_stop, annotations=it.annotations)
+            if isinstance(it, UnrollForWithStep):
+                real_frame = tir.unroll(real_stop, annotations=it.annotations)
+            elif isinstance(it, SerialForWithStep):
+                real_frame = tir.serial(real_stop, annotations=it.annotations)
+            else:
+                raise TypeError(
+                    f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
+                    "range, T.serial, T.unroll, T.grid, T.parallel, T.vectorized, T.thread_binding")
             with self.with_frame(real_frame) as v:
                 IRBuilder.name('_tmp', v)
                 yield it.start + v * it.step