tile-ai
diff --git a/‎.github/workflows/dist.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/dist.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/gdn/example_chunk_o_bwd.py‎
Lines changed: 3 additions & 4 deletions b/‎examples/gdn/example_chunk_o_bwd.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎requirements-lint.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements-lint.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transform/make_packed_api.cc‎
Lines changed: 1 addition & 1 deletion b/‎src/transform/make_packed_api.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎testing/python/issue/test_tilelang_issue_1115.py‎
Lines changed: 49 additions & 0 deletions b/‎testing/python/issue/test_tilelang_issue_1115.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎testing/python/jit/test_tilelang_jit_parcompile.py‎
Lines changed: 74 additions & 0 deletions b/‎testing/python/jit/test_tilelang_jit_parcompile.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎testing/python/language/test_tilelang_laguange_chain_equal.py‎ renamed to ‎testing/python/language/test_tilelang_language_chain_equal.py‎ b/‎testing/python/language/test_tilelang_laguange_chain_equal.py‎ renamed to ‎testing/python/language/test_tilelang_language_chain_equal.py‎
@@ -89,7 +89,7 @@ jobs:
       - name: Upload SDist
         # Not PR to save artifact storage, as SDist is only needed for releases.
         if: github.event_name != 'pull_request' || contains(github.event.pull_request.title, '[Release]')
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
         with:
           name: sdist
           path: dist/*.tar.gz
@@ -172,7 +172,7 @@ jobs:
     timeout-minutes: 15
     steps:
       - name: Download built SDist
-        uses: actions/download-artifact@v5
+        uses: actions/download-artifact@v6
         with:
           # unpacks default artifact into dist/
           # if `name: artifact` is omitted, the action will create extra parent dir
 
@@ -41,7 +41,7 @@ repos:
             ^.+\.json$
           )
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.1  # sync with requirements-lint.txt
+    rev: v0.14.3  # sync with requirements-lint.txt
     hooks:
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix]
 
@@ -7,8 +7,6 @@
 import tilelang.language as T
 from tilelang.engine.callback import register_cuda_postproc_callback  # noqa: F401
 
-print(tilelang.__file__)
-
 # Add your fla repository path to sys.path
 # Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
@@ -256,8 +254,9 @@ def kernel(
                     # for i_kv in T.Parallel(block_DK * block_DV):
                     #     dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv % block_DV] * dh_shared[i_kv // block_DV, i_kv % block_DV]
                     for i_kv in T.Parallel(block_DK * block_DV):
-                        i_k, i_v = i_kv // block_DV, i_kv % block_DV
-                        dg_last_fragment[i_kv] = h_shared[i_k, i_v] * dh_shared[i_k, i_v]
+                        dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv %
+                                                          block_DV] * dh_shared[i_kv // block_DV,
+                                                                                i_kv % block_DV]
                     T.reduce_sum(dg_last_fragment, dg_last_fragment_scalar, dim=-1, clear=False)
                     dg_last_local[0] += dg_last_fragment_scalar[0]
 
 
@@ -3,5 +3,5 @@ pre-commit
 clang-format==21.1.2
 clang-tidy==21.1.1
 codespell[toml]==2.4.1
-ruff==0.14.1
+ruff==0.14.3
 yapf==0.43.0
@@ -433,7 +433,7 @@ PrimFunc MakePackedAPI(PrimFunc func) {
       auto shape_vectorize_expr = [&]() -> PrimExpr {
         PrimExpr result = IntImm(kv.second->DefaultIndexType(), 1);
         result = result * vectorize_dim;
-        result = FloorMod(result, dynamic_alignment);
+        result = FloorMod(result, IntImm(result->dtype, dynamic_alignment));
         return result;
       }();
       shape_checks.emplace_back(AssertStmt(
 
@@ -0,0 +1,49 @@
+import torch
+import tilelang
+import tilelang.language as T
+
+
+def test_int64_address():
+
+    @tilelang.jit
+    def set_cache_kernel(
+        S,
+        D,
+        pos_ty='int64',
+        dtype="float32",
+    ):
+
+        @T.prim_func
+        def main(
+                pos: T
+            .Tensor(
+                [
+                    S,
+                ], pos_ty
+            ),  # type: ignore  `TypeError: Check failed: (a.dtype() == b.dtype()) is false: mismatched types. int64 vs. int32`
+                value: T.Tensor([S, D], dtype),  # type: ignore
+                cache: T.Tensor([S, D], dtype),  # type: ignore
+        ):
+            with T.Kernel(S, threads=128) as bx:
+                slot = pos[bx]
+                for i in T.Parallel(D):
+                    cache[slot, i] = value[bx, i]
+
+        return main
+
+    D = 2
+    S = 10
+    cache = torch.rand((S, D), device="cuda", dtype=torch.float32)
+    value = torch.rand((S, D), device='cuda', dtype=torch.float32)
+    pos_int64 = torch.arange(S, device='cuda', dtype=torch.int64)
+    pos_int32 = torch.arange(S, device='cuda', dtype=torch.int32)
+    kernel_int64 = set_cache_kernel(S, D, 'int64')
+    kernel_int32 = set_cache_kernel(S, D, 'int32')
+    kernel_int64(pos_int64, value, cache)
+    torch.testing.assert_close(cache, value)
+    kernel_int32(pos_int32, value, cache)
+    torch.testing.assert_close(cache, value)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
@@ -0,0 +1,74 @@
+import tilelang.testing
+import tilelang
+import torch
+
+
+@tilelang.jit(
+    out_idx=-1,  # create the output tensor during runtime
+    verbose=True,
+)
+def matmul_kernel_jit(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A=False,
+    trans_B=True,
+    in_dtype='float16',
+    out_dtype='float32',
+    accum_dtype='float32',
+    num_stages=2,
+    threads=128,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def test_par_compile():
+    configs = [
+        (1024, 1024, 1024, 128, 128, 32),
+        (2048, 2048, 2048, 256, 256, 64),
+        (4096, 4096, 4096, 64, 64, 128),
+    ]
+    kernels = matmul_kernel_jit.par_compile(configs)
+    for (M, N, K, _, _, _), kernel in zip(configs, kernels):
+        A = torch.randn(M, K, dtype=torch.float16).cuda()
+        B = torch.randn(N, K, dtype=torch.float16).cuda()
+        ref = (A @ B.T).float()
+        C = kernel(A, B)
+        tilelang.testing.torch_assert_close(C, ref, rtol=1e-2, atol=1e-2)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ repos:`
`41`	`41`	`^.+\.json$`
`42`	`42`	`)`
`43`	`43`	`- repo: https://github.com/astral-sh/ruff-pre-commit`
`44`		`- rev: v0.14.1 # sync with requirements-lint.txt`
	`44`	`+ rev: v0.14.3 # sync with requirements-lint.txt`
`45`	`45`	`hooks:`
`46`	`46`	`- id: ruff-check`
`47`	`47`	`args: [--fix, --exit-non-zero-on-fix]`