[Language] Add Correctness and performance check scripts for V2 (#1174)

LeiWang1999 · web-flow · commit d99853b665fc · 2025-11-03T02:51:59.000+08:00
* fix

* lint fix

* fix

* lint fix

* fix

* upd
diff --git a/maint/gemm_v2/correctness_evaluation.py b/maint/gemm_v2/correctness_evaluation.py
diff --git a/maint/gemm_v2/latency.py b/maint/gemm_v2/latency.py
@@ -0,0 +1,99 @@
+import tilelang
+import tilelang.language as T
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--use_v2", action="store_true")
+args = parser.parse_args()
+
+use_v2 = args.use_v2
+
+
+# @tilelang.jit(target="cuda")
+# target currently can be "cuda" or "hip" or "cpu".
+# if not specified, it will be inferred from the input tensors during compile time
+@tilelang.jit
+def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+
+    @T.prim_func
+    def matmul_relu_kernel(
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((K, N), dtype),
+            C: T.Tensor((M, N), dtype),
+    ):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Enable rasterization for better L2 cache locality (Optional)
+            # T.use_swizzle(panel_size=10, enable=True)
+
+            # Clear local accumulation
+            T.clear(C_local)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                # Copy tile of A
+                # This is a sugar syntax for parallelized copy
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+
+                # Copy tile of B
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+
+                # Perform a tile-level GEMM on the shared buffers
+                # Currently we dispatch to the cute/hip on Nvidia/AMD GPUs
+                if use_v2:
+                    T.gemm_v2(A_shared, B_shared, C_local)
+                else:
+                    T.gemm_v1(A_shared, B_shared, C_local)
+
+            # relu
+            for i, j in T.Parallel(block_M, block_N):
+                C_local[i, j] = T.max(C_local[i, j], 0)
+
+            # Copy result back to global memory
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return matmul_relu_kernel
+
+
+M = 16384  # M = T.dynamic("m") if you want to use dynamic shape
+N = 16384
+K = 16384
+block_M = 128
+block_N = 128
+block_K = 64
+
+# 1. Define the kernel (matmul) and compile/lower it into an executable module
+matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)
+
+# 3. Test the kernel in Python with PyTorch data
+import torch
+
+# Create random input tensors on the GPU
+a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+c = torch.empty(M, N, device="cuda", dtype=torch.float16)
+
+# Run the kernel through the Profiler
+matmul_relu_kernel(a, b, c)
+
+print(c)
+# Reference multiplication using PyTorch
+ref_c = torch.relu(a @ b)
+
+# Validate correctness
+torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+print("Kernel output matches PyTorch reference.")
+
+# 4. Retrieve and inspect the generated CUDA source (optional)
+# cuda_source = jit_kernel.get_kernel_source()
+# print("Generated CUDA kernel:\n", cuda_source)
+
+# 5.Profile latency with kernel
+profiler = matmul_relu_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+latency = profiler.do_bench()
+
+print(f"Latency: {latency} ms")
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
@@ -122,8 +122,6 @@ bool GemmNode::AllowWGMMA(int block_size, Target target) const {
 GemmInst GemmNode::GetGemmInst(int block_size, Target target) const {
   bool allow_tcgen5mma = AllowTCGEN5MMA(target);
   bool allow_wgmma = AllowWGMMA(block_size, target);
-  LOG(INFO) << "allow_tcgen5mma: " << allow_tcgen5mma
-            << ", allow_wgmma: " << allow_wgmma;
   if (allow_tcgen5mma) {
     return GemmInst::kTCGEN5MMA;
   } else if (allow_wgmma) {
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -1749,10 +1749,19 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
         "reinterpret_cast<const (ARegType)*>((A_ptr) + (A_offset)), "
         "reinterpret_cast<const (BRegType)*>((B_ptr) + (B_offset)));\n";
     tl::codegen::Replacer replacer;
+    std::string AType = tl::codegen::ptx::DTypeEnumToString(dtype_a_enum);
+    if (AType == "tl::DataType::kFloat32") {
+      AType = "tl::DataType::kTensorFloat32";
+    }
+    std::string BType = tl::codegen::ptx::DTypeEnumToString(dtype_b_enum);
+    if (BType == "tl::DataType::kFloat32") {
+      BType = "tl::DataType::kTensorFloat32";
+    }
+
     replacer.register_rule("(AType)",
-                           tl::codegen::ptx::DTypeEnumToString(dtype_a_enum));
+                           tl::codegen::ptx::DTypeEnumToString(AType));
     replacer.register_rule("(BType)",
-                           tl::codegen::ptx::DTypeEnumToString(dtype_b_enum));
+                           tl::codegen::ptx::DTypeEnumToString(BType));
     replacer.register_rule("(CType)",
                            tl::codegen::ptx::DTypeEnumToString(dtype_c_enum));
     replacer.register_rule("(M)", std::to_string(m));
@@ -1838,16 +1847,12 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string B_offset = this->PrintExpr(op->args[9]);
     std::string c_ref = this->PrintExpr(op->args[10]);
     std::string c_offset = this->PrintExpr(op->args[11]);
-    bool scale_out = Downcast<Bool>(op->args[12])->value;
+    std::string scale_out = this->PrintExpr(op->args[12]);
     bool scale_in_a = Downcast<Bool>(op->args[13])->value;
     bool scale_in_b = Downcast<Bool>(op->args[14])->value;
 
     const bool a_is_shared = true;
     this->PrintIndent();
-    std::string asm_code = PrintWGMMAAssembly(
-        shape, a_is_k_major, b_is_k_major, A_dtype, B_dtype, C_dtype, a_desc,
-        A_offset, b_desc, B_offset, c_ref, c_offset, scale_out, scale_in_a,
-        scale_in_b, a_is_shared, "", "", "", false);
     auto [m, n, k] = tl::codegen::ptx::ParseMMAShape(shape);
     need_wgmma_instruction_h_ = true;
     std::string wgmma_asm_code =
@@ -1856,10 +1861,18 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
         "uint64_t((desc_b) + (B_offset)), ((uint32_t*)((C))), (scale_out));\n";
     // replace patterns
     tl::codegen::Replacer replacer;
-    replacer.register_rule("(AType)",
-                           tl::codegen::ptx::DTypeEnumToString(A_dtype));
-    replacer.register_rule("(BType)",
-                           tl::codegen::ptx::DTypeEnumToString(B_dtype));
+
+    std::string AType = tl::codegen::ptx::DTypeEnumToString(A_dtype);
+    if (AType == "tl::DataType::kFloat32") {
+      AType = "tl::DataType::kTensorFloat32";
+    }
+    std::string BType = tl::codegen::ptx::DTypeEnumToString(B_dtype);
+    if (BType == "tl::DataType::kFloat32") {
+      BType = "tl::DataType::kTensorFloat32";
+    }
+
+    replacer.register_rule("(AType)", AType);
+    replacer.register_rule("(BType)", BType);
     replacer.register_rule("(CType)",
                            tl::codegen::ptx::DTypeEnumToString(C_dtype));
     replacer.register_rule("(M)", std::to_string(m));
@@ -1874,7 +1887,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     replacer.register_rule("(desc_b)", b_desc);
     replacer.register_rule("(B_offset)", B_offset);
     replacer.register_rule("(C)", c_ref + " + " + c_offset);
-    replacer.register_rule("(scale_out)", scale_out ? "true" : "false");
+    replacer.register_rule("(scale_out)", scale_out);
     wgmma_asm_code = replacer.rewrite(wgmma_asm_code);
     this->stream << wgmma_asm_code;
   } else if (op->op.same_as(tl::ptx_wgmma_rs())) {
@@ -1904,7 +1917,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string B_offset = this->PrintExpr(op->args[8]);
     std::string c_ref = this->PrintExpr(op->args[9]);
     std::string c_offset = this->PrintExpr(op->args[10]);
-    bool scale_out = Downcast<Bool>(op->args[11])->value;
+    std::string scale_out = this->PrintExpr(op->args[11]);
     bool scale_in_a = Downcast<Bool>(op->args[12])->value;
     bool scale_in_b = Downcast<Bool>(op->args[13])->value;
 
@@ -1924,10 +1937,17 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
         "(scale_out));\n";
 
     tl::codegen::Replacer replacer;
-    replacer.register_rule("(AType)",
-                           tl::codegen::ptx::DTypeEnumToString(dtype_a_enum));
-    replacer.register_rule("(BType)",
-                           tl::codegen::ptx::DTypeEnumToString(dtype_b_enum));
+    std::string AType = tl::codegen::ptx::DTypeEnumToString(A_dtype);
+    if (AType == "tl::DataType::kFloat32") {
+      AType = "tl::DataType::kTensorFloat32";
+    }
+    std::string BType = tl::codegen::ptx::DTypeEnumToString(B_dtype);
+    if (BType == "tl::DataType::kFloat32") {
+      BType = "tl::DataType::kTensorFloat32";
+    }
+
+    replacer.register_rule("(AType)", AType);
+    replacer.register_rule("(BType)", BType);
     replacer.register_rule("(CType)",
                            tl::codegen::ptx::DTypeEnumToString(dtype_c_enum));
     replacer.register_rule("(M)", std::to_string(m));
@@ -1943,7 +1963,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     replacer.register_rule("(B_offset)", B_offset);
     replacer.register_rule("(C_ptr)", c_ref);
     replacer.register_rule("(C_offset)", c_offset);
-    replacer.register_rule("(scale_out)", scale_out ? "true" : "false");
+    replacer.register_rule("(scale_out)", scale_out);
     wgmma_call = replacer.rewrite(wgmma_call);
     this->stream << wgmma_call;
   } else if (op->op.same_as(tl::ptx_tcgen05_mma_ss())) {
diff --git a/src/tl_templates/cuda/instruction/mma.h b/src/tl_templates/cuda/instruction/mma.h
@@ -127,6 +127,15 @@ TL_DEFINE_MMA_DISPATCHER(kFloat8_e5m2, kFloat8_e5m2, kFloat16, 16, 8, 32, false,
 TL_DEFINE_MMA_DISPATCHER(kFloat8_e5m2, kFloat8_e5m2, kFloat32, 16, 8, 32, false,
                          true, false, cute::SM89_16x8x32_F32E5M2E5M2F32_TN)
 
+// TF32 inputs (FP32 math on Tensor Cores)
+// Support both k=4 and k=8 variants on SM80
+TL_DEFINE_MMA_DISPATCHER(kTensorFloat32, kTensorFloat32, kFloat32, 16, 8, 4,
+                         false, true, false,
+                         cute::SM80_16x8x4_F32TF32TF32F32_TN)
+TL_DEFINE_MMA_DISPATCHER(kTensorFloat32, kTensorFloat32, kFloat32, 16, 8, 8,
+                         false, true, false,
+                         cute::SM80_16x8x8_F32TF32TF32F32_TN)
+
 #undef TL_DEFINE_MMA_DISPATCHER
 
 } // namespace detail
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
@@ -397,6 +397,7 @@ def test_gemm_sr():
     run_gemm_sr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
 
     # float32 tests
+    # TODO(lei): fix in future
     run_gemm_sr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
     run_gemm_sr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
     run_gemm_sr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
diff --git a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
@@ -186,43 +186,5 @@ def visit(node):
     assert order.index("tl.fence_proxy_async") < order.index("tl.ptx_wgmma_ss")
 
 
-def test_wgmma_after_descriptor():
-
-    @T.prim_func
-    def before():
-        with T.Kernel(1):
-            desc_a = T.decl_buffer((1,), "uint64", scope="local.descriptor.wgmma")
-            desc_b = T.decl_buffer((1,), "uint64", scope="local.descriptor.wgmma")
-            C_local = T.decl_buffer((32,), "float16", scope="local")
-            T.initialize_wgmma_descriptor(desc_a, T.uint64(0), 2, 1, 32)
-            T.initialize_wgmma_descriptor(desc_b, T.uint64(0), 2, 1, 32)
-            T.warpgroup_arrive()
-            T.ptx_wgmma_ss("float16", "m64n64k16", T.bool(True), T.bool(True), "fp16", "fp16",
-                           "fp16", desc_a.data, T.int32(0), desc_b.data, T.int32(0), C_local.data,
-                           T.int32(0), T.bool(True), 1, 1)
-
-    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
-    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
-    mod = tl.transform.InjectFenceProxy()(mod)
-
-    fence_count = 0
-    order = []
-
-    def visit(node):
-        nonlocal fence_count
-        if isinstance(node, tir.Evaluate):
-            call = node.value
-            if isinstance(call, tir.Call):
-                name = getattr(call.op, "name", "")
-                order.append(name)
-                if name == "tl.fence_proxy_async":
-                    fence_count += 1
-
-    tir.stmt_functor.post_order_visit(mod["main"].body, visit)
-    assert fence_count >= 1
-    assert "tl.warpgroup_arrive" in order
-    assert order.index("tl.fence_proxy_async") < order.index("tl.warpgroup_arrive")
-
-
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
@@ -51,7 +51,7 @@
     alloc_tcgen05_instr_desc,  # noqa: F401
 )
 from .copy import copy, c2d_im2col  # noqa: F401
-from .gemm import GemmWarpPolicy, gemm, gemm_v2  # noqa: F401
+from .gemm import GemmWarpPolicy, gemm, gemm_v1, gemm_v2  # noqa: F401
 from .experimental.gemm_sp import gemm_sp  # noqa: F401
 from .fill import fill, clear  # noqa: F401
 from .reduce import (
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm.py
@@ -7,7 +7,7 @@
 from tilelang.utils.language import get_buffer_region_from_load
 
 
-def gemm(
+def gemm_v1(
     A: tir.Buffer | tir.Var,
     B: tir.Buffer | tir.Var,
     C: tir.Buffer | tir.Var,
@@ -432,3 +432,6 @@ def retrieve_offset(object: tir.Buffer | tir.BufferRegion) -> tir.PrimExpr:
         C_coords[0],
         C_coords[1],
     )
+
+
+gemm = gemm_v1

Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@`
`51`	`51`	`alloc_tcgen05_instr_desc, # noqa: F401`
`52`	`52`	`)`
`53`	`53`	`from .copy import copy, c2d_im2col # noqa: F401`
`54`		`-from .gemm import GemmWarpPolicy, gemm, gemm_v2 # noqa: F401`
	`54`	`+from .gemm import GemmWarpPolicy, gemm, gemm_v1, gemm_v2 # noqa: F401`
`55`	`55`	`from .experimental.gemm_sp import gemm_sp # noqa: F401`
`56`	`56`	`from .fill import fill, clear # noqa: F401`
`57`	`57`	`from .reduce import (`