wip

masahi · masahi · commit 078060fe28d2 · 2022-05-17T10:29:17.000+09:00
diff --git a/tests/python/unittest/test_mma_16x8x16_4k_tune.py b/tests/python/unittest/test_mma_16x8x16_4k_tune.py
@@ -364,8 +364,6 @@ def fetch_to_shared(block, idx, ndim):
     A_sh = fetch_to_shared(block_outer, 0, 2)
     B_sh = fetch_to_shared(block_outer, 1, 2)
 
-    loop = sch.get_loops(block_outer)[-1]
-
     A_warp = sch.cache_read(block_outer, 0, "warp")
     B_warp = sch.cache_read(block_outer, 1, "warp")
 
diff --git a/tests/python/unittest/test_mma_16x8x16_4k_tune_trans.py b/tests/python/unittest/test_mma_16x8x16_4k_tune_trans.py
@@ -381,8 +381,6 @@ def fetch_to_shared(block, idx, ndim):
     A_sh = fetch_to_shared(block_outer, 0, 2)
     B_sh = fetch_to_shared(block_outer, 1, 2)
 
-    loop = sch.get_loops(block_outer)[-1]
-
     A_warp = sch.cache_read(block_outer, 0, "warp")
     B_warp = sch.cache_read(block_outer, 1, "warp")
 
diff --git a/tests/python/unittest/test_mma_16x8x16_fp16_4k_tune.py b/tests/python/unittest/test_mma_16x8x16_fp16_4k_tune.py
@@ -8,6 +8,17 @@
 import numpy as np
 
 
+def shared_16x16_to_ldmatrix_32x8_layout(i, j):
+    thread_id = 4 * (i % 8) + (j % 8) // 2
+    return thread_id, 4 * (j // 8) + (i // 8) * 2 + (j % 2)
+
+
+@tvm._ffi.register_func("tir.index_map.shared_16x16_to_ldmatrix_32x8_layout")
+def index_map_shared_16x16_to_ldmatrix_32x8_layout(i, j):
+    thread_id, local_id = shared_16x16_to_ldmatrix_32x8_layout(i, j)
+    return tvm.runtime.convert([thread_id, local_id])
+
+
 @T.prim_func
 def ldmatrix_a_desc(a: T.handle, c: T.handle) -> None:
     A_shared = T.match_buffer(a, (16, 16), "float16", align=128, offset_factor=16, scope="shared")
@@ -21,10 +32,10 @@ def ldmatrix_a_desc(a: T.handle, c: T.handle) -> None:
             with T.block("A_shared_warp"):
                 v0, v1 = T.axis.remap("SS", [ax0, ax1])
                 T.reads(A_shared[v0, v1])
-                T.writes(A_warp[v0 % 8 * 4 + v1 % 8 // 2, v1 // 8 * 4 + v0 // 8 * 2 + v1 % 2])
-                A_warp[v0 % 8 * 4 + v1 % 8 // 2, v1 // 8 * 4 + v0 // 8 * 2 + v1 % 2] = A_shared[
-                    v0, v1
-                ]
+
+                thread_id, local_id = shared_16x16_to_ldmatrix_32x8_layout(v0, v1)
+                T.writes(A_warp[thread_id, local_id])
+                A_warp[thread_id, local_id] = A_shared[v0, v1]
 
 
 @T.prim_func
@@ -74,10 +85,9 @@ def ldmatrix_b_desc(a: T.handle, c: T.handle) -> None:
             with T.block("B_shared_warp"):
                 v0, v1 = T.axis.remap("SS", [ax0, ax1])
                 T.reads(B_shared[v0, v1])
-                T.writes(B_warp[v0 % 8 * 4 + v1 % 8 // 2, v1 // 8 * 4 + v0 // 8 * 2 + v1 % 2])
-                B_warp[v0 % 8 * 4 + v1 % 8 // 2, v1 // 8 * 4 + v0 // 8 * 2 + v1 % 2] = B_shared[
-                    v0, v1
-                ]
+                thread_id, local_id = shared_16x16_to_ldmatrix_32x8_layout(v0, v1)
+                T.writes(B_warp[thread_id, local_id])
+                B_warp[thread_id, local_id] = B_shared[v0, v1]
 
 
 @T.prim_func
@@ -126,15 +136,19 @@ def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
         for i, j, k in T.grid(16, 16, 16):
             with T.block("C"):
                 i, j, k = T.axis.remap("SSR", [i, j, k])
+                thread_id_C, local_id_C = shared_16x16_to_ldmatrix_32x8_layout(i, j)
+                thread_id_A, local_id_A = shared_16x16_to_ldmatrix_32x8_layout(i, k)
+                thread_id_B, local_id_B = shared_16x16_to_ldmatrix_32x8_layout(k, j)
+
                 T.reads(
-                    C[i % 8 * 4 + j % 8 // 2, j % 16 // 8 * 4 + i % 16 // 8 * 2 + j % 2],
-                    A[i % 8 * 4 + k % 8 // 2, k % 16 // 8 * 4 + i % 16 // 8 * 2 + k % 2],
-                    B[k % 8 * 4 + j % 8 // 2, j % 16 // 8 * 4 + k % 16 // 8 * 2 + j % 2],
+                    C[thread_id_C, local_id_C],
+                    A[thread_id_A, local_id_A],
+                    B[thread_id_B, local_id_B],
+                )
+                T.writes(C[thread_id_C, local_id_C])
+                C[thread_id_C, local_id_C] += (
+                    A[thread_id_A, local_id_A] * B[thread_id_B, local_id_B]
                 )
-                T.writes(C[i % 8 * 4 + j % 8 // 2, j % 16 // 8 * 4 + i % 16 // 8 * 2 + j % 2])
-                C[i % 8 * 4 + j % 8 // 2, j % 16 // 8 * 4 + i % 16 // 8 * 2 + j % 8 % 2] = C[
-                    i % 8 * 4 + j % 8 // 2, j % 16 // 8 * 4 + i % 16 // 8 * 2 + j % 8 % 2
-                ] + A[i % 8 * 4 + k % 8 // 2, k % 16 // 8 * 4 + i % 16 // 8 * 2 + k % 8 % 2] * B[k % 8 * 4 + j % 8 // 2, j % 16 // 8 * 4 + k % 16 // 8 * 2 + j % 8 % 2]
 
 
 @T.prim_func
@@ -196,14 +210,13 @@ def mma_store_desc(a: T.handle, c: T.handle) -> None:
     with T.block("root"):
         T.reads(C_warp[0:32, 0:8])
         T.writes(C[0:16, 0:16])
-        for ax1_0, i0, i1 in T.grid(2, 32, 4):
+        for i0, i1 in T.grid(16, 16):
             with T.block("C_warp"):
-                v0 = T.axis.spatial(16, i1 // 2 * 8 + i0 // 4)
-                v1 = T.axis.spatial(16, ax1_0 * 8 + i0 % 4 * 2 + i1 % 2)
-
-                T.reads(C_warp[v0 % 8 * 4 + v1 % 8 // 2, v1 // 8 * 4 + v0 // 8 * 2 + v1 % 2])
+                v0, v1 = T.axis.remap("SS", [i0, i1])
+                thread_id, local_id = shared_16x16_to_ldmatrix_32x8_layout(v0, v1)
+                T.reads(C_warp[thread_id, local_id])
                 T.writes(C[v0, v1])
-                C[v0, v1] = C_warp[v0 % 8 * 4 + v1 % 8 // 2, v1 // 8 * 4 + v0 // 8 * 2 + v1 % 2]
+                C[v0, v1] = C_warp[thread_id, local_id]
 
 
 @T.prim_func
@@ -236,21 +249,13 @@ def mma_fill_desc(a: T.handle) -> None:
     with T.block("root"):
         T.reads()
         T.writes(C_warp[0:32, 0:8])
-        for i0, i1 in T.grid(32, 8):
+        for i0, i1 in T.grid(16, 16):
             with T.block("C_warp"):
-                i_init = T.axis.spatial(16, i1 // 4 * 8 + i0 // 4)
-                j_init = T.axis.spatial(16, (i0 % 4) * 4 + i1 % 4)
+                i_init, j_init = T.axis.remap("SS", [i0, i1])
+                thread_id, local_id = shared_16x16_to_ldmatrix_32x8_layout(i_init, j_init)
                 T.reads()
-                T.writes(
-                    C_warp[
-                        i_init % 8 * 4 + j_init % 8 // 2,
-                        j_init % 16 // 8 * 4 + i_init % 16 // 8 * 2 + j_init % 2,
-                    ]
-                )
-                C_warp[
-                    i_init % 8 * 4 + j_init % 8 // 2,
-                    j_init % 16 // 8 * 4 + i_init % 16 // 8 * 2 + j_init % 8 % 2,
-                ] = T.float16(0)
+                T.writes(C_warp[thread_id, local_id])
+                C_warp[thread_id, local_id] = T.float16(0)
 
 
 @T.prim_func
@@ -276,6 +281,7 @@ def mma_fill_impl(a: T.handle) -> None:
 M = 4096
 K = 4096
 
+
 def matmul_fp16(n, m, k):
     a = te.placeholder((n, k), name="A", dtype="float16")
     b = te.placeholder((k, m), name="B", dtype="float16")
@@ -373,8 +379,6 @@ def fetch_to_shared(block, idx, ndim):
     A_sh = fetch_to_shared(block_outer, 0, 2)
     B_sh = fetch_to_shared(block_outer, 1, 2)
 
-    loop = sch.get_loops(block_outer)[-1]
-
     A_warp = sch.cache_read(block_outer, 0, "warp")
     B_warp = sch.cache_read(block_outer, 1, "warp")
 
@@ -389,7 +393,8 @@ def fetch_to_shared(block, idx, ndim):
     jo, ji = sch.split(jj, factors=[None, 16])
     sch.reorder(io, jo, ii, ji)
 
-    block_init_c = sch.decompose_reduction(block_outer, sch.get_loops(block_outer)[3])
+    sch.decompose_reduction(block_outer, sch.get_loops(block_outer)[3])
+    block_init_c = sch.get_block("C_init")
 
     def tile_wmma_fragment(block_read, height):
         i, j = sch.get_loops(block_read)[-2:]
@@ -398,47 +403,25 @@ def tile_wmma_fragment(block_read, height):
         sch.reorder(i0, j0, i1, j1)
         return i1
 
-    def shared_16x16_to_ldmatrix_32x8_layout(i, j):
-        i_0 = i // 16
-        j_0 = j // 16
-
-        i = i % 16
-        j = j % 16
-
-        thread_id = 4 * (i % 8) + (j % 8) // 2
-        return i_0, j_0, thread_id, 4 * (j // 8) + (i // 8) * 2 + (j % 8) % 2
-
     loop_a = tile_wmma_fragment(A_warp, 16)
     loop_b = tile_wmma_fragment(B_warp, 16)
 
-    sch.transform_layout(A_warp, 0, "write", index_map=shared_16x16_to_ldmatrix_32x8_layout)
-    sch.transform_layout(B_warp, 0, "write", index_map=shared_16x16_to_ldmatrix_32x8_layout)
-    sch.transform_layout(C_warp, 0, "read", index_map=shared_16x16_to_ldmatrix_32x8_layout)
+    def index_map(i, j):
+        return (
+            i // 16,
+            j // 16,
+            *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16),
+        )
+
+    sch.transform_layout(A_warp, 0, "write", index_map)
+    sch.transform_layout(B_warp, 0, "write", index_map)
+    sch.transform_layout(C_warp, 0, "read", index_map)
 
     sch.tensorize(loop_a, "mma.ldmatrix_a")
     sch.tensorize(loop_b, "mma.ldmatrix_b")
-
-    mma_loop = sch.get_loops(block_inner)[-3]
-    sch.tensorize(mma_loop, "mma_sync")
-
-    block_init_c = sch.get_block("C_init")
-    init_loop1, init_loop2 = sch.get_loops(block_init_c)[-2:]
-    f_0, f_1 = sch.split(init_loop1, factors=[None, 8])
-    f_2, f_3 = sch.split(init_loop2, factors=[None, 4])
-    sch.reorder(f_1, f_2, f_0, f_3)
-    fused_1 = sch.fuse(f_1, f_2)
-    fused_2 = sch.fuse(f_0, f_3)
-    sch.tensorize(fused_1, "mma_fill")
-
-    warp_loop1, warp_loop2 = sch.get_loops(C_warp)[-2:]
-    f_0, f_1 = sch.split(warp_loop1, factors=[None, 8])
-    outer, f_2, f_3 = sch.split(warp_loop2, factors=[2, 4, 2])
-    sch.reorder(outer, f_1, f_2, f_0, f_3)
-    fused_1 = sch.fuse(f_1, f_2)
-    fused_2 = sch.fuse(f_0, f_3)
-    sch.tensorize(outer, "mma_store")
-    # print(sch.mod.script())
-    # return
+    sch.tensorize(sch.get_loops(block_inner)[-3], "mma_sync")
+    sch.tensorize(sch.get_loops(block_init_c)[-2], "mma_fill")
+    sch.tensorize(sch.get_loops(C_warp)[-2], "mma_store")
 
 
 ir_module = tvm.IRModule({"main": workload})
diff --git a/tests/python/unittest/test_mma_16x8x16_fp16_4k_tune_trans.py b/tests/python/unittest/test_mma_16x8x16_fp16_4k_tune_trans.py
@@ -8,6 +8,17 @@
 import numpy as np
 
 
+def shared_16x16_to_ldmatrix_32x8_layout(i, j):
+    thread_id = 4 * (i % 8) + (j % 8) // 2
+    return thread_id, 4 * (j // 8) + (i // 8) * 2 + (j % 2)
+
+
+@tvm._ffi.register_func("tir.index_map.shared_16x16_to_ldmatrix_32x8_layout")
+def index_map_shared_16x16_to_ldmatrix_32x8_layout(i, j):
+    thread_id, local_id = shared_16x16_to_ldmatrix_32x8_layout(i, j)
+    return tvm.runtime.convert([thread_id, local_id])
+
+
 @T.prim_func
 def ldmatrix_a_desc(a: T.handle, c: T.handle) -> None:
     A_shared = T.match_buffer(a, (16, 16), "float16", align=128, offset_factor=16, scope="shared")
@@ -21,10 +32,10 @@ def ldmatrix_a_desc(a: T.handle, c: T.handle) -> None:
             with T.block("A_shared_warp"):
                 v0, v1 = T.axis.remap("SS", [ax0, ax1])
                 T.reads(A_shared[v0, v1])
-                T.writes(A_warp[v0 % 8 * 4 + v1 % 8 // 2, v1 // 8 * 4 + v0 // 8 * 2 + v1 % 2])
-                A_warp[v0 % 8 * 4 + v1 % 8 // 2, v1 // 8 * 4 + v0 // 8 * 2 + v1 % 2] = A_shared[
-                    v0, v1
-                ]
+
+                thread_id, local_id = shared_16x16_to_ldmatrix_32x8_layout(v0, v1)
+                T.writes(A_warp[thread_id, local_id])
+                A_warp[thread_id, local_id] = A_shared[v0, v1]
 
 
 @T.prim_func
@@ -60,7 +71,6 @@ def ldmatrix_a_impl(a: T.handle, c: T.handle) -> None:
             )
         )
 
-
 @T.prim_func
 def ldmatrix_b_desc(a: T.handle, c: T.handle) -> None:
     B_shared = T.match_buffer(a, (16, 16), "float16", align=128, offset_factor=16, scope="shared")
@@ -74,10 +84,9 @@ def ldmatrix_b_desc(a: T.handle, c: T.handle) -> None:
             with T.block("B_shared_warp"):
                 v0, v1 = T.axis.remap("SS", [ax0, ax1])
                 T.reads(B_shared[v0, v1])
-                T.writes(B_warp[v0 % 8 * 4 + v1 % 8 // 2, v1 // 8 * 4 + v0 // 8 * 2 + v1 % 2])
-                B_warp[v0 % 8 * 4 + v1 % 8 // 2, v1 // 8 * 4 + v0 // 8 * 2 + v1 % 2] = B_shared[
-                    v0, v1
-                ]
+                thread_id, local_id = shared_16x16_to_ldmatrix_32x8_layout(v0, v1)
+                T.writes(B_warp[thread_id, local_id])
+                B_warp[thread_id, local_id] = B_shared[v0, v1]
 
 
 @T.prim_func