stash

junrushao · junrushao · commit 888fc75712c3 · 2022-10-25T10:21:37.000-07:00
diff --git a/main.py b/main.py
@@ -0,0 +1,165 @@
+import tvm
+from tvm import tir
+from tvm.script import tir as T
+
+
+@T.prim_func
+def func(
+    p0: T.Buffer[(1, 64, 56, 56), "float32"],
+    p1: T.Buffer[(6, 6, 64, 64), "float32"],
+    p2: T.Buffer[(1, 64, 1, 1), "float32"],
+    output: T.Buffer[(1, 64, 56, 56), "float32"],
+) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    # body
+    # with T.block("root")
+    data_pad = T.alloc_buffer([1, 64, 58, 58], dtype="float32")
+    d = T.alloc_buffer([64, 196, 6, 6], dtype="float32")
+    B = T.alloc_buffer([6, 6], dtype="float32")
+    data_pack = T.alloc_buffer([6, 6, 64, 196], dtype="float32")
+    bgemm = T.alloc_buffer([6, 6, 64, 196], dtype="float32")
+    A = T.alloc_buffer([6, 4], dtype="float32")
+    inverse = T.alloc_buffer([64, 196, 4, 4], dtype="float32")
+    for i0, i1, i2, i3 in T.grid(1, 64, 58, 58):
+        with T.block("data_pad"):
+            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            T.reads(p0[i0_1, i1_1, i2_1 - 1, i3_1 - 1])
+            T.writes(data_pad[i0_1, i1_1, i2_1, i3_1])
+            # fmt: off
+            data_pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i2_1 and i2_1 < 57 and 1 <= i3_1 and i3_1 < 57, p0[i0_1, i1_1, i2_1 - 1, i3_1 - 1], T.float32(0), dtype="float32")
+            # fmt: on
+    for i0, i1, i2, i3 in T.grid(64, 196, 6, 6):
+        with T.block("d"):
+            c, p, eps, nu = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            T.reads(data_pad[p // 196, c, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu])
+            T.writes(d[c, p, eps, nu])
+            d[c, p, eps, nu] = data_pad[p // 196, c, p % 196 // 14 * 4 + eps, p % 14 * 4 + nu]
+    for i0, i1 in T.grid(6, 6):
+        with T.block("B"):
+            i, j = T.axis.remap("SS", [i0, i1])
+            T.reads()
+            T.writes(B[i, j])
+            # T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"})
+            # fmt: off
+            B[i, j] = T.Select(i % 6 == 5 and j % 6 == 5, T.float32(1), T.Select(i % 6 == 5 and j % 6 == 4, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 3, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 2, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 1, T.float32(0), T.Select(i % 6 == 5 and j % 6 == 0, T.float32(0), T.Select(i % 6 == 4 and j % 6 == 5, T.float32(1.5), T.Select(i % 6 == 4 and j % 6 == 4, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 3, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 2, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 1, T.float32(1), T.Select(i % 6 == 4 and j % 6 == 0, T.float32(1), T.Select(i % 6 == 3 and j % 6 == 5, T.float32(-2), T.Select(i % 6 == 3 and j % 6 == 4, T.float32(-0.5), T.Select(i % 6 == 3 and j % 6 == 3, T.float32(2), T.Select(i % 6 == 3 and j % 6 == 2, T.float32(2.5), T.Select(i % 6 == 3 and j % 6 == 1, T.float32(0.5), T.Select(i % 6 == 3 and j % 6 == 0, T.float32(1.5), T.Select(i % 6 == 2 and j % 6 == 5, T.float32(-1.5), T.Select(i % 6 == 2 and j % 6 == 4, T.float32(-1), T.Select(i % 6 == 2 and j % 6 == 3, T.float32(-1), T.Select(i % 6 == 2 and j % 6 == 2, T.float32(0.5), T.Select(i % 6 == 2 and j % 6 == 1, T.float32(-2.5), T.Select(i % 6 == 2 and j % 6 == 0, T.float32(-2), T.Select(i % 6 == 1 and j % 6 == 5, T.float32(1), T.Select(i % 6 == 1 and j % 6 == 4, T.float32(0.5), T.Select(i % 6 == 1 and j % 6 == 3, T.float32(-2), T.Select(i % 6 == 1 and j % 6 == 2, T.float32(-1), T.Select(i % 6 == 1 and j % 6 == 1, T.float32(1), T.Select(i % 6 == 1 and j % 6 == 0, T.float32(-1.5), T.Select(i % 6 == 0 and j % 6 == 5, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 4, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 3, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 2, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 1, T.float32(0), T.Select(i % 6 == 0 and j % 6 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))))))))))))))
+            # fmt: on
+    for i0, i1, i2, i3, i4, i5 in T.grid(6, 6, 64, 196, 6, 6):
+        with T.block("data_pack"):
+            eps, nu, ci, p, r_a, r_a_1 = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+            T.reads(
+                d[ci, p, r_a, r_a_1],
+                B[T.min(r_a, r_a_1) : T.max(r_a, r_a_1) + 1, T.min(eps, nu) : T.max(eps, nu) + 1],
+            )
+            T.writes(data_pack[eps, nu, ci, p])
+            # T.block_attr({"schedule_rule":"meta_schedule.winograd_data_pack.nchw.cuda"})
+            with T.init():
+                data_pack[eps, nu, ci, p] = T.float32(0)
+            data_pack[eps, nu, ci, p] = (
+                data_pack[eps, nu, ci, p] + d[ci, p, r_a, r_a_1] * B[r_a, eps] * B[r_a_1, nu]
+            )
+    for i0, i1, i2, i3, i4 in T.grid(6, 6, 64, 196, 64):
+        with T.block("bgemm"):
+            eps, nu, co, p, ci = T.axis.remap("SSSSR", [i0, i1, i2, i3, i4])
+            T.reads(p1[eps, nu, ci, co], data_pack[eps, nu, ci, p])
+            T.writes(bgemm[eps, nu, co, p])
+            with T.init():
+                bgemm[eps, nu, co, p] = T.float32(0)
+            bgemm[eps, nu, co, p] = (
+                bgemm[eps, nu, co, p] + p1[eps, nu, ci, co] * data_pack[eps, nu, ci, p]
+            )
+    for i0, i1 in T.grid(6, 4):
+        with T.block("A"):
+            i, j = T.axis.remap("SS", [i0, i1])
+            T.reads()
+            T.writes(A[i, j])
+            # T.block_attr({"const_matrix":True, "schedule_rule":"meta_schedule.compute_inline"})
+            # fmt: off
+            A[i, j] = T.Select(i % 6 == 5 and j % 4 == 3, T.float32(1), T.Select(i % 6 == 5 and j % 4 == 2, T.float32(0), T.Select(i % 6 == 5 and j % 4 == 1, T.float32(0), T.Select(i % 6 == 5 and j % 4 == 0, T.float32(0), T.Select(i % 6 == 4 and j % 4 == 3, T.float32(-8), T.Select(i % 6 == 4 and j % 4 == 2, T.float32(4), T.Select(i % 6 == 4 and j % 4 == 1, T.float32(-2), T.Select(i % 6 == 4 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 3 and j % 4 == 3, T.float32(0.125), T.Select(i % 6 == 3 and j % 4 == 2, T.float32(0.25), T.Select(i % 6 == 3 and j % 4 == 1, T.float32(0.5), T.Select(i % 6 == 3 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 3, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 2, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 1, T.float32(1), T.Select(i % 6 == 2 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 1 and j % 4 == 3, T.float32(-1), T.Select(i % 6 == 1 and j % 4 == 2, T.float32(1), T.Select(i % 6 == 1 and j % 4 == 1, T.float32(-1), T.Select(i % 6 == 1 and j % 4 == 0, T.float32(1), T.Select(i % 6 == 0 and j % 4 == 3, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 2, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 1, T.float32(0), T.Select(i % 6 == 0 and j % 4 == 0, T.float32(1), T.float32(0)))))))))))))))))))))))))
+            # fmt: on
+    for i0, i1, i2, i3, i4, i5 in T.grid(64, 196, 4, 4, 6, 6):
+        with T.block("inverse"):
+            co, p, vh, vw, r_a_2, r_a_3 = T.axis.remap("SSSSRR", [i0, i1, i2, i3, i4, i5])
+            T.reads(
+                bgemm[r_a_2, r_a_3, co, p],
+                A[T.min(r_a_2, r_a_3) : T.max(r_a_2, r_a_3) + 1, T.min(vh, vw) : T.max(vh, vw) + 1],
+            )
+            T.writes(inverse[co, p, vh, vw])
+            # T.block_attr({"schedule_rule":"meta_schedule.winograd_inverse.nchw.cuda"})
+            with T.init():
+                inverse[co, p, vh, vw] = T.float32(0)
+            inverse[co, p, vh, vw] = (
+                inverse[co, p, vh, vw] + bgemm[r_a_2, r_a_3, co, p] * A[r_a_2, vh] * A[r_a_3, vw]
+            )
+    for i0, i1, i2, i3 in T.grid(1, 64, 56, 56):
+        with T.block("output"):
+            n, co, h, w = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            T.reads(inverse[co, n * 196 + h // 4 * 14 + w // 4, h % 4, w % 4])
+            T.writes(output[n, co, h, w])
+            # T.block_attr({"schedule_rule":"meta_schedule.winograd_output.nchw.cuda", "winograd_tile_size":4, "workload":["conv2d_nchw_winograd_without_weight_transform.cuda", ["TENSOR", [1, 64, 56, 56], "float32"], ["TENSOR", [6, 6, 64, 64], "float32"], [1, 1], [1, 1, 1, 1], [1, 1], "float32"]})
+            output[n, co, h, w] = inverse[co, n * 196 + h // 4 * 14 + w // 4, h % 4, w % 4]
+
+
+def schedule_data_pack(sch: tir.Schedule, data_pack: tir.schedule.BlockRV):
+    loops = sch.get_loops(data_pack)
+
+    # factors = sch.sample_perfect_tile(loops[2], n=2, max_innermost_factor=64)
+    # t0 = sch.split(loops[2], factors)
+    #
+    # factors = sch.sample_perfect_tile(loops[3], n=2, max_innermost_factor=64)
+    # t1 = sch.split(loops[3], factors)
+
+    # sch.unroll(loops[0])
+    # sch.unroll(loops[1])
+    # sch.unroll(loops[4])
+    # sch.unroll(loops[5])
+    # sch.reorder(
+    #     t0[0],
+    #     t1[0],
+    #     t0[1],
+    #     t1[1],
+    #     loops[0],
+    #     loops[1],
+    #     loops[4],
+    #     loops[5],
+    # )
+    # return t1[1]
+
+    # sch.unroll(loops[0])
+    # sch.unroll(loops[1])
+    # sch.unroll(loops[4])
+    # sch.unroll(loops[5])
+    t0_t1 = sch.fuse(loops[2], loops[3])
+    t0, t1 = sch.split(t0_t1, factors=[None, 128])
+    sch.reorder(
+        t0,
+        t1,
+        loops[0],
+        loops[1],
+        loops[4],
+        loops[5],
+    )
+    return t1
+
+
+def main():
+    sch = tir.Schedule(func)
+    sch.compute_inline(sch.get_block("A"))
+    sch.compute_inline(sch.get_block("B"))
+    # data_pack
+    data_pack = sch.get_block("data_pack")
+    (input_tile,) = sch.get_producers(data_pack)
+    (data_pad,) = sch.get_producers(input_tile)
+    loop = schedule_data_pack(sch, data_pack)
+    # sch->ComputeAt(input_tile, /*loop_rv=*/loop, /*preserve_unit_loops=*/true);
+    # sch->SetScope(input_tile, /*buffer_index=*/0, /*storage_scope=*/"local");
+    # sch->ComputeInline(data_pad);
+    sch.compute_at(input_tile, loop, preserve_unit_loops=True)
+    sch.set_scope(input_tile, 0, "local")
+    sch.compute_inline(data_pad)
+
+    tvm.lower(sch.mod).show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -174,6 +174,15 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
     return output
 
 
+FN_INPUTS = None
+
+
+@tvm._ffi.register_func("debug_store_fn_inputs")
+def debug_store_fn_inputs(fn_inputs):
+    global FN_INPUTS
+    FN_INPUTS = fn_inputs
+
+
 def schedule_winograd_cuda(cfg, s, output, pre_computed):
     """Schedule winograd template"""
     # get stages
@@ -183,27 +192,47 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     input_tile, B = s[data_pack].op.input_tensors
     pad_data = s[input_tile].op.input_tensors[0]
 
+    def _print_lower(msg):
+        if pre_computed:
+            print(msg)
+            p0 = pad_data.op.input_tensors[0]
+            p1 = kernel_pack
+            pn = FN_INPUTS[2:]
+            tvm.lower(s, [p0, p1, *pn, output]).show()
+
+    def _print_tir():
+        if pre_computed:
+            p0 = pad_data.op.input_tensors[0]
+            p1 = kernel_pack
+            pn = FN_INPUTS[2:]
+            tvm.te.create_prim_func([p0, p1, *pn, output]).show()
+
+    _print_lower("initial")
+    _print_tir()
+
     # data transform
     s[B].compute_inline()
 
     data_l = s.cache_write(data_pack, "local")
     eps, nu, c, p = s[data_l].op.axis
     r_a, r_b = s[data_l].op.reduce_axis
-    for axis in [eps, nu, r_a, r_b]:
-        s[data_l].unroll(axis)
+    # for axis in [eps, nu, r_a, r_b]:
+    #     s[data_l].unroll(axis)
 
     eps, nu, c, p = s[data_pack].op.axis
     p, pi = s[data_pack].split(p, 1)
     fused = s[data_pack].fuse(c, p)
     bb, tt = s[data_pack].split(fused, 128)
     s[data_pack].reorder(bb, tt, pi, eps, nu)
-    s[data_pack].bind(bb, te.thread_axis("blockIdx.x"))
-    s[data_pack].bind(tt, te.thread_axis("threadIdx.x"))
+    # s[data_pack].bind(bb, te.thread_axis("blockIdx.x"))
+    # s[data_pack].bind(tt, te.thread_axis("threadIdx.x"))
 
     s[data_l].compute_at(s[data_pack], pi)
     s[input_tile].compute_at(s[data_pack], pi)
     s[pad_data].compute_inline()
 
+    _print_lower("after `data_pack`")
+
     # transform kernel
     if not pre_computed:
         kernel, G = s[kernel_pack].op.input_tensors
@@ -296,8 +325,8 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
         s[load].bind(ty, te.thread_axis("threadIdx.y"))
         s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
-    s[C].pragma(bgemm_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    s[C].pragma(bgemm_scope, "unroll_explicit", cfg["unroll_explicit"].val)
+    # s[C].pragma(bgemm_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+    # s[C].pragma(bgemm_scope, "unroll_explicit", cfg["unroll_explicit"].val)
 
     # schedule inverse, output and fusion
     if output.op in s.outputs:
@@ -328,6 +357,8 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
         s[inverse].unroll(axis)
     s[inverse].compute_at(s[output], tt)
 
+    if pre_computed:
+        breakpoint()
     return s
 
 
diff --git a/src/meta_schedule/schedule_rule/winograd.cc b/src/meta_schedule/schedule_rule/winograd.cc
@@ -55,16 +55,8 @@ inline LoopRV ScheduleDataPack(Schedule sch, BlockRV block) {
   Array<LoopRV> t1 = sch->Split(loops[3], {factors.begin(), factors.end()});
   ICHECK_EQ(t1.size(), 2);
 
-  if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[0]))) {
-    if (*i <= 16) {
-      sch->Unroll(loops[0]);
-    }
-  }
-  if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[1]))) {
-    if (*i <= 16) {
-      sch->Unroll(loops[1]);
-    }
-  }
+  sch->Unroll(loops[0]);
+  sch->Unroll(loops[1]);
   sch->Unroll(loops[4]);
   sch->Unroll(loops[5]);
   sch->Reorder({
@@ -127,16 +119,8 @@ inline LoopRV ScheduleDataPackNCHW(Schedule sch, BlockRV block) {
   Array<LoopRV> loops = sch->GetLoops(block);
   ICHECK_EQ(loops.size(), 6);
 
-  if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[0]))) {
-    if (*i <= 16) {
-      sch->Unroll(loops[0]);
-    }
-  }
-  if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[1]))) {
-    if (*i <= 16) {
-      sch->Unroll(loops[1]);
-    }
-  }
+  sch->Unroll(loops[0]);
+  sch->Unroll(loops[1]);
   sch->Unroll(loops[4]);
   sch->Unroll(loops[5]);
 
@@ -185,16 +169,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.nchw.cuda")
       sch->SetScope(inverse, /*buffer_index=*/0, /*storage_scope=*/"local");
       Array<LoopRV> loops = sch->GetLoops(inverse);
       ICHECK_EQ(loops.size(), 6);
-      if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[2]))) {
-        if (*i <= 16) {
-          sch->Unroll(loops[2]);
-        }
-      }
-      if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[3]))) {
-        if (*i <= 16) {
-          sch->Unroll(loops[3]);
-        }
-      }
+      sch->Unroll(loops[2]);
+      sch->Unroll(loops[3]);
       sch->Unroll(loops[4]);
       sch->Unroll(loops[5]);
       return {sch};
@@ -204,16 +180,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_kernel_pack.nchw.cuda")
     .set_body_typed([](Schedule sch, BlockRV kernel_pack) -> Array<Schedule> {
       Array<LoopRV> loops = sch->GetLoops(kernel_pack);
       ICHECK_EQ(loops.size(), 6);
-      if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[0]))) {
-        if (*i <= 16) {
-          sch->Unroll(loops[0]);
-        }
-      }
-      if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[1]))) {
-        if (*i <= 16) {
-          sch->Unroll(loops[1]);
-        }
-      }
+      sch->Unroll(loops[0]);
+      sch->Unroll(loops[1]);
       sch->Unroll(loops[4]);
       sch->Unroll(loops[5]);
 
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
@@ -477,6 +477,8 @@ class ScheduleBuilder : public ExprVisitor {
     LowerToTECompute lower_te_compute(target_);
     Array<te::Tensor> tensor_outs = lower_te_compute.Lower(relay_func);
     Array<te::Tensor> fn_inputs = lower_te_compute.fn_inputs_;
+    static const auto* f_store_fn_inputs = runtime::Registry::Get("debug_store_fn_inputs");
+    (*f_store_fn_inputs)(fn_inputs);
     VisitExpr(relay_func->body);
 
     // TODO(mbs): This should be the definitive global by which the PrimFunc is known and