[Dlight] Enhance Decode-GEMV Rules

junrushao · junrushao · commit 5d715aa98cd8 · 2023-07-02T23:07:44.000-07:00
diff --git a/python/tvm/dlight/gpu/decode_gemv.py b/python/tvm/dlight/gpu/decode_gemv.py
@@ -17,8 +17,7 @@
 # pylint: disable=missing-docstring
 """A fallback schedule rule for GPU operators."""
 # pylint: disable=invalid-name
-
-from typing import List, Optional, Union
+from typing import List, Optional, Set, Tuple, Union
 
 from tvm import tir
 from tvm._ffi import get_global_func
@@ -27,6 +26,7 @@
 from tvm.target import Target
 
 from ..base import ScheduleRule, normalize_prim_func, try_inline_contiguous_spatial
+from . import utils
 
 
 def _get_reduction_expr(block: tir.Block) -> Optional[tir.PrimExpr]:
@@ -47,7 +47,7 @@ def _get_reduction_expr(block: tir.Block) -> Optional[tir.PrimExpr]:
 
 def _detect_dominant_read(block: tir.Block) -> tir.PrimExpr:
     dominant_read, read_iters = None, None
-    tir_vars = set()
+    tir_vars: Set[tir.Var] = set()
     for buffer_region in block.reads:
         tir_vars.clear()
 
@@ -67,25 +67,37 @@ def _collect_tir_var(e):
     return result
 
 
-class DecodeGEMV(ScheduleRule):
-    def __init__(self) -> None:
-        super().__init__()
-        self.get_loop_iter_type = get_global_func("tir.schedule.GetLoopIterType")
+_get_loop_iter_type = get_global_func("tir.schedule.GetLoopIterType")
+
+
+def _fuse_spatial_reduction_loops(
+    sch: tir.Schedule,
+    loops: List[tir.schedule.LoopRV],
+) -> Tuple[tir.schedule.LoopRV, tir.schedule.LoopRV]:
+    s_loops, r_loops = [], []
+    for loop_rv in loops:
+        iter_type = _get_loop_iter_type(sch, loop_rv)
+        if iter_type == "S":
+            s_loops.append(loop_rv)
+        elif iter_type == "R":
+            r_loops.append(loop_rv)
+        else:
+            raise RuntimeError("Unknown loop type " + str(iter_type))
+    sch.reorder(*s_loops, *r_loops)
+    s_ctr = sch.fuse(*s_loops)
+    r_ctr = sch.fuse(*r_loops)
+    return s_ctr, r_ctr
 
-    def apply(  # pylint: disable=too-many-locals
+
+class DecodeGEMV(ScheduleRule):
+    def apply(  # pylint: disable=too-many-locals,too-many-branches,too-many-return-statements
         self,
         func: tir.PrimFunc,
         target: Target,
         _: bool,
     ) -> Union[None, tir.Schedule, List[tir.Schedule]]:
         if not isinstance(func, tir.PrimFunc):
             return None
-
-        if target.kind.name == "cuda":
-            len_tx, len_ty = 16, 16
-        else:
-            len_tx, len_ty = 8, 8
-
         sch = tir.Schedule(func)
         block_infos = try_inline_contiguous_spatial(sch, normalize_prim_func(sch))
 
@@ -113,6 +125,7 @@ def apply(  # pylint: disable=too-many-locals
             return None
         iter_to_info = {i.var: i for i in block_info.iters}
         s_loops, r_loops, c_loops = [], [], []
+        c_loop_factor = None
         for split in sorted_iter_access.args:
             block_var = split.source.source
             block_var_info = iter_to_info[block_var]
@@ -122,71 +135,82 @@ def apply(  # pylint: disable=too-many-locals
                 c_loop_factor = split.lower_factor
                 loop_rv, c_loop = sch.split(loop_rv, factors=[None, c_loop_factor])
                 c_loops.append(c_loop)
-                is_loop_c_reduction = is_inner_reduction
+                if is_inner_reduction:
+                    c_loop_factor = None
             if is_inner_reduction:
                 r_loops.append(loop_rv)
             else:
                 s_loops.append(loop_rv)
 
-        if len(c_loops) > 1:
+        if len(c_loops) > 1 or len(s_loops) == 0 or len(r_loops) == 0:
             return None
         if len(s_loops) != len([_ for i in block_info.iters if i.kind == "S"]):
             return None
-        if len(s_loops) == 0 or len(r_loops) == 0:
-            return None
 
         sch.reorder(*s_loops, *r_loops, *c_loops)
         s = sch.fuse(*s_loops)
         r = sch.fuse(*r_loops)
-
-        if is_inner_reduction:
-            _, tx = sch.split(r, factors=[None, len_tx * len_ty])
-            rf = sch.rfactor(tx, 0)
-            s, r, tx = sch.get_loops(rf)[:3]
-            sch.reorder(s, tx, r)
-            sch.reverse_compute_at(block, s, preserve_unit_loops=True)
-            sch.bind(tx, "threadIdx.x")
-            sch.bind(s, "blockIdx.x")
-        else:
-            sch.split(s, factors=[None, len_tx])
-            _, ty = sch.split(r, factors=[None, len_ty])
-            rf = sch.rfactor(ty, 0)
-            bx, tx, r, ty = sch.get_loops(rf)[:4]
-            sch.reorder(bx, tx, ty, r)
-            sch.reverse_compute_at(block, bx, preserve_unit_loops=True)
-            sch.bind(tx, "threadIdx.x")
-            sch.bind(ty, "threadIdx.y")
-            sch.bind(bx, "blockIdx.x")
-
-        s_loops, r_loops = [], []
-        for loop_rv in sch.get_loops(block)[1:]:
-            iter_type = self.get_loop_iter_type(sch, loop_rv)
-            if iter_type == "S":
-                s_loops.append(loop_rv)
-            elif iter_type == "R":
-                r_loops.append(loop_rv)
-            else:
-                raise RuntimeError("Unknown loop type " + str(iter_type))
-        sch.reorder(*s_loops, *r_loops)
-        s_ctr = sch.fuse(*s_loops)
-        r_ctr = sch.fuse(*r_loops)
-
-        if c_loops and not is_loop_c_reduction:
-            s_ctr, inner = sch.split(s_ctr, factors=[None, c_loop_factor])
-            sch.reorder(s_ctr, r_ctr, inner)
-
         if is_inner_reduction:
-            sch.bind(r_ctr, "threadIdx.x")
-            sch.set_scope(rf, 0, "local")
-            sch.decompose_reduction(rf, sch.get_loops(rf)[2])
+            self._sch_inner_reduction(sch, block, target, s, r, c_loop_factor)
         else:
-            sch.bind(s_ctr, "threadIdx.x")
-            sch.bind(r_ctr, "threadIdx.y")
-            sch.set_scope(rf, 0, "local")
-            sch.decompose_reduction(rf, sch.get_loops(rf)[3])
-
+            self._sch_inner_spatial(sch, block, target, s, r, c_loop_factor)
         if len(block_infos) == 2:
             sch.set_scope(block, 0, "local")
             sch.reverse_compute_at(block_infos[1].block_rv, sch.get_loops(block)[0])
-
         return sch
+
+    def _sch_inner_reduction(  # pylint: disable=too-many-arguments
+        self,
+        sch: tir.Schedule,
+        block: tir.schedule.BlockRV,
+        target: Target,
+        _: tir.schedule.LoopRV,
+        r: tir.schedule.LoopRV,
+        unroll_spatial_factor: Optional[int],
+    ):
+        (len_tx,) = utils.suggest_threads_per_block(  # pylint: disable=unbalanced-tuple-unpacking
+            target, [sch.get(r)]
+        )
+
+        _, tx = sch.split(r, factors=[None, len_tx])
+        rf = sch.rfactor(tx, 0)
+        s, r, tx = sch.get_loops(rf)[:3]
+        sch.reorder(s, tx, r)
+        sch.reverse_compute_at(block, s, preserve_unit_loops=True)
+        sch.bind(tx, "threadIdx.x")
+        sch.bind(s, "blockIdx.x")
+        s_ctr, r_ctr = _fuse_spatial_reduction_loops(sch, sch.get_loops(block)[1:])
+        if unroll_spatial_factor:
+            s_ctr, inner = sch.split(s_ctr, factors=[None, unroll_spatial_factor])
+            sch.reorder(s_ctr, r_ctr, inner)
+        sch.bind(r_ctr, "threadIdx.x")
+        sch.set_scope(rf, 0, "local")
+        sch.decompose_reduction(rf, sch.get_loops(rf)[2])
+
+    def _sch_inner_spatial(  # pylint: disable=too-many-locals,too-many-arguments
+        self,
+        sch: tir.Schedule,
+        block: tir.schedule.BlockRV,
+        target: Target,
+        s: tir.schedule.LoopRV,
+        r: tir.schedule.LoopRV,
+        unroll_spatial_factor: Optional[int],
+    ):
+        len_tx, len_ty = 16, 16
+        sch.split(s, factors=[None, len_tx])
+        _, ty = sch.split(r, factors=[None, len_ty])
+        rf = sch.rfactor(ty, 0)
+        bx, tx, r, ty = sch.get_loops(rf)[:4]
+        sch.reorder(bx, tx, ty, r)
+        sch.reverse_compute_at(block, bx, preserve_unit_loops=True)
+        sch.bind(tx, "threadIdx.x")
+        sch.bind(ty, "threadIdx.y")
+        sch.bind(bx, "blockIdx.x")
+        s_ctr, r_ctr = _fuse_spatial_reduction_loops(sch, sch.get_loops(block)[1:])
+        if unroll_spatial_factor:
+            s_ctr, inner = sch.split(s_ctr, factors=[None, unroll_spatial_factor])
+            sch.reorder(s_ctr, r_ctr, inner)
+        sch.bind(s_ctr, "threadIdx.x")
+        sch.bind(r_ctr, "threadIdx.y")
+        sch.set_scope(rf, 0, "local")
+        sch.decompose_reduction(rf, sch.get_loops(rf)[3])
diff --git a/python/tvm/dlight/gpu/fallback.py b/python/tvm/dlight/gpu/fallback.py
@@ -21,7 +21,8 @@
 from tvm import tir
 from tvm.target import Target
 
-from ..base import ScheduleRule, analysis, normalize_prim_func, try_inline
+from ..base import ScheduleRule, normalize_prim_func, try_inline
+from . import utils
 
 
 class Fallback(ScheduleRule):
@@ -36,7 +37,7 @@ def apply(  # pylint: disable=too-many-locals,missing-docstring
         target: Target,
         _: bool,
     ) -> tir.Schedule:
-        max_threads_per_block = analysis.get_max_threads_per_block(target)
+        max_threads_per_block = utils.max_threads_per_block(target)
 
         sch = tir.Schedule(func)
         block_infos = try_inline(sch, normalize_prim_func(sch))
diff --git a/python/tvm/dlight/gpu/utils.py b/python/tvm/dlight/gpu/utils.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+"""Utility methods for generic GPU."""
+from typing import List, Optional
+
+from tvm import tir
+from tvm.target import Target
+
+
+def max_threads_per_block(target: Target) -> int:
+    """Get the maximum number of threads per block for a given target.
+
+    Parameters
+    ----------
+    target : Target
+        The target to get the maximum number of threads per block for.
+
+    Returns
+    -------
+    max_threads_per_block : int
+        The maximum number of threads per block for the given target.
+    """
+    for name in ["max_threads_per_block", "max_num_threads"]:
+        result = target.attrs.get(name, None)
+        if result is not None:
+            return result
+    if target.kind.name == "cuda":
+        return 1024
+    return 256
+
+
+def suggest_threads_per_block(
+    target: Target,
+    loops: List[tir.For],
+    max_threads_for_dynamic_loop: int = 32,
+) -> List[int]:
+    if target.kind.name == "cuda":
+        threads = 256
+    else:
+        threads = 64
+    results: List[Optional[int]] = []
+    dynamic: List[int] = []
+    for i, loop in enumerate(loops):
+        loop_extent = loop.extent
+        if isinstance(loop_extent, tir.IntImm):
+            loop_extent = loop_extent.value
+            extent = 1
+            while extent <= loop_extent and extent <= threads:
+                extent *= 2
+            extent //= 2
+            assert extent >= 1
+            assert threads % extent == 0
+            threads //= extent
+            results.append(extent)
+        else:
+            results.append(None)
+            dynamic.append(i)
+
+    for i in dynamic:
+        extent = 1
+        while extent <= max_threads_for_dynamic_loop and extent <= threads:
+            extent *= 2
+        extent //= 2
+        assert extent >= 1
+        assert threads % extent == 0
+        threads //= extent
+        results[i] = extent
+
+    if dynamic:
+        results[dynamic[0]] *= threads
+
+    return results
diff --git a/tests/python/dlight/test_gpu_reduction.py b/tests/python/dlight/test_gpu_reduction.py
@@ -100,7 +100,7 @@ def main(p_lv44: T.handle, p_output0: T.handle):
                             v0 = T.axis.spatial(T.int64(32), ax0_ax1_fused // n + ax0)
                             v1 = T.axis.spatial(n, ax0_ax1_fused % n + ax1)
                             v2 = T.axis.reduce(m, ax2_fused_0 * T.int64(256) + ax2_fused_1)
-                            T.where(T.int64(0) <= ax0_ax1_fused // n and ax0_ax1_fused // n < T.int64(32) and T.int64(0) <= ax0_ax1_fused % n and ax0_ax1_fused % n < n and ax2_fused_0 * T.int64(256) + ax2_fused_1 < m)
+                            T.where(ax2_fused_0 * T.int64(256) + ax2_fused_1 < m)
                             T.reads(lv44[T.int64(0), v0, v1, v2])
                             T.writes(T_softmax_maxelem_shared[T.int64(0), v0, v1])
                             with T.init():
@@ -112,7 +112,7 @@ def main(p_lv44: T.handle, p_output0: T.handle):
                             v0 = T.axis.spatial(T.int64(32), ax0_ax1_fused // n + ax0)
                             v1 = T.axis.spatial(n, ax0_ax1_fused % n + ax1)
                             v2 = T.axis.reduce(m, ax2_fused_0 * T.int64(256) + ax2_fused_1)
-                            T.where(T.int64(0) <= ax0_ax1_fused // n and ax0_ax1_fused // n < T.int64(32) and T.int64(0) <= ax0_ax1_fused % n and ax0_ax1_fused % n < n and ax2_fused_0 * T.int64(256) + ax2_fused_1 < m)
+                            T.where(ax2_fused_0 * T.int64(256) + ax2_fused_1 < m)
                             T.reads(lv44[T.int64(0), v0, v1, v2], T_softmax_maxelem_shared[T.int64(0), v0, v1])
                             T.writes(T_softmax_expsum_shared[T.int64(0), v0, v1])
                             with T.init():