[TOPI] Remove blockIdx.z in topi sort

Siyuan Feng · Siyuan Feng · commit 4b5cd425c90a · 2024-05-08T21:05:53.000+08:00
As `blockIdx.z` is not allowed in WebGPU, this PR split `blockIdx.z`
into `blockIdx.y` to support WebGPU
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
@@ -57,18 +57,15 @@ def traverse(op):
     return s
 
 
-def _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz):
+def _get_threads(ib, nthread_tx, nthread_bx, nthread_by):
     tx = te.thread_axis("threadIdx.x")
     bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
 
     by = te.thread_axis("blockIdx.y")
-    bz = te.thread_axis("blockIdx.z")
     ib.scope_attr(by, "thread_extent", nthread_by)
-    ib.scope_attr(bz, "thread_extent", nthread_bz)
-
-    return tx, bx, by, bz
+    return tx, bx, by
 
 
 def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_func=None):
@@ -87,14 +84,13 @@ def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_f
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = ceil_div(shape[axis], max_threads)
-    nthread_by = axis_mul_before
-    nthread_bz = axis_mul_after
+    nthread_by = axis_mul_before * axis_mul_after
 
     # Copy the keys_in to initial output
     with ib.new_scope():
-        tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz)
+        tx, bx, by = _get_threads(ib, nthread_tx, nthread_bx, nthread_by)
         tid = bx * nthread_tx + tx
-        idx = (by * shape[axis] + tid) * axis_mul_after + bz
+        idx = (by // axis_mul_after * shape[axis] + tid) * axis_mul_after + (by % axis_mul_after)
         with ib.if_scope(tid < shape[axis]):
             keys_out[idx] = keys_in[idx]
             if values_out is not None:
@@ -122,11 +118,10 @@ def _odd_even_sort(
 ):
     nthread_tx = block_size // 2
     nthread_bx = ceil_div(size, block_size)
-    nthread_by = axis_mul_before
-    nthread_bz = axis_mul_after
+    nthread_by = axis_mul_before * axis_mul_after
     with ib.new_scope():
         ib.scope_attr(tvm.tir.const(0), "hand_threaded", 0)
-        tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz)
+        tx, bx, by = _get_threads(ib, nthread_tx, nthread_bx, nthread_by)
         tid = 2 * tx
         start = bx * block_size
 
@@ -153,7 +148,7 @@ def _odd_even_sort(
         temp_cond1 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond1", scope="local")
         temp_cond2 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond2", scope="local")
         # Copy data to scratch space
-        base_idx = by * size * axis_mul_after + bz
+        base_idx = (by // axis_mul_after) * size * axis_mul_after + (by % axis_mul_after)
         with ib.for_range(0, 2) as n:
             with ib.if_scope((tid + n + start) < size):
                 tmp_keys_swap[tid + n] = keys[base_idx + (tid + n + start) * axis_mul_after]
@@ -222,7 +217,6 @@ def _sort_common(
 
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_by = axis_mul_before * axis_mul_after
-    nthread_bz = 1
     nthread_tx = max_threads
     nthread_bx = ceil_div(size, nthread_tx)
 
@@ -334,12 +328,12 @@ def assign_j():
                 ntx = max_threads
                 nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32")
                 nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32")
-                tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz)
+                tx, bx, by = _get_threads(ib, ntx, nbx, nthread_by * nbz)
             else:
                 ntx = tvm.tir.generic.cast(tvm.te.min(max_threads, width), "int32")
                 nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32")
                 nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32")
-                tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz)
+                tx, bx, by = _get_threads(ib, ntx, nbx, nthread_by * nbz)
 
             def mergepath(
                 source,
@@ -392,7 +386,7 @@ def merge(source, dest, source_idx, dest_idx):
 
             def mergesort(source, dest, source_idx, dest_idx, size, width, even):
                 # calculate the start, mid, and end points of this section
-                start = width * bz
+                start = width * (by % nbz)
                 middle = cast(tvm.te.min(start + tvm.tir.indexdiv(width, 2), size), "int64")
                 end = cast(tvm.te.min(start + width, size), "int64")
                 with ib.if_scope(start < size):
@@ -471,18 +465,17 @@ def do_merge(first, last):
                 width,
                 tvm.tir.indexmod(l2_width, 2) == 0,
             )
-    nthread_by = axis_mul_before
-    nthread_bz = axis_mul_after
+    nthread_by = axis_mul_before * axis_mul_after
     nthread_tx = max_threads
     nthread_bx = ceil_div(size, nthread_tx)
     ## if the final sorted data ended up in the swap, copy it to the real output
     with ib.if_scope(
         tvm.tir.all(upper_lim > lower_lim, tvm.tir.indexmod(upper_lim - lower_lim, 2) == 1)
     ):
         with ib.new_scope():
-            tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz)
+            tx, bx, by = _get_threads(ib, nthread_tx, nthread_bx, nthread_by)
             tid = bx * nthread_tx + tx
-            idx = (by * axis_mul_after + bz) * size + tid
+            idx = by * size + tid
             with ib.if_scope(tid < size):
                 keys[idx] = keys_swap[idx]
                 if values is not None: