apache
diff --git a/‎python/tvm/relax/backend/dispatch_sort_scan.py‎
Lines changed: 53 additions & 17 deletions b/‎python/tvm/relax/backend/dispatch_sort_scan.py‎
Lines changed: 53 additions & 17 deletions
diff --git a/‎python/tvm/relax/frontend/nn/op.py‎
Lines changed: 106 additions & 0 deletions b/‎python/tvm/relax/frontend/nn/op.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎python/tvm/te/operation.py‎
Lines changed: 8 additions & 8 deletions b/‎python/tvm/te/operation.py‎
Lines changed: 8 additions & 8 deletions
@@ -17,13 +17,16 @@
 # pylint: disable=invalid-name, unused-argument, redefined-argument-from-local
 """Dispatch sort and scan operators to platform dependent implementation."""
 
-from tvm import topi, dlight, relax
+from functools import reduce
+from operator import mul
+
+from tvm import DataType, dlight, relax, topi
+from tvm.contrib.thrust import can_use_thrust
 from tvm.ir import Op
 from tvm.ir.module import IRModule
 from tvm.ir.transform import PassContext, module_pass
-from tvm.target import Target
-from tvm.contrib.thrust import can_use_thrust
 from tvm.relax import PyExprMutator, expr_functor
+from tvm.target import Target
 
 
 @expr_functor.mutator
@@ -80,23 +83,24 @@ def visit_call_(self, call: relax.Call) -> relax.Expr:
         if call.op.name == "relax.sort":
             tgt = self._get_target(call.struct_info)
             te_func = topi.sort
+            kwargs = {}
             with tgt:
                 if can_use_thrust(tgt, "tvm.contrib.thrust.sort"):
                     te_func = topi.cuda.sort_thrust
+                    kwargs["workspace"] = self.allocate_workspace(call)
                 elif tgt.kind.name == "cuda":
                     te_func = topi.cuda.sort
             return self.builder_.call_te(
-                te_func,
-                call.args[0],
-                call.attrs.axis,
-                not call.attrs.descending,
+                te_func, call.args[0], call.attrs.axis, not call.attrs.descending, **kwargs
             )
         if call.op.name == "relax.argsort":
             tgt = self._get_target(call.struct_info)
             te_func = topi.argsort
+            kwargs = {}
             with tgt:
                 if can_use_thrust(tgt, "tvm.contrib.thrust.sort"):
                     te_func = topi.cuda.argsort_thrust
+                    kwargs["workspace"] = self.allocate_workspace(call)
                 elif tgt.kind.name == "cuda":
                     te_func = topi.cuda.argsort
             return self.builder_.call_te(
@@ -105,12 +109,15 @@ def visit_call_(self, call: relax.Call) -> relax.Expr:
                 axis=call.attrs.axis,
                 is_ascend=not call.attrs.descending,
                 dtype=call.attrs.dtype,
+                **kwargs,
             )
         if call.op.name == "relax.topk":
             tgt = self._get_target(call.struct_info)
             te_func = topi.topk
+            kwargs = {}
             if can_use_thrust(tgt, "tvm.contrib.thrust.sort"):
                 te_func = topi.cuda.topk_thrust
+                kwargs["workspace"] = self.allocate_workspace(call)
             elif tgt.kind.name == "cuda":
                 te_func = topi.cuda.topk
             tir_call = self.builder_.call_te(
@@ -121,6 +128,7 @@ def visit_call_(self, call: relax.Call) -> relax.Expr:
                 ret_type=call.attrs.ret_type,
                 is_ascend=not call.attrs.largest,
                 dtype=call.attrs.dtype,
+                **kwargs,
             )
             if tgt.kind.name != "cuda":
                 return tir_call
@@ -130,23 +138,51 @@ def visit_call_(self, call: relax.Call) -> relax.Expr:
         if call.op.name in ("relax.cumprod", "relax.cumsum"):
             tgt = self._get_target(call.struct_info)
             axis = int(call.attrs.axis) if call.attrs.axis is not None else call.attrs.axis
-            te_func = topi.cuda.cumsum if tgt.kind.name == "cuda" else topi.cumsum
-            if call.op.name == "relax.cumprod":
-                te_func = topi.cuda.cumprod if tgt.kind.name == "cuda" else topi.cumprod
-            tir_call = self.builder_.call_te(
-                te_func,
-                call.args[0],
-                axis,
-                call.attrs.dtype,
-                call.attrs.exclusive,
-            )
+            kwargs = {}
+            with tgt:
+                if call.op.name == "relax.cumsum":
+                    te_func = topi.cuda.cumsum if tgt.kind.name == "cuda" else topi.cumsum
+                    if can_use_thrust(tgt, "tvm.contrib.thrust.sum_scan"):
+                        kwargs["workspace"] = self.allocate_workspace(call)
+                elif call.op.name == "relax.cumprod":
+                    te_func = topi.cuda.cumprod if tgt.kind.name == "cuda" else topi.cumprod
+                else:
+                    raise ValueError(f"Unsupported op: {call.op.name}")
+                tir_call = self.builder_.call_te(
+                    te_func,
+                    call.args[0],
+                    axis,
+                    call.attrs.dtype,
+                    call.attrs.exclusive,
+                    **kwargs,
+                )
             if tgt.kind.name != "cuda":
                 return tir_call
             # apply dlight gpu fallback
             self._apply_dlight_gpu_fallback(tgt, tir_call)
             return tir_call
         return super().visit_call_(call)
 
+    def estimate_thrust_workspace_size(self, call: relax.Call) -> int:
+        """
+        Estimate the workspace size for thrust sort/argsort/topk/cumsum
+        """
+        input_shape = call.args[0].struct_info.shape
+        input_byte_per_elem = DataType(call.args[0].struct_info.dtype).bits // 8
+        input_size = reduce(mul, input_shape, 1) * input_byte_per_elem
+        # Most GPU algorithms take O(n) space or less, we choose 2N + 4MB as a safe estimation
+        return 2 * input_size + 4 * 1024 * 1024
+
+    def allocate_workspace(self, call: relax.Call) -> relax.Var:
+        """
+        Allocate workspace for thrust sort/argsort/topk.
+        """
+        workspace_size = self.estimate_thrust_workspace_size(call)
+        alloc = relax.op.builtin.alloc_tensor(
+            relax.ShapeExpr((workspace_size,)), "uint8", runtime_device_index=0
+        )
+        return self.builder_.emit(alloc)
+
 
 @module_pass(opt_level=0, name="DispatchSortScan")
 class DispatchSortScan:
 
@@ -2241,6 +2241,112 @@ def cumsum(
     return wrap_nested(_op.cumsum(data._expr, axis, dtype, exclusive), name)
 
 
+def sort(x: Tensor, axis: int = -1, descending: bool = False, name="sort"):
+    """Performs sorting along the given axis and returns an array
+    in sorted order.
+
+    Parameters
+    ----------
+    x : Tensor
+        The input tensor.
+
+    axis : int
+        Axis along which to sort the input tensor.
+        By default the last axis of the input is used.
+
+    descending : bool
+        Whether to sort in descending order, the default is False
+
+    name : str
+        Name hint.
+
+    Returns
+    -------
+    out : Tensor
+        The sorted tensor.
+    """
+    return wrap_nested(_op.sort(x, axis, descending), name=name)
+
+
+def argsort(
+    data: Tensor, axis: int = -1, descending: bool = False, dtype: str = "int32", name="argsort"
+):
+    """Performs sorting along the given axis and returns an array of indices
+    having same shape as an input array that index data in sorted order.
+
+    Parameters
+    ----------
+    data : Tensor
+        The input data tensor.
+
+    axis : int
+        Axis long which to sort the input tensor.
+
+    descending : bool
+        Whether to sort in descending order, the default is False
+
+    dtype : str
+        The data type of the output indices.
+
+    name : str
+        Name hint.
+
+    Returns
+    -------
+    out : Tensor
+        The indices of the sorted tensor.
+    """
+    return wrap_nested(_op.argsort(data, axis, descending, dtype), name=name)
+
+
+def topk(
+    data: Tensor,
+    k: int = 1,
+    axis: int = -1,
+    ret_type: str = "both",
+    largest: bool = True,
+    dtype: str = "int32",
+    name: str = "topk",
+):
+    """Get the top k elements in an input tensor along the given axis.
+
+    ret_type specifies the return type, can be one of ("both", "values", "indices").
+
+    Parameters
+    ----------
+    data : Tensor
+        The input data tensor.
+
+    k : int
+        Number of top elements to select. Return all elements if k < 1.
+
+    axis : int
+        Axis long which to sort the input tensor.
+
+    ret_type: str
+        The return type [both, values, indices].
+        "both": return both top k data and indices.
+        "values": return top k data only.
+        "indices": return top k indices only.
+
+    largest : bool
+        Whether to return largest or smallest elements.
+        The k smallest elements are returned if largest is False.
+
+    dtype : str
+        The data type of the indices output.
+
+    name : str
+        Name hint.
+
+    Returns
+    -------
+    out : Tensor or Tuple[Tensor, Tensor]
+        The computed result.
+    """
+    return wrap_nested(_op.topk(data, k, axis, ret_type, largest, dtype), name=name)
+
+
 def multinomial_from_uniform(
     prob: Tensor,
     uniform_sample: Tensor,
 
@@ -333,15 +333,15 @@ def extern(
             )
         types.add(t.dtype)
 
-    if dtype is None:
-        if len(types) != 1:
-            raise ValueError("Cannot infer output type, please provide dtype argument")
-        infered_type = types.pop()
-        dtype = [infered_type for _ in shape]
-    if isinstance(dtype, str):
-        dtype = [dtype]
-
     if out_buffers is None:
+        if dtype is None:
+            if len(types) != 1:
+                raise ValueError("Cannot infer output type, please provide dtype argument")
+            infered_type = types.pop()
+            dtype = [infered_type for _ in shape]
+        if isinstance(dtype, str):
+            dtype = [dtype]
+
         for shp, dt in zip(shape, dtype):
             output_placeholders.append(
                 tvm.tir.decl_buffer(shp, dt, name, elem_offset=tvm.tir.Var("elem_offset", "int32"))