Add test case where PrimFunc is used both in-place and DPS

slyubomirsky · slyubomirsky · commit 3b44ff1cc243 · 2024-02-05T19:16:03.000-05:00
diff --git a/tests/python/relax/test_transform_fuse_tir.py b/tests/python/relax/test_transform_fuse_tir.py
@@ -2162,5 +2162,97 @@ def main(
     _check(Module, Expected)
 
 
+def test_use_as_inplace_and_dps():
+    @I.ir_module
+    class Module:
+        # we will use it both in-place and normally (DPS)
+        @T.prim_func(private=True)
+        def add(
+            A: T.Buffer((T.int64(10), T.int64(20)), "float32"),
+            B: T.Buffer((), "float32"),
+            Out: T.Buffer((T.int64(10), T.int64(20)), "float32"),
+        ):
+            T.func_attr({"tir.noalias": T.bool(True)})
+            for ax0, ax1 in T.grid(T.int64(10), T.int64(20)):
+                with T.block("T_add"):
+                    v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
+                    Out[v_ax0, v_ax1] = A[v_ax0, v_ax1] + B[()]
+
+        @R.function(private=True)
+        def fused_sums(
+            x: R.Tensor((10, 20), dtype="float32"), p0: R.Tensor((), dtype="float32")
+        ) -> R.Tensor((10, 20), dtype="float32"):
+            R.func_attr({"Primitive": 1})
+            cls = Module
+            with R.dataflow():
+                lv = R.call_tir(
+                    cls.add,
+                    (x, p0),
+                    out_sinfo=R.Tensor((10, 20), dtype="float32"),
+                )
+                lv1 = R.call_tir_inplace(
+                    cls.add,
+                    (x, p0, lv),
+                    inplace_indices=[2],
+                    out_sinfo=R.Tensor((10, 20), dtype="float32"),
+                )
+                lv2 = R.call_tir_inplace(
+                    cls.add,
+                    (x, p0, lv1),
+                    inplace_indices=[2],
+                    out_sinfo=R.Tensor((10, 20), dtype="float32"),
+                )
+                R.output(lv2)
+            return lv2
+
+        @R.function
+        def main(
+            x: R.Tensor((10, 20), dtype="float32"), p0: R.Tensor((), dtype="float32")
+        ) -> R.Tensor((10, 20), dtype="float32"):
+            cls = Module
+            with R.dataflow():
+                gv1: R.Tensor((10, 20), dtype="float32") = cls.fused_sums(x, p0)
+                R.output(gv1)
+            return gv1
+
+    @I.ir_module
+    class Expected:
+        @T.prim_func(private=True)
+        def fused_sums(
+            x: T.Buffer((T.int64(10), T.int64(20)), "float32"),
+            p0: T.Buffer((), "float32"),
+            p_output0: T.Buffer((T.int64(10), T.int64(20)), "float32"),
+        ):
+            T.func_attr({"tir.noalias": T.bool(True)})
+            for ax0, ax1 in T.grid(T.int64(10), T.int64(20)):
+                with T.block("T_add"):
+                    v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
+                    p_output0[v_ax0, v_ax1] = x[v_ax0, v_ax1] + p0[()]
+            for ax0, ax1 in T.grid(T.int64(10), T.int64(20)):
+                with T.block("T_add"):
+                    v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
+                    p_output0[v_ax0, v_ax1] = x[v_ax0, v_ax1] + p0[()]
+            for ax0, ax1 in T.grid(T.int64(10), T.int64(20)):
+                with T.block("T_add"):
+                    v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
+                    p_output0[v_ax0, v_ax1] = x[v_ax0, v_ax1] + p0[()]
+
+        @R.function
+        def main(
+            x: R.Tensor((10, 20), dtype="float32"), p0: R.Tensor((), dtype="float32")
+        ) -> R.Tensor((10, 20), dtype="float32"):
+            cls = Expected
+            with R.dataflow():
+                gv1: R.Tensor((10, 20), dtype="float32") = R.call_tir(
+                    cls.fused_sums,
+                    (x, p0),
+                    out_sinfo=R.Tensor((10, 20), dtype="float32"),
+                )
+                R.output(gv1)
+            return gv1
+
+    _check(Module, Expected)
+
+
 if __name__ == "__main__":
     tvm.testing.main()