[Unity][Transform] Extract partial-tuple-usage from FuseTIR

Lunderberg · Lunderberg · commit 4b60b6925717 · 2023-11-13T12:31:38.000-06:00
Prior to this commit, the `FuseTIR` pass explicitly tracked usage of tuple arguments, to minimize the set of arguments provided to each kernel. The additional tgracking and handling of partially-used tuples makes it difficult to follow the primary changes being made by `FuseTIR`. This commit implements the same functionality in terms of the `ExpandTupleArguments` and `RemoveUnusedParameters` transforms, introduced in #16115 and #16116 respectively. By using these passes before the main `FuseOps` changes, partial tuple usage is already handled at that point. This commit is intended to minimize any changes to user-facing behavior, and so these pre-process passes are currently used internally by `FuseOps`. This may be avoided in the future by pulling this internal delegation out into a lowering pipeline.
diff --git a/src/relax/transform/fuse_tir.cc b/src/relax/transform/fuse_tir.cc
@@ -376,13 +376,6 @@ class FusedTIRConstructor : public ExprVisitor {
       }
     }
 
-    PostOrderVisit(func->body, [=, &tuple_param](Expr e) {
-      if (auto tup_get = e.as<TupleGetItemNode>();
-          tup_get && tuple_param.count(tup_get->tuple.get())) {
-        func_info_.used_tuple_field_indices[tup_get->tuple.get()].insert(tup_get->index);
-      }
-    });
-
     for (const Var& relax_param : func->params) {
       auto sinfo = GetStructInfo(relax_param);
       if (sinfo->IsInstance<ShapeStructInfoNode>()) {
@@ -397,7 +390,7 @@ class FusedTIRConstructor : public ExprVisitor {
           int index = 0;
           Array<tir::Var> params;
           Array<tir::Buffer> buffers;
-          for (auto i : func_info_.used_tuple_field_indices[relax_param.get()]) {
+          for (size_t i = 0; i < tuple->fields.size(); i++) {
             auto [ret_params, ret_buffers] =
                 CreateParamsAndBuffers(tuple->fields[i], relax_param->name_hint(), index);
             ICHECK_EQ(ret_params.size(), ret_buffers.size());
@@ -529,12 +522,7 @@ class FusedTIRConstructor : public ExprVisitor {
       int end_buf_idx = 0;
       const TupleType& tuple_type = Downcast<TupleType>(tuple_get_item->tuple->checked_type());
       for (int i = 0; i < tuple_get_item->index; ++i) {
-        auto it = func_info_.used_tuple_field_indices.find(tuple_get_item->tuple.get());
-        // If this tuple is not passed as a parameter, or if the field at the index i is actually
-        // used, the corresponding buffer needs to be taken into account by this function.
-        if (it == func_info_.used_tuple_field_indices.end() || it->second.count(i)) {
-          begin_buf_idx += GetTotalTensorSize(tuple_type->fields[i]);
-        }
+        begin_buf_idx += GetTotalTensorSize(tuple_type->fields[i]);
       }
       end_buf_idx = begin_buf_idx + GetTotalTensorSize(tuple_type->fields[tuple_get_item->index]);
       func_info_.expr2buffers.Set(
@@ -835,8 +823,6 @@ class FusedTIRConstructor : public ExprVisitor {
     std::string global_name = "fused";
     /*! \brief The map from symbolic var to its corresponding var in the fused function */
     tir::SymbolicMatcher symbolic_var_matcher = tir::SymbolicMatcher(&symbolic_var_remap);
-    /*! \brief Record indices of tuple fields that are actually accessed. */
-    std::unordered_map<const Object*, std::unordered_set<size_t>> used_tuple_field_indices;
   };
 
   /*! \brief The IRModule */
@@ -1040,10 +1026,17 @@ namespace transform {
 Pass FuseTIR() {
   runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func =  //
       [=](IRModule m, PassContext pc) { return relax::FuseTIR(m); };
-  return CreateModulePass(/*pass_function=*/pass_func,  //
-                          /*opt_level=*/0,              //
-                          /*pass_name=*/"FuseTIR",      //
-                          /*required=*/{});
+  auto inner_pass = CreateModulePass(/*pass_function=*/pass_func,   //
+                                     /*opt_level=*/0,               //
+                                     /*pass_name=*/"FuseTIRInner",  //
+                                     /*required=*/{});
+  return tvm::transform::Sequential(
+      {
+          ExpandTupleArguments(),
+          RemoveUnusedParameters(),
+          inner_pass,
+      },
+      "FuseTIR");
 }
 
 TVM_REGISTER_GLOBAL("relax.transform.FuseTIR").set_body_typed(FuseTIR);
diff --git a/tests/python/relax/test_transform_fuse_tir.py b/tests/python/relax/test_transform_fuse_tir.py
@@ -205,7 +205,7 @@ def fused_exp_squeeze(x):
         with bb.function("main", [x]):
             with bb.dataflow():
                 lv = bb.emit_te(fused_exp_squeeze, x)
-                lv2 = bb.emit_te(fused_exp_squeeze, lv)
+                lv2 = bb.call_te(fused_exp_squeeze, lv)
                 gv = bb.emit_output(lv2)
             bb.emit_func_output(gv)
         return bb.get()
@@ -245,7 +245,7 @@ def fused_exp_exp_squeeze(x):
         x = relax.Var("x", R.Tensor([10, 20], "float32"))
         with bb.function("main", [x]):
             with bb.dataflow():
-                lv = bb.emit_te(fused_exp_exp_squeeze, x)
+                lv = bb.call_te(fused_exp_exp_squeeze, x)
                 gv = bb.emit_output(lv)
             bb.emit_func_output(gv)
         return bb.get()
@@ -373,7 +373,7 @@ def fused_exp_squeeze(x):
         with bb.function("main", [x]):
             with bb.dataflow():
                 lv = bb.emit_te(fused_exp_squeeze, x)
-                lv2 = bb.emit_te(topi.add, lv, relax.const(1, "float32"))
+                lv2 = bb.call_te(topi.add, lv, relax.const(1, "float32"))
                 gv = bb.emit_output(lv2)
             bb.emit_func_output(gv)
         return bb.get()
@@ -414,7 +414,7 @@ def fused_add_exp_squeeze(x, y):
         x = relax.Var("x", R.Tensor([10, 20], "float32"))
         with bb.function("main", [x]):
             with bb.dataflow():
-                lv = bb.emit_te(fused_add_exp_squeeze, x, relax.const(1, "float32"))
+                lv = bb.call_te(fused_add_exp_squeeze, x, relax.const(1, "float32"))
                 gv = bb.emit_output(lv)
             bb.emit_func_output(gv)
         return bb.get()
@@ -1115,7 +1115,7 @@ def reshape(
                         (v_ax2 * T.int64(64) + v_ax3) % T.int64(2048),
                     ]
 
-        @R.function
+        @R.function(private=True)
         def fused_reshape(
             lv: R.Tuple(
                 R.Tensor((4, 8, 2048), dtype="float32"), R.Tensor((4, 8, 2048), dtype="float32")