[Codegen] Emit tir::Let as var assignment explicitly

MasterJH5574 · MasterJH5574 · commit 5457abb49778 · 2024-08-19T10:06:50.000-04:00
Prior to this PR, the PrimExpr `tir::Let` is treated as inlining during
codegen, which makes any common subexpression elimination (CSE) efforts
using `tir::Let` at TIR level effectless.

This PR updates codegen so that the `tir::Let` will have an explicit
var assignment and thus can effectively reflect the CSE efforts.
diff --git a/python/tvm/relax/frontend/nn/op.py b/python/tvm/relax/frontend/nn/op.py
@@ -2544,7 +2544,7 @@ def _cumsum_mask(cumsum_sorted, top_p, top_k, i, j):
 
     @T.prim_func(private=True)
     def _get_renorm_prob(A: T.handle, B: T.handle, C: T.handle, D: T.handle):
-        batch, vocab_size = T.int64(), T.int64()
+        batch, vocab_size = T.int64(is_size_var=True), T.int64(is_size_var=True)
         cumsum_sorted = T.match_buffer(A, (batch, vocab_size), prob_dtype)
         top_p = T.match_buffer(B, (batch, 1), prob_dtype)
         top_k = T.match_buffer(C, (batch, 1), index_dtype)
@@ -2564,8 +2564,8 @@ def _get_renorm_prob(A: T.handle, B: T.handle, C: T.handle, D: T.handle):
     def _get_index_from_sorted(
         A: T.handle, B: T.handle, C: T.handle, D: T.handle, E: T.handle, F: T.handle
     ):
-        batch, vocab_size = T.int64(), T.int64()
-        out_batch = T.int64()
+        batch, vocab_size = T.int64(is_size_var=True), T.int64(is_size_var=True)
+        out_batch = T.int64(is_size_var=True)
         cumsum_sorted = T.match_buffer(A, (batch, vocab_size), prob_dtype)
         indices = T.match_buffer(B, (batch, vocab_size), index_dtype)
         renorm_prob = T.match_buffer(C, (batch, 1), prob_dtype)
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
@@ -887,8 +887,27 @@ void CodeGenC::VisitExpr_(const LetNode* op, std::ostream& os) {  // NOLINT(*)
     let_binding_[op->var] = op;
   }
   std::string value = PrintExpr(op->value);
-  var_idmap_[op->var.get()] = value;
+  if (print_ssa_form_) {
+    ICHECK(!var_idmap_.count(op->var.get()));
+    var_idmap_[op->var.get()] = value;
+  } else {
+    PrintIndent();
+    if (op->var.dtype() == DataType::Handle() && handle_data_type_.count(op->var.get())) {
+      PrintType(handle_data_type_.at(op->var.get()), this->stream);
+      this->stream << "* " << AllocVarID(op->var.get()) << " = (";
+      PrintType(handle_data_type_.at(op->var.get()), this->stream);
+      this->stream << "*)" << value << ";\n";
+    } else {
+      PrintType(op->var.dtype(), this->stream);
+      this->stream << ' ' << AllocVarID(op->var.get()) << " = " << value << ";\n";
+    }
+  }
   os << PrintExpr(op->body);
+  // Pop the defined var from var_idmap when exiting its scope.
+  // We do this because it is hard to completely avoid a same LetNode appearing
+  // at different places.
+  bool removed = var_idmap_.erase(op->var.get());
+  ICHECK(removed);
 }
 
 void CodeGenC::VisitExpr_(const RampNode* op, std::ostream& os) {  // NOLINT(*)
diff --git a/tests/python/relax/test_frontend_nn_op.py b/tests/python/relax/test_frontend_nn_op.py
@@ -947,7 +947,7 @@ def foo(
     class Expected:
         @T.prim_func(private=True)
         def get_index_from_sorted(A: T.handle, B: T.handle, C: T.handle, D: T.handle, E: T.handle, F: T.handle):
-            batch, vocab_size = T.int64(), T.int64()
+            batch, vocab_size = T.int64(is_size_var=True), T.int64(is_size_var=True)
             cumsum_sorted = T.match_buffer(A, (batch, vocab_size))
             indices = T.match_buffer(B, (batch, vocab_size), "int64")
             renorm_prob = T.match_buffer(C, (batch, 1))
@@ -970,7 +970,7 @@ def get_index_from_sorted(A: T.handle, B: T.handle, C: T.handle, D: T.handle, E:
 
         @T.prim_func(private=True)
         def get_renorm_prob(A: T.handle, B: T.handle, C: T.handle, D: T.handle):
-            batch, vocab_size = T.int64(), T.int64()
+            batch, vocab_size = T.int64(is_size_var=True), T.int64(is_size_var=True)
             cumsum_sorted = T.match_buffer(A, (batch, vocab_size))
             top_p = T.match_buffer(B, (batch, 1))
             top_k = T.match_buffer(C, (batch, 1), "int64")