[Pass] Attach memory-planning attributes for dynamic func output (#1604)

tristankincaid · tristankincaid · commit dadf5d348b94 · 2024-01-14T22:38:17.000-05:00
This PR adds a pass into the model compilation pipeline, which attach an attribute `"relax.memory_plan_dynamic_func_output"` for each Relax function in the IRModule. This attribute suggests that the Relax functions' output tensors, though having dynamic shapes, are statically plannable. This enhancement makes sure that in serving scenarios, our memory allcoation is completely static after stablized. So we will not be worried about continuing memory usage growth, and can allocate more memory for KV cache. This PR can be early merged, but it will not take effects until apache/tvm#16111 is merged.
diff --git a/python/mlc_chat/compiler_pass/attach_to_ir_module.py b/python/mlc_chat/compiler_pass/attach_to_ir_module.py
@@ -32,3 +32,15 @@ def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IR
         for func_name, func in self.functions.items():
             mod[func_name] = func.with_attr("global_symbol", func_name)
         return mod
+
+
+@tvm.transform.module_pass(opt_level=0, name="AttachMemoryPlanAttr")
+class AttachMemoryPlanAttr:  # pylint: disable=too-few-public-methods
+    """Attach memory planning attribute for dynamic function output planning to Relax functions."""
+
+    def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule:
+        """Entrypoint"""
+        for g_var, func in mod.functions_items():
+            if isinstance(func, relax.Function):
+                mod[g_var] = func.with_attr("relax.memory_plan_dynamic_func_output", True)
+        return mod
diff --git a/python/mlc_chat/compiler_pass/pipeline.py b/python/mlc_chat/compiler_pass/pipeline.py
@@ -10,7 +10,11 @@
 
 from mlc_chat.support import logging
 
-from .attach_to_ir_module import AttachAdditionalPrimFuncs, AttachVariableBounds
+from .attach_to_ir_module import (
+    AttachAdditionalPrimFuncs,
+    AttachMemoryPlanAttr,
+    AttachVariableBounds,
+)
 from .clean_up_tir_attrs import CleanUpTIRAttrs
 from .cublas_dispatch import CublasDispatch
 from .estimate_memory_usage import AttachMetadataWithMemoryUsage
@@ -81,6 +85,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
                 PruneRelaxFunc(flashinfer=flashinfer),
                 AttachVariableBounds(variable_bounds),
                 AttachAdditionalPrimFuncs(additional_tirs),
+                AttachMemoryPlanAttr(),
                 _DebugDump("debug-phase0.py", debug_dump, show_meta=False),
                 # Phase 1. Passes on high-level operator graph
                 _LogProgress("Running TVM Relax graph-level optimizations"),