apache
diff --git a/‎3rdparty/cutlass_fpA_intB_gemm‎ b/‎3rdparty/cutlass_fpA_intB_gemm‎
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/modules/CUDA.cmake‎
Lines changed: 4 additions & 21 deletions b/‎cmake/modules/CUDA.cmake‎
Lines changed: 4 additions & 21 deletions
diff --git a/‎python/tvm/contrib/msc/core/codegen/codegen.py‎
Lines changed: 15 additions & 1 deletion b/‎python/tvm/contrib/msc/core/codegen/codegen.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎python/tvm/contrib/msc/framework/tvm/codegen/codegen.py‎
Lines changed: 15 additions & 1 deletion b/‎python/tvm/contrib/msc/framework/tvm/codegen/codegen.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎python/tvm/driver/tvmc/compiler.py‎
Lines changed: 28 additions & 3 deletions b/‎python/tvm/driver/tvmc/compiler.py‎
Lines changed: 28 additions & 3 deletions
diff --git a/‎python/tvm/ir/instrument.py‎
Lines changed: 18 additions & 0 deletions b/‎python/tvm/ir/instrument.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎python/tvm/relax/backend/dispatch_sort_scan.py‎
Lines changed: 1 addition & 0 deletions b/‎python/tvm/relax/backend/dispatch_sort_scan.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/tvm/relax/transform/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎python/tvm/relax/transform/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/tvm/relax/transform/transform.py‎
Lines changed: 17 additions & 0 deletions b/‎python/tvm/relax/transform/transform.py‎
Lines changed: 17 additions & 0 deletions
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.24)
 project(tvm C CXX)
 
 # Utility functions
 
@@ -38,27 +38,10 @@ if(USE_CUDA)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDA_LIBRARY})
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_NVRTC_LIBRARY})
 
-  # Compatibility with cmake 3.18+
-  #
-  # The updates to the cutlass kernels made in TVM PR#16244 require
-  # symbols provided in cuda 7.5+.  While the cuda architecture is
-  # specified by setting `NVCC_FLAGS` in the `CMakeLists.txt` for each
-  # kernel, cmake 3.18+ also sets it based on the
-  # `CMAKE_CUDA_ARCHITECTURES` value.  If not set, cmake will explicitly
-  # pass the compute capability as nvidia's default of 5.2, *EVEN IF* it
-  # has already been specified in `NVCC_FLAGS`.  Because the kernels
-  # cannot compile with compute capability of 5.2, this causes
-  # compilation errors.
-  #
-  # By setting `CMAKE_CUDA_ARCHITECTURES` to `OFF`, cmake does not add
-  # 5.2 as a target architecture.
-  #
-  # See https://cmake.org/cmake/help/latest/policy/CMP0104.html for
-  # details on CMake's policy for CUDA architecture flags.
-  #
-  # See https://cmake.org/cmake/help/latest/policy/CMP0104.html for the
-  # default CUDA architecture for each version of CUDA.
-  set(CMAKE_CUDA_ARCHITECTURES OFF)
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    message(STATUS "CMAKE_CUDA_ARCHITECTURES not set, using native")
+    set(CMAKE_CUDA_ARCHITECTURES native)
+  endif()
 
   if(USE_CUDNN)
     message(STATUS "Build with cuDNN support")
 
@@ -172,4 +172,18 @@ def relay_to_relax(
     def _bind_weights(mod: tvm.IRModule, folder: msc_utils.MSCDirectory) -> tvm.IRModule:
         return BindParams("main", weights)(mod)
 
-    return codegen.load(inputs, post_load=_bind_weights)
+    mod = codegen.load(inputs, post_load=_bind_weights)
+
+    mod = tvm.ir.transform.Sequential(
+        [
+            # The canonicalization of relax variable bindings is not required
+            # for correctness.  It does, however, remove trivial `x = y`
+            # bindings, preventing test cases from depending on their
+            # presence.
+            tvm.relax.transform.CanonicalizeBindings(),
+            tvm.relax.transform.ConvertToDataflow(min_size=1),
+        ],
+        name="tvm.contrib.msc.core.codegen.relay_to_relax_postproc",
+    )(mod)
+
+    return mod
@@ -71,4 +71,18 @@ def _bind_weights(mod: tvm.IRModule, folder: msc_utils.MSCDirectory) -> tvm.IRMo
         return mod
 
     codegen = CodeGen(graph, _ffi_api.GetRelaxSources, codegen_config, print_config, build_folder)
-    return codegen.load(inputs, pre_load=_save_weights, post_load=_bind_weights)
+    mod = codegen.load(inputs, pre_load=_save_weights, post_load=_bind_weights)
+
+    mod = tvm.ir.transform.Sequential(
+        [
+            # The canonicalization of relax variable bindings is not required
+            # for correctness.  It does, however, remove trivial `x = y`
+            # bindings, preventing test cases from depending on their
+            # presence.
+            tvm.relax.transform.CanonicalizeBindings(),
+            tvm.relax.transform.ConvertToDataflow(min_size=1),
+        ],
+        name="tvm.contrib.msc.framework.tvm.codegen.to_relax_postproc",
+    )(mod)
+
+    return mod
@@ -31,7 +31,7 @@
 from tvm import autotvm, auto_scheduler
 from tvm import relay
 from tvm.driver.tvmc.registry import generate_registry_args, reconstruct_registry_entity
-from tvm.ir.instrument import PassInstrument, PassTimingInstrument
+from tvm.ir.instrument import PassInstrument, PassTimingInstrument, PassPrintingInstrument
 from tvm.ir.memory_pools import WorkspaceMemoryPools
 from tvm.target import Target
 from tvm.relay.backend import Executor, Runtime
@@ -162,6 +162,18 @@ def add_compile_parser(subparsers, _, json_params):
         action="store_true",
         help="print compilation time per pass",
     )
+    parser.add_argument(
+        "--print-ir-before",
+        help="print IR before each named pass of a comma-separated list of pass names."
+        "e.g. '--print-ir-before [tir.SplitHostDevice,tir.ConvertSSA]' ",
+        default="",
+    )
+    parser.add_argument(
+        "--print-ir-after",
+        help="print IR after each named pass of a comma-separated list of pass names."
+        "e.g. '--print-ir-after [tir.SplitHostDevice,tir.ConvertSSA]' ",
+        default="",
+    )
     for one_entry in json_params:
         parser.set_defaults(**one_entry)
 
@@ -220,6 +232,8 @@ def drive_compile(args):
             workspace_pools_recombobulate(args, [workspace_pools_target], extra_targets)
         ),
         print_pass_times=args.print_pass_times,
+        print_ir_before=args.print_ir_before,
+        print_ir_after=args.print_ir_after,
         **transform_args,
     )
 
@@ -247,6 +261,8 @@ def compile_model(
     mod_name: Optional[str] = "default",
     workspace_pools: Optional[WorkspaceMemoryPools] = None,
     print_pass_times: bool = False,
+    print_ir_before: Optional[List[str]] = None,
+    print_ir_after: Optional[List[str]] = None,
     instruments: Optional[Sequence[PassInstrument]] = None,
     desired_layout: Optional[str] = None,
     desired_layout_ops: Optional[List[str]] = None,
@@ -295,7 +311,7 @@ def compile_model(
         needs to be generated.
     disabled_pass: str, optional
         Comma-separated list of passes which needs to be disabled
-        during compilation
+        during compilation.
     pass_context_configs: list[str], optional
         List of strings containing a set of configurations to be passed to the
         PassContext.
@@ -310,6 +326,10 @@ def compile_model(
         compilation.
     print_pass_times: bool
         To enable printing a breakdown of compilation times by pass. Disabled by default.
+    print_ir_before: list[str], optional
+        To print IR before each named pass of a comma-separated list of passes.
+    print_ir_after: list[str], optional
+        To print IR after each named pass of a comma-separated list of passes.
     instruments: Optional[Sequence[PassInstrument]]
         The list of pass instrument implementations.
     desired_layout: str, optional
@@ -369,6 +389,12 @@ def compile_model(
         timing_inst = PassTimingInstrument()
         instruments = [timing_inst] if instruments is None else [timing_inst] + instruments
 
+    if print_ir_before or print_ir_after:
+        print_ir_instr = PassPrintingInstrument(
+            print_before_pass_names=print_ir_before, print_after_pass_names=print_ir_after
+        )
+        instruments = [print_ir_instr] if instruments is None else [print_ir_instr] + instruments
+
     with tvm.transform.PassContext(
         opt_level=opt_level,
         config=config,
@@ -581,7 +607,6 @@ def dump_operation_offloads(mod: tvm.ir.IRModule, initial_mod: tvm.ir.IRModule,
     save_to_file = all([dump_path != "-", dump_path != ""])
 
     if print_to_console or save_to_file:
-
         operations_distribution = analyze_operations_distribution(mod)
 
         def annotate_f(x):
 
@@ -255,3 +255,21 @@ def render():
                 profiles = timing_inst.render()
         """
         return _ffi_instrument_api.RenderTimePassProfiles()
+
+
+@pass_instrument
+class PassPrintingInstrument:
+    """A pass instrument to print if before or
+    print ir after each element of a named pass."""
+
+    def __init__(self, print_before_pass_names, print_after_pass_names):
+        self.print_before_pass_names = print_before_pass_names
+        self.print_after_pass_names = print_after_pass_names
+
+    def run_before_pass(self, mod, pass_info):
+        if pass_info.name in self.print_before_pass_names:
+            print(f"Print IR before: {pass_info.name}\n{mod}\n\n")
+
+    def run_after_pass(self, mod, pass_info):
+        if pass_info.name in self.print_after_pass_names:
+            print(f"Print IR after: {pass_info.name}\n{mod}\n\n")
@@ -116,6 +116,7 @@ def visit_call_(self, call: relax.Call) -> relax.Expr:
             tir_call = self.builder_.call_te(
                 te_func,
                 call.args[0],
+                k=call.attrs.k,
                 axis=call.attrs.axis,
                 ret_type=call.attrs.ret_type,
                 is_ascend=not call.attrs.largest,
 
@@ -17,6 +17,7 @@
 """Relax transformations. """
 
 from .transform import (
+    AdjustMatmulOrder,
     AllocateWorkspace,
     AlterOpImpl,
     AnnotateTIROpPattern,
 
@@ -1249,6 +1249,23 @@ def UpdateParamStructInfo(sinfo_func: Callable[[Var], Optional[StructInfo]]):
     return _ffi_api.UpdateParamStructInfo(sinfo_func)  # type: ignore
 
 
+def AdjustMatmulOrder():
+    """Reorder `x*(A*B)` to `(x*A)*B`
+
+    Useful for optimizing LoRA computations, where `matmul(x,
+    LoraA*LoraB)` may be computed as `matmul(matmul(x, LoraA),
+    LoraB)`, reducing the total memory usage.
+
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The corresponding pass.
+    """
+
+    return _ffi_api.AdjustMatmulOrder()  # type: ignore
+
+
 def ReorderTakeAfterMatmul():
     """Reorder `matmul(x, take(weights, indices))` to `take(matmul(x,weights),indices)`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-cmake_minimum_required(VERSION 3.18)`
	`1`	`+cmake_minimum_required(VERSION 3.24)`
`2`	`2`	`project(tvm C CXX)`
`3`	`3`
`4`	`4`	`# Utility functions`