Cutlass offload (mlc-ai#2)

yelite · sunggg · commit 04eef9afdaf7 · 2023-05-23T15:41:11.000Z
* Add cutlass offload

* Add comments
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@ __pycache__/
 *.so
 
 build*
+!build.py
 
 *.ll
 .npm
diff --git a/build.py b/build.py
@@ -8,10 +8,12 @@
 import tvm
 from tvm import meta_schedule as ms
 from tvm import relax
+from tvm.relax.backend.pattern_registry import get_pattern
 
 import mlc_llm
 from mlc_llm import utils
 from mlc_llm.relax_model import gpt_neox, llama, moss
+from mlc_llm.transform import rewrite_attention
 
 
 def _parse_args():
@@ -36,6 +38,7 @@ def _parse_args():
         choices=[*utils.quantization_dict.keys()],
         default=list(utils.quantization_dict.keys())[0],
     )
+    args.add_argument("--cutlass-offload", action="store_true", default=False)
     args.add_argument("--max-seq-len", type=int, default=-1)
     args.add_argument("--target", type=str, default="auto")
     args.add_argument(
@@ -185,6 +188,8 @@ def debug_dump_script(mod, name, args):
         return
     dump_path = os.path.join(args.artifact_path, "debug", name)
     with open(dump_path, "w", encoding="utf-8") as outfile:
+        # Remove runtime modules from external codegen so that the IR module can be printed.
+        mod = mod.without_attr("external_mods").without_attr("const_name_to_constant")
         outfile.write(mod.script(show_meta=True))
     print(f"Dump mod to {dump_path}")
 
@@ -240,11 +245,23 @@ def mod_transform_before_build(
             storage_nbit=args.quantization.storage_nbit,
             dtype=args.quantization.model_dtype,
         )(mod)
-    mod = mlc_llm.transform.FuseTransposeMatmul()(mod)  # pylint: disable=not-callable
-    mod = relax.pipeline.get_pipeline()(mod)  # pylint: disable=no-value-for-parameter
-    mod = mlc_llm.transform.FuseDecodeMatmulEwise(  # pylint: disable=not-callable
-        args.quantization.model_dtype, args.target_kind
-    )(mod)
+    if args.target_kind == "cuda" and args.cutlass_offload:
+        from tvm.relax.backend.contrib.cutlass import partition_for_cutlass
+
+        debug_dump_script(mod, "mod_before_cutlass.py", args)
+        mod = partition_for_cutlass(mod)
+        debug_dump_script(mod, "mod_after_cutlass_partition.py", args)
+        codegen_pass = relax.transform.RunCodegen(
+            {"cutlass": {"sm": 80, "find_first_valid": False}},
+            entry_functions=model_names,
+        )
+        mod = codegen_pass(mod)
+        debug_dump_script(mod, "mod_after_cutlass_codegen.py", args)
+
+    mod = mlc_llm.transform.FuseTransposeMatmul()(mod)
+
+    mod = relax.pipeline.get_pipeline()(mod)
+    mod = mlc_llm.transform.FuseDecodeMatmulEwise(args.dtype)(mod)
     mod = relax.transform.DeadCodeElimination(model_names)(mod)
     mod = relax.transform.LiftTransformParams()(mod)
     mod_transform, mod_deploy = utils.split_transform_deploy_mod(mod, model_names)
@@ -317,10 +334,10 @@ def build(mod_deploy: tvm.IRModule, args: argparse.Namespace) -> None:
     ex = relax.build(mod_deploy, args.target, system_lib=args.system_lib)
 
     output_filename = (
-        f"{args.model}-{args.quantization.name}-{target_kind}.{args.lib_format}"
+        f"{args.model}-{args.quantization.name}-{target_kind}_{args.dtype}.{args.lib_format}"
     )
 
-    debug_dump_shader(ex, f"{args.model}_{args.quantization.name}_{target_kind}", args)
+    debug_dump_shader(ex, f"{args.model}_{args.quantization.name}_{target_kind}_{args.dtype}", args)
     lib_path = os.path.join(args.artifact_path, output_filename)
     ex.export_library(lib_path, **args.export_kwargs)
     print(f"Finish exporting to {lib_path}")
diff --git a/mlc_llm/relax_model/llama.py b/mlc_llm/relax_model/llama.py
@@ -303,6 +303,8 @@ def forward(
             attention_mask.struct_info.shape.values,
             (bsz, tvm.tir.IntImm("int64", 1), q_len, kv_seq_len),
         )
+        
+        attn_weights = nn.emit(relax.op.add(attn_weights, attention_mask))
 
         attn_weights = nn.emit(
             maximum(
@@ -315,12 +317,7 @@ def forward(
         )
         attn_weights = nn.emit(relax.op.minimum(attn_weights, attention_mask))
 
-        # upcast attention to fp32
-        if attn_weights.struct_info.dtype != "float32":
-            attn_weights = astype(attn_weights, "float32")
         attn_weights = nn.emit(softmax(attn_weights, axis=-1))
-        if attn_weights.struct_info.dtype != query_states.struct_info.dtype:
-            attn_weights = astype(attn_weights, query_states.struct_info.dtype)
         attn_output = nn.emit(matmul(attn_weights, value_states))
 
         tvm.ir.assert_structural_equal(
@@ -402,7 +399,7 @@ def min_max_triu_te():
         return te.compute(
             (tgt_len, tgt_len),
             lambda i, j: tvm.tir.Select(
-                j > i, tvm.tir.min_value(dtype), tvm.tir.max_value(dtype)
+                j > i, tvm.tir.min_value(dtype), tvm.tir.FloatImm(dtype, 0)
             ),
             name="make_diag_mask_te",
         )
@@ -416,9 +413,7 @@ def extend_te(x, tgt_len, src_len):
         return te.compute(
             (bsz, 1, tgt_len, src_len),
             lambda b, _, i, j: te.if_then_else(
-                j < src_len - tgt_len,
-                tvm.tir.max_value(dtype),
-                x[b, _, i, j - (src_len - tgt_len)],
+                j < src_len - tgt_len, tvm.tir.FloatImm(dtype, 0), x[b, _, i, j - (src_len - tgt_len)]
             ),
             name="concat_te",
         )
@@ -451,13 +446,7 @@ def _prepare_decoder_attention_mask(self, input_shape, src_len, dtype):
             # Get src_len from input parameters
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             bsz, tgt_len = input_shape
-            combined_attention_mask = nn.emit(
-                relax.op.full(
-                    (bsz, 1, tgt_len, src_len),
-                    relax.const(tvm.tir.max_value(dtype).value, dtype),
-                    dtype,
-                )
-            )
+            combined_attention_mask = nn.emit(relax.op.full((bsz, 1, tgt_len, src_len), relax.const(0, dtype), dtype))
         return combined_attention_mask
 
     def forward(
diff --git a/mlc_llm/transform/__init__.py b/mlc_llm/transform/__init__.py
@@ -2,3 +2,5 @@
 from .lift_tir_global_buffer_alloc import LiftTIRGlobalBufferAlloc
 from .quantization import GroupQuantize
 from .transpose_matmul import FuseTransposeMatmul
+from .decode_matmul_ewise import FuseDecodeMatmulEwise
+from .rewrite_attention import rewrite_attention
diff --git a/mlc_llm/transform/rewrite_attention.py b/mlc_llm/transform/rewrite_attention.py
@@ -0,0 +1,28 @@
+from tvm.relax.dpl import PatternContext, is_const, is_op, rewrite_call, wildcard
+from tvm.script import relax as R
+
+
+def rewrite_attention(f):
+    Q = wildcard()
+    K = wildcard()
+    V = wildcard()
+    bias = wildcard()
+
+    Q_BNSH = is_op("relax.permute_dims")(Q)
+    K_BNSH = is_op("relax.permute_dims")(K)
+    V_BNSH = is_op("relax.permute_dims")(V)
+
+    K_BNSH_T = is_op("relax.permute_dims")(K_BNSH)
+
+    matmul1 = is_op("relax.matmul")(Q_BNSH, K_BNSH_T)
+    divide = is_op("relax.divide")(matmul1, is_const())
+    with_bias = is_op("relax.add")(divide, bias)
+    softmax = is_op("relax.nn.softmax")(with_bias)
+    matmul2 = is_op("relax.matmul")(softmax, V_BNSH)
+
+    pattern = is_op("relax.permute_dims")(matmul2)
+
+    def callback(_, matchings):
+        return R.nn.attention(matchings[Q], matchings[K], matchings[V], matchings[bias])
+
+    return rewrite_call(pattern, callback, f)
diff --git a/mlc_llm/utils.py b/mlc_llm/utils.py
@@ -104,6 +104,14 @@ def split_transform_deploy_mod(
     )
     mod_deploy = relax.transform.DeadCodeElimination(model_names)(mod_deploy)
 
+    # Copy the runtime module from external codegen
+    mod_deploy = mod_deploy.with_attrs(
+        {
+            "external_mods": mod.get_attr("external_mods"),
+            "const_name_to_constant": mod.get_attr("const_name_to_constant"),
+        }
+    )
+
     return mod_transform, mod_deploy
 
 

-Original file line number
+Diff line change
 *.so
 build*
 +!build.py
 *.ll
 .npm