...

junrushao · junrushao · commit 1ac89bc5bc49 · 2024-01-06T15:21:52.000-08:00
diff --git a/python/mlc_chat/model/gpt2/gpt2_model.py b/python/mlc_chat/model/gpt2/gpt2_model.py
@@ -3,13 +3,13 @@
 TODO: add docstring
 """
 import dataclasses
-import math
 from typing import Any, Dict, Optional
 
 from tvm import te, tir
 from tvm.relax.frontend import nn
 from tvm.relax.frontend.nn import Tensor, op
 
+from mlc_chat import op as op_ext
 from mlc_chat.support import logging
 from mlc_chat.support.config import ConfigBase
 from mlc_chat.support.style import bold
@@ -110,29 +110,15 @@ def forward(
 
         self.k_cache.append(op.squeeze(k, axis=0))
         self.v_cache.append(op.squeeze(v, axis=0))
-        k = op.reshape(self.k_cache.view(t), (b, t, h, d))
-        v = op.reshape(self.v_cache.view(t), (b, t, h, d))
-
-        q = q.permute_dims([0, 2, 1, 3])  # [b, h, s, d]
-        k = k.permute_dims([0, 2, 1, 3])  # [b, h, t, d]
-        v = v.permute_dims([0, 2, 1, 3])  # [b, h, t, d]
-
-        attn_weights = op.matmul(
-            q, k.permute_dims([0, 1, 3, 2])  # [b, h, s, d] x [b, h, d, t] = [b, h, s, t]
-        ) / math.sqrt(d)
+        k = self.k_cache.view(t)
+        v = self.v_cache.view(t)
 
         if self.scale_attn_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-
-        dtype = attn_weights.dtype
-        attn_weights = attn_weights.maximum(tir.min_value(dtype)).minimum(attention_mask)
-        if dtype == "float32":
-            attn_weights = op.softmax(attn_weights, axis=-1)
+            attn_score_scaling_factor = 1.0 / float(self.layer_idx + 1)
         else:
-            attn_weights = op.softmax(attn_weights.astype("float32"), axis=-1).astype(dtype)
-        # [b, h, s, t] x [b, h, t, d] => [b, h, s, d] => [b, s, h, d]
-        output = op.matmul(attn_weights, v)
-        return self.c_proj(output.permute_dims([0, 2, 1, 3]).reshape((b, s, h * d)))
+            attn_score_scaling_factor = 1.0
+        output = op_ext.attention(q, k, v, attention_mask, attn_score_scaling_factor)
+        return self.c_proj(output)
 
 
 class GPT2MLP(nn.Module):
diff --git a/python/mlc_chat/model/mixtral/mixtral_model.py b/python/mlc_chat/model/mixtral/mixtral_model.py
@@ -1,12 +1,9 @@
-"""
-Implementation for Mistral architecture.
-"""
+"""Implementation for Mistral architecture."""
 import dataclasses
 
-from tvm import te, tir
+from tvm import tir
 from tvm.relax.frontend import nn
 from tvm.relax.frontend.nn import Tensor, op
-from tvm.topi.cuda.scan import inclusive_scan
 
 from mlc_chat import op as op_ext
 from mlc_chat.model.mistral.mistral_model import (
@@ -39,112 +36,81 @@ class MixtralMoE(nn.Module):
 
     def __init__(self, config: MixtralConfig):
         super().__init__()
-        self.gate = nn.Linear(
-            in_features=config.hidden_size, out_features=config.num_local_experts, bias=False
-        )
         self.num_experts_per_tok = config.num_experts_per_tok
         self.num_local_experts = config.num_local_experts
         self.intermediate_size = config.intermediate_size // config.tensor_parallel_shards
+        self.gate = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=config.num_local_experts,
+            bias=False,
+        )
         self.e1_e3 = MixtralExperts(
             self.num_local_experts,
-            self.num_experts_per_tok,
             in_features=config.hidden_size,
             out_features=2 * self.intermediate_size,
         )
         self.e2 = MixtralExperts(
             self.num_local_experts,
-            self.num_experts_per_tok,
             in_features=self.intermediate_size,
             out_features=config.hidden_size,
         )
         self.dtype = "float32"
 
-    # TODO: replace with cumsum nn op when it's ready
-    def cumsum(self, data: Tensor, dim: int) -> Tensor:
-        return op.tensor_expr_op(inclusive_scan, "cumsum", args=[data, dim, "int32"])
-
-    def sum(self, x):
-        # dlight cannot handle too small reduction axis extent
-        # so we manually transform it into spatial op.
-        if self.num_experts_per_tok == 2:
-
-            def te_add(x):
-                new_shape = (x.shape[0], x.shape[2])
-                return te.compute(
-                    new_shape,
-                    lambda i, j: x[i, 0, j] + x[i, 1, j],
-                    name="add",
-                )
-
-            return op.tensor_expr_op(te_add, "topk_mask", args=[x])
-        return op.sum(x, axis=1)
-
     def forward(self, x: Tensor):
-        assert x.ndim == 3
-        input_shape = x.shape
-        x = op.reshape(x, (input_shape[0] * input_shape[1], input_shape[2]))
-        num_tokens = input_shape[0] * input_shape[1]
-
-        # MoE data preparation
+        def _expert_forward(x: Tensor, indptr: Tensor):
+            # x: [num_tokens, hidden_size]
+            x1_x3 = self.e1_e3(x, indptr)
+            # x1, x3: [experts_per_tok, intermediate_size]
+            x1, x3 = op.split(x1_x3, indices_or_sections=2, axis=-1)
+            # x:
+            # - batched: [num_tokens, hidden_size]
+            # - single: [experts_per_tok, hidden_size]
+            x = self.e2(op.silu(x1) * x3, indptr)
+            return x
+
+        experts_per_tok = self.num_experts_per_tok  # activated experts per token
+        local_experts = self.num_local_experts  # total number of experts
+        batch_size, seq_len, hidden_size = x.shape
+        num_tokens = batch_size * seq_len
+        x = x.reshape(num_tokens, hidden_size)
+        # gate: [num_tokens, local_experts]
         gate: Tensor = self.gate(x)
-        expert_weights, expert_indices = op_ext.topk(
-            gate, self.num_experts_per_tok, self.num_local_experts, self.dtype, "int32"
-        )
+        # expert_weights: [num_tokens, experts_per_tok]
+        # expert_indices: [num_tokens, experts_per_tok]
+        expert_weights, expert_indices = op_ext.moe.topk(gate, experts_per_tok)
         expert_weights = op.softmax(expert_weights.astype("float32"), axis=-1).astype(self.dtype)
         if num_tokens == 1:
-            # single batch decode
-            expert_indices = op.reshape(expert_indices, (self.num_experts_per_tok,))
-            concat_x1_x3 = self.e1_e3(x, expert_indices, single_batch_decode=True)
-            x1, x3 = op.split(concat_x1_x3, indices_or_sections=2, axis=-1)
-            linear_out = self.e2(op.silu(x1) * x3, expert_indices, single_batch_decode=True)
-            unflattened = op.reshape(
-                linear_out, (num_tokens, self.num_experts_per_tok, linear_out.shape[-1])
-            )
+            # x: [num_tokens * experts_per_tok, hidden_size]
+            x = _expert_forward(x, expert_indices)
         else:
-            expert_mask = op_ext.topk_mask(
-                expert_indices, self.num_experts_per_tok, self.num_local_experts
-            )
-            mask_T_flattened = op.reshape(
-                op.permute_dims(expert_mask), (expert_mask.shape[0] * expert_mask.shape[1],)
-            )
-            cumsum_colwise_flattened = self.cumsum(mask_T_flattened, dim=0)
-            flattened_indices = op_ext.get_indices(
-                cumsum_colwise_flattened, expert_indices, self.num_experts_per_tok
-            )
-            indptr = op_ext.get_indptr(cumsum_colwise_flattened, self.num_local_experts)
-            token_indices = op.divide(
-                flattened_indices, Tensor.from_const(self.num_experts_per_tok)
-            )
-            gathered_x = op.take(x, token_indices, axis=0)
-
-            # expert forward begin
-            concat_x1_x3 = self.e1_e3(gathered_x, indptr)
-            x1, x3 = op.split(concat_x1_x3, indices_or_sections=2, axis=-1)
-            linear_out = self.e2(op.silu(x1) * x3, indptr)
-            # expert forward end
-
-            # MoE result post-processing
-            unpermuted = op_ext.scatter_output(flattened_indices, linear_out, self.dtype)
-            unflattened = op.reshape(
-                unpermuted, (num_tokens, self.num_experts_per_tok, unpermuted.shape[1])
-            )
-        expert_weights = op.reshape(expert_weights, (num_tokens, self.num_experts_per_tok, 1))
-        weighted_sum = self.sum(unflattened * expert_weights)
-        weighted_sum = op.reshape(
-            weighted_sum, (input_shape[0], input_shape[1], weighted_sum.shape[-1])
-        )
-        return weighted_sum
+            # cumsum: [num_tokens * total_experts]
+            cumsum = op_ext.moe.moe_cumsum(expert_indices, local_experts)
+            # indices: [num_tokens * experts_per_tok]
+            indices = op_ext.moe.get_indices(cumsum, expert_indices)
+            # indptr: [num_local_experts + 1]
+            indptr = op_ext.moe.get_indptr(cumsum, local_experts)
+            # x: [num_tokens * experts_per_tok, hidden_size]
+            x = op.take(x, indices / experts_per_tok, axis=0)
+            x = _expert_forward(x, indptr)
+            x = op_ext.moe.scatter_output(x, indices)
+        # x: [num_tokens, experts_per_tok, hidden_size]
+        x = x.reshape(num_tokens, experts_per_tok, hidden_size)
+        x = x * expert_weights.reshape(num_tokens, experts_per_tok, 1)
+        # x: [num_tokens, hidden_size]
+        x = op_ext.moe.moe_sum(x, dim=1)
+        x = x.reshape(batch_size, seq_len, hidden_size)
+        return x
 
 
 class MixtralDecoderLayer(nn.Module):
     """Mixtral decoder layer"""
 
     def __init__(self, config: MixtralConfig, rotary_embedding: RotaryEmbedding):
-        rms_norm_eps = config.rms_norm_eps
+        eps = config.rms_norm_eps
         self.self_attn = MistralAttention(config, rotary_embedding)
         self.moe = MixtralMoE(config)
-        self.input_layernorm = nn.RMSNorm(config.hidden_size, -1, rms_norm_eps, bias=False)
-        self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, -1, rms_norm_eps, bias=False)
+        self.input_layernorm = nn.RMSNorm(config.hidden_size, -1, eps, bias=False)
+        self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, -1, eps, bias=False)
 
         def _set_tp():
             def _set(layer, hint):
diff --git a/python/mlc_chat/nn/expert.py b/python/mlc_chat/nn/expert.py
@@ -2,59 +2,30 @@
 from tvm.relax.frontend import nn
 from tvm.relax.frontend.nn import Tensor
 
-from mlc_chat import op as op_ext
+from mlc_chat.op import moe_matmul
 
 
 class MixtralExperts(nn.Module):
     """Mixtral experts"""
 
-    def __init__(self, num_local_experts, num_experts_per_tok, in_features, out_features):
+    def __init__(self, num_local_experts, in_features, out_features):
         self.num_local_experts = num_local_experts
-        self.num_experts_per_tok = num_experts_per_tok
         self.in_features = in_features
         self.out_features = out_features
         self.weight = nn.Parameter((num_local_experts, out_features, in_features))
         self.dtype = "float32"
 
-    def forward(  # pylint: disable=missing-function-docstring,invalid-name
-        self,
-        x: Tensor,
-        indptr: Tensor,
-        single_batch_decode: bool = False,
-    ):
-        assert x.ndim == 2
-        if single_batch_decode:
-            # single-batch decode
-            assert x.shape[1] == self.in_features
-            assert indptr.ndim == 1
-            if x.shape[0] == 1:
-                return op_ext.gemv_e1_e3(
-                    x,
-                    self.weight,
-                    indptr,
-                    self.in_features,
-                    self.out_features,
-                    self.num_experts_per_tok,
-                    self.num_local_experts,
-                    self.dtype,
-                )
-            return op_ext.gemv_e2(
-                x,
-                self.weight,
-                indptr,
-                self.in_features,
-                self.out_features,
-                self.num_experts_per_tok,
-                self.num_local_experts,
-                self.dtype,
-            )
+    def _forward_single(self, x: Tensor, indptr: Tensor):  # pylint: disable=invalid-name
+        assert x.ndim == 2 and indptr.ndim == 2
+        assert indptr.shape[0] == 1
+        return moe_matmul.gemv(x, self.weight, indptr)
 
-        return op_ext.group_gemm(
-            x,
-            self.weight,
-            indptr,
-            self.in_features,
-            self.out_features,
-            self.num_local_experts,
-            self.dtype,
-        )
+    def _forward_batched(self, x: Tensor, indptr: Tensor):  # pylint: disable=invalid-name
+        assert x.ndim == 2 and indptr.ndim == 1
+        return moe_matmul.group_gemm(x, self.weight, indptr)
+
+    def forward(self, x: Tensor, indptr: Tensor):  # pylint: disable=invalid-name,missing-docstring
+        assert x.ndim == 2 and indptr.ndim in [1, 2]
+        if indptr.ndim == 1:
+            return self._forward_batched(x, indptr)
+        return self._forward_single(x, indptr)
diff --git a/python/mlc_chat/op/__init__.py b/python/mlc_chat/op/__init__.py
@@ -1,18 +1,6 @@
 """Extern module for compiler."""
+from . import moe, moe_matmul
 from .attention import attention
 from .extern import configure, enable, get_store
 from .gemm import faster_transformer_dequantize_gemm
-from .moe import (
-    gemv_e1_e3,
-    gemv_e2,
-    get_indices,
-    get_indptr,
-    group_dequantize_gemv_e1_e3,
-    group_dequantize_gemv_e2,
-    group_dequantize_group_gemm,
-    group_gemm,
-    scatter_output,
-    topk,
-    topk_mask,
-)
 from .position_embedding import llama_rope
diff --git a/python/mlc_chat/op/attention.py b/python/mlc_chat/op/attention.py
@@ -16,11 +16,12 @@
 WARN_FLASHINFER_HEAD_DIM = False
 
 
-def attention(  # pylint: disable=invalid-name,too-many-locals
+def attention(  # pylint: disable=invalid-name,too-many-locals,too-many-statements
     q: nn.Tensor,
     k: nn.Tensor,
     v: nn.Tensor,
     casual_mask: nn.Tensor,
+    attn_score_scaling_factor: float = 1.0,
 ) -> nn.Tensor:
     """Attention with casual mask.
 
@@ -47,7 +48,7 @@ def attention(  # pylint: disable=invalid-name,too-many-locals
             v = v.repeat(h_q // h_kv, axis=1)
         q -> [b, h, s, d]
         k, v -> [b, h, t, d]
-        attn = q @ k^T / sqrt(d)  # [b, h, s, t]
+        attn = q @ k^T / sqrt(d) * attn_score_scaling_factor  # [b, h, s, t]
         attn = softmax_with_mask(attn, casual_mask, axis=-1)
         o = attn @ v  # [b, h, s, d]
         o -> [b, s, h * d]
@@ -67,27 +68,30 @@ def _fallback():
         if h_kv != h_q:
             k = k.repeat(h_q // h_kv, axis=2)
             v = v.repeat(h_q // h_kv, axis=2)
-        q = q.permute_dims([0, 2, 1, 3])
-        k = k.permute_dims([0, 2, 1, 3])
-        v = v.permute_dims([0, 2, 1, 3])
+        q = op.permute_dims(q, [0, 2, 1, 3])
+        k = op.permute_dims(k, [0, 2, 1, 3])
+        v = op.permute_dims(v, [0, 2, 1, 3])
         attn_weights = op.matmul(  # [b, h, s, t]
             q,  # [b, h, s, d]
-            k.permute_dims([0, 1, 3, 2]),  # [b, h, d, t]
+            op.permute_dims(k, [0, 1, 3, 2]),  # [b, h, d, t]
         ) / math.sqrt(d)
+        if attn_score_scaling_factor != 1.0:
+            attn_weights = attn_weights * attn_score_scaling_factor
         dtype = attn_weights.dtype
         attn_weights = attn_weights.maximum(tir.min_value(dtype)).minimum(casual_mask)
         if dtype == "float32":
             attn_weights = op.softmax(attn_weights, axis=-1)
         else:
             attn_weights = op.softmax(attn_weights.astype("float32"), axis=-1).astype(dtype)
         output = op.matmul(attn_weights, v)  # [b, h, s, d] <= [b, h, s, t] x [b, h, t, d]
-        output = output.permute_dims([0, 2, 1, 3])  #  [b, s, h, d]
-        output = output.reshape([b, s, h_q * d])  # [b, s, h * d]
+        output = op.permute_dims(output, [0, 2, 1, 3])  #  [b, s, h, d]
+        output = op.reshape(output, [b, s, h_q * d])  # [b, s, h * d]
         return output
 
     # FlashInfer Implementation
     if (
         _extern.get_store().flashinfer
+        and attn_score_scaling_factor == 1.0
         and q.dtype == "float16"
         and k.dtype == "float16"
         and v.dtype == "float16"
diff --git a/python/mlc_chat/op/moe.py b/python/mlc_chat/op/moe.py
diff --git a/python/mlc_chat/op/moe_matmul.py b/python/mlc_chat/op/moe_matmul.py