[None] [fix] Fix missing ActivationType issue (#9171)

kaiyux · syuoni · nzmora-nvidia · web-flow · commit 04be5a704eb1 · 2025-11-17T10:43:25.000+08:00
Signed-off-by: Kaiyu Xie &lt;26294424+kaiyux@users.noreply.github.com&gt;
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
Signed-off-by: Neta Zmora &lt;96238833+nzmora-nvidia@users.noreply.github.com&gt;
Co-authored-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
Co-authored-by: Neta Zmora &lt;96238833+nzmora-nvidia@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h
@@ -19,7 +19,8 @@
 namespace tensorrt_llm::kernels::cutlass_kernels
 {
 
-// Note update moe.py to match
+// IMPORTANT: Keep the same order of activation functions in this enum and the activation functions in
+// cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu::doActivationKernel().
 enum class ActivationType
 {
     Gelu = 0,
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
@@ -2292,6 +2292,8 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
 
     auto fn = [&]()
     {
+        // IMPORTANT: Keep the order of the activation functions in the same order as the ActivationType enum in
+        // common.h
         auto fn = [&](auto block_scaling_type)
         {
             auto fn_list = std::array{
@@ -2307,11 +2309,12 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
                     decltype(block_scaling_type)::value>, // Geglu
                 &doActivationKernel<T, GemmOutputType, ScaleBiasType, SwigluBiasAdaptor,
                     decltype(block_scaling_type)::value>, // SwigluBias
-                &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::Relu2>,
-                    decltype(block_scaling_type)::value>, // Relu2
                 &doActivationKernel<T, GemmOutputType, ScaleBiasType,
                     IdentityAdaptor<cutlass::epilogue::thread::Identity>,
-                    decltype(block_scaling_type)::value> // Identity
+                    decltype(block_scaling_type)::value>, // Identity
+                &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::Relu2>,
+                    decltype(block_scaling_type)::value>  // Relu2
+
             };
             return fn_list[static_cast<int>(activation_type.activation_type)];
         };
diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy
@@ -698,6 +698,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
         "tensorrt_llm/_ipc_utils.py",
         "tensorrt_llm/_torch/compilation/patterns/ar_residual_norm.py",
         "tensorrt_llm/_torch/compilation/patterns/ub_allreduce.py",
+        "tensorrt_llm/_torch/custom_ops/torch_custom_ops.py",
         "tensorrt_llm/_torch/custom_ops/userbuffers_custom_ops.py",
         "tensorrt_llm/_torch/models/modeling_llama.py",
         "tensorrt_llm/_torch/modules/fused_moe/",
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py
@@ -1,6 +1,6 @@
 import torch
 
-from tensorrt_llm._torch.custom_ops.torch_custom_ops import ActivationType
+from tensorrt_llm._torch.utils import ActivationType
 
 
 @torch.library.custom_op("auto_deploy::trtllm_moe_fused", mutates_args=())
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -13,7 +13,7 @@
                          OptimizationProfile, TunableRunner, TuningConfig)
 from ..modules.multi_stream_utils import do_multi_stream
 from ..modules.swiglu import silu_and_mul_kernel
-from ..utils import (fp4_scale_infer_shape,
+from ..utils import (ActivationType, fp4_scale_infer_shape,
                      get_last_power_of_2_num_tokens_buckets,
                      last_positive_power_of_2)
 
@@ -24,21 +24,6 @@ def bmm_out(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor) -> None:
     torch.bmm(a, b, out=out)
 
 
-from enum import IntEnum
-
-
-class ActivationType(IntEnum):
-    Gelu = 0
-    Relu = 1
-    Silu = 2
-    Swiglu = 3
-    Geglu = 4
-    SwigluBias = 5
-    Relu2 = 6
-    Identity = 7
-    InvalidType = 8
-
-
 class MoERunner(TunableRunner):
     # avoid overhead of creating a new runner in forward pass
     runner_dict = dict()
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -565,6 +565,7 @@ def forward_chunk(
             tune_max_num_tokens=self.tune_max_num_tokens,
             tuner_num_tokens=tuner_num_tokens,
             tuner_top_k=tuner_top_k,
+            activation_type=self.activation_type,
             unpadded_hidden_size=self.unpadded_hidden_size,
             out_tensor=moe_output,
         )
diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py
@@ -8,8 +8,8 @@
 
 from ...distributed.ops import reducescatter
 from ...model_config import ModelConfig
-from ...utils import (AuxStreamType, Fp4QuantizedTensor, get_model_extra_attrs,
-                      is_torch_compiling)
+from ...utils import (ActivationType, AuxStreamType, Fp4QuantizedTensor,
+                      get_model_extra_attrs, is_torch_compiling)
 from .routing import BaseMoeRoutingMethod
 
 
@@ -144,6 +144,7 @@ def __init__(
         swiglu_beta: Optional[torch.Tensor] = None,
         swiglu_limit: Optional[torch.Tensor] = None,
         layer_idx: Optional[int] = None,
+        activation_type: ActivationType = ActivationType.Swiglu,
     ):
         from ...distributed import AllReduce
 
@@ -161,6 +162,7 @@ def __init__(
         self.swiglu_limit = swiglu_limit
         self.layer_idx = layer_idx
         self.layer_idx_str = str(layer_idx) if layer_idx is not None else None
+        self.activation_type = int(activation_type)
 
         self._register_layer(model_config)
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/ops/moe_op_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/ops/moe_op_cutlass.py
@@ -82,6 +82,7 @@ def finalize_tactic(
                                               False),
                 min_latency_mode=min_latency_mode,
                 use_fused_finalize=use_fused_finalize,
+                activation_type=module.activation_type,
             )
 
         # Set tuning configuration
@@ -164,6 +165,7 @@ def compute_moe(
         swiglu_beta = module.swiglu_beta
         swiglu_limit = module.swiglu_limit
         use_w4_group_scaling = getattr(module, 'has_w4afp8', False)
+        activation_type = module.activation_type
 
         # Determine weight dtype for view operation if needed
         weight_dtype = w3_w1_weight.dtype
@@ -199,7 +201,7 @@ def compute_moe(
                          input_sf, swizzled_input_sf, swiglu_alpha, swiglu_beta,
                          swiglu_limit, tp_size, tp_rank, ep_size, ep_rank,
                          cluster_size, cluster_rank, use_all_to_all,
-                         min_latency_mode, self.gemm_tactics,
+                         min_latency_mode, self.gemm_tactics, activation_type,
                          unpadded_hidden_size, tuner_num_tokens, None)
 
         # Return output based on latency mode
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
@@ -1,7 +1,7 @@
 import contextlib
 import threading
 from dataclasses import dataclass
-from enum import Enum
+from enum import Enum, IntEnum
 from typing import Dict, List
 
 import torch
@@ -31,6 +31,20 @@
 )
 
 
+# IMPORTANT: Keep the same order of activation functions in this enum and the enum in
+# cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h
+class ActivationType(IntEnum):
+    Gelu = 0
+    Relu = 1
+    Silu = 2
+    Swiglu = 3
+    Geglu = 4
+    SwigluBias = 5
+    Identity = 6
+    Relu2 = 7
+    InvalidType = 8
+
+
 def set_torch_compiling(enable: bool):
     global is_torch_compiling_flag
     is_torch_compiling_flag = enable
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py
@@ -12,7 +12,7 @@
 from utils.util import skip_pre_hopper
 
 import tensorrt_llm._torch.auto_deploy.custom_ops  # noqa: F401
-from tensorrt_llm._torch.custom_ops.torch_custom_ops import ActivationType
+from tensorrt_llm._torch.utils import ActivationType
 
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
 FP8_DTYPE = torch.float8_e4m3fn

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,8 @@`
`19`	`19`	`namespace tensorrt_llm::kernels::cutlass_kernels`
`20`	`20`	`{`
`21`	`21`
`22`		`-// Note update moe.py to match`
	`22`	`+// IMPORTANT: Keep the same order of activation functions in this enum and the activation functions in`
	`23`	`+// cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu::doActivationKernel().`
`23`	`24`	`enum class ActivationType`
`24`	`25`	`{`
`25`	`26`	`Gelu = 0,`
Original file line number	Diff line number	Diff line change
`@@ -565,6 +565,7 @@ def forward_chunk(`
`565`	`565`	`tune_max_num_tokens=self.tune_max_num_tokens,`
`566`	`566`	`tuner_num_tokens=tuner_num_tokens,`
`567`	`567`	`tuner_top_k=tuner_top_k,`
	`568`	`+ activation_type=self.activation_type,`
`568`	`569`	`unpadded_hidden_size=self.unpadded_hidden_size,`
`569`	`570`	`out_tensor=moe_output,`
`570`	`571`	`)`