microsoft · apsonawane · Aug 26, 2025 · Aug 3, 2025 · Aug 3, 2025 · Aug 3, 2025
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -106,7 +106,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QEmbedLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QGemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QGemm);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QMoE);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, QMoE);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QMoE);
 // ******** End: Quantization ******************* //
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -272,7 +273,8 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QEmbedLayerNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QGemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QGemm)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QMoE)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, QMoE)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QMoE)>,
   };
 
   for (auto& function_table_entry : function_table) {

diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_base_cpu.h b/onnxruntime/contrib_ops/cpu/moe/moe_base_cpu.h
@@ -6,7 +6,8 @@
 #include "core/common/common.h"
 #include "core/framework/tensor_shape.h"
 #include "core/framework/op_kernel.h"
-#include "contrib_ops/cpu/quantization/moe_helper.h"
+#include "contrib_ops/cpu/moe/moe_helper.h"
+#include <limits>
 
 namespace onnxruntime {
 namespace contrib {
@@ -46,12 +47,21 @@
     if (use_sparse_mixer_) {
       ORT_ENFORCE(k_ == 2, "Sparse mixer only supports k=2");
     }
+
+    swiglu_fusion_ = op_kernel_info.GetAttrOrDefault<int64_t>("swiglu_fusion", 0);
+    swiglu_limit_ = op_kernel_info.GetAttrOrDefault<float>("swiglu_limit", std::numeric_limits<float>::infinity());
+    activation_alpha_ = op_kernel_info.GetAttrOrDefault<float>("activation_alpha", 1.0f);
+    activation_beta_ = op_kernel_info.GetAttrOrDefault<float>("activation_beta", 0.0f);
   }
 
   bool normalize_routing_weights_;
   bool use_sparse_mixer_;
   int64_t k_;
   ActivationType activation_type_;
+  float activation_alpha_;
+  float activation_beta_;
+  float swiglu_limit_;
+  int64_t swiglu_fusion_;
 };
 
 }  // namespace contrib

diff --git a/...contrib_ops/cpu/quantization/moe_helper.h → onnxruntime/contrib_ops/cpu/moe/moe_helper.h b/...contrib_ops/cpu/quantization/moe_helper.h → onnxruntime/contrib_ops/cpu/moe/moe_helper.h
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "contrib_ops/cpu/moe/moe_base_cpu.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+/**
+ * @brief QMoE is the templated CPU implementation of the Quantized Mixture of Experts operator.
+ *
+ * This kernel supports both float and MLFloat16 data types for activations, scales, and outputs.
+ * It parallelizes expert computation using the ONNX Runtime thread pool and minimizes memory
+ * usage through on-the-fly block dequantization of weights.
+ *
+ * @tparam T The data type for the kernel (float or MLFloat16).
+ */
+template <typename T>
+class QMoECPU final : public OpKernel, public MoEBaseCPU {
+ public:
+  explicit QMoECPU(const OpKernelInfo& op_kernel_info);
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  int64_t expert_weight_bits_;
+  int64_t block_size_;
+};
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_utils.cc b/onnxruntime/contrib_ops/cpu/moe/moe_utils.cc
@@ -4,6 +4,7 @@
 #include "contrib_ops/cpu/moe/moe_utils.h"
 #include <cmath>
 #include <algorithm>
+#include "core/common/common.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -19,74 +20,31 @@ float ApplyActivation(float x, ActivationType activation_type) {
     case ActivationType::Identity:
       return x;
     case ActivationType::SwiGLU:
-      // SwiGLU: This is handled specially as it requires gating, not applied here
+      // SwiGLU is a special case handled by ApplySwiGLUActivation, this is just a placeholder
       return x;
     default:
-      return x;  // Default to identity
+      return x;
   }
 }
 
-// Helper method for applying SwiGLU activation with different memory layouts
-void ApplySwiGLUActivation(float* data, int64_t inter_size, bool is_interleaved_format) {
-  constexpr float swiglu_alpha = 1.702f;
-  constexpr float clamp_limit = 7.0f;  // Clamping limit as specified
-
+void ApplySwiGLUActivation(const float* input_data, float* output_data, int64_t inter_size, bool is_interleaved_format,
+                           float activation_alpha, float activation_beta, float clamp_limit) {
   if (is_interleaved_format) {
-    // For interleaved format [linear, gate, linear, gate, ...], process directly
-    // Make a temporary copy of each pair of values before modifying them
     for (int64_t i = 0; i < inter_size; ++i) {
-      const size_t idx = static_cast<size_t>(i);
-      const size_t linear_idx = 2 * idx;
-      const size_t gate_idx = linear_idx + 1;
+      float gate_val = input_data[2 * i];
+      float linear_val = input_data[2 * i + 1];
 
-      // Store original values
-      float linear_val = data[linear_idx];  // Interleaved: even index
-      float gate_val = data[gate_idx];      // Interleaved: odd index
+      gate_val = std::min(gate_val, clamp_limit);
+      linear_val = std::clamp(linear_val, -clamp_limit, clamp_limit);
 
-      // Apply clamping to the values
-      if (gate_val > clamp_limit) gate_val = clamp_limit;      // Clamp gate max only
-      if (linear_val > clamp_limit) linear_val = clamp_limit;  // Clamp linear min/max
-      if (linear_val < -clamp_limit) linear_val = -clamp_limit;
-
-      // SwiGLU: gate * sigmoid(alpha * gate) * (linear + 1)
-      float sigmoid_arg = swiglu_alpha * gate_val;
+      float sigmoid_arg = activation_alpha * gate_val;
       float sigmoid_out = 1.0f / (1.0f + std::exp(-sigmoid_arg));
       float swish_out = gate_val * sigmoid_out;
-      float result = swish_out * (linear_val + 1.0f);
 
-      // Store result in first element (linear position)
-      data[idx] = result;
+      output_data[i] = swish_out * (linear_val + activation_beta);
     }
   } else {
-    // For chunked layout [linear..., gate...], handle separately
-    // Need to work with original data in-place
-    // First, store all the gate computations since they depend on original gate values
-    std::vector<float> computed_gates(static_cast<size_t>(inter_size));
-
-    for (int64_t i = 0; i < inter_size; ++i) {
-      const size_t idx = static_cast<size_t>(i);
-      float gate_val = data[idx + static_cast<size_t>(inter_size)];
-
-      // Apply clamping to the gate value (max only)
-      if (gate_val > clamp_limit) gate_val = clamp_limit;
-
-      // Compute the gate part of SwiGLU
-      float sigmoid_arg = swiglu_alpha * gate_val;
-      float sigmoid_out = 1.0f / (1.0f + std::exp(-sigmoid_arg));
-      computed_gates[idx] = gate_val * sigmoid_out;
-    }
-
-    // Now apply the full activation with the precomputed gate values
-    for (int64_t i = 0; i < inter_size; ++i) {
-      const size_t idx = static_cast<size_t>(i);
-      float linear_val = data[idx];
-
-      // Apply clamping to the linear value (min/max)
-      if (linear_val > clamp_limit) linear_val = clamp_limit;
-      if (linear_val < -clamp_limit) linear_val = -clamp_limit;
-
-      data[idx] = computed_gates[idx] * (linear_val + 1.0f);
-    }
+    ORT_NOT_IMPLEMENTED("Non-interleaved format not supported for SwiGLU activation");
   }
 }
 

diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_utils.h b/onnxruntime/contrib_ops/cpu/moe/moe_utils.h
@@ -9,7 +9,9 @@ namespace onnxruntime {
 namespace contrib {
 
 float ApplyActivation(float x, ActivationType activation_type);
-void ApplySwiGLUActivation(float* data, int64_t inter_size, bool is_interleaved_format);
+
+void ApplySwiGLUActivation(const float* input_data, float* output_data, int64_t inter_size, bool is_interleaved_format,
+                           float activation_alpha, float activation_beta, float clamp_limit);
 
 }  // namespace contrib
 }  // namespace onnxruntime