ml-explore · awni · Sep 13, 2025 · Sep 9, 2025 · Sep 9, 2025 · Sep 9, 2025
diff --git a/ACKNOWLEDGMENTS.md b/ACKNOWLEDGMENTS.md
@@ -13,11 +13,11 @@ MLX LM was developed with contributions from the following individuals:
    THUKEG's `GLM4`, Rednote `dots.llm1`, Baisu's `Ernie4.5 MoE`, inclusionAI's
    `Bailing MoE e.g. Ling-family`, Klear team - Kuaishou Technology's `Klear`,
    IBM's `Granite MoE`, Meituan's `LongCat`, Nvidia's `Nemotron H`, Swiss-AI's
-   `Apertus`, Nikity's `Lille130m`, and Allenai's `OLMoE`; Added support for the
-   following training algorithms: `Full Weight Fine-Tuning`, and the `Muon`
+   `Apertus`, Nikity's `Lille130m`, Alibaba Qwen's `Qwen3Next`, and Allenai's `OLMoE`;
+   Helped add support for the following model architectures: Alibaba Qwen's `Qwen3 & Qwen3MoE)`;
+   Added support for the following training algorithms: `Full Weight Fine-Tuning`, and the `Muon`
    optimizer; Added support for the following other features: `Multiple Optimizers
-   to choose for training`, and `reporting training metrics to WandB (Weights &
-   Biases)`.
+   to choose for training`, and `reporting training metrics to WandB (Weights & Biases)`.
 - Prince Canuma: Helped add support for the following model architectures:
   HuggingFace's `Starcoder2`, Cohere's `Cohere (1 and 2)`, Alibaba Qwen's `Qwen
   (2, 3 and MoE)`, Microsoft's `Phi (3 and 3.5 MoE)`, `BitNet1.58`, Meta's `Llama

diff --git a/mlx_lm/models/qwen3_moe.py b/mlx_lm/models/qwen3_moe.py
@@ -127,7 +127,7 @@ def __call__(
         gates = mx.softmax(gates, axis=-1, precise=True)
 
         k = self.top_k
-        inds = mx.stop_gradient(mx.argpartition(-gates, kth=k - 1, axis=-1)[..., :k])
+        inds = mx.argpartition(gates, kth=-k, axis=-1)[..., -k:]
         scores = mx.take_along_axis(gates, inds, axis=-1)
         if self.norm_topk_prob:
             scores /= mx.sum(scores, axis=-1, keepdims=True)