From ec6b0f02c9033e9ebf939617bc99c574df89fd22 Mon Sep 17 00:00:00 2001
From: Kaihang Jiang <kaihangj@nvidia.com>
Date: Thu, 16 Apr 2026 12:54:09 -0400
Subject: [PATCH 1/3] Add B200 GPU configuration for MiniMax-M2.7

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Kaihang Jiang <kaihangj@nvidia.com>
---
 MiniMax/MiniMax-M2.md | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/MiniMax/MiniMax-M2.md b/MiniMax/MiniMax-M2.md
index 1cabeb19..1d0f1cb0 100644
--- a/MiniMax/MiniMax-M2.md
+++ b/MiniMax/MiniMax-M2.md
@@ -115,7 +115,7 @@ uv pip install vllm \
 
 ### NVIDIA GPU
 
-You can use 4x H200/H20/H100 or 4x A100/A800 GPUs to launch this model.
+You can use 4x H200/H20/H100 or 4x A100/A800 or 4x B200 GPUs to launch this model.
 
 run tensor-parallel like this:
 
@@ -129,6 +129,21 @@ vllm serve MiniMaxAI/MiniMax-M2.7 \
   --trust-remote-code
 ```
 
+- B200 (4x B200)
+
+On B200 GPUs, TP4 with the following configuration is recommended.
+
+```bash
+vllm serve MiniMaxAI/MiniMax-M2.7 \
+  --trust-remote-code \
+  --tensor-parallel-size 4 \
+  --enable-auto-tool-choice \
+  --tool-call-parser minimax_m2 \
+  --reasoning-parser minimax_m2_append_think
+```
+
+> **Note**: For improved performance, you may set `VLLM_FLOAT32_MATMUL_PRECISION="high"` to enable TF32 TensorCore acceleration for float32 matmuls. This deviates from the original implementation, which uses full FP32 precision for MoE gating, but evaluations show no observable differences on GSM8K, MMLU-Pro, and tool-calling benchmarks.
+
 Note that pure TP8 is not supported. To run the model with >4 GPUs, please use DP+EP or TP+EP:
 
 - DP8+EP

From 74949e316fac99fb3e1a82baef09dc223e76f60c Mon Sep 17 00:00:00 2001
From: Kaihang Jiang <kaihangj@nvidia.com>
Date: Thu, 16 Apr 2026 14:13:46 -0400
Subject: [PATCH 2/3] Align B200 config with H200 recipe

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Kaihang Jiang <kaihangj@nvidia.com>
---
 MiniMax/MiniMax-M2.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/MiniMax/MiniMax-M2.md b/MiniMax/MiniMax-M2.md
index 1d0f1cb0..ad8d31a8 100644
--- a/MiniMax/MiniMax-M2.md
+++ b/MiniMax/MiniMax-M2.md
@@ -135,11 +135,12 @@ On B200 GPUs, TP4 with the following configuration is recommended.
 
 ```bash
 vllm serve MiniMaxAI/MiniMax-M2.7 \
-  --trust-remote-code \
   --tensor-parallel-size 4 \
-  --enable-auto-tool-choice \
   --tool-call-parser minimax_m2 \
-  --reasoning-parser minimax_m2_append_think
+  --reasoning-parser minimax_m2 \
+  --compilation-config '{"mode":3,"pass_config":{"fuse_minimax_qk_norm":true}}' \
+  --enable-auto-tool-choice \
+  --trust-remote-code
 ```
 
 > **Note**: For improved performance, you may set `VLLM_FLOAT32_MATMUL_PRECISION="high"` to enable TF32 TensorCore acceleration for float32 matmuls. This deviates from the original implementation, which uses full FP32 precision for MoE gating, but evaluations show no observable differences on GSM8K, MMLU-Pro, and tool-calling benchmarks.

From f63d8f75c20e50604cdf233ce9a0e904bad790a3 Mon Sep 17 00:00:00 2001
From: Kaihang Jiang <kaihangj@nvidia.com>
Date: Thu, 16 Apr 2026 15:13:18 -0400
Subject: [PATCH 3/3] Remove duplicate B200 section, keep
 VLLM_FLOAT32_MATMUL_PRECISION note

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Kaihang Jiang <kaihangj@nvidia.com>
---
 MiniMax/MiniMax-M2.md | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/MiniMax/MiniMax-M2.md b/MiniMax/MiniMax-M2.md
index ad8d31a8..858b38e2 100644
--- a/MiniMax/MiniMax-M2.md
+++ b/MiniMax/MiniMax-M2.md
@@ -129,20 +129,6 @@ vllm serve MiniMaxAI/MiniMax-M2.7 \
   --trust-remote-code
 ```
 
-- B200 (4x B200)
-
-On B200 GPUs, TP4 with the following configuration is recommended.
-
-```bash
-vllm serve MiniMaxAI/MiniMax-M2.7 \
-  --tensor-parallel-size 4 \
-  --tool-call-parser minimax_m2 \
-  --reasoning-parser minimax_m2 \
-  --compilation-config '{"mode":3,"pass_config":{"fuse_minimax_qk_norm":true}}' \
-  --enable-auto-tool-choice \
-  --trust-remote-code
-```
-
 > **Note**: For improved performance, you may set `VLLM_FLOAT32_MATMUL_PRECISION="high"` to enable TF32 TensorCore acceleration for float32 matmuls. This deviates from the original implementation, which uses full FP32 precision for MoE gating, but evaluations show no observable differences on GSM8K, MMLU-Pro, and tool-calling benchmarks.
 
 Note that pure TP8 is not supported. To run the model with >4 GPUs, please use DP+EP or TP+EP: