vllm-project · bnellnm · Apr 15, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
@@ -90,6 +90,19 @@ steps:
   commands:
     - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
 
+- label: MoE Nightly Integration Test (H100)
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/models-nightly-small.txt
+
+- label: MoE Nightly Integration Test (B200)
+  device: b200
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/models-nightly-small.txt
 
 - label: LM Eval TurboQuant KV Cache
   timeout_in_minutes: 75

@@ -0,0 +1,5 @@
+model_name: "baidu/ERNIE-4.5-VL-28B-A3B-Thinking"
+accuracy_threshold: 0.75
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2"
@@ -0,0 +1,5 @@
+model_name: "zai-org/GLM-4.7-Flash"
+accuracy_threshold: 0.84
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096"
@@ -0,0 +1,5 @@
+model_name: "openai/gpt-oss-20b"
+accuracy_threshold: 0.31
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2"
@@ -0,0 +1,5 @@
+model_name: "google/gemma-4-26B-A4B-it"
+accuracy_threshold: 0.30
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2 --enable-expert-parallel"
@@ -0,0 +1,5 @@
+model_name: "ibm-granite/granite-4.0-h-small"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096"
@@ -0,0 +1,5 @@
+model_name: "ai21labs/AI21-Jamba2-Mini"
+accuracy_threshold: 0.76
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096"
@@ -0,0 +1,5 @@
+model_name: "LiquidAI/LFM2.5-350M"
+accuracy_threshold: 0.21
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096"
@@ -0,0 +1,5 @@
+model_name: "MiniMaxAI/MiniMax-M2.7"
+accuracy_threshold: 0.92
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2"
@@ -0,0 +1,5 @@
+model_name: "allenai/OLMoE-1B-7B-0125-Instruct"
+accuracy_threshold: 0.69
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096 --enable-expert-parallel"
@@ -0,0 +1,5 @@
+model_name: "microsoft/Phi-tiny-MoE-instruct"
+accuracy_threshold: 0.70
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096 --enable-expert-parallel"
@@ -0,0 +1,5 @@
+model_name: "sarvamai/sarvam-30b"
+accuracy_threshold: 0.66
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2"
@@ -0,0 +1,5 @@
+model_name: "stepfun-ai/Step-3.5-Flash"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 4"
@@ -0,0 +1,5 @@
+model_name: "arcee-ai/Trinity-Mini"
+accuracy_threshold: 0.84
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 4096"
@@ -0,0 +1,2 @@
+DeepSeek-R1-TP.yaml
+Step-3.5-Flash.yaml
@@ -0,0 +1,16 @@
+Trinity-Mini.yaml
+Gemma-4-26B-A4B-it.yaml
+GLM-4.7-Flash.yaml
+GPT-OSS-20B.yaml
+Granite-4.0-H-Small.yaml
+Jamba2-Mini.yaml
+LFM2.5-350M.yaml
+MiniMax-M2.7.yaml
+moe-refactor/Mixtral-8x7B-BF16-triton.yaml
+Nemotron-3-Super-120B-A12B-NVFP4.yaml
+OLMoE-1B-7B-0125-Instruct.yaml
+Phi-Tiny-MoE-Instruct.yaml
+Sarvam-30B.yaml
+ERNIE-4.5-VL-28B-A3B-Thinking.yaml
+# This model times out in weight loading. See #40535
+#moe-refactor/Llama-4-Scout-BF16-triton.yaml
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		DeepSeek-R1-TP.yaml
Comment thread bnellnm marked this conversation as resolved.
		Step-3.5-Flash.yaml