diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml index a07d702cf3ce..4cf23cd8fd10 100644 --- a/.buildkite/test_areas/lm_eval.yaml +++ b/.buildkite/test_areas/lm_eval.yaml @@ -90,6 +90,19 @@ steps: commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt +- label: MoE Nightly Integration Test (H100) + device: h100 + optional: true + num_devices: 2 + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/models-nightly-small.txt + +- label: MoE Nightly Integration Test (B200) + device: b200 + optional: true + num_devices: 2 + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/models-nightly-small.txt - label: LM Eval TurboQuant KV Cache timeout_in_minutes: 75 diff --git a/tests/evals/gsm8k/configs/ERNIE-4.5-VL-28B-A3B-Thinking.yaml b/tests/evals/gsm8k/configs/ERNIE-4.5-VL-28B-A3B-Thinking.yaml new file mode 100644 index 000000000000..420414f6829c --- /dev/null +++ b/tests/evals/gsm8k/configs/ERNIE-4.5-VL-28B-A3B-Thinking.yaml @@ -0,0 +1,5 @@ +model_name: "baidu/ERNIE-4.5-VL-28B-A3B-Thinking" +accuracy_threshold: 0.75 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/GLM-4.7-Flash.yaml b/tests/evals/gsm8k/configs/GLM-4.7-Flash.yaml new file mode 100644 index 000000000000..0c599ca42067 --- /dev/null +++ b/tests/evals/gsm8k/configs/GLM-4.7-Flash.yaml @@ -0,0 +1,5 @@ +model_name: "zai-org/GLM-4.7-Flash" +accuracy_threshold: 0.84 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/GPT-OSS-20B.yaml b/tests/evals/gsm8k/configs/GPT-OSS-20B.yaml new file mode 100644 index 000000000000..e797dfed3be3 --- /dev/null +++ b/tests/evals/gsm8k/configs/GPT-OSS-20B.yaml @@ -0,0 +1,5 @@ +model_name: "openai/gpt-oss-20b" +accuracy_threshold: 0.31 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/Gemma-4-26B-A4B-it.yaml b/tests/evals/gsm8k/configs/Gemma-4-26B-A4B-it.yaml new file mode 100644 index 000000000000..79180a554c49 --- /dev/null +++ b/tests/evals/gsm8k/configs/Gemma-4-26B-A4B-it.yaml @@ -0,0 +1,5 @@ +model_name: "google/gemma-4-26B-A4B-it" +accuracy_threshold: 0.30 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2 --enable-expert-parallel" diff --git a/tests/evals/gsm8k/configs/Granite-4.0-H-Small.yaml b/tests/evals/gsm8k/configs/Granite-4.0-H-Small.yaml new file mode 100644 index 000000000000..b0b4a0728c33 --- /dev/null +++ b/tests/evals/gsm8k/configs/Granite-4.0-H-Small.yaml @@ -0,0 +1,5 @@ +model_name: "ibm-granite/granite-4.0-h-small" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/Jamba2-Mini.yaml b/tests/evals/gsm8k/configs/Jamba2-Mini.yaml new file mode 100644 index 000000000000..62eb87b56162 --- /dev/null +++ b/tests/evals/gsm8k/configs/Jamba2-Mini.yaml @@ -0,0 +1,5 @@ +model_name: "ai21labs/AI21-Jamba2-Mini" +accuracy_threshold: 0.76 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/LFM2.5-350M.yaml b/tests/evals/gsm8k/configs/LFM2.5-350M.yaml new file mode 100644 index 000000000000..ad207cde0f5d --- /dev/null +++ b/tests/evals/gsm8k/configs/LFM2.5-350M.yaml @@ -0,0 +1,5 @@ +model_name: "LiquidAI/LFM2.5-350M" +accuracy_threshold: 0.21 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/MiniMax-M2.7.yaml b/tests/evals/gsm8k/configs/MiniMax-M2.7.yaml new file mode 100644 index 000000000000..8f60e7259814 --- /dev/null +++ b/tests/evals/gsm8k/configs/MiniMax-M2.7.yaml @@ -0,0 +1,5 @@ +model_name: "MiniMaxAI/MiniMax-M2.7" +accuracy_threshold: 0.92 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/OLMoE-1B-7B-0125-Instruct.yaml b/tests/evals/gsm8k/configs/OLMoE-1B-7B-0125-Instruct.yaml new file mode 100644 index 000000000000..db839334766a --- /dev/null +++ b/tests/evals/gsm8k/configs/OLMoE-1B-7B-0125-Instruct.yaml @@ -0,0 +1,5 @@ +model_name: "allenai/OLMoE-1B-7B-0125-Instruct" +accuracy_threshold: 0.69 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096 --enable-expert-parallel" diff --git a/tests/evals/gsm8k/configs/Phi-Tiny-MoE-Instruct.yaml b/tests/evals/gsm8k/configs/Phi-Tiny-MoE-Instruct.yaml new file mode 100644 index 000000000000..273754eb0e23 --- /dev/null +++ b/tests/evals/gsm8k/configs/Phi-Tiny-MoE-Instruct.yaml @@ -0,0 +1,5 @@ +model_name: "microsoft/Phi-tiny-MoE-instruct" +accuracy_threshold: 0.70 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096 --enable-expert-parallel" diff --git a/tests/evals/gsm8k/configs/Sarvam-30B.yaml b/tests/evals/gsm8k/configs/Sarvam-30B.yaml new file mode 100644 index 000000000000..7d528327dd34 --- /dev/null +++ b/tests/evals/gsm8k/configs/Sarvam-30B.yaml @@ -0,0 +1,5 @@ +model_name: "sarvamai/sarvam-30b" +accuracy_threshold: 0.66 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/Step-3.5-Flash.yaml b/tests/evals/gsm8k/configs/Step-3.5-Flash.yaml new file mode 100644 index 000000000000..29a681dbf13c --- /dev/null +++ b/tests/evals/gsm8k/configs/Step-3.5-Flash.yaml @@ -0,0 +1,5 @@ +model_name: "stepfun-ai/Step-3.5-Flash" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 4" diff --git a/tests/evals/gsm8k/configs/Trinity-Mini.yaml b/tests/evals/gsm8k/configs/Trinity-Mini.yaml new file mode 100644 index 000000000000..2e58926b2458 --- /dev/null +++ b/tests/evals/gsm8k/configs/Trinity-Mini.yaml @@ -0,0 +1,5 @@ +model_name: "arcee-ai/Trinity-Mini" +accuracy_threshold: 0.84 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 4096" diff --git a/tests/evals/gsm8k/configs/models-nightly-large.txt b/tests/evals/gsm8k/configs/models-nightly-large.txt new file mode 100644 index 000000000000..a1931bf0b527 --- /dev/null +++ b/tests/evals/gsm8k/configs/models-nightly-large.txt @@ -0,0 +1,2 @@ +DeepSeek-R1-TP.yaml +Step-3.5-Flash.yaml diff --git a/tests/evals/gsm8k/configs/models-nightly-small.txt b/tests/evals/gsm8k/configs/models-nightly-small.txt new file mode 100644 index 000000000000..57b4e2395a8c --- /dev/null +++ b/tests/evals/gsm8k/configs/models-nightly-small.txt @@ -0,0 +1,16 @@ +Trinity-Mini.yaml +Gemma-4-26B-A4B-it.yaml +GLM-4.7-Flash.yaml +GPT-OSS-20B.yaml +Granite-4.0-H-Small.yaml +Jamba2-Mini.yaml +LFM2.5-350M.yaml +MiniMax-M2.7.yaml +moe-refactor/Mixtral-8x7B-BF16-triton.yaml +Nemotron-3-Super-120B-A12B-NVFP4.yaml +OLMoE-1B-7B-0125-Instruct.yaml +Phi-Tiny-MoE-Instruct.yaml +Sarvam-30B.yaml +ERNIE-4.5-VL-28B-A3B-Thinking.yaml +# This model times out in weight loading. See #40535 +#moe-refactor/Llama-4-Scout-BF16-triton.yaml