Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .buildkite/test_areas/lm_eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,19 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt

- label: MoE Nightly Integration Test (H100)
device: h100
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/models-nightly-small.txt

- label: MoE Nightly Integration Test (B200)
device: b200
optional: true
num_devices: 2
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/models-nightly-small.txt

- label: LM Eval TurboQuant KV Cache
timeout_in_minutes: 75
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "baidu/ERNIE-4.5-VL-28B-A3B-Thinking"
accuracy_threshold: 0.75
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/GLM-4.7-Flash.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "zai-org/GLM-4.7-Flash"
accuracy_threshold: 0.84
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/GPT-OSS-20B.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "openai/gpt-oss-20b"
accuracy_threshold: 0.31
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/Gemma-4-26B-A4B-it.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "google/gemma-4-26B-A4B-it"
accuracy_threshold: 0.30
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2 --enable-expert-parallel"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/Granite-4.0-H-Small.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "ibm-granite/granite-4.0-h-small"
accuracy_threshold: 0.85
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/Jamba2-Mini.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "ai21labs/AI21-Jamba2-Mini"
accuracy_threshold: 0.76
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/LFM2.5-350M.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "LiquidAI/LFM2.5-350M"
accuracy_threshold: 0.21
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/MiniMax-M2.7.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "MiniMaxAI/MiniMax-M2.7"
accuracy_threshold: 0.92
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/OLMoE-1B-7B-0125-Instruct.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "allenai/OLMoE-1B-7B-0125-Instruct"
accuracy_threshold: 0.69
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096 --enable-expert-parallel"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/Phi-Tiny-MoE-Instruct.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "microsoft/Phi-tiny-MoE-instruct"
accuracy_threshold: 0.70
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096 --enable-expert-parallel"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/Sarvam-30B.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "sarvamai/sarvam-30b"
accuracy_threshold: 0.66
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 2"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/Step-3.5-Flash.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "stepfun-ai/Step-3.5-Flash"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096 --tensor-parallel-size 4"
5 changes: 5 additions & 0 deletions tests/evals/gsm8k/configs/Trinity-Mini.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "arcee-ai/Trinity-Mini"
accuracy_threshold: 0.84
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 4096"
2 changes: 2 additions & 0 deletions tests/evals/gsm8k/configs/models-nightly-large.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
DeepSeek-R1-TP.yaml
Comment thread
bnellnm marked this conversation as resolved.
Step-3.5-Flash.yaml
16 changes: 16 additions & 0 deletions tests/evals/gsm8k/configs/models-nightly-small.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Trinity-Mini.yaml
Gemma-4-26B-A4B-it.yaml
GLM-4.7-Flash.yaml
GPT-OSS-20B.yaml
Granite-4.0-H-Small.yaml
Jamba2-Mini.yaml
LFM2.5-350M.yaml
MiniMax-M2.7.yaml
moe-refactor/Mixtral-8x7B-BF16-triton.yaml
Nemotron-3-Super-120B-A12B-NVFP4.yaml
OLMoE-1B-7B-0125-Instruct.yaml
Phi-Tiny-MoE-Instruct.yaml
Sarvam-30B.yaml
ERNIE-4.5-VL-28B-A3B-Thinking.yaml
# This model times out in weight loading. See #40535
#moe-refactor/Llama-4-Scout-BF16-triton.yaml
Loading