Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,4 @@ server_args: >-
--tensor-parallel-size 2
--enable-expert-parallel
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
--moe-backend=flashinfer_trtllm
3 changes: 1 addition & 2 deletions tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,4 @@ server_args: >-
--tensor-parallel-size 2
--enable-expert-parallel
--async-scheduling
env:
VLLM_USE_FLASHINFER_MOE_FP8: "1"
--moe-backend=flashinfer_trtllm
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
accuracy_threshold: 0.92
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --moe-backend=triton"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "0"
VLLM_USE_DEEP_GEMM: "0"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
VLLM_FLASHINFER_MOE_BACKEND: "masked_gemm"
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency --moe-backend=flashinfer_cutedsl"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
VLLM_FLASHINFER_MOE_BACKEND: "masked_gemm"
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency --moe-backend=flashinfer_cutedsl"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
VLLM_FLASHINFER_MOE_BACKEND: "latency"
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --moe-backend=flashinfer_trtllm"
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,4 @@ model_name: "meta-llama/Llama-4-Scout-17B-16E-Instruct"
accuracy_threshold: 0.92
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel"
env:
VLLM_USE_FLASHINFER_MOE_FP16: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"

server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
accuracy_threshold: 0.92
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
accuracy_threshold: 0.92
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "latency"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_trtllm"
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@ model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
accuracy_threshold: 0.92
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "0"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=triton"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "mistralai/Mixtral-8x7B-v0.1"
accuracy_threshold: 0.58
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel"
env:
VLLM_USE_FLASHINFER_MOE_FP16: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,4 @@
# accuracy_threshold: 0.62
# num_questions: 1319
# num_fewshot: 5
# server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
# env:
# VLLM_USE_FLASHINFER_MOE_FP8: "1"
# VLLM_FLASHINFER_MOE_BACKEND: "throughput"
# server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
accuracy_threshold: 0.29
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "latency"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_trtllm"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4"
accuracy_threshold: 0.29
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@ model_name: "Qwen/Qwen3-30B-A3B"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel"
env:
VLLM_USE_FLASHINFER_MOE_FP16: "1"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "latency"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_trtllm"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=triton"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "0"
VLLM_USE_DEEP_GEMM: "0"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
accuracy_threshold: 0.85
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
accuracy_threshold: 0.85
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=triton"
env:
VLLM_USE_FLASHINFER_MOE_FP8: "0"
VLLM_USE_DEEP_GEMM: "0"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
VLLM_FLASHINFER_MOE_BACKEND: "latency"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_trtllm"
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@ model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "0"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_cutlass"
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "1"
VLLM_FLASHINFER_MOE_BACKEND: "latency"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=flashinfer_trtllm"
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-NVFP4"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
env:
VLLM_USE_FLASHINFER_MOE_FP4: "0"
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --moe-backend=cutlass"
70 changes: 42 additions & 28 deletions tests/quantization/test_blackwell_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,43 +85,46 @@ def can_initialize(
)
)
def test_llama4_fp8_tensor_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
can_initialize(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
hf_overrides=HF_OVERRIDE_MM,
extra_args=["--moe-backend=flashinfer_cutlass"],
)


def test_llama4_fp8_tensor_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
can_initialize(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
hf_overrides=HF_OVERRIDE_MM,
extra_args=["--moe-backend=flashinfer_trtllm"],
)


def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
can_initialize(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
hf_overrides=HF_OVERRIDE_MM,
extra_args=["--moe-backend=flashinfer_cutlass"],
)


def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
can_initialize(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
hf_overrides=HF_OVERRIDE_MM,
extra_args=["--moe-backend=flashinfer_trtllm"],
)


## DeepSeekV3 ##


def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
can_initialize(
"deepseek-ai/DeepSeek-V3.1",
hf_overrides=HF_OVERRIDE_TEXT,
extra_args=["--moe-backend=deep_gemm"],
)


@pytest.mark.skip(
Expand All @@ -131,27 +134,35 @@ def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
)
)
def test_deepseek_fp8_block_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
can_initialize(
"deepseek-ai/DeepSeek-V3.1",
hf_overrides=HF_OVERRIDE_TEXT,
extra_args=["--moe-backend=flashinfer_cutlass"],
)


def test_deepseek_fp8_block_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
can_initialize(
"deepseek-ai/DeepSeek-V3.1",
hf_overrides=HF_OVERRIDE_TEXT,
extra_args=["--moe-backend=flashinfer_trtllm"],
)


def test_deepseek_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
can_initialize(
"nvidia/DeepSeek-R1-0528-FP4-v2",
hf_overrides=HF_OVERRIDE_TEXT,
extra_args=["--moe-backend=flashinfer_cutlass"],
)


def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
can_initialize(
"nvidia/DeepSeek-R1-0528-FP4-v2",
hf_overrides=HF_OVERRIDE_TEXT,
extra_args=["--moe-backend=flashinfer_trtllm"],
)


## GPT-OSS ##
Expand Down Expand Up @@ -184,5 +195,8 @@ def test_gptoss_eager(monkeypatch: pytest.MonkeyPatch):


def test_qwen3_next_bf16_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1")
can_initialize("Qwen/Qwen3-Next-80B-A3B-Instruct", hf_overrides=HF_OVERRIDE_TEXT)
can_initialize(
"Qwen/Qwen3-Next-80B-A3B-Instruct",
hf_overrides=HF_OVERRIDE_TEXT,
extra_args=["--moe-backend=flashinfer_trtllm"],
)
Loading