diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml new file mode 100644 index 000000000000..1328fdedf0c4 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml @@ -0,0 +1,8 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency" +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_FLASHINFER_MOE_BACKEND: "masked_gemm" diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml new file mode 100644 index 000000000000..87fac0e708c5 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml @@ -0,0 +1,8 @@ +model_name: "nvidia/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency" +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_FLASHINFER_MOE_BACKEND: "masked_gemm" diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml new file mode 100644 index 000000000000..44f8700e4b46 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml @@ -0,0 +1,8 @@ +model_name: "nvidia/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel" +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_FLASHINFER_MOE_BACKEND: "throughput" diff --git a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt index c1b405fd1d00..53e2fa8a7dd1 100644 --- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt +++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt @@ -1,8 +1,10 @@ +Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml +Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutedsl-deepep-ll.yaml +Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml +Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml -Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml -Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-fi-a2av.yaml diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-test.txt b/tests/evals/gsm8k/configs/moe-refactor/config-test.txt new file mode 100644 index 000000000000..1816666bec0a --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/config-test.txt @@ -0,0 +1 @@ +Qwen3-30B-A3B-NvFp4-CT-marlin.yaml \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index eaf45ead5afd..912ff5a4a12a 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -456,7 +456,7 @@ def prepare_nvfp4_moe_layer_for_fi_or_cutlass( NvFp4MoeBackend.VLLM_CUTLASS, NvFp4MoeBackend.FLASHINFER_CUTLASS, NvFp4MoeBackend.FLASHINFER_TRTLLM, - NvFp4MoeBackend.FLASHINFER_TRTLLM, + NvFp4MoeBackend.FLASHINFER_CUTEDSL, ] # Reorder [w1, w3] to [w3, w1] for FI NVFP4 MoE kernels.