diff --git a/tests/evals/gsm8k/configs/moe-refactor/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16-triton.yaml new file mode 100644 index 000000000000..6c880b9e6124 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16-triton.yaml @@ -0,0 +1,5 @@ +model_name: "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 10 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt b/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt index 9d86e432e84f..15677b399113 100644 --- a/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt +++ b/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt @@ -11,3 +11,4 @@ Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml +NVIDIA-Nemotron-3-Nano-30B-A3B-BF16-triton.yaml diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index bcda7b42c2ec..8856c98a1f61 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -733,6 +733,7 @@ def __init__( block_quant=False, tp_size=moe_config.moe_parallel_config.tp_size, with_lora_support=self.moe.is_lora_enabled, + is_act_and_mul=self.moe.is_act_and_mul, ) self.kernel: mk.FusedMoEModularKernel | None = None