From 2aa9c1cb9b24485f5b205f6e7476f2720c652f9c Mon Sep 17 00:00:00 2001 From: MerkyorLynn <268568828+MerkyorLynn@users.noreply.github.com> Date: Sat, 6 Jun 2026 00:59:35 +0800 Subject: [PATCH] Add ModelOpt W4A16 lm_head regression tests Signed-off-by: MerkyorLynn <268568828+MerkyorLynn@users.noreply.github.com> --- .../test_qwen3_5_quantization.py | 48 +++++++++++++++++++ tests/quantization/test_modelopt.py | 17 +++++++ 2 files changed, 65 insertions(+) diff --git a/tests/model_executor/test_qwen3_5_quantization.py b/tests/model_executor/test_qwen3_5_quantization.py index 7100990de71f..d18b46820328 100644 --- a/tests/model_executor/test_qwen3_5_quantization.py +++ b/tests/model_executor/test_qwen3_5_quantization.py @@ -76,3 +76,51 @@ def test_qwen3_5_mtp_lm_head_receives_quant_config(): MockLMHead.assert_called_once() call_kwargs = MockLMHead.call_args.kwargs assert call_kwargs["quant_config"] is mock_quant_config + + +def test_qwen3_moe_lm_head_receives_quant_config(): + from vllm.model_executor.models.qwen3_moe import ( + Qwen3MoeDecoderLayer, + Qwen3MoeForCausalLM, + Qwen3MoeSparseMoeBlock, + ) + + mock_quant_config = Mock() + + mock_hf_config = Mock() + mock_hf_config.tie_word_embeddings = False + mock_hf_config.vocab_size = 128 + mock_hf_config.hidden_size = 64 + + mock_vllm_config = Mock() + mock_vllm_config.model_config.hf_text_config = mock_hf_config + mock_vllm_config.quant_config = mock_quant_config + + # Build just enough of a Qwen3-MoE layer to satisfy the metadata scan in + # Qwen3MoeForCausalLM.__init__ without constructing the full model. + fake_layer = object.__new__(Qwen3MoeDecoderLayer) + fake_mlp = object.__new__(Qwen3MoeSparseMoeBlock) + for attr, value in { + "experts": Mock(), + "n_logical_experts": 2, + "n_physical_experts": 2, + "n_local_physical_experts": 2, + "n_routed_experts": 2, + "n_redundant_experts": 0, + }.items(): + object.__setattr__(fake_mlp, attr, value) + object.__setattr__(fake_layer, "mlp", fake_mlp) + + with ( + patch("vllm.model_executor.models.qwen3_moe.Qwen3MoeModel") as MockModel, + patch("vllm.model_executor.models.qwen3_moe.ParallelLMHead") as MockLMHead, + patch("vllm.model_executor.models.qwen3_moe.LogitsProcessor"), + ): + MockModel.return_value.make_empty_intermediate_tensors = Mock() + MockModel.return_value.layers = [fake_layer] + + Qwen3MoeForCausalLM(vllm_config=mock_vllm_config) + + MockLMHead.assert_called_once() + call_kwargs = MockLMHead.call_args.kwargs + assert call_kwargs["quant_config"] is mock_quant_config diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py index 2655295c8597..99e414f2cfcc 100644 --- a/tests/quantization/test_modelopt.py +++ b/tests/quantization/test_modelopt.py @@ -20,6 +20,7 @@ ModelOptMixedPrecisionConfig, ModelOptNvFp4Config, ModelOptNvFp4LinearMethod, + ModelOptNvFp4W4A16LinearMethod, ) from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, @@ -126,6 +127,22 @@ def test_modelopt_mixed_precision_quantizes_parallel_lm_head(): assert isinstance(method, ModelOptNvFp4LinearMethod) +@pytest.mark.parametrize("prefix", ["lm_head", "model.lm_head"]) +def test_modelopt_mixed_precision_quantizes_w4a16_parallel_lm_head(prefix): + """Official ModelOpt mixed-precision NVFP4 checkpoints may quantize + ``lm_head`` as W4A16_NVFP4 instead of leaving it BF16. Keep this covered + separately from generic linear layers because LM heads are implemented by + ``ParallelLMHead`` rather than ``LinearBase``. + """ + config = _mixed_precision_config( + {"lm_head": {"quant_algo": "W4A16_NVFP4", "group_size": 16}} + ) + + method = config.get_quant_method(_mock_lm_head(), prefix=prefix) + + assert isinstance(method, ModelOptNvFp4W4A16LinearMethod) + + def test_vocab_parallel_embedding_weight_loader_accepts_scalar_scale(): holder = Mock() scale = torch.nn.Parameter(torch.empty(1))