From 8b88807037f137d8e0e2b43b0ec030f3dcca398c Mon Sep 17 00:00:00 2001 From: Matthias Gehre Date: Wed, 18 Feb 2026 02:47:10 -0700 Subject: [PATCH] [Bugfix] Add regression test for MoE quant_config under torch.compile The code fix landed via #34371 (31d992d). This adds a regression test to prevent future regressions: test_w4a16_moe_torch_compile loads a W4A16 MoE model with enforce_eager=False and verifies inference succeeds without the "Hidden size mismatch" assertion error. Signed-off-by: Matthias Gehre --- tests/quantization/test_compressed_tensors.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 795591ec35e6..e5a047a7c34a 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -816,3 +816,26 @@ def check_model(model): # Verify the model can generate output output = llm.generate_greedy("Hello, my name is", max_tokens=4) assert output + + +def test_w4a16_moe_torch_compile(vllm_runner): + """Regression test: MoE quant_config must be initialized inside the + moe_forward custom op, not just in forward_native which is compiled by + Dynamo (attribute mutations are not replayed at runtime). + + Without the fix in _moe_forward/_moe_forward_shared, this hits: + AssertionError: Hidden size mismatch 2048 != 1024 + because use_int4_w4a16 is False (moe_quant_config stays None). + """ + model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only-CTstable" + + with vllm_runner( + model_path, + enforce_eager=False, + max_model_len=256, + compilation_config={ + "cudagraph_mode": "NONE", + }, + ) as llm: + output = llm.generate_greedy("Hi", max_tokens=1) + assert output