From 8b88807037f137d8e0e2b43b0ec030f3dcca398c Mon Sep 17 00:00:00 2001
From: Matthias Gehre <matthias.gehre@amd.com>
Date: Wed, 18 Feb 2026 02:47:10 -0700
Subject: [PATCH] [Bugfix] Add regression test for MoE quant_config under
 torch.compile

The code fix landed via #34371 (31d992d). This adds a regression test
to prevent future regressions: test_w4a16_moe_torch_compile loads a
W4A16 MoE model with enforce_eager=False and verifies inference
succeeds without the "Hidden size mismatch" assertion error.

Signed-off-by: Matthias Gehre <matthias.gehre@amd.com>
---
 tests/quantization/test_compressed_tensors.py | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 795591ec35e6..e5a047a7c34a 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -816,3 +816,26 @@ def check_model(model):
         # Verify the model can generate output
         output = llm.generate_greedy("Hello, my name is", max_tokens=4)
         assert output
+
+
+def test_w4a16_moe_torch_compile(vllm_runner):
+    """Regression test: MoE quant_config must be initialized inside the
+    moe_forward custom op, not just in forward_native which is compiled by
+    Dynamo (attribute mutations are not replayed at runtime).
+
+    Without the fix in _moe_forward/_moe_forward_shared, this hits:
+        AssertionError: Hidden size mismatch 2048 != 1024
+    because use_int4_w4a16 is False (moe_quant_config stays None).
+    """
+    model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only-CTstable"
+
+    with vllm_runner(
+        model_path,
+        enforce_eager=False,
+        max_model_len=256,
+        compilation_config={
+            "cudagraph_mode": "NONE",
+        },
+    ) as llm:
+        output = llm.generate_greedy("Hi", max_tokens=1)
+        assert output