Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions tests/quantization/test_compressed_tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,3 +816,26 @@ def check_model(model):
# Verify the model can generate output
output = llm.generate_greedy("Hello, my name is", max_tokens=4)
assert output


def test_w4a16_moe_torch_compile(vllm_runner):
"""Regression test: MoE quant_config must be initialized inside the
moe_forward custom op, not just in forward_native which is compiled by
Dynamo (attribute mutations are not replayed at runtime).

Without the fix in _moe_forward/_moe_forward_shared, this hits:
AssertionError: Hidden size mismatch 2048 != 1024
because use_int4_w4a16 is False (moe_quant_config stays None).
"""
model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only-CTstable"

with vllm_runner(
model_path,
enforce_eager=False,
max_model_len=256,
compilation_config={
"cudagraph_mode": "NONE",
},
) as llm:
output = llm.generate_greedy("Hi", max_tokens=1)
assert output