From 12c729dcdec7890e0f2970025648197f0e56a77a Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Thu, 29 Jan 2026 13:17:38 +0100 Subject: [PATCH 1/3] [Perf] Regional torch.compile for code predictor decoder layers Apply regionally_compile(mode="reduce-overhead") to the 5 Qwen3TTSDecoderLayer blocks inside the code predictor, reducing per-kernel launch overhead across the 31-iteration generate loop. Uses the existing diffusion/compile.py pattern. Falls back gracefully if compilation fails on a given GPU/CUDA version. Also adds a debug log for code_predictor.config.use_cache to inform Phase 1b KV cache work. Co-Authored-By: Claude Opus 4.5 --- .../models/qwen3_tts/test_qwen3_tts.py | 334 ++++++++++++++++++ .../models/qwen3_tts/modeling_qwen3_tts.py | 1 + .../models/qwen3_tts/qwen3_tts.py | 12 + 3 files changed, 347 insertions(+) create mode 100644 tests/model_executor/models/qwen3_tts/test_qwen3_tts.py diff --git a/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py new file mode 100644 index 00000000000..4f5a955fa3e --- /dev/null +++ b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py @@ -0,0 +1,334 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Regression tests for Qwen3 TTS model wrapper. + +Tests cover: + - Profile run short-circuit (regression for PR #1082) + - Flash-attn detection and fallback + - Code predictor regional compilation (Phase 1a) + +These tests mock heavy dependencies (vllm, transformers, librosa, etc.) so +they can run without GPU, model weights, or the full vllm engine. + +The module under test is compiled and executed in a synthetic namespace +to completely bypass the vllm_omni.__init__ import chain. +""" + +import logging +import sys +import types +from pathlib import Path +from typing import NamedTuple +from unittest.mock import MagicMock, patch + +import numpy as np +import torch + +# --------------------------------------------------------------------------- +# Bootstrap: build a minimal set of stub modules, then compile+exec the +# target .py file in a module whose __package__ resolves relative imports +# to our stubs. +# --------------------------------------------------------------------------- + +_REPO = Path(__file__).resolve().parents[4] # repo root +_TARGET = _REPO / "vllm_omni" / "model_executor" / "models" / "qwen3_tts" / "qwen3_tts.py" + +# Full set of modules the target file references (directly or via relative +# imports that we intercept). +_STUB_FQNS = [ + # vllm (direct imports in qwen3_tts.py) + "vllm", + "vllm.config", + "vllm.logger", + "vllm.sequence", + # transformers (direct import in qwen3_tts.py) + "transformers", + # audio I/O libs (direct imports in qwen3_tts.py) + "librosa", + "soundfile", + # vllm_omni package tree (relative imports resolve to these) + "vllm_omni", + "vllm_omni.diffusion", + "vllm_omni.diffusion.compile", + "vllm_omni.model_executor", + "vllm_omni.model_executor.models", + "vllm_omni.model_executor.models.output_templates", + "vllm_omni.model_executor.models.qwen3_tts", + "vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts", + "vllm_omni.model_executor.models.qwen3_tts.modeling_qwen3_tts", + "vllm_omni.model_executor.models.qwen3_tts.processing_qwen3_tts", +] + +_saved_modules: dict[str, types.ModuleType | None] = {} + + +def _make_stub(fqn: str) -> types.ModuleType: + parts = fqn.split(".") + for i in range(1, len(parts) + 1): + key = ".".join(parts[:i]) + if key not in sys.modules: + mod = types.ModuleType(key) + mod.__path__ = [str(_REPO / key.replace(".", "/"))] + mod.__package__ = key + mod.__spec__ = None + sys.modules[key] = mod + return sys.modules[fqn] + + +_mock_regionally_compile = MagicMock(name="regionally_compile") + + +def _setup(): + # Save originals so they can be restored if needed + for fqn in _STUB_FQNS: + _saved_modules[fqn] = sys.modules.get(fqn) + _make_stub(fqn) + + # Wire parent.child attributes + for fqn in _STUB_FQNS: + parts = fqn.split(".") + if len(parts) > 1: + parent = sys.modules.get(".".join(parts[:-1])) + child = sys.modules.get(fqn) + if parent and child: + setattr(parent, parts[-1], child) + + # ---- Concrete stubs for names the target file actually uses ---- + + # vllm.logger + sys.modules["vllm.logger"].init_logger = lambda name: logging.getLogger(name) + + # vllm.config + sys.modules["vllm.config"].VllmConfig = type("VllmConfig", (), {}) + + # vllm.sequence + class IntermediateTensors: + def __init__(self, d=None): + self.tensors = d or {} + sys.modules["vllm.sequence"].IntermediateTensors = IntermediateTensors + + # OmniOutput (from vllm_omni.model_executor.models.output_templates) + class OmniOutput(NamedTuple): + text_hidden_states: object + multimodal_outputs: dict | None = None + intermediate_tensors: object | None = None + next_token_id: object | None = None + sys.modules["vllm_omni.model_executor.models.output_templates"].OmniOutput = OmniOutput + + # vllm_omni.diffusion.compile + sys.modules["vllm_omni.diffusion.compile"].regionally_compile = _mock_regionally_compile + + # Relative-import siblings + sys.modules[ + "vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts" + ].Qwen3TTSConfig = type("Qwen3TTSConfig", (), {}) + + sys.modules[ + "vllm_omni.model_executor.models.qwen3_tts.modeling_qwen3_tts" + ].Qwen3TTSForConditionalGeneration = type("Qwen3TTSForConditionalGeneration", (), {}) + + sys.modules[ + "vllm_omni.model_executor.models.qwen3_tts.processing_qwen3_tts" + ].Qwen3TTSProcessor = type("Qwen3TTSProcessor", (), {}) + + # transformers + sys.modules["transformers"].AutoConfig = MagicMock() + sys.modules["transformers"].AutoModel = MagicMock() + sys.modules["transformers"].AutoProcessor = MagicMock() + + +_setup() + +# Compile and exec the target file in a synthetic module, setting __package__ +# so that `from .foo import bar` resolves via sys.modules, not the file system. +_MOD_FQN = "vllm_omni.model_executor.models.qwen3_tts.qwen3_tts" +_mod = types.ModuleType(_MOD_FQN) +_mod.__file__ = str(_TARGET) +_mod.__package__ = "vllm_omni.model_executor.models.qwen3_tts" +_mod.__spec__ = None +sys.modules[_MOD_FQN] = _mod + +_source = _TARGET.read_text() +_code = compile(_source, str(_TARGET), "exec") +exec(_code, _mod.__dict__) # noqa: S102 + +Qwen3TTSModelForGeneration = _mod.Qwen3TTSModelForGeneration +Qwen3TTSModel = _mod.Qwen3TTSModel + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_vllm_config(model_path: str = "Qwen/Qwen3-TTS-12Hz-0.6B-Base") -> MagicMock: + cfg = MagicMock() + cfg.model_config.model = model_path + return cfg + + +def _make_wrapper(vllm_config=None): + """Instantiate the wrapper with a mocked Qwen3TTSModel.from_pretrained.""" + if vllm_config is None: + vllm_config = _make_vllm_config() + + with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: + mock_fp.return_value = MagicMock() + wrapper = Qwen3TTSModelForGeneration(vllm_config=vllm_config) + + return wrapper + + +def _builtins_import(): + import builtins + return builtins.__import__ + + +# --------------------------------------------------------------------------- +# Test A: Profile run short-circuit (regression for PR #1082) +# --------------------------------------------------------------------------- + +class TestProfileRunShortCircuit: + """Empty text triggers a dummy audio return instead of hanging.""" + + def test_empty_text_returns_dummy_audio(self): + wrapper = _make_wrapper() + result = wrapper.forward( + runtime_additional_information=[{"text": [""]}], + ) + + assert result.multimodal_outputs is not None + audio = result.multimodal_outputs["model_outputs"] + assert audio.shape == (24000,) + assert result.multimodal_outputs["sr"].item() == 24000 + + def test_empty_text_skips_generation(self): + wrapper = _make_wrapper() + wrapper.forward( + runtime_additional_information=[{"text": [""]}], + ) + + model = wrapper.model + model.generate_voice_clone.assert_not_called() + model.generate_custom_voice.assert_not_called() + model.generate_voice_design.assert_not_called() + + def test_nonempty_text_proceeds_to_generation(self): + wrapper = _make_wrapper() + dummy_wav = np.zeros(24000, dtype=np.float32) + wrapper.model.generate_voice_clone.return_value = ([dummy_wav], 24000) + + wrapper.forward( + runtime_additional_information=[{ + "text": ["Hello"], + "task_type": ["Base"], + "language": ["Auto"], + }], + ) + + wrapper.model.generate_voice_clone.assert_called_once() + + +# --------------------------------------------------------------------------- +# Test B: Flash-attn detection +# --------------------------------------------------------------------------- + +class TestFlashAttnDetection: + """Verify attn_implementation kwarg passed to Qwen3TTSModel.from_pretrained.""" + + def test_no_attn_kwarg_without_flash_attn(self): + """When flash_attn is not importable, from_pretrained gets no attn_implementation.""" + saved = sys.modules.pop("flash_attn", None) + try: + real_import = _builtins_import() + + def _fake_import(name, *args, **kwargs): + if name == "flash_attn": + raise ImportError("mocked: no flash_attn") + return real_import(name, *args, **kwargs) + + vllm_config = _make_vllm_config() + with ( + patch("builtins.__import__", side_effect=_fake_import), + patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp, + ): + mock_fp.return_value = MagicMock() + Qwen3TTSModelForGeneration(vllm_config=vllm_config) + + mock_fp.assert_called_once() + _, call_kwargs = mock_fp.call_args + assert "attn_implementation" not in call_kwargs, \ + f"Expected no attn_implementation kwarg, got: {call_kwargs}" + finally: + if saved is not None: + sys.modules["flash_attn"] = saved + + def test_flash_attn_preferred_when_available(self): + """When flash_attn is importable, from_pretrained receives flash_attention_2.""" + vllm_config = _make_vllm_config() + + fake_flash = types.ModuleType("flash_attn") + sys.modules["flash_attn"] = fake_flash + try: + with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: + mock_fp.return_value = MagicMock() + Qwen3TTSModelForGeneration(vllm_config=vllm_config) + + mock_fp.assert_called_once() + _, call_kwargs = mock_fp.call_args + assert call_kwargs.get("attn_implementation") == "flash_attention_2", \ + f"Expected attn_implementation='flash_attention_2', got: {call_kwargs}" + finally: + sys.modules.pop("flash_attn", None) + + +# --------------------------------------------------------------------------- +# Test C: Code predictor regional compilation (Phase 1a) +# --------------------------------------------------------------------------- + +class TestCodePredictorCompilation: + """Verify regionally_compile is called on the code predictor model.""" + + def test_regionally_compile_called_on_init(self): + """regionally_compile is called with the code predictor model during __init__.""" + _mock_regionally_compile.reset_mock() + + vllm_config = _make_vllm_config() + with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: + mock_model = MagicMock() + mock_fp.return_value = mock_model + Qwen3TTSModelForGeneration(vllm_config=vllm_config) + + _mock_regionally_compile.assert_called_once() + call_args, call_kwargs = _mock_regionally_compile.call_args + # First positional arg is the code predictor's inner model + assert call_args[0] is mock_model.model.code_predictor.model + assert call_kwargs.get("mode") == "reduce-overhead" + + def test_repeated_blocks_set_before_compile(self): + """_repeated_blocks attribute is set on the code predictor model.""" + _mock_regionally_compile.reset_mock() + + vllm_config = _make_vllm_config() + with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: + mock_model = MagicMock() + mock_fp.return_value = mock_model + Qwen3TTSModelForGeneration(vllm_config=vllm_config) + + cp_model = mock_model.model.code_predictor.model + assert cp_model._repeated_blocks == ["Qwen3TTSDecoderLayer"] + + def test_compile_failure_does_not_crash(self): + """If regionally_compile raises, __init__ still succeeds.""" + _mock_regionally_compile.reset_mock() + _mock_regionally_compile.side_effect = RuntimeError("compile failed") + + try: + vllm_config = _make_vllm_config() + with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: + mock_fp.return_value = MagicMock() + # Should not raise + wrapper = Qwen3TTSModelForGeneration(vllm_config=vllm_config) + assert wrapper is not None + finally: + _mock_regionally_compile.side_effect = None diff --git a/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py index 75fe5dbf403..455ec8ae69b 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py +++ b/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py @@ -1629,6 +1629,7 @@ def forward( # Generate else: last_id_hidden = self.get_input_embeddings()(input_ids) + logger.debug("Code predictor generate: use_cache=%s", self.code_predictor.config.use_cache) predictor_result = self.code_predictor.generate( inputs_embeds=torch.cat((past_hidden, last_id_hidden), dim=1), max_new_tokens=self.config.num_code_groups - 1, diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py index 73c7c2743c3..788b072eabd 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py @@ -81,6 +81,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): torch_dtype=torch.bfloat16, **attn_kwargs, ) + + # Compile code predictor decoder layers for reduced kernel launch overhead + try: + from vllm_omni.diffusion.compile import regionally_compile + + code_predictor_model = self.model.model.code_predictor.model + code_predictor_model._repeated_blocks = ["Qwen3TTSDecoderLayer"] + regionally_compile(code_predictor_model, mode="reduce-overhead") + logger.info("Code predictor decoder layers compiled with torch.compile.") + except Exception as e: + logger.warning("Failed to compile code predictor layers: %s. Continuing without compilation.", e) + self.task_type = model_path.split("-")[-1].strip("/") # Mark that this model produces multimodal outputs self.have_multimodal_outputs = True From d00e92af56ea9459f1e6b208a7454b6810956877 Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Thu, 29 Jan 2026 13:48:41 +0100 Subject: [PATCH 2/3] Fix critical review findings for code predictor compilation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix attribute chain: .model.model.code_predictor.model → .model.model.talker.code_predictor.model (was silently failing) - Use dynamic=True instead of mode="reduce-overhead" to avoid CUDA graph shape mismatches in autoregressive generate() loop - Narrow except: separate ImportError from RuntimeError to avoid masking real bugs like wrong attribute paths - Add enforce_eager gate: skip compilation when enforce_eager=True, matching the diffusion model runner pattern - Use structured mock in tests to catch attribute chain bugs - Add test for enforce_eager gate Co-Authored-By: Claude Opus 4.5 --- .../models/qwen3_tts/test_qwen3_tts.py | 75 +++++++++++++++---- .../models/qwen3_tts/qwen3_tts.py | 27 ++++--- 2 files changed, 78 insertions(+), 24 deletions(-) diff --git a/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py index 4f5a955fa3e..307ea193e54 100644 --- a/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py +++ b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py @@ -161,9 +161,13 @@ class OmniOutput(NamedTuple): # Helpers # --------------------------------------------------------------------------- -def _make_vllm_config(model_path: str = "Qwen/Qwen3-TTS-12Hz-0.6B-Base") -> MagicMock: +def _make_vllm_config( + model_path: str = "Qwen/Qwen3-TTS-12Hz-0.6B-Base", + enforce_eager: bool = False, +) -> MagicMock: cfg = MagicMock() cfg.model_config.model = model_path + cfg.model_config.enforce_eager = enforce_eager return cfg @@ -286,49 +290,92 @@ def test_flash_attn_preferred_when_available(self): # Test C: Code predictor regional compilation (Phase 1a) # --------------------------------------------------------------------------- +def _make_structured_model_mock(): + """Build a mock with explicit attribute hierarchy matching the real model. + + Real hierarchy: + Qwen3TTSModel (wrapper .model attr) + └── Qwen3TTSForConditionalGeneration (.model on wrapper) + └── .talker (Qwen3TTSTalkerForConditionalGeneration) + └── .code_predictor (Qwen3TTSTalkerCodePredictorModelForConditionalGeneration) + └── .model (Qwen3TTSTalkerCodePredictorModel — has .layers) + + Using a structured mock prevents MagicMock from auto-creating wrong paths + (e.g. .model.model.code_predictor.model without .talker). + """ + cp_inner_model = MagicMock(name="Qwen3TTSTalkerCodePredictorModel") + code_predictor = MagicMock(name="CodePredictorForCG") + code_predictor.model = cp_inner_model + + talker = MagicMock(name="TalkerForCG") + talker.code_predictor = code_predictor + + hf_model = MagicMock(name="Qwen3TTSForConditionalGeneration") + hf_model.talker = talker + # Ensure accessing .code_predictor directly on hf_model raises, + # so tests would fail if the production code skips .talker + del hf_model.code_predictor + + wrapper_model = MagicMock(name="Qwen3TTSModel") + wrapper_model.model = hf_model + + return wrapper_model, cp_inner_model + + class TestCodePredictorCompilation: """Verify regionally_compile is called on the code predictor model.""" def test_regionally_compile_called_on_init(self): - """regionally_compile is called with the code predictor model during __init__.""" + """regionally_compile is called with the code predictor's inner model and dynamic=True.""" _mock_regionally_compile.reset_mock() vllm_config = _make_vllm_config() + wrapper_model, cp_inner = _make_structured_model_mock() with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: - mock_model = MagicMock() - mock_fp.return_value = mock_model + mock_fp.return_value = wrapper_model Qwen3TTSModelForGeneration(vllm_config=vllm_config) _mock_regionally_compile.assert_called_once() call_args, call_kwargs = _mock_regionally_compile.call_args - # First positional arg is the code predictor's inner model - assert call_args[0] is mock_model.model.code_predictor.model - assert call_kwargs.get("mode") == "reduce-overhead" + assert call_args[0] is cp_inner + assert call_kwargs.get("dynamic") is True + assert "mode" not in call_kwargs def test_repeated_blocks_set_before_compile(self): """_repeated_blocks attribute is set on the code predictor model.""" _mock_regionally_compile.reset_mock() vllm_config = _make_vllm_config() + wrapper_model, cp_inner = _make_structured_model_mock() with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: - mock_model = MagicMock() - mock_fp.return_value = mock_model + mock_fp.return_value = wrapper_model Qwen3TTSModelForGeneration(vllm_config=vllm_config) - cp_model = mock_model.model.code_predictor.model - assert cp_model._repeated_blocks == ["Qwen3TTSDecoderLayer"] + assert cp_inner._repeated_blocks == ["Qwen3TTSDecoderLayer"] def test_compile_failure_does_not_crash(self): - """If regionally_compile raises, __init__ still succeeds.""" + """If regionally_compile raises RuntimeError, __init__ still succeeds.""" _mock_regionally_compile.reset_mock() _mock_regionally_compile.side_effect = RuntimeError("compile failed") try: vllm_config = _make_vllm_config() + wrapper_model, _ = _make_structured_model_mock() with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: - mock_fp.return_value = MagicMock() - # Should not raise + mock_fp.return_value = wrapper_model wrapper = Qwen3TTSModelForGeneration(vllm_config=vllm_config) assert wrapper is not None finally: _mock_regionally_compile.side_effect = None + + def test_enforce_eager_skips_compilation(self): + """When enforce_eager=True, regionally_compile is not called.""" + _mock_regionally_compile.reset_mock() + + vllm_config = _make_vllm_config(enforce_eager=True) + wrapper_model, _ = _make_structured_model_mock() + with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: + mock_fp.return_value = wrapper_model + Qwen3TTSModelForGeneration(vllm_config=vllm_config) + + _mock_regionally_compile.assert_not_called() diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py index 788b072eabd..7474c30fbd0 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py @@ -82,16 +82,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): **attn_kwargs, ) - # Compile code predictor decoder layers for reduced kernel launch overhead - try: - from vllm_omni.diffusion.compile import regionally_compile - - code_predictor_model = self.model.model.code_predictor.model - code_predictor_model._repeated_blocks = ["Qwen3TTSDecoderLayer"] - regionally_compile(code_predictor_model, mode="reduce-overhead") - logger.info("Code predictor decoder layers compiled with torch.compile.") - except Exception as e: - logger.warning("Failed to compile code predictor layers: %s. Continuing without compilation.", e) + # Compile code predictor decoder layers for reduced kernel launch overhead. + # Uses dynamic=True (not mode="reduce-overhead") because the autoregressive + # generate() loop has variable KV cache / sequence length shapes per step. + enforce_eager = getattr(getattr(vllm_config, "model_config", None), "enforce_eager", False) + if not enforce_eager: + try: + from vllm_omni.diffusion.compile import regionally_compile + except ImportError: + logger.info("regionally_compile not available, skipping code predictor compilation.") + else: + code_predictor_model = self.model.model.talker.code_predictor.model + code_predictor_model._repeated_blocks = ["Qwen3TTSDecoderLayer"] + try: + regionally_compile(code_predictor_model, dynamic=True) + logger.info("Code predictor decoder layers compiled with torch.compile.") + except RuntimeError as e: + logger.warning("Failed to compile code predictor layers: %s. Continuing without compilation.", e) self.task_type = model_path.split("-")[-1].strip("/") # Mark that this model produces multimodal outputs From 22e0d535c44a3480a233ee6fa4b15a1266e14d86 Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Thu, 29 Jan 2026 14:31:41 +0100 Subject: [PATCH 3/3] Address remaining review findings (M1-M4, test updates) - M1: Rate-limit debug log to fire once instead of 31x per token - M2: Add atexit teardown to restore sys.modules after tests - M3: Add __init__.py to test directory - M4: Add autouse pytest fixture for mock reset between tests - Update Test A for new profile run behavior (caps max_new_tokens=2 instead of returning dummy audio) Co-Authored-By: Claude Opus 4.5 --- .../models/qwen3_tts/__init__.py | 0 .../models/qwen3_tts/test_qwen3_tts.py | 165 ++++++++++-------- .../models/qwen3_tts/modeling_qwen3_tts.py | 4 +- 3 files changed, 99 insertions(+), 70 deletions(-) create mode 100644 tests/model_executor/models/qwen3_tts/__init__.py diff --git a/tests/model_executor/models/qwen3_tts/__init__.py b/tests/model_executor/models/qwen3_tts/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py index 307ea193e54..8c64dd4e237 100644 --- a/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py +++ b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py @@ -4,7 +4,7 @@ """Regression tests for Qwen3 TTS model wrapper. Tests cover: - - Profile run short-circuit (regression for PR #1082) + - Profile run cap (regression for PR #1082 / #995) - Flash-attn detection and fallback - Code predictor regional compilation (Phase 1a) @@ -15,6 +15,7 @@ to completely bypass the vllm_omni.__init__ import chain. """ +import atexit import logging import sys import types @@ -23,6 +24,7 @@ from unittest.mock import MagicMock, patch import numpy as np +import pytest import torch # --------------------------------------------------------------------------- @@ -80,7 +82,7 @@ def _make_stub(fqn: str) -> types.ModuleType: def _setup(): - # Save originals so they can be restored if needed + # Save originals so they can be restored on teardown for fqn in _STUB_FQNS: _saved_modules[fqn] = sys.modules.get(fqn) _make_stub(fqn) @@ -138,7 +140,18 @@ class OmniOutput(NamedTuple): sys.modules["transformers"].AutoProcessor = MagicMock() +def _teardown(): + """Restore sys.modules to pre-test state.""" + for fqn in reversed(_STUB_FQNS): + orig = _saved_modules.get(fqn) + if orig is None: + sys.modules.pop(fqn, None) + else: + sys.modules[fqn] = orig + + _setup() +atexit.register(_teardown) # Compile and exec the target file in a synthetic module, setting __package__ # so that `from .foo import bar` resolves via sys.modules, not the file system. @@ -157,6 +170,17 @@ class OmniOutput(NamedTuple): Qwen3TTSModel = _mod.Qwen3TTSModel +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(autouse=True) +def _reset_compile_mock(): + """Reset the regionally_compile mock before each test.""" + _mock_regionally_compile.reset_mock() + _mock_regionally_compile.side_effect = None + + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -188,34 +212,62 @@ def _builtins_import(): return builtins.__import__ +def _make_structured_model_mock(): + """Build a mock with explicit attribute hierarchy matching the real model. + + Real hierarchy: + Qwen3TTSModel (wrapper .model attr) + +-- Qwen3TTSForConditionalGeneration (.model on wrapper) + +-- .talker (Qwen3TTSTalkerForConditionalGeneration) + +-- .code_predictor (Qwen3TTSTalkerCodePredictorModelForConditionalGeneration) + +-- .model (Qwen3TTSTalkerCodePredictorModel -- has .layers) + + Using a structured mock prevents MagicMock from auto-creating wrong paths + (e.g. .model.model.code_predictor.model without .talker). + """ + cp_inner_model = MagicMock(name="Qwen3TTSTalkerCodePredictorModel") + code_predictor = MagicMock(name="CodePredictorForCG") + code_predictor.model = cp_inner_model + + talker = MagicMock(name="TalkerForCG") + talker.code_predictor = code_predictor + + hf_model = MagicMock(name="Qwen3TTSForConditionalGeneration") + hf_model.talker = talker + # Ensure accessing .code_predictor directly on hf_model raises, + # so tests would fail if the production code skips .talker + del hf_model.code_predictor + + wrapper_model = MagicMock(name="Qwen3TTSModel") + wrapper_model.model = hf_model + + return wrapper_model, cp_inner_model + + # --------------------------------------------------------------------------- -# Test A: Profile run short-circuit (regression for PR #1082) +# Test A: Profile run cap (regression for PR #1082 / #995) # --------------------------------------------------------------------------- -class TestProfileRunShortCircuit: - """Empty text triggers a dummy audio return instead of hanging.""" +class TestProfileRunCap: + """Empty text caps max_new_tokens and proceeds to generation.""" - def test_empty_text_returns_dummy_audio(self): + def test_empty_text_caps_max_new_tokens(self): + """Profile run sets max_new_tokens=2 and still calls generation.""" wrapper = _make_wrapper() - result = wrapper.forward( - runtime_additional_information=[{"text": [""]}], - ) - - assert result.multimodal_outputs is not None - audio = result.multimodal_outputs["model_outputs"] - assert audio.shape == (24000,) - assert result.multimodal_outputs["sr"].item() == 24000 + dummy_wav = np.zeros(24000, dtype=np.float32) + wrapper.model.generate_voice_clone.return_value = ([dummy_wav], 24000) - def test_empty_text_skips_generation(self): - wrapper = _make_wrapper() wrapper.forward( - runtime_additional_information=[{"text": [""]}], + runtime_additional_information=[{ + "text": [""], + "task_type": ["Base"], + "language": ["Auto"], + }], ) - model = wrapper.model - model.generate_voice_clone.assert_not_called() - model.generate_custom_voice.assert_not_called() - model.generate_voice_design.assert_not_called() + wrapper.model.generate_voice_clone.assert_called_once() + _, call_kwargs = wrapper.model.generate_voice_clone.call_args + assert call_kwargs.get("max_new_tokens") == 2 def test_nonempty_text_proceeds_to_generation(self): wrapper = _make_wrapper() @@ -232,6 +284,23 @@ def test_nonempty_text_proceeds_to_generation(self): wrapper.model.generate_voice_clone.assert_called_once() + def test_nonempty_text_does_not_cap_max_new_tokens(self): + """Non-profile runs should not inject max_new_tokens=2.""" + wrapper = _make_wrapper() + dummy_wav = np.zeros(24000, dtype=np.float32) + wrapper.model.generate_voice_clone.return_value = ([dummy_wav], 24000) + + wrapper.forward( + runtime_additional_information=[{ + "text": ["Hello world"], + "task_type": ["Base"], + "language": ["Auto"], + }], + ) + + _, call_kwargs = wrapper.model.generate_voice_clone.call_args + assert call_kwargs.get("max_new_tokens") != 2 + # --------------------------------------------------------------------------- # Test B: Flash-attn detection @@ -290,45 +359,11 @@ def test_flash_attn_preferred_when_available(self): # Test C: Code predictor regional compilation (Phase 1a) # --------------------------------------------------------------------------- -def _make_structured_model_mock(): - """Build a mock with explicit attribute hierarchy matching the real model. - - Real hierarchy: - Qwen3TTSModel (wrapper .model attr) - └── Qwen3TTSForConditionalGeneration (.model on wrapper) - └── .talker (Qwen3TTSTalkerForConditionalGeneration) - └── .code_predictor (Qwen3TTSTalkerCodePredictorModelForConditionalGeneration) - └── .model (Qwen3TTSTalkerCodePredictorModel — has .layers) - - Using a structured mock prevents MagicMock from auto-creating wrong paths - (e.g. .model.model.code_predictor.model without .talker). - """ - cp_inner_model = MagicMock(name="Qwen3TTSTalkerCodePredictorModel") - code_predictor = MagicMock(name="CodePredictorForCG") - code_predictor.model = cp_inner_model - - talker = MagicMock(name="TalkerForCG") - talker.code_predictor = code_predictor - - hf_model = MagicMock(name="Qwen3TTSForConditionalGeneration") - hf_model.talker = talker - # Ensure accessing .code_predictor directly on hf_model raises, - # so tests would fail if the production code skips .talker - del hf_model.code_predictor - - wrapper_model = MagicMock(name="Qwen3TTSModel") - wrapper_model.model = hf_model - - return wrapper_model, cp_inner_model - - class TestCodePredictorCompilation: """Verify regionally_compile is called on the code predictor model.""" def test_regionally_compile_called_on_init(self): """regionally_compile is called with the code predictor's inner model and dynamic=True.""" - _mock_regionally_compile.reset_mock() - vllm_config = _make_vllm_config() wrapper_model, cp_inner = _make_structured_model_mock() with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: @@ -343,8 +378,6 @@ def test_regionally_compile_called_on_init(self): def test_repeated_blocks_set_before_compile(self): """_repeated_blocks attribute is set on the code predictor model.""" - _mock_regionally_compile.reset_mock() - vllm_config = _make_vllm_config() wrapper_model, cp_inner = _make_structured_model_mock() with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: @@ -355,23 +388,17 @@ def test_repeated_blocks_set_before_compile(self): def test_compile_failure_does_not_crash(self): """If regionally_compile raises RuntimeError, __init__ still succeeds.""" - _mock_regionally_compile.reset_mock() _mock_regionally_compile.side_effect = RuntimeError("compile failed") - try: - vllm_config = _make_vllm_config() - wrapper_model, _ = _make_structured_model_mock() - with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: - mock_fp.return_value = wrapper_model - wrapper = Qwen3TTSModelForGeneration(vllm_config=vllm_config) - assert wrapper is not None - finally: - _mock_regionally_compile.side_effect = None + vllm_config = _make_vllm_config() + wrapper_model, _ = _make_structured_model_mock() + with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: + mock_fp.return_value = wrapper_model + wrapper = Qwen3TTSModelForGeneration(vllm_config=vllm_config) + assert wrapper is not None def test_enforce_eager_skips_compilation(self): """When enforce_eager=True, regionally_compile is not called.""" - _mock_regionally_compile.reset_mock() - vllm_config = _make_vllm_config(enforce_eager=True) wrapper_model, _ = _make_structured_model_mock() with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp: diff --git a/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py index 455ec8ae69b..bf81d2c4ab1 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py +++ b/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py @@ -1629,7 +1629,9 @@ def forward( # Generate else: last_id_hidden = self.get_input_embeddings()(input_ids) - logger.debug("Code predictor generate: use_cache=%s", self.code_predictor.config.use_cache) + if not getattr(self, "_logged_use_cache", False): + logger.debug("Code predictor generate: use_cache=%s", self.code_predictor.config.use_cache) + self._logged_use_cache = True predictor_result = self.code_predictor.generate( inputs_embeds=torch.cat((past_hidden, last_id_hidden), dim=1), max_new_tokens=self.config.num_code_groups - 1,