From 12c729dcdec7890e0f2970025648197f0e56a77a Mon Sep 17 00:00:00 2001
From: marksverdhei <marksverdhei@hotmail.com>
Date: Thu, 29 Jan 2026 13:17:38 +0100
Subject: [PATCH 1/3] [Perf] Regional torch.compile for code predictor decoder
 layers

Apply regionally_compile(mode="reduce-overhead") to the 5
Qwen3TTSDecoderLayer blocks inside the code predictor, reducing
per-kernel launch overhead across the 31-iteration generate loop.
Uses the existing diffusion/compile.py pattern. Falls back gracefully
if compilation fails on a given GPU/CUDA version.

Also adds a debug log for code_predictor.config.use_cache to inform
Phase 1b KV cache work.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .../models/qwen3_tts/test_qwen3_tts.py        | 334 ++++++++++++++++++
 .../models/qwen3_tts/modeling_qwen3_tts.py    |   1 +
 .../models/qwen3_tts/qwen3_tts.py             |  12 +
 3 files changed, 347 insertions(+)
 create mode 100644 tests/model_executor/models/qwen3_tts/test_qwen3_tts.py

diff --git a/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py
new file mode 100644
index 00000000000..4f5a955fa3e
--- /dev/null
+++ b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py
@@ -0,0 +1,334 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Regression tests for Qwen3 TTS model wrapper.
+
+Tests cover:
+  - Profile run short-circuit (regression for PR #1082)
+  - Flash-attn detection and fallback
+  - Code predictor regional compilation (Phase 1a)
+
+These tests mock heavy dependencies (vllm, transformers, librosa, etc.) so
+they can run without GPU, model weights, or the full vllm engine.
+
+The module under test is compiled and executed in a synthetic namespace
+to completely bypass the vllm_omni.__init__ import chain.
+"""
+
+import logging
+import sys
+import types
+from pathlib import Path
+from typing import NamedTuple
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+import torch
+
+# ---------------------------------------------------------------------------
+# Bootstrap: build a minimal set of stub modules, then compile+exec the
+# target .py file in a module whose __package__ resolves relative imports
+# to our stubs.
+# ---------------------------------------------------------------------------
+
+_REPO = Path(__file__).resolve().parents[4]  # repo root
+_TARGET = _REPO / "vllm_omni" / "model_executor" / "models" / "qwen3_tts" / "qwen3_tts.py"
+
+# Full set of modules the target file references (directly or via relative
+# imports that we intercept).
+_STUB_FQNS = [
+    # vllm (direct imports in qwen3_tts.py)
+    "vllm",
+    "vllm.config",
+    "vllm.logger",
+    "vllm.sequence",
+    # transformers (direct import in qwen3_tts.py)
+    "transformers",
+    # audio I/O libs (direct imports in qwen3_tts.py)
+    "librosa",
+    "soundfile",
+    # vllm_omni package tree (relative imports resolve to these)
+    "vllm_omni",
+    "vllm_omni.diffusion",
+    "vllm_omni.diffusion.compile",
+    "vllm_omni.model_executor",
+    "vllm_omni.model_executor.models",
+    "vllm_omni.model_executor.models.output_templates",
+    "vllm_omni.model_executor.models.qwen3_tts",
+    "vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts",
+    "vllm_omni.model_executor.models.qwen3_tts.modeling_qwen3_tts",
+    "vllm_omni.model_executor.models.qwen3_tts.processing_qwen3_tts",
+]
+
+_saved_modules: dict[str, types.ModuleType | None] = {}
+
+
+def _make_stub(fqn: str) -> types.ModuleType:
+    parts = fqn.split(".")
+    for i in range(1, len(parts) + 1):
+        key = ".".join(parts[:i])
+        if key not in sys.modules:
+            mod = types.ModuleType(key)
+            mod.__path__ = [str(_REPO / key.replace(".", "/"))]
+            mod.__package__ = key
+            mod.__spec__ = None
+            sys.modules[key] = mod
+    return sys.modules[fqn]
+
+
+_mock_regionally_compile = MagicMock(name="regionally_compile")
+
+
+def _setup():
+    # Save originals so they can be restored if needed
+    for fqn in _STUB_FQNS:
+        _saved_modules[fqn] = sys.modules.get(fqn)
+        _make_stub(fqn)
+
+    # Wire parent.child attributes
+    for fqn in _STUB_FQNS:
+        parts = fqn.split(".")
+        if len(parts) > 1:
+            parent = sys.modules.get(".".join(parts[:-1]))
+            child = sys.modules.get(fqn)
+            if parent and child:
+                setattr(parent, parts[-1], child)
+
+    # ---- Concrete stubs for names the target file actually uses ----
+
+    # vllm.logger
+    sys.modules["vllm.logger"].init_logger = lambda name: logging.getLogger(name)
+
+    # vllm.config
+    sys.modules["vllm.config"].VllmConfig = type("VllmConfig", (), {})
+
+    # vllm.sequence
+    class IntermediateTensors:
+        def __init__(self, d=None):
+            self.tensors = d or {}
+    sys.modules["vllm.sequence"].IntermediateTensors = IntermediateTensors
+
+    # OmniOutput (from vllm_omni.model_executor.models.output_templates)
+    class OmniOutput(NamedTuple):
+        text_hidden_states: object
+        multimodal_outputs: dict | None = None
+        intermediate_tensors: object | None = None
+        next_token_id: object | None = None
+    sys.modules["vllm_omni.model_executor.models.output_templates"].OmniOutput = OmniOutput
+
+    # vllm_omni.diffusion.compile
+    sys.modules["vllm_omni.diffusion.compile"].regionally_compile = _mock_regionally_compile
+
+    # Relative-import siblings
+    sys.modules[
+        "vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts"
+    ].Qwen3TTSConfig = type("Qwen3TTSConfig", (), {})
+
+    sys.modules[
+        "vllm_omni.model_executor.models.qwen3_tts.modeling_qwen3_tts"
+    ].Qwen3TTSForConditionalGeneration = type("Qwen3TTSForConditionalGeneration", (), {})
+
+    sys.modules[
+        "vllm_omni.model_executor.models.qwen3_tts.processing_qwen3_tts"
+    ].Qwen3TTSProcessor = type("Qwen3TTSProcessor", (), {})
+
+    # transformers
+    sys.modules["transformers"].AutoConfig = MagicMock()
+    sys.modules["transformers"].AutoModel = MagicMock()
+    sys.modules["transformers"].AutoProcessor = MagicMock()
+
+
+_setup()
+
+# Compile and exec the target file in a synthetic module, setting __package__
+# so that `from .foo import bar` resolves via sys.modules, not the file system.
+_MOD_FQN = "vllm_omni.model_executor.models.qwen3_tts.qwen3_tts"
+_mod = types.ModuleType(_MOD_FQN)
+_mod.__file__ = str(_TARGET)
+_mod.__package__ = "vllm_omni.model_executor.models.qwen3_tts"
+_mod.__spec__ = None
+sys.modules[_MOD_FQN] = _mod
+
+_source = _TARGET.read_text()
+_code = compile(_source, str(_TARGET), "exec")
+exec(_code, _mod.__dict__)  # noqa: S102
+
+Qwen3TTSModelForGeneration = _mod.Qwen3TTSModelForGeneration
+Qwen3TTSModel = _mod.Qwen3TTSModel
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_vllm_config(model_path: str = "Qwen/Qwen3-TTS-12Hz-0.6B-Base") -> MagicMock:
+    cfg = MagicMock()
+    cfg.model_config.model = model_path
+    return cfg
+
+
+def _make_wrapper(vllm_config=None):
+    """Instantiate the wrapper with a mocked Qwen3TTSModel.from_pretrained."""
+    if vllm_config is None:
+        vllm_config = _make_vllm_config()
+
+    with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
+        mock_fp.return_value = MagicMock()
+        wrapper = Qwen3TTSModelForGeneration(vllm_config=vllm_config)
+
+    return wrapper
+
+
+def _builtins_import():
+    import builtins
+    return builtins.__import__
+
+
+# ---------------------------------------------------------------------------
+# Test A: Profile run short-circuit (regression for PR #1082)
+# ---------------------------------------------------------------------------
+
+class TestProfileRunShortCircuit:
+    """Empty text triggers a dummy audio return instead of hanging."""
+
+    def test_empty_text_returns_dummy_audio(self):
+        wrapper = _make_wrapper()
+        result = wrapper.forward(
+            runtime_additional_information=[{"text": [""]}],
+        )
+
+        assert result.multimodal_outputs is not None
+        audio = result.multimodal_outputs["model_outputs"]
+        assert audio.shape == (24000,)
+        assert result.multimodal_outputs["sr"].item() == 24000
+
+    def test_empty_text_skips_generation(self):
+        wrapper = _make_wrapper()
+        wrapper.forward(
+            runtime_additional_information=[{"text": [""]}],
+        )
+
+        model = wrapper.model
+        model.generate_voice_clone.assert_not_called()
+        model.generate_custom_voice.assert_not_called()
+        model.generate_voice_design.assert_not_called()
+
+    def test_nonempty_text_proceeds_to_generation(self):
+        wrapper = _make_wrapper()
+        dummy_wav = np.zeros(24000, dtype=np.float32)
+        wrapper.model.generate_voice_clone.return_value = ([dummy_wav], 24000)
+
+        wrapper.forward(
+            runtime_additional_information=[{
+                "text": ["Hello"],
+                "task_type": ["Base"],
+                "language": ["Auto"],
+            }],
+        )
+
+        wrapper.model.generate_voice_clone.assert_called_once()
+
+
+# ---------------------------------------------------------------------------
+# Test B: Flash-attn detection
+# ---------------------------------------------------------------------------
+
+class TestFlashAttnDetection:
+    """Verify attn_implementation kwarg passed to Qwen3TTSModel.from_pretrained."""
+
+    def test_no_attn_kwarg_without_flash_attn(self):
+        """When flash_attn is not importable, from_pretrained gets no attn_implementation."""
+        saved = sys.modules.pop("flash_attn", None)
+        try:
+            real_import = _builtins_import()
+
+            def _fake_import(name, *args, **kwargs):
+                if name == "flash_attn":
+                    raise ImportError("mocked: no flash_attn")
+                return real_import(name, *args, **kwargs)
+
+            vllm_config = _make_vllm_config()
+            with (
+                patch("builtins.__import__", side_effect=_fake_import),
+                patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp,
+            ):
+                mock_fp.return_value = MagicMock()
+                Qwen3TTSModelForGeneration(vllm_config=vllm_config)
+
+                mock_fp.assert_called_once()
+                _, call_kwargs = mock_fp.call_args
+                assert "attn_implementation" not in call_kwargs, \
+                    f"Expected no attn_implementation kwarg, got: {call_kwargs}"
+        finally:
+            if saved is not None:
+                sys.modules["flash_attn"] = saved
+
+    def test_flash_attn_preferred_when_available(self):
+        """When flash_attn is importable, from_pretrained receives flash_attention_2."""
+        vllm_config = _make_vllm_config()
+
+        fake_flash = types.ModuleType("flash_attn")
+        sys.modules["flash_attn"] = fake_flash
+        try:
+            with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
+                mock_fp.return_value = MagicMock()
+                Qwen3TTSModelForGeneration(vllm_config=vllm_config)
+
+                mock_fp.assert_called_once()
+                _, call_kwargs = mock_fp.call_args
+                assert call_kwargs.get("attn_implementation") == "flash_attention_2", \
+                    f"Expected attn_implementation='flash_attention_2', got: {call_kwargs}"
+        finally:
+            sys.modules.pop("flash_attn", None)
+
+
+# ---------------------------------------------------------------------------
+# Test C: Code predictor regional compilation (Phase 1a)
+# ---------------------------------------------------------------------------
+
+class TestCodePredictorCompilation:
+    """Verify regionally_compile is called on the code predictor model."""
+
+    def test_regionally_compile_called_on_init(self):
+        """regionally_compile is called with the code predictor model during __init__."""
+        _mock_regionally_compile.reset_mock()
+
+        vllm_config = _make_vllm_config()
+        with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
+            mock_model = MagicMock()
+            mock_fp.return_value = mock_model
+            Qwen3TTSModelForGeneration(vllm_config=vllm_config)
+
+        _mock_regionally_compile.assert_called_once()
+        call_args, call_kwargs = _mock_regionally_compile.call_args
+        # First positional arg is the code predictor's inner model
+        assert call_args[0] is mock_model.model.code_predictor.model
+        assert call_kwargs.get("mode") == "reduce-overhead"
+
+    def test_repeated_blocks_set_before_compile(self):
+        """_repeated_blocks attribute is set on the code predictor model."""
+        _mock_regionally_compile.reset_mock()
+
+        vllm_config = _make_vllm_config()
+        with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
+            mock_model = MagicMock()
+            mock_fp.return_value = mock_model
+            Qwen3TTSModelForGeneration(vllm_config=vllm_config)
+
+        cp_model = mock_model.model.code_predictor.model
+        assert cp_model._repeated_blocks == ["Qwen3TTSDecoderLayer"]
+
+    def test_compile_failure_does_not_crash(self):
+        """If regionally_compile raises, __init__ still succeeds."""
+        _mock_regionally_compile.reset_mock()
+        _mock_regionally_compile.side_effect = RuntimeError("compile failed")
+
+        try:
+            vllm_config = _make_vllm_config()
+            with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
+                mock_fp.return_value = MagicMock()
+                # Should not raise
+                wrapper = Qwen3TTSModelForGeneration(vllm_config=vllm_config)
+                assert wrapper is not None
+        finally:
+            _mock_regionally_compile.side_effect = None
diff --git a/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py
index 75fe5dbf403..455ec8ae69b 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py
@@ -1629,6 +1629,7 @@ def forward(
         # Generate
         else:
             last_id_hidden = self.get_input_embeddings()(input_ids)
+            logger.debug("Code predictor generate: use_cache=%s", self.code_predictor.config.use_cache)
             predictor_result = self.code_predictor.generate(
                 inputs_embeds=torch.cat((past_hidden, last_id_hidden), dim=1),
                 max_new_tokens=self.config.num_code_groups - 1,
diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
index 73c7c2743c3..788b072eabd 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
@@ -81,6 +81,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             torch_dtype=torch.bfloat16,
             **attn_kwargs,
         )
+
+        # Compile code predictor decoder layers for reduced kernel launch overhead
+        try:
+            from vllm_omni.diffusion.compile import regionally_compile
+
+            code_predictor_model = self.model.model.code_predictor.model
+            code_predictor_model._repeated_blocks = ["Qwen3TTSDecoderLayer"]
+            regionally_compile(code_predictor_model, mode="reduce-overhead")
+            logger.info("Code predictor decoder layers compiled with torch.compile.")
+        except Exception as e:
+            logger.warning("Failed to compile code predictor layers: %s. Continuing without compilation.", e)
+
         self.task_type = model_path.split("-")[-1].strip("/")
         # Mark that this model produces multimodal outputs
         self.have_multimodal_outputs = True

From d00e92af56ea9459f1e6b208a7454b6810956877 Mon Sep 17 00:00:00 2001
From: marksverdhei <marksverdhei@hotmail.com>
Date: Thu, 29 Jan 2026 13:48:41 +0100
Subject: [PATCH 2/3] Fix critical review findings for code predictor
 compilation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix attribute chain: .model.model.code_predictor.model →
  .model.model.talker.code_predictor.model (was silently failing)
- Use dynamic=True instead of mode="reduce-overhead" to avoid
  CUDA graph shape mismatches in autoregressive generate() loop
- Narrow except: separate ImportError from RuntimeError to avoid
  masking real bugs like wrong attribute paths
- Add enforce_eager gate: skip compilation when enforce_eager=True,
  matching the diffusion model runner pattern
- Use structured mock in tests to catch attribute chain bugs
- Add test for enforce_eager gate

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .../models/qwen3_tts/test_qwen3_tts.py        | 75 +++++++++++++++----
 .../models/qwen3_tts/qwen3_tts.py             | 27 ++++---
 2 files changed, 78 insertions(+), 24 deletions(-)

diff --git a/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py
index 4f5a955fa3e..307ea193e54 100644
--- a/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py
+++ b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py
@@ -161,9 +161,13 @@ class OmniOutput(NamedTuple):
 # Helpers
 # ---------------------------------------------------------------------------
 
-def _make_vllm_config(model_path: str = "Qwen/Qwen3-TTS-12Hz-0.6B-Base") -> MagicMock:
+def _make_vllm_config(
+    model_path: str = "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
+    enforce_eager: bool = False,
+) -> MagicMock:
     cfg = MagicMock()
     cfg.model_config.model = model_path
+    cfg.model_config.enforce_eager = enforce_eager
     return cfg
 
 
@@ -286,49 +290,92 @@ def test_flash_attn_preferred_when_available(self):
 # Test C: Code predictor regional compilation (Phase 1a)
 # ---------------------------------------------------------------------------
 
+def _make_structured_model_mock():
+    """Build a mock with explicit attribute hierarchy matching the real model.
+
+    Real hierarchy:
+        Qwen3TTSModel (wrapper .model attr)
+          └── Qwen3TTSForConditionalGeneration (.model on wrapper)
+                └── .talker  (Qwen3TTSTalkerForConditionalGeneration)
+                      └── .code_predictor  (Qwen3TTSTalkerCodePredictorModelForConditionalGeneration)
+                            └── .model  (Qwen3TTSTalkerCodePredictorModel — has .layers)
+
+    Using a structured mock prevents MagicMock from auto-creating wrong paths
+    (e.g. .model.model.code_predictor.model without .talker).
+    """
+    cp_inner_model = MagicMock(name="Qwen3TTSTalkerCodePredictorModel")
+    code_predictor = MagicMock(name="CodePredictorForCG")
+    code_predictor.model = cp_inner_model
+
+    talker = MagicMock(name="TalkerForCG")
+    talker.code_predictor = code_predictor
+
+    hf_model = MagicMock(name="Qwen3TTSForConditionalGeneration")
+    hf_model.talker = talker
+    # Ensure accessing .code_predictor directly on hf_model raises,
+    # so tests would fail if the production code skips .talker
+    del hf_model.code_predictor
+
+    wrapper_model = MagicMock(name="Qwen3TTSModel")
+    wrapper_model.model = hf_model
+
+    return wrapper_model, cp_inner_model
+
+
 class TestCodePredictorCompilation:
     """Verify regionally_compile is called on the code predictor model."""
 
     def test_regionally_compile_called_on_init(self):
-        """regionally_compile is called with the code predictor model during __init__."""
+        """regionally_compile is called with the code predictor's inner model and dynamic=True."""
         _mock_regionally_compile.reset_mock()
 
         vllm_config = _make_vllm_config()
+        wrapper_model, cp_inner = _make_structured_model_mock()
         with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
-            mock_model = MagicMock()
-            mock_fp.return_value = mock_model
+            mock_fp.return_value = wrapper_model
             Qwen3TTSModelForGeneration(vllm_config=vllm_config)
 
         _mock_regionally_compile.assert_called_once()
         call_args, call_kwargs = _mock_regionally_compile.call_args
-        # First positional arg is the code predictor's inner model
-        assert call_args[0] is mock_model.model.code_predictor.model
-        assert call_kwargs.get("mode") == "reduce-overhead"
+        assert call_args[0] is cp_inner
+        assert call_kwargs.get("dynamic") is True
+        assert "mode" not in call_kwargs
 
     def test_repeated_blocks_set_before_compile(self):
         """_repeated_blocks attribute is set on the code predictor model."""
         _mock_regionally_compile.reset_mock()
 
         vllm_config = _make_vllm_config()
+        wrapper_model, cp_inner = _make_structured_model_mock()
         with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
-            mock_model = MagicMock()
-            mock_fp.return_value = mock_model
+            mock_fp.return_value = wrapper_model
             Qwen3TTSModelForGeneration(vllm_config=vllm_config)
 
-        cp_model = mock_model.model.code_predictor.model
-        assert cp_model._repeated_blocks == ["Qwen3TTSDecoderLayer"]
+        assert cp_inner._repeated_blocks == ["Qwen3TTSDecoderLayer"]
 
     def test_compile_failure_does_not_crash(self):
-        """If regionally_compile raises, __init__ still succeeds."""
+        """If regionally_compile raises RuntimeError, __init__ still succeeds."""
         _mock_regionally_compile.reset_mock()
         _mock_regionally_compile.side_effect = RuntimeError("compile failed")
 
         try:
             vllm_config = _make_vllm_config()
+            wrapper_model, _ = _make_structured_model_mock()
             with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
-                mock_fp.return_value = MagicMock()
-                # Should not raise
+                mock_fp.return_value = wrapper_model
                 wrapper = Qwen3TTSModelForGeneration(vllm_config=vllm_config)
                 assert wrapper is not None
         finally:
             _mock_regionally_compile.side_effect = None
+
+    def test_enforce_eager_skips_compilation(self):
+        """When enforce_eager=True, regionally_compile is not called."""
+        _mock_regionally_compile.reset_mock()
+
+        vllm_config = _make_vllm_config(enforce_eager=True)
+        wrapper_model, _ = _make_structured_model_mock()
+        with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
+            mock_fp.return_value = wrapper_model
+            Qwen3TTSModelForGeneration(vllm_config=vllm_config)
+
+        _mock_regionally_compile.assert_not_called()
diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
index 788b072eabd..7474c30fbd0 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts.py
@@ -82,16 +82,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             **attn_kwargs,
         )
 
-        # Compile code predictor decoder layers for reduced kernel launch overhead
-        try:
-            from vllm_omni.diffusion.compile import regionally_compile
-
-            code_predictor_model = self.model.model.code_predictor.model
-            code_predictor_model._repeated_blocks = ["Qwen3TTSDecoderLayer"]
-            regionally_compile(code_predictor_model, mode="reduce-overhead")
-            logger.info("Code predictor decoder layers compiled with torch.compile.")
-        except Exception as e:
-            logger.warning("Failed to compile code predictor layers: %s. Continuing without compilation.", e)
+        # Compile code predictor decoder layers for reduced kernel launch overhead.
+        # Uses dynamic=True (not mode="reduce-overhead") because the autoregressive
+        # generate() loop has variable KV cache / sequence length shapes per step.
+        enforce_eager = getattr(getattr(vllm_config, "model_config", None), "enforce_eager", False)
+        if not enforce_eager:
+            try:
+                from vllm_omni.diffusion.compile import regionally_compile
+            except ImportError:
+                logger.info("regionally_compile not available, skipping code predictor compilation.")
+            else:
+                code_predictor_model = self.model.model.talker.code_predictor.model
+                code_predictor_model._repeated_blocks = ["Qwen3TTSDecoderLayer"]
+                try:
+                    regionally_compile(code_predictor_model, dynamic=True)
+                    logger.info("Code predictor decoder layers compiled with torch.compile.")
+                except RuntimeError as e:
+                    logger.warning("Failed to compile code predictor layers: %s. Continuing without compilation.", e)
 
         self.task_type = model_path.split("-")[-1].strip("/")
         # Mark that this model produces multimodal outputs

From 22e0d535c44a3480a233ee6fa4b15a1266e14d86 Mon Sep 17 00:00:00 2001
From: marksverdhei <marksverdhei@hotmail.com>
Date: Thu, 29 Jan 2026 14:31:41 +0100
Subject: [PATCH 3/3] Address remaining review findings (M1-M4, test updates)

- M1: Rate-limit debug log to fire once instead of 31x per token
- M2: Add atexit teardown to restore sys.modules after tests
- M3: Add __init__.py to test directory
- M4: Add autouse pytest fixture for mock reset between tests
- Update Test A for new profile run behavior (caps max_new_tokens=2
  instead of returning dummy audio)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .../models/qwen3_tts/__init__.py              |   0
 .../models/qwen3_tts/test_qwen3_tts.py        | 165 ++++++++++--------
 .../models/qwen3_tts/modeling_qwen3_tts.py    |   4 +-
 3 files changed, 99 insertions(+), 70 deletions(-)
 create mode 100644 tests/model_executor/models/qwen3_tts/__init__.py

diff --git a/tests/model_executor/models/qwen3_tts/__init__.py b/tests/model_executor/models/qwen3_tts/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py
index 307ea193e54..8c64dd4e237 100644
--- a/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py
+++ b/tests/model_executor/models/qwen3_tts/test_qwen3_tts.py
@@ -4,7 +4,7 @@
 """Regression tests for Qwen3 TTS model wrapper.
 
 Tests cover:
-  - Profile run short-circuit (regression for PR #1082)
+  - Profile run cap (regression for PR #1082 / #995)
   - Flash-attn detection and fallback
   - Code predictor regional compilation (Phase 1a)
 
@@ -15,6 +15,7 @@
 to completely bypass the vllm_omni.__init__ import chain.
 """
 
+import atexit
 import logging
 import sys
 import types
@@ -23,6 +24,7 @@
 from unittest.mock import MagicMock, patch
 
 import numpy as np
+import pytest
 import torch
 
 # ---------------------------------------------------------------------------
@@ -80,7 +82,7 @@ def _make_stub(fqn: str) -> types.ModuleType:
 
 
 def _setup():
-    # Save originals so they can be restored if needed
+    # Save originals so they can be restored on teardown
     for fqn in _STUB_FQNS:
         _saved_modules[fqn] = sys.modules.get(fqn)
         _make_stub(fqn)
@@ -138,7 +140,18 @@ class OmniOutput(NamedTuple):
     sys.modules["transformers"].AutoProcessor = MagicMock()
 
 
+def _teardown():
+    """Restore sys.modules to pre-test state."""
+    for fqn in reversed(_STUB_FQNS):
+        orig = _saved_modules.get(fqn)
+        if orig is None:
+            sys.modules.pop(fqn, None)
+        else:
+            sys.modules[fqn] = orig
+
+
 _setup()
+atexit.register(_teardown)
 
 # Compile and exec the target file in a synthetic module, setting __package__
 # so that `from .foo import bar` resolves via sys.modules, not the file system.
@@ -157,6 +170,17 @@ class OmniOutput(NamedTuple):
 Qwen3TTSModel = _mod.Qwen3TTSModel
 
 
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture(autouse=True)
+def _reset_compile_mock():
+    """Reset the regionally_compile mock before each test."""
+    _mock_regionally_compile.reset_mock()
+    _mock_regionally_compile.side_effect = None
+
+
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -188,34 +212,62 @@ def _builtins_import():
     return builtins.__import__
 
 
+def _make_structured_model_mock():
+    """Build a mock with explicit attribute hierarchy matching the real model.
+
+    Real hierarchy:
+        Qwen3TTSModel (wrapper .model attr)
+          +-- Qwen3TTSForConditionalGeneration (.model on wrapper)
+                +-- .talker  (Qwen3TTSTalkerForConditionalGeneration)
+                      +-- .code_predictor  (Qwen3TTSTalkerCodePredictorModelForConditionalGeneration)
+                            +-- .model  (Qwen3TTSTalkerCodePredictorModel -- has .layers)
+
+    Using a structured mock prevents MagicMock from auto-creating wrong paths
+    (e.g. .model.model.code_predictor.model without .talker).
+    """
+    cp_inner_model = MagicMock(name="Qwen3TTSTalkerCodePredictorModel")
+    code_predictor = MagicMock(name="CodePredictorForCG")
+    code_predictor.model = cp_inner_model
+
+    talker = MagicMock(name="TalkerForCG")
+    talker.code_predictor = code_predictor
+
+    hf_model = MagicMock(name="Qwen3TTSForConditionalGeneration")
+    hf_model.talker = talker
+    # Ensure accessing .code_predictor directly on hf_model raises,
+    # so tests would fail if the production code skips .talker
+    del hf_model.code_predictor
+
+    wrapper_model = MagicMock(name="Qwen3TTSModel")
+    wrapper_model.model = hf_model
+
+    return wrapper_model, cp_inner_model
+
+
 # ---------------------------------------------------------------------------
-# Test A: Profile run short-circuit (regression for PR #1082)
+# Test A: Profile run cap (regression for PR #1082 / #995)
 # ---------------------------------------------------------------------------
 
-class TestProfileRunShortCircuit:
-    """Empty text triggers a dummy audio return instead of hanging."""
+class TestProfileRunCap:
+    """Empty text caps max_new_tokens and proceeds to generation."""
 
-    def test_empty_text_returns_dummy_audio(self):
+    def test_empty_text_caps_max_new_tokens(self):
+        """Profile run sets max_new_tokens=2 and still calls generation."""
         wrapper = _make_wrapper()
-        result = wrapper.forward(
-            runtime_additional_information=[{"text": [""]}],
-        )
-
-        assert result.multimodal_outputs is not None
-        audio = result.multimodal_outputs["model_outputs"]
-        assert audio.shape == (24000,)
-        assert result.multimodal_outputs["sr"].item() == 24000
+        dummy_wav = np.zeros(24000, dtype=np.float32)
+        wrapper.model.generate_voice_clone.return_value = ([dummy_wav], 24000)
 
-    def test_empty_text_skips_generation(self):
-        wrapper = _make_wrapper()
         wrapper.forward(
-            runtime_additional_information=[{"text": [""]}],
+            runtime_additional_information=[{
+                "text": [""],
+                "task_type": ["Base"],
+                "language": ["Auto"],
+            }],
         )
 
-        model = wrapper.model
-        model.generate_voice_clone.assert_not_called()
-        model.generate_custom_voice.assert_not_called()
-        model.generate_voice_design.assert_not_called()
+        wrapper.model.generate_voice_clone.assert_called_once()
+        _, call_kwargs = wrapper.model.generate_voice_clone.call_args
+        assert call_kwargs.get("max_new_tokens") == 2
 
     def test_nonempty_text_proceeds_to_generation(self):
         wrapper = _make_wrapper()
@@ -232,6 +284,23 @@ def test_nonempty_text_proceeds_to_generation(self):
 
         wrapper.model.generate_voice_clone.assert_called_once()
 
+    def test_nonempty_text_does_not_cap_max_new_tokens(self):
+        """Non-profile runs should not inject max_new_tokens=2."""
+        wrapper = _make_wrapper()
+        dummy_wav = np.zeros(24000, dtype=np.float32)
+        wrapper.model.generate_voice_clone.return_value = ([dummy_wav], 24000)
+
+        wrapper.forward(
+            runtime_additional_information=[{
+                "text": ["Hello world"],
+                "task_type": ["Base"],
+                "language": ["Auto"],
+            }],
+        )
+
+        _, call_kwargs = wrapper.model.generate_voice_clone.call_args
+        assert call_kwargs.get("max_new_tokens") != 2
+
 
 # ---------------------------------------------------------------------------
 # Test B: Flash-attn detection
@@ -290,45 +359,11 @@ def test_flash_attn_preferred_when_available(self):
 # Test C: Code predictor regional compilation (Phase 1a)
 # ---------------------------------------------------------------------------
 
-def _make_structured_model_mock():
-    """Build a mock with explicit attribute hierarchy matching the real model.
-
-    Real hierarchy:
-        Qwen3TTSModel (wrapper .model attr)
-          └── Qwen3TTSForConditionalGeneration (.model on wrapper)
-                └── .talker  (Qwen3TTSTalkerForConditionalGeneration)
-                      └── .code_predictor  (Qwen3TTSTalkerCodePredictorModelForConditionalGeneration)
-                            └── .model  (Qwen3TTSTalkerCodePredictorModel — has .layers)
-
-    Using a structured mock prevents MagicMock from auto-creating wrong paths
-    (e.g. .model.model.code_predictor.model without .talker).
-    """
-    cp_inner_model = MagicMock(name="Qwen3TTSTalkerCodePredictorModel")
-    code_predictor = MagicMock(name="CodePredictorForCG")
-    code_predictor.model = cp_inner_model
-
-    talker = MagicMock(name="TalkerForCG")
-    talker.code_predictor = code_predictor
-
-    hf_model = MagicMock(name="Qwen3TTSForConditionalGeneration")
-    hf_model.talker = talker
-    # Ensure accessing .code_predictor directly on hf_model raises,
-    # so tests would fail if the production code skips .talker
-    del hf_model.code_predictor
-
-    wrapper_model = MagicMock(name="Qwen3TTSModel")
-    wrapper_model.model = hf_model
-
-    return wrapper_model, cp_inner_model
-
-
 class TestCodePredictorCompilation:
     """Verify regionally_compile is called on the code predictor model."""
 
     def test_regionally_compile_called_on_init(self):
         """regionally_compile is called with the code predictor's inner model and dynamic=True."""
-        _mock_regionally_compile.reset_mock()
-
         vllm_config = _make_vllm_config()
         wrapper_model, cp_inner = _make_structured_model_mock()
         with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
@@ -343,8 +378,6 @@ def test_regionally_compile_called_on_init(self):
 
     def test_repeated_blocks_set_before_compile(self):
         """_repeated_blocks attribute is set on the code predictor model."""
-        _mock_regionally_compile.reset_mock()
-
         vllm_config = _make_vllm_config()
         wrapper_model, cp_inner = _make_structured_model_mock()
         with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
@@ -355,23 +388,17 @@ def test_repeated_blocks_set_before_compile(self):
 
     def test_compile_failure_does_not_crash(self):
         """If regionally_compile raises RuntimeError, __init__ still succeeds."""
-        _mock_regionally_compile.reset_mock()
         _mock_regionally_compile.side_effect = RuntimeError("compile failed")
 
-        try:
-            vllm_config = _make_vllm_config()
-            wrapper_model, _ = _make_structured_model_mock()
-            with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
-                mock_fp.return_value = wrapper_model
-                wrapper = Qwen3TTSModelForGeneration(vllm_config=vllm_config)
-                assert wrapper is not None
-        finally:
-            _mock_regionally_compile.side_effect = None
+        vllm_config = _make_vllm_config()
+        wrapper_model, _ = _make_structured_model_mock()
+        with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
+            mock_fp.return_value = wrapper_model
+            wrapper = Qwen3TTSModelForGeneration(vllm_config=vllm_config)
+            assert wrapper is not None
 
     def test_enforce_eager_skips_compilation(self):
         """When enforce_eager=True, regionally_compile is not called."""
-        _mock_regionally_compile.reset_mock()
-
         vllm_config = _make_vllm_config(enforce_eager=True)
         wrapper_model, _ = _make_structured_model_mock()
         with patch.object(Qwen3TTSModel, "from_pretrained") as mock_fp:
diff --git a/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py b/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py
index 455ec8ae69b..bf81d2c4ab1 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/modeling_qwen3_tts.py
@@ -1629,7 +1629,9 @@ def forward(
         # Generate
         else:
             last_id_hidden = self.get_input_embeddings()(input_ids)
-            logger.debug("Code predictor generate: use_cache=%s", self.code_predictor.config.use_cache)
+            if not getattr(self, "_logged_use_cache", False):
+                logger.debug("Code predictor generate: use_cache=%s", self.code_predictor.config.use_cache)
+                self._logged_use_cache = True
             predictor_result = self.code_predictor.generate(
                 inputs_embeds=torch.cat((past_hidden, last_id_hidden), dim=1),
                 max_new_tokens=self.config.num_code_groups - 1,