diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md
index 6c451936233..0d03f919ddf 100644
--- a/docs/contributing/ci/CI_5levels.md
+++ b/docs/contributing/ci/CI_5levels.md
@@ -604,7 +604,7 @@ When you want to add L5-level stability test cases, add or extend the appropriat
     "test_name": "test_qwen3_omni_stability",
     "server_params": {
         "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
-        "stage_config_name": "qwen3_omni.yaml"
+        "stage_config_name": "qwen3_omni_moe.yaml"
     },
     "benchmark_params": [
         {
@@ -640,7 +640,7 @@ When you want to add L5-level stability test cases, add or extend the appropriat
 | Parameter         | Required | Example                            | Description                         |
 | ----------------- | -------- | ---------------------------------- | ----------------------------------- |
 | model             | Yes      | "Qwen/Qwen3-Omni-30B-A3B-Instruct" | Model name or path                  |
-| stage_config_name | Yes      | "qwen3_omni.yaml"                  | Stage configuration file name       |
+| stage_config_name | Yes      | "qwen3_omni_moe.yaml"              | Stage configuration file name       |
 
 ##### Dynamic Configuration (update/delete)
 
diff --git a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md
index f1f3073dc52..1329f53872c 100644
--- a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md
+++ b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md
@@ -5,7 +5,7 @@ When you want to add L4-level ***performance test*** cases, you can refer to the
     "test_name": "test_qwen3_omni",
     "server_params": {
         "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
-        "stage_config_name": "qwen3_omni.yaml"
+        "stage_config_name": "qwen3_omni_moe.yaml"
     },
     "benchmark_params": [
         {
@@ -43,7 +43,7 @@ When you want to add L4-level ***performance test*** cases, you can refer to the
 | Parameter         | Required | Example                            | Description                   |
 | ----------------- | -------- | ---------------------------------- | ----------------------------- |
 | model             | Yes      | "Qwen/Qwen3-Omni-30B-A3B-Instruct" | Model name or path            |
-| stage_config_name | Yes      | "qwen3_omni.yaml"                  | Stage configuration file name |
+| stage_config_name | Yes      | "qwen3_omni_moe.yaml"              | Stage configuration file name |
 
 *Dynamic Configuration (update/delete)*
 
diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md
index 3a8cb0f127c..a62297a8391 100644
--- a/docs/contributing/ci/tests_style.md
+++ b/docs/contributing/ci/tests_style.md
@@ -235,7 +235,7 @@ models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
 
 #If you use the default configuration file, you can directly use the following address.
 def get_default_config():
-    return get_deploy_config_path("ci/qwen3_omni_moe.yaml")
+    return get_deploy_config_path("qwen3_omni_moe.yaml")
 
 #If you need to modify the configuration file, you can use modify_stage_config.
 def get_chunk_config():
diff --git a/tests/dfx/conftest.py b/tests/dfx/conftest.py
index 0c59bed994f..8dee350bd0c 100644
--- a/tests/dfx/conftest.py
+++ b/tests/dfx/conftest.py
@@ -11,7 +11,7 @@
 
 from tests.dfx.reliability.helpers import list_remote_process_pids_by_pattern, post_chat_completions_raw
 from tests.helpers.runtime import OmniServerParams
-from tests.helpers.stage_config import modify_stage_config
+from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config
 from vllm_omni.platforms import current_omni_platform
 
 
@@ -69,7 +69,7 @@ def _build_serve_args(serve_args: Any) -> list[str]:
 
 def create_unique_server_params(
     configs: list[dict[str, Any]],
-    stage_configs_dir: Path,
+    stage_configs_dir: Path | None = None,
 ) -> list[tuple[str, str, str | None, str | None, tuple[str, ...], bool]]:
     """Return one row per unique server configuration.
 
@@ -87,7 +87,11 @@ def create_unique_server_params(
         model = server_params["model"]
         stage_config_name = server_params.get("stage_config_name")
         if stage_config_name:
-            stage_config_path = str(stage_configs_dir / stage_config_name)
+            stage_config_path = (
+                str(stage_configs_dir / stage_config_name)
+                if stage_configs_dir is not None
+                else get_deploy_config_path(stage_config_name)
+            )
             delete = server_params.get("delete", None)
             update = server_params.get("update", None)
             stage_config_path = modify_stage(stage_config_path, update, delete)
diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py
index c1f3264c18e..868ba97f7bc 100644
--- a/tests/dfx/perf/scripts/run_benchmark.py
+++ b/tests/dfx/perf/scripts/run_benchmark.py
@@ -51,8 +51,7 @@ def _get_config_file_from_argv() -> str | None:
 OMNI_RESULT_TEMPLATE_PATH = Path(__file__).parent / "result_omni_template.json"
 
 
-DEPLOY_CONFIGS_DIR = Path(__file__).parent.parent / "deploy"
-test_params = create_unique_server_params(BENCHMARK_CONFIGS, DEPLOY_CONFIGS_DIR)
+test_params = create_unique_server_params(BENCHMARK_CONFIGS)
 server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS)
 
 _omni_server_lock = threading.Lock()
diff --git a/tests/diffusion/test_profiler.py b/tests/diffusion/test_profiler.py
new file mode 100644
index 00000000000..3fcddf79183
--- /dev/null
+++ b/tests/diffusion/test_profiler.py
@@ -0,0 +1,321 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Unit tests for profiler trace collection across ranks.
+
+Tests that:
+- OmniTorchProfilerWrapper writes trace files for each rank
+- DiffusionWorker start/stop_profile lifecycle works per rank
+- OmniStage handles profiler tasks via inline engine when queues are absent
+"""
+
+import os
+import tempfile
+
+import pytest
+from pytest_mock import MockerFixture
+from vllm.config import ProfilerConfig
+
+from vllm_omni.entrypoints.omni_stage import OmniStage
+from vllm_omni.entrypoints.stage_utils import OmniStageTaskType
+from vllm_omni.profiler.omni_torch_profiler import OmniTorchProfilerWrapper
+
+pytestmark = [pytest.mark.cpu]
+
+
+# ---------------------------------------------------------------------------
+# OmniTorchProfilerWrapper: per-rank trace file naming
+# ---------------------------------------------------------------------------
+
+
+class TestProfilerTraceNaming:
+    """Verify that each rank produces a uniquely named trace file."""
+
+    def test_trace_filename_includes_rank(self):
+        """_on_trace_ready should produce <filename>_rank<N>.json."""
+        with tempfile.TemporaryDirectory() as trace_dir:
+            config = ProfilerConfig(
+                profiler="torch",
+                torch_profiler_dir=trace_dir,
+            )
+            for rank in (0, 1):
+                profiler = OmniTorchProfilerWrapper(
+                    profiler_config=config,
+                    worker_name=f"test_rank_{rank}",
+                    local_rank=rank,
+                    activities=["CPU"],
+                )
+                profiler.set_trace_filename("test_trace")
+
+                # Start → do nothing → stop triggers _on_trace_ready
+                profiler.start()
+                profiler.stop()
+
+            # Both rank files should exist
+            files = sorted(os.listdir(trace_dir))
+            rank0_files = [f for f in files if "_rank0.json" in f]
+            rank1_files = [f for f in files if "_rank1.json" in f]
+            assert rank0_files, f"No rank-0 trace found in {files}"
+            assert rank1_files, f"No rank-1 trace found in {files}"
+
+    def test_trace_filename_with_full_path(self):
+        """When filename already contains a directory, use as-is."""
+        with tempfile.TemporaryDirectory() as trace_dir:
+            config = ProfilerConfig(
+                profiler="torch",
+                torch_profiler_dir=trace_dir,
+            )
+            profiler = OmniTorchProfilerWrapper(
+                profiler_config=config,
+                worker_name="test",
+                local_rank=3,
+                activities=["CPU"],
+            )
+            full_path = os.path.join(trace_dir, "subdir", "my_trace")
+            profiler.set_trace_filename(full_path)
+            profiler.start()
+            profiler.stop()
+
+            expected = f"{full_path}_rank3.json"
+            assert os.path.exists(expected), (
+                f"Expected {expected}, found: {os.listdir(os.path.dirname(expected))}"
+            )
+
+    def test_get_results_returns_trace_path(self):
+        """get_results() should return the path of the exported trace."""
+        with tempfile.TemporaryDirectory() as trace_dir:
+            config = ProfilerConfig(
+                profiler="torch",
+                torch_profiler_dir=trace_dir,
+                torch_profiler_use_gzip=False,
+            )
+            profiler = OmniTorchProfilerWrapper(
+                profiler_config=config,
+                worker_name="test",
+                local_rank=0,
+                activities=["CPU"],
+            )
+            profiler.set_trace_filename("results_test")
+            profiler.start()
+            profiler.stop()
+
+            results = profiler.get_results()
+            assert results["trace"] is not None
+            assert results["trace"].endswith("_rank0.json")
+            assert os.path.exists(results["trace"])
+
+
+# ---------------------------------------------------------------------------
+# DiffusionWorker: profiler lifecycle
+# ---------------------------------------------------------------------------
+
+
+class TestDiffusionWorkerProfiler:
+    """Test DiffusionWorker.start_profile / stop_profile."""
+
+    @pytest.fixture
+    def worker_with_profiler(self, mocker: MockerFixture):
+        """Create a DiffusionWorker with a real profiler (CPU-only)."""
+        from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker
+
+        config = mocker.Mock()
+        config.num_gpus = 1
+        config.master_port = 12345
+        config.enable_sleep_mode = False
+        config.cache_backend = None
+        config.cache_config = None
+        config.model = "test-model"
+        config.profiler_config = ProfilerConfig(
+            profiler="torch",
+            torch_profiler_dir=tempfile.mkdtemp(),
+            torch_profiler_use_gzip=False,
+        )
+
+        mocker.patch.object(DiffusionWorker, "init_device")
+        mocker.patch.object(DiffusionWorker, "load_model")
+        mocker.patch.object(DiffusionWorker, "init_lora_manager")
+
+        worker = DiffusionWorker(
+            local_rank=0, rank=0, od_config=config, skip_load_model=True,
+        )
+        worker.model_runner = mocker.Mock()
+        return worker
+
+    def test_start_stop_creates_trace(self, worker_with_profiler):
+        """start_profile + stop_profile should produce a trace file."""
+        worker = worker_with_profiler
+        trace_dir = worker.od_config.profiler_config.torch_profiler_dir
+
+        template = os.path.join(trace_dir, "test_worker")
+        worker.start_profile(template)
+        worker.stop_profile()
+
+        files = os.listdir(trace_dir)
+        assert any("_rank0.json" in f for f in files), f"No rank-0 trace in {files}"
+
+    def test_stop_profile_returns_results(self, worker_with_profiler):
+        """stop_profile should return dict with trace path."""
+        worker = worker_with_profiler
+        trace_dir = worker.od_config.profiler_config.torch_profiler_dir
+
+        template = os.path.join(trace_dir, "test_results")
+        worker.start_profile(template)
+        result = worker.stop_profile()
+
+        assert isinstance(result, dict)
+        assert "trace" in result
+        assert result["trace"] is not None
+        assert os.path.exists(result["trace"])
+
+    def test_multiple_ranks_produce_separate_traces(self, mocker: MockerFixture):
+        """Two workers with different local_rank should write separate files."""
+        from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker
+
+        trace_dir = tempfile.mkdtemp()
+
+        workers = []
+        for rank in (0, 1):
+            config = mocker.Mock()
+            config.num_gpus = 2
+            config.master_port = 12345
+            config.enable_sleep_mode = False
+            config.cache_backend = None
+            config.cache_config = None
+            config.model = "test-model"
+            config.profiler_config = ProfilerConfig(
+                profiler="torch",
+                torch_profiler_dir=trace_dir,
+                torch_profiler_use_gzip=False,
+            )
+
+            mocker.patch.object(DiffusionWorker, "init_device")
+            mocker.patch.object(DiffusionWorker, "load_model")
+            mocker.patch.object(DiffusionWorker, "init_lora_manager")
+
+            worker = DiffusionWorker(
+                local_rank=rank, rank=rank, od_config=config, skip_load_model=True,
+            )
+            worker.model_runner = mocker.Mock()
+            workers.append(worker)
+
+        # Start and stop profiling on both workers
+        template = os.path.join(trace_dir, "multi_rank")
+        for w in workers:
+            w.start_profile(template)
+        for w in workers:
+            w.stop_profile()
+
+        files = os.listdir(trace_dir)
+        rank0_files = [f for f in files if "_rank0.json" in f]
+        rank1_files = [f for f in files if "_rank1.json" in f]
+        assert rank0_files, f"Missing rank-0 trace in {files}"
+        assert rank1_files, f"Missing rank-1 trace in {files}"
+
+
+# ---------------------------------------------------------------------------
+# OmniStage: inline engine profiler routing
+# ---------------------------------------------------------------------------
+
+
+class TestOmniStageInlineProfiler:
+    """Test that OmniStage routes profiler tasks to inline engine."""
+
+    @pytest.fixture
+    def stage_with_inline_engine(self, mocker: MockerFixture):
+        """Create an OmniStage with a mock inline engine (no queues)."""
+        stage_config = mocker.Mock()
+        stage_config.stage_id = 0
+        stage_config.engine_args = mocker.Mock()
+        stage_config.engine_args.model_stage = "diffusion"
+        stage_config.engine_args.engine_output_type = None
+        stage_config.engine_args.stage_id = 0
+        stage_config.runtime = mocker.Mock()
+        stage_config.runtime.requires_multimodal_data = False
+        stage_config.stage_type = "diffusion"
+        stage_config.final_output = True
+        stage_config.final_output_type = "video"
+        stage_config.is_comprehension = False
+        # No custom_process_input_func
+        del stage_config.custom_process_input_func
+        # No prompt_expand_func
+        del stage_config.prompt_expand_func
+        # Default sampling params
+        stage_config.default_sampling_params = {}
+        # No input sources
+        stage_config.input_sources = []
+        stage_config.engine_input_source = []
+
+        # Patch SamplingParams import to avoid full init
+        mocker.patch(
+            "vllm_omni.entrypoints.omni_stage.OmniDiffusionSamplingParams",
+            return_value=mocker.Mock(),
+        )
+
+        stage = OmniStage(stage_config)
+
+        # Attach a mock inline engine (simulates inline diffusion mode)
+        mock_engine = mocker.Mock()
+        mock_engine.start_profile = mocker.Mock()
+        mock_engine.stop_profile = mocker.Mock(return_value={"traces": ["t.json"], "tables": []})
+        stage._inline_engine = mock_engine
+
+        return stage, mock_engine
+
+    def test_submit_profiler_start_routes_to_inline_engine(self, stage_with_inline_engine):
+        """submit(PROFILER_START) should call inline_engine.start_profile()."""
+        stage, mock_engine = stage_with_inline_engine
+
+        stage.submit({"type": OmniStageTaskType.PROFILER_START})
+
+        mock_engine.start_profile.assert_called_once()
+
+    def test_submit_profiler_stop_routes_to_inline_engine(self, stage_with_inline_engine):
+        """submit(PROFILER_STOP) should call inline_engine.stop_profile()."""
+        stage, mock_engine = stage_with_inline_engine
+
+        stage.submit({"type": OmniStageTaskType.PROFILER_STOP})
+
+        mock_engine.stop_profile.assert_called_once()
+
+    def test_stop_profile_returns_inline_engine_result(self, stage_with_inline_engine):
+        """stop_profile() should return the inline engine's result directly."""
+        stage, mock_engine = stage_with_inline_engine
+
+        result = stage.stop_profile()
+
+        mock_engine.stop_profile.assert_called_once()
+        assert result == {"traces": ["t.json"], "tables": []}
+
+    def test_submit_asserts_when_no_queue_and_no_inline_engine(self, mocker: MockerFixture):
+        """submit() should assert when neither queues nor inline engine available."""
+        stage_config = mocker.Mock()
+        stage_config.stage_id = 0
+        stage_config.engine_args = mocker.Mock()
+        stage_config.engine_args.model_stage = "diffusion"
+        stage_config.engine_args.engine_output_type = None
+        stage_config.engine_args.stage_id = 0
+        stage_config.runtime = mocker.Mock()
+        stage_config.runtime.requires_multimodal_data = False
+        stage_config.stage_type = "diffusion"
+        stage_config.final_output = False
+        stage_config.final_output_type = None
+        stage_config.is_comprehension = False
+        del stage_config.custom_process_input_func
+        del stage_config.prompt_expand_func
+        stage_config.default_sampling_params = {}
+        stage_config.input_sources = []
+        stage_config.engine_input_source = []
+
+        mocker.patch(
+            "vllm_omni.entrypoints.omni_stage.OmniDiffusionSamplingParams",
+            return_value=mocker.Mock(),
+        )
+
+        stage = OmniStage(stage_config)
+        # No inline engine, no queues
+        assert stage._inline_engine is None
+        assert stage._in_q is None
+
+        with pytest.raises(AssertionError):
+            stage.submit({"type": OmniStageTaskType.PROFILER_START})
diff --git a/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py b/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py
index 7fb71b28d77..6b3bb5e90e3 100644
--- a/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py
+++ b/tests/e2e/accuracy/qwen3_omni/run_qwen_omni_acc_benchmark.py
@@ -297,8 +297,10 @@ def build_arg_parser() -> argparse.ArgumentParser:
     p.add_argument("--daily-omni-input-mode", choices=("all", "visual", "audio"), default="all")
     p.add_argument(
         "--daily-extra-body-json",
-        default='{"modalities":["text"]}',
-        help="JSON merged into each chat request for Daily-Omni (default matches common L4 / text-output runs).",
+        default='{"modalities":["text"],"max_tokens":8192}',
+        help="JSON merged into each chat request for Daily-Omni. max_tokens:8192 gives the thinker "
+        "enough room to complete its reasoning trace before producing the final MCQ answer "
+        "(the production server default of 2048 can be insufficient for complex multimodal questions).",
     )
     p.add_argument(
         "--daily-omni-save-eval-items",
diff --git a/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py b/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py
index 773f7c1108c..6ef21a7ccbf 100644
--- a/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py
+++ b/tests/e2e/accuracy/qwen3_omni/test_qwen3_omni.py
@@ -51,8 +51,7 @@
 )
 from tests.helpers.mark import hardware_test
 from tests.helpers.runtime import OmniServerParams
-from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config
-from vllm_omni.platforms import current_omni_platform
+from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY
 
 _E2E_ROOT = Path(__file__).resolve().parent.parent.parent
 
@@ -60,26 +59,10 @@
 
 pytestmark = [pytest.mark.full_model, pytest.mark.omni]
 
-_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml")
+_DEPLOY = QWEN3_OMNI_MOE_DEPLOY
 
 
-def get_chunk_config(config_path: str | None = None):
-    """Load the qwen3_omni CI deploy yaml with async_chunk modifications for streaming mode."""
-    if config_path is None:
-        config_path = _CI_DEPLOY
-    # TODO: remove this workaround once legacy `stage_args` path is deleted.
-    # The pipeline (qwen3_omni/pipeline.py) already wires
-    # thinker2talker_async_chunk / talker2code2wav_async_chunk on stage 0/1,
-    # so only async_chunk needs flipping. Writing nested `engine_args:` into
-    # the new-schema overlay trips _parse_stage_deploy's legacy branch and
-    # drops flat fields (load_format, max_num_seqs, ...).
-    return modify_stage_config(config_path, updates={"async_chunk": True})
-
-
-if current_omni_platform.is_xpu():
-    stage_configs = [_CI_DEPLOY]
-else:  # CUDA + ROCm MI325 share the same deploy config
-    stage_configs = [get_chunk_config()]
+stage_configs = [_DEPLOY]
 
 test_params = [
     OmniServerParams(model=model, stage_config_path=stage_config) for model in models for stage_config in stage_configs
diff --git a/tests/e2e/offline_inference/test_qwen3_omni.py b/tests/e2e/offline_inference/test_qwen3_omni.py
index c4d257b5114..9e47efeb15f 100644
--- a/tests/e2e/offline_inference/test_qwen3_omni.py
+++ b/tests/e2e/offline_inference/test_qwen3_omni.py
@@ -11,20 +11,20 @@
 
 from tests.helpers.mark import hardware_test
 from tests.helpers.media import generate_synthetic_video
-from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config
+from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY, modify_stage_config
 from vllm_omni.platforms import current_omni_platform
 
 models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
 
 
-# Single CI deploy YAML; rocm/xpu deltas are picked automatically via the
+# Single deploy YAML; rocm/xpu deltas are picked automatically via the
 # platforms: section. Only CUDA needs an extra enforce_eager tweak.
-_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml")
+_DEPLOY = QWEN3_OMNI_MOE_DEPLOY
 
 
 def get_cuda_graph_config():
     return modify_stage_config(
-        _CI_DEPLOY,
+        _DEPLOY,
         updates={
             "stages": {
                 0: {"enforce_eager": True},
@@ -35,7 +35,7 @@ def get_cuda_graph_config():
 
 
 if current_omni_platform.is_rocm() or current_omni_platform.is_xpu():
-    stage_configs = [_CI_DEPLOY]
+    stage_configs = [_DEPLOY]
 else:
     stage_configs = [get_cuda_graph_config()]
 
diff --git a/tests/e2e/offline_inference/test_qwen3_omni_autoround_w4a16.py b/tests/e2e/offline_inference/test_qwen3_omni_autoround_w4a16.py
index 3a3c874b64b..6b1690a26fe 100644
--- a/tests/e2e/offline_inference/test_qwen3_omni_autoround_w4a16.py
+++ b/tests/e2e/offline_inference/test_qwen3_omni_autoround_w4a16.py
@@ -20,7 +20,7 @@
     generate_synthetic_image,
     generate_synthetic_video,
 )
-from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config
+from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY, modify_stage_config
 
 QUANTIZED_MODEL = "Intel/Qwen3-Omni-30B-A3B-Instruct-int4-AutoRound"
 BASELINE_MODEL = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
@@ -29,7 +29,7 @@
 QUANTIZED_MODEL = os.environ.get("QWEN3_OMNI_AUTOROUND_MODEL", QUANTIZED_MODEL)
 BASELINE_MODEL = os.environ.get("QWEN3_OMNI_BASELINE_MODEL", BASELINE_MODEL)
 
-_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml")
+_DEPLOY = QWEN3_OMNI_MOE_DEPLOY
 
 
 @pytest.fixture(scope="module", autouse=True)
@@ -48,7 +48,7 @@ def _qwen3_omni_env():
 def _get_stage_config():
     """Build a CI-friendly stage config with eager mode."""
     return modify_stage_config(
-        _CI_DEPLOY,
+        _DEPLOY,
         updates={
             "stages": {
                 0: {"enforce_eager": True},
diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py
index ed210dc9c2f..4c586c7e9a3 100644
--- a/tests/e2e/online_serving/test_qwen3_omni.py
+++ b/tests/e2e/online_serving/test_qwen3_omni.py
@@ -9,8 +9,7 @@
 from tests.helpers.mark import hardware_test
 from tests.helpers.media import generate_synthetic_audio, generate_synthetic_image, generate_synthetic_video
 from tests.helpers.runtime import OmniServerParams, dummy_messages_from_mix_data
-from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config
-from vllm_omni.platforms import current_omni_platform
+from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY, modify_stage_config
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
@@ -21,20 +20,7 @@
 # Set VLLM_TEST_PD_MODE=1 to test PD disaggregation (follow-up — deploy overlay not yet migrated).
 _USE_PD = os.environ.get("VLLM_TEST_PD_MODE", "0") == "1"
 
-_CI_DEPLOY = get_deploy_config_path("ci/qwen3_omni_moe.yaml")
-
-
-def get_chunk_config(config_path: str | None = None):
-    """Load the qwen3_omni CI deploy yaml with async_chunk modifications for streaming mode."""
-    if config_path is None:
-        config_path = _CI_DEPLOY
-    # TODO: remove this workaround once legacy `stage_args` path is deleted.
-    # The pipeline (qwen3_omni/pipeline.py) already wires
-    # thinker2talker_async_chunk / talker2code2wav_async_chunk on stage 0/1,
-    # so only async_chunk needs flipping. Writing nested `engine_args:` into
-    # the new-schema overlay trips _parse_stage_deploy's legacy branch and
-    # drops flat fields (load_format, max_num_seqs, ...).
-    return modify_stage_config(config_path, updates={"async_chunk": True})
+_DEPLOY = QWEN3_OMNI_MOE_DEPLOY
 
 
 def get_prefix_caching_config(config_path: str):
@@ -42,24 +28,21 @@ def get_prefix_caching_config(config_path: str):
     path = modify_stage_config(
         config_path,
         updates={
-            "stage_args": {
-                0: {"engine_args.enable_prefix_caching": True},
+            "stages": {
+                0: {"enable_prefix_caching": True},
             },
         },
     )
     return path
 
 
-# Platform-specific overrides live inside the new deploy yaml's ``platforms:``
-# section, so a single ``_CI_DEPLOY`` path serves CUDA, ROCm, and XPU.
+# Platform-specific overrides live inside the deploy yaml's ``platforms:``
+# section, so a single ``_DEPLOY`` path serves CUDA, ROCm, and XPU.
 # TODO: re-add VLLM_TEST_PD_MODE branch once the PD-disaggregation deploy
 # overlay has been migrated to the new schema (previously used the deleted
 # ``qwen3_omni_moe_pd_ci.yaml`` stage-configs file).
-if current_omni_platform.is_xpu():
-    stage_configs = [_CI_DEPLOY]
-else:  # CUDA + ROCm MI325 share the same deploy config
-    stage_configs = [get_chunk_config()]
-prefix_caching_stage_configs = [get_prefix_caching_config(_CI_DEPLOY)]
+stage_configs = [_DEPLOY]
+prefix_caching_stage_configs = [get_prefix_caching_config(_DEPLOY)]
 
 # Create parameter combinations for model and stage config
 test_params = [
diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
index 9601057d44c..a6bef9d135f 100644
--- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
@@ -11,7 +11,7 @@
 from tests.helpers.mark import hardware_test
 from tests.helpers.media import generate_synthetic_audio, generate_synthetic_image, generate_synthetic_video
 from tests.helpers.runtime import OmniServerParams, dummy_messages_from_mix_data
-from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config
+from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY, modify_stage_config
 
 pytestmark = [pytest.mark.full_model, pytest.mark.omni]
 
@@ -66,11 +66,9 @@ def get_async_chunk_config(default_path):
     )
 
 
-# CI deploy YAML (single file; xpu deltas applied via ``platforms:`` section).
-# The overlay explicitly sets ``async_chunk: False``, so ``default`` tests the
-# sync path and ``async_chunk`` tests the streaming path with a longer thinker
-# output — two distinct scenarios, kept as separate parametrizations.
-default_path = get_deploy_config_path("ci/qwen3_omni_moe.yaml")
+# Qwen3-Omni uses the default deploy YAML. The sync variant disables async
+# chunk through CLI so both parametrizations share the same config source.
+default_path = QWEN3_OMNI_MOE_DEPLOY
 
 test_params = [
     pytest.param(
diff --git a/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py b/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py
index 90f8897c58f..dae5a254d4c 100644
--- a/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py
+++ b/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py
@@ -23,7 +23,7 @@
     generate_synthetic_audio,
 )
 from tests.helpers.runtime import OmniServerParams
-from tests.helpers.stage_config import get_deploy_config_path
+from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
@@ -32,9 +32,9 @@
 # Synthetic input for realtime E2E (``generate_synthetic_audio``); distinct cache file per phrase.
 REALTIME_SYNTH_PHRASE_TEXT = "Translate into Chinese: Beijing is the Capital of China"
 
-# The new-schema CI overlay bakes in async_chunk: False and covers CUDA/ROCm/XPU
-# via its ``platforms:`` section, so one path serves all three.
-default_stage_config = get_deploy_config_path("ci/qwen3_omni_moe.yaml")
+# Use the default deploy config; the sync realtime path disables async chunk
+# through CLI.
+default_stage_config = QWEN3_OMNI_MOE_DEPLOY
 
 realtime_server_params = [
     pytest.param(
diff --git a/tests/examples/online_serving/test_qwen3_omni.py b/tests/examples/online_serving/test_qwen3_omni.py
index 1bb577ed656..ade101bd065 100644
--- a/tests/examples/online_serving/test_qwen3_omni.py
+++ b/tests/examples/online_serving/test_qwen3_omni.py
@@ -21,15 +21,14 @@
 from tests.helpers.mark import hardware_test
 from tests.helpers.media import convert_audio_file_to_text, cosine_similarity_text
 from tests.helpers.runtime import OmniServerParams
-from tests.helpers.stage_config import get_deploy_config_path
+from tests.helpers.stage_config import QWEN3_OMNI_MOE_DEPLOY
 
 pytestmark = [pytest.mark.full_model, pytest.mark.example, pytest.mark.omni]
 
 models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
 
 
-stage_configs = [get_deploy_config_path("ci/qwen3_omni_moe.yaml")]
-
+stage_configs = [QWEN3_OMNI_MOE_DEPLOY]
 
 example_dir = str(Path(__file__).parent.parent.parent.parent / "examples" / "online_serving")
 # Create parameter combinations for model and stage config
@@ -205,15 +204,14 @@ def test_stream_001(omni_server) -> None:
     text_content_tmp = extract_content_after_keyword("content:", result)
     text_content = strip_audio_saved_to_lines(text_content_tmp)
 
-    # Verify text output same as audio output
+    # In streaming mode, audio is emitted as multiple small chunks; only the last
+    # chunk path is captured by extract_last_audio_saved_path, so keyword
+    # verification must use text_content (the complete accumulated response).
     wav_path = extract_last_audio_saved_path(result)
     audio_content = convert_audio_file_to_text(output_path=f"./{wav_path}")
     print(f"text content is: {text_content}")
-    assert "cherry blossom" in audio_content, "The output does not contain any of the keywords."
     print(f"audio content is: {audio_content}")
-    similarity = cosine_similarity_text(audio_content.lower(), text_content.lower())
-    print(f"similarity is: {similarity}")
-    assert similarity > 0.9, "The audio content is not same as the text"
+    assert "cherry blossom" in text_content, "The output does not contain any of the keywords."
     # TODO: Verify the E2E latency after confirmation baseline.
 
 
diff --git a/tests/helpers/stage_config.py b/tests/helpers/stage_config.py
index 66a58378d32..fcde3161d17 100644
--- a/tests/helpers/stage_config.py
+++ b/tests/helpers/stage_config.py
@@ -325,92 +325,6 @@ def delete_by_path(config_dict: dict, path: str) -> None:
             },
         },
     },
-    "qwen3_omni_moe": {
-        "base_config": "qwen3_omni_moe.yaml",
-        "async_chunk": False,
-        "stages": [
-            {
-                "stage_id": 0,
-                "max_num_seqs": 5,
-                "max_model_len": 32768,
-                "mm_processor_cache_gb": 0,
-                "load_format": "dummy",
-                "default_sampling_params": {"max_tokens": 150, "ignore_eos": False},
-            },
-            {
-                "stage_id": 1,
-                "gpu_memory_utilization": 0.5,
-                "max_num_seqs": 5,
-                "max_model_len": 32768,
-                "load_format": "dummy",
-                "default_sampling_params": {"max_tokens": 1000},
-            },
-            {
-                "stage_id": 2,
-                "max_num_seqs": 5,
-                "max_num_batched_tokens": 100000,
-                "load_format": "dummy",
-                "default_sampling_params": {"max_tokens": 2000},
-            },
-        ],
-        "platforms": {
-            "rocm": {
-                "stages": [
-                    {"stage_id": 0, "max_num_seqs": 1, "default_sampling_params": {"max_tokens": 100}},
-                    {
-                        "stage_id": 1,
-                        "max_num_seqs": 1,
-                        "enforce_eager": True,
-                        "default_sampling_params": {"max_tokens": 100},
-                    },
-                    {
-                        "stage_id": 2,
-                        "max_num_seqs": 1,
-                        "max_num_batched_tokens": 1000000,
-                        "default_sampling_params": {"max_tokens": 200},
-                    },
-                ],
-            },
-            "xpu": {
-                "stages": [
-                    {
-                        "stage_id": 0,
-                        "gpu_memory_utilization": 0.85,
-                        "max_num_seqs": 1,
-                        "tensor_parallel_size": 4,
-                        "enforce_eager": True,
-                        "max_num_batched_tokens": 4096,
-                        "max_model_len": 4096,
-                        "max_cudagraph_capture_size": 0,
-                        "skip_mm_profiling": True,
-                        "devices": "0,1,2,3",
-                        "default_sampling_params": {"max_tokens": 100, "ignore_eos": False},
-                    },
-                    {
-                        "stage_id": 1,
-                        "gpu_memory_utilization": 0.6,
-                        "max_num_seqs": 1,
-                        "enforce_eager": True,
-                        "max_num_batched_tokens": 4096,
-                        "max_model_len": 4096,
-                        "max_cudagraph_capture_size": 0,
-                        "skip_mm_profiling": True,
-                        "devices": "4",
-                    },
-                    {
-                        "stage_id": 2,
-                        "gpu_memory_utilization": 0.3,
-                        "max_num_seqs": 1,
-                        "max_num_batched_tokens": 100000,
-                        "max_cudagraph_capture_size": 0,
-                        "skip_mm_profiling": True,
-                        "devices": "5",
-                        "default_sampling_params": {"max_tokens": 2000},
-                    },
-                ],
-            },
-        },
-    },
     "bagel": {
         "base_config": "bagel.yaml",
         "stages": [
@@ -588,7 +502,10 @@ def get_deploy_config_path(rel_path: str) -> str:
     return str(_DEPLOY_DIR / rel_path)
 
 
+QWEN3_OMNI_MOE_DEPLOY = get_deploy_config_path("qwen3_omni_moe.yaml")
+
 __all__ = [
     "modify_stage_config",
     "get_deploy_config_path",
+    "QWEN3_OMNI_MOE_DEPLOY",
 ]
diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py
index 22dcf26d097..dd61c7360f9 100644
--- a/tests/test_config_factory.py
+++ b/tests/test_config_factory.py
@@ -136,6 +136,15 @@ def test_to_omegaconf_max_batch_size_deprecation(self):
             assert len(deprecation_warnings) == 1
             assert "max_batch_size" in str(deprecation_warnings[0].message)
 
+    def test_to_omegaconf_leaves_max_num_seqs_unset_by_default(self):
+        """Let vLLM choose its default max_num_seqs when stage config omits it."""
+        config = StageConfig(
+            stage_id=0,
+            model_stage="thinker",
+        )
+        omega_config = config.to_omegaconf()
+        assert "max_num_seqs" not in omega_config.engine_args
+
     def test_to_omegaconf_max_num_seqs_in_engine_args(self):
         """Test that max_num_seqs in yaml_engine_args takes precedence."""
         config = StageConfig(
@@ -1156,41 +1165,30 @@ def test_thinker_only_yaml_loads_and_merges(self):
 class TestBaseConfigInheritance:
     """Test deploy YAML base_config inheritance."""
 
-    def test_ci_inherits_from_main(self):
-        from tests.helpers.stage_config import get_deploy_config_path
+    def test_qwen3_omni_deploy_config(self):
         from vllm_omni.config.stage_config import load_deploy_config
 
-        ci_path = Path(get_deploy_config_path("ci/qwen3_omni_moe.yaml"))
-        if not ci_path.exists():
-            pytest.skip("CI deploy config not found")
+        deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml"
+        if not deploy_path.exists():
+            pytest.skip("Qwen3-Omni deploy config not found")
 
-        deploy = load_deploy_config(ci_path)
+        deploy = load_deploy_config(deploy_path)
         assert len(deploy.stages) == 3
-        # CI overrides
-        assert deploy.stages[0].engine_extras.get("load_format") == "dummy"
-        assert deploy.stages[0].max_num_seqs == 5
-        # Inherited from base
-        assert deploy.stages[0].gpu_memory_utilization == 0.9
+        assert deploy.stages[0].engine_extras.get("gpu_memory_utilization") == 0.9
         assert deploy.connectors is not None
         assert "connector_of_shared_memory" in deploy.connectors
-        # CI overlay explicitly sets async_chunk: False (see
-        # tests.helpers.stage_config._CI_OVERLAYS and PR #2383 discussion). Overlay
-        # bool overrides base even when the base yaml has async_chunk: true.
-        assert deploy.async_chunk is False
+        assert deploy.async_chunk is True
 
-    def test_ci_sampling_merge(self):
-        from tests.helpers.stage_config import get_deploy_config_path
+    def test_qwen3_omni_deploy_sampling_params(self):
         from vllm_omni.config.stage_config import load_deploy_config
 
-        ci_path = Path(get_deploy_config_path("ci/qwen3_omni_moe.yaml"))
-        if not ci_path.exists():
-            pytest.skip("CI deploy config not found")
+        deploy_path = Path(__file__).parent.parent / "vllm_omni" / "deploy" / "qwen3_omni_moe.yaml"
+        if not deploy_path.exists():
+            pytest.skip("Qwen3-Omni deploy config not found")
 
-        deploy = load_deploy_config(ci_path)
+        deploy = load_deploy_config(deploy_path)
         s0 = deploy.stages[0].default_sampling_params
-        # CI overrides max_tokens
-        assert s0["max_tokens"] == 150
-        # Inherited from base
+        assert s0["max_tokens"] == 2048
         assert s0["temperature"] == 0.4
         assert s0["seed"] == 42
 
@@ -1207,7 +1205,7 @@ def test_pure_inheritance_overlay(self, tmp_path):
 
         deploy = load_deploy_config(overlay)
         assert len(deploy.stages) == 3
-        assert deploy.stages[0].gpu_memory_utilization == 0.9
+        assert deploy.stages[0].engine_extras.get("gpu_memory_utilization") == 0.9
 
     def test_single_field_overlay(self, tmp_path):
         """An overlay overriding one stage field merges with the base."""
@@ -1221,9 +1219,10 @@ def test_single_field_overlay(self, tmp_path):
         overlay.write_text(f"base_config: {base}\nstages:\n  - stage_id: 2\n    max_num_batched_tokens: 1000000\n")
 
         deploy = load_deploy_config(overlay)
-        assert deploy.stages[2].max_num_batched_tokens == 1000000
-        # Rest inherited
-        assert deploy.stages[0].gpu_memory_utilization == 0.9
+        # max_num_batched_tokens goes into engine_extras (not a StageDeployConfig field)
+        assert deploy.stages[2].engine_extras.get("max_num_batched_tokens") == 1000000
+        # max_num_seqs is in engine_extras (no longer a direct StageDeployConfig field)
+        assert deploy.stages[0].engine_extras.get("max_num_seqs") == 64
 
 
 class TestPlatformOverrides:
@@ -1241,11 +1240,11 @@ def test_npu_overrides(self):
         deploy = load_deploy_config(deploy_path)
         deploy = _apply_platform_overrides(deploy, platform="npu")
 
-        assert deploy.stages[0].gpu_memory_utilization == 0.6
-        assert deploy.stages[0].tensor_parallel_size == 2
+        assert deploy.stages[0].engine_extras.get("gpu_memory_utilization") == 0.6
+        assert deploy.stages[0].engine_extras.get("tensor_parallel_size") == 2
         assert deploy.stages[0].devices == "0,1"
         # Stage 2 unaffected fields stay at base
-        assert deploy.stages[2].enforce_eager is True
+        assert deploy.stages[2].engine_extras.get("enforce_eager") is True
 
     def test_xpu_overrides(self):
         from pathlib import Path
@@ -1259,7 +1258,7 @@ def test_xpu_overrides(self):
         deploy = load_deploy_config(deploy_path)
         deploy = _apply_platform_overrides(deploy, platform="xpu")
 
-        assert deploy.stages[0].tensor_parallel_size == 4
+        assert deploy.stages[0].engine_extras.get("tensor_parallel_size") == 4
         assert deploy.stages[0].devices == "0,1,2,3"
         assert deploy.stages[0].engine_extras.get("max_cudagraph_capture_size") == 0
 
@@ -1273,9 +1272,9 @@ def test_unknown_platform_noop(self):
             pytest.skip("Deploy config not found")
 
         deploy = load_deploy_config(deploy_path)
-        original_mem = deploy.stages[0].gpu_memory_utilization
+        original_mem = deploy.stages[0].engine_extras.get("gpu_memory_utilization")
         deploy = _apply_platform_overrides(deploy, platform="unknown_hw")
-        assert deploy.stages[0].gpu_memory_utilization == original_mem
+        assert deploy.stages[0].engine_extras.get("gpu_memory_utilization") == original_mem
 
     def test_platforms_deep_merge_inheritance(self, tmp_path):
         """Overlay's platforms: block layers onto base's, per-stage."""
@@ -1305,10 +1304,10 @@ def test_platforms_deep_merge_inheritance(self, tmp_path):
         deploy = load_deploy_config(overlay)
         deploy = _apply_platform_overrides(deploy, platform="rocm")
         # Both base's enforce_eager and overlay's max_num_seqs should apply.
-        assert deploy.stages[0].enforce_eager is True
-        assert deploy.stages[0].max_num_seqs == 1
+        assert deploy.stages[0].engine_extras.get("enforce_eager") is True
+        assert deploy.stages[0].engine_extras.get("max_num_seqs") == 1
         # Inherited stage default not touched by overlay platforms section.
-        assert deploy.stages[0].gpu_memory_utilization == 0.9
+        assert deploy.stages[0].engine_extras.get("gpu_memory_utilization") == 0.9
 
 
 class TestCLIOverrideFlow:
diff --git a/vllm_omni/benchmarks/data_modules/daily_omni_eval.py b/vllm_omni/benchmarks/data_modules/daily_omni_eval.py
index f191cf2febc..a84792dd58b 100644
--- a/vllm_omni/benchmarks/data_modules/daily_omni_eval.py
+++ b/vllm_omni/benchmarks/data_modules/daily_omni_eval.py
@@ -47,6 +47,11 @@ def extract_choice_letter_official(text: str | None) -> str | None:
         return None
     match = re.search(r"assistant\s*([\s\S]*)$", raw, flags=re.IGNORECASE)
     candidate = match.group(1).strip() if match else raw
+    # Strip <think>...</think> reasoning traces (Qwen3-Omni thinking model output) so we look
+    # only at the final answer, not option letters mentioned inside the thinking trace.
+    post_think = re.sub(r"<think>[\s\S]*?</think>", "", candidate, flags=re.IGNORECASE).strip()
+    if post_think:
+        candidate = post_think
     direct = re.match(r"(?i)^\s*([A-D])(?:[\s\.\):：]|$)", candidate)
     if direct:
         return direct.group(1).upper()
diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index 44cc83baea8..22510980b48 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -402,13 +402,6 @@ class StageDeployConfig:
     """
 
     stage_id: int
-    max_num_seqs: int = 64
-    gpu_memory_utilization: float = 0.9
-    tensor_parallel_size: int = 1
-    enforce_eager: bool = False
-    max_num_batched_tokens: int = 32768
-    max_model_len: int | None = None
-    async_scheduling: bool | None = None
     devices: str = "0"
     output_connectors: dict[str, str] | None = None
     input_connectors: dict[str, str] | None = None
@@ -455,6 +448,7 @@ class DeployConfig:
         "output_connectors",
         "input_connectors",
         "default_sampling_params",
+        "subtalker_sampling_params",
         "engine_extras",
     }
 )
@@ -883,8 +877,6 @@ def to_omegaconf(self) -> Any:
             effective_mbs = int(cli_mbs or legacy_mbs or 1)
             engine_args.setdefault("max_num_seqs", effective_mbs)
 
-        engine_args.setdefault("max_num_seqs", 1)
-
         # Build full config dict
         config_dict: dict[str, Any] = {
             "stage_id": self.stage_id,
diff --git a/vllm_omni/deploy/qwen2_5_omni.yaml b/vllm_omni/deploy/qwen2_5_omni.yaml
index 41aef0df6f6..fe84005baf3 100644
--- a/vllm_omni/deploy/qwen2_5_omni.yaml
+++ b/vllm_omni/deploy/qwen2_5_omni.yaml
@@ -3,10 +3,9 @@
 # flashinfer; the autotune dummy run OOMs the shared cuda:0 device otherwise.
 #
 # Fields omitted from a stage fall back to StageDeployConfig dataclass
-# defaults (see vllm_omni/config/stage_config.py). For instance, every
-# stage here uses vLLM's default max_num_batched_tokens=32768 because
-# chat-sized prefill comfortably fits; only models with codec prefill
-# (Qwen3-Omni, Qwen3-TTS) need to bump it above 32k.
+# defaults (see vllm_omni/config/stage_config.py). Omitting
+# max_num_batched_tokens inherits vLLM's hardware-specific default
+# (e.g., 16384 for H100, 8192 for others).
 #
 # enforce_eager policy across the three deploy YAMLs:
 #   * code2wav / generation stages:  always true (cudagraph incompatible with
diff --git a/vllm_omni/deploy/qwen3_omni_moe.yaml b/vllm_omni/deploy/qwen3_omni_moe.yaml
index 39baed6bd7b..9ea180e137e 100644
--- a/vllm_omni/deploy/qwen3_omni_moe.yaml
+++ b/vllm_omni/deploy/qwen3_omni_moe.yaml
@@ -23,6 +23,8 @@ connectors:
 stages:
   - stage_id: 0
     gpu_memory_utilization: 0.9
+    max_num_seqs: 64
+    max_num_batched_tokens: 32768
     devices: "0"
     default_sampling_params:
       temperature: 0.4
@@ -34,7 +36,9 @@ stages:
 
   - stage_id: 1
     gpu_memory_utilization: 0.6
+    max_num_seqs: 64
     devices: "1"
+    max_num_batched_tokens: 32768
     input_connectors:
       from_stage_0: connector_of_shared_memory
     default_sampling_params:
@@ -46,6 +50,7 @@ stages:
 
   - stage_id: 2
     gpu_memory_utilization: 0.1
+    max_num_seqs: 64
     enforce_eager: true
     async_scheduling: false
     max_num_batched_tokens: 51200