From 4e429ada3b4363e6390111615b3a4aa5b5e2cecd Mon Sep 17 00:00:00 2001
From: ZeldaHuang <hzm414167@alibaba-inc.com>
Date: Thu, 29 Jan 2026 17:23:32 +0800
Subject: [PATCH 1/3] change qwen3 omni yaml placement strategy

Signed-off-by: ZeldaHuang <hzm414167@alibaba-inc.com>
---
 .../offline_inference/stage_configs/qwen3_omni_ci.yaml | 10 +++++-----
 .../stage_configs/rocm/qwen3_omni_ci.yaml              | 10 +++++-----
 .../online_serving/stage_configs/qwen3_omni_ci.yaml    | 10 +++++-----
 .../stage_configs/rocm/qwen3_omni_ci.yaml              | 10 +++++-----
 tests/e2e/stage_configs/qwen3_omni_ci.yaml             | 10 +++++-----
 .../model_executor/stage_configs/qwen3_omni_moe.yaml   | 10 +++++-----
 .../stage_configs/qwen3_omni_moe_async_chunk.yaml      | 10 +++++-----
 .../stage_configs/qwen3_omni_moe_multiconnector.yaml   | 10 +++++-----
 8 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml
index 7ac435ea5ab..2048485946a 100644
--- a/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml
@@ -7,21 +7,21 @@
 stage_args:
   - stage_id: 0
     runtime:
-      devices: "0,1"
+      devices: "0"
       max_batch_size: 1
     engine_args:
       model_stage: thinker
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.6
+      gpu_memory_utilization: 0.85
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
       distributed_executor_backend: "mp"
       enable_prefix_caching: false
       hf_config_name: thinker_config
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
       load_format: dummy
     final_output: true
     final_output_type: text
@@ -44,7 +44,7 @@ stage_args:
        model_arch: Qwen3OmniMoeForConditionalGeneration
        worker_type: ar
        scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-       gpu_memory_utilization: 0.3
+       gpu_memory_utilization: 0.6
        enforce_eager: true
        trust_remote_code: true
        engine_output_type: latent  # Output codec codes for code2wav
@@ -68,7 +68,7 @@ stage_args:
 
   - stage_id: 2
     runtime:
-      devices: "0"
+      devices: "1"
       max_batch_size: 1
     engine_args:
       model_stage: code2wav
diff --git a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
index 7ac435ea5ab..2048485946a 100644
--- a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
@@ -7,21 +7,21 @@
 stage_args:
   - stage_id: 0
     runtime:
-      devices: "0,1"
+      devices: "0"
       max_batch_size: 1
     engine_args:
       model_stage: thinker
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.6
+      gpu_memory_utilization: 0.85
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
       distributed_executor_backend: "mp"
       enable_prefix_caching: false
       hf_config_name: thinker_config
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
       load_format: dummy
     final_output: true
     final_output_type: text
@@ -44,7 +44,7 @@ stage_args:
        model_arch: Qwen3OmniMoeForConditionalGeneration
        worker_type: ar
        scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-       gpu_memory_utilization: 0.3
+       gpu_memory_utilization: 0.6
        enforce_eager: true
        trust_remote_code: true
        engine_output_type: latent  # Output codec codes for code2wav
@@ -68,7 +68,7 @@ stage_args:
 
   - stage_id: 2
     runtime:
-      devices: "0"
+      devices: "1"
       max_batch_size: 1
     engine_args:
       model_stage: code2wav
diff --git a/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml
index 4db11e11ffa..9f4be615615 100644
--- a/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml
+++ b/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml
@@ -8,14 +8,14 @@ stage_args:
   - stage_id: 0
     stage_type: llm  # Use llm stage type to launch OmniLLM
     runtime:
-      devices: "0,1"
+      devices: "0"
       max_batch_size: 5
     engine_args:
       model_stage: thinker
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.6
+      gpu_memory_utilization: 0.85
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
@@ -23,7 +23,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       hf_config_name: thinker_config
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
       load_format: dummy
     final_output: true
     final_output_type: text
@@ -47,7 +47,7 @@ stage_args:
        model_arch: Qwen3OmniMoeForConditionalGeneration
        worker_type: ar
        scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-       gpu_memory_utilization: 0.3
+       gpu_memory_utilization: 0.6
        enforce_eager: false
        trust_remote_code: true
        engine_output_type: latent  # Output codec codes for code2wav
@@ -72,7 +72,7 @@ stage_args:
   - stage_id: 2
     stage_type: llm  # Use llm stage type to launch OmniLLM
     runtime:
-      devices: "0"
+      devices: "1"
       max_batch_size: 1
     engine_args:
       model_stage: code2wav
diff --git a/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml
index 3297347c48b..2387f309d03 100644
--- a/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml
+++ b/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml
@@ -6,21 +6,21 @@
 stage_args:
   - stage_id: 0
     runtime:
-      devices: "0,1"
+      devices: "0"
       max_batch_size: 5
     engine_args:
       model_stage: thinker
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.6
+      gpu_memory_utilization: 0.85
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
       distributed_executor_backend: "mp"
       enable_prefix_caching: false
       hf_config_name: thinker_config
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
     final_output: true
     final_output_type: text
     is_comprehension: true
@@ -42,7 +42,7 @@ stage_args:
        model_arch: Qwen3OmniMoeForConditionalGeneration
        worker_type: ar
        scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-       gpu_memory_utilization: 0.3
+       gpu_memory_utilization: 0.6
        enforce_eager: true
        trust_remote_code: true
        engine_output_type: latent  # Output codec codes for code2wav
@@ -65,7 +65,7 @@ stage_args:
 
   - stage_id: 2
     runtime:
-      devices: "0"
+      devices: "1"
       max_batch_size: 1
     engine_args:
       model_stage: code2wav
diff --git a/tests/e2e/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_ci.yaml
index 148a949d886..67ff8802911 100644
--- a/tests/e2e/stage_configs/qwen3_omni_ci.yaml
+++ b/tests/e2e/stage_configs/qwen3_omni_ci.yaml
@@ -7,14 +7,14 @@
 stage_args:
 - stage_id: 0
   runtime:
-    devices: "0,1"
+    devices: "0"
     max_batch_size: 5
   engine_args:
     model_stage: thinker
     model_arch: Qwen3OmniMoeForConditionalGeneration
     worker_type: ar
     scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-    gpu_memory_utilization: 0.6
+    gpu_memory_utilization: 0.85
     enforce_eager: false
     trust_remote_code: true
     engine_output_type: latent  # Output hidden states for talker
@@ -23,7 +23,7 @@ stage_args:
     max_model_len: 32768
     enable_prefix_caching: false
     hf_config_name: thinker_config
-    tensor_parallel_size: 2
+    tensor_parallel_size: 1
   final_output: true
   final_output_type: text
   is_comprehension: true
@@ -46,7 +46,7 @@ stage_args:
     model_arch: Qwen3OmniMoeForConditionalGeneration
     worker_type: ar
     scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-    gpu_memory_utilization: 0.3
+    gpu_memory_utilization: 0.6
     enforce_eager: false
     trust_remote_code: true
     engine_output_type: latent  # Output codec codes for code2wav
@@ -68,7 +68,7 @@ stage_args:
 
 - stage_id: 2
   runtime:
-    devices: "0"
+    devices: "1"
     max_batch_size: 1
   engine_args:
     model_stage: code2wav
diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml
index d0380a39d95..48442d8264a 100644
--- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml
+++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml
@@ -9,14 +9,14 @@ stage_args:
   - stage_id: 0
     stage_type: llm  # Use llm stage type to launch OmniLLM
     runtime:
-      devices: "0,1"
+      devices: "0"
       max_batch_size: 64
     engine_args:
       model_stage: thinker
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.6
+      gpu_memory_utilization: 0.85
       enforce_eager: true
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
@@ -24,7 +24,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       hf_config_name: thinker_config
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
     final_output: true
     final_output_type: text
     is_comprehension: true
@@ -47,7 +47,7 @@ stage_args:
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.3
+      gpu_memory_utilization: 0.6
       enforce_eager: true
       trust_remote_code: true
       engine_output_type: latent  # Output codec codes for code2wav
@@ -71,7 +71,7 @@ stage_args:
   - stage_id: 2
     stage_type: llm  # Use llm stage type to launch OmniLLM
     runtime:
-      devices: "0"
+      devices: "1"
       max_batch_size: 1
     engine_args:
       model_stage: code2wav
diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml
index 42748960e77..48d7a3d6368 100644
--- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml
+++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml
@@ -9,14 +9,14 @@ stage_args:
   - stage_id: 0
     stage_type: llm  # Use llm stage type to launch OmniLLM
     runtime:
-      devices: "0,1"
+      devices: "0"
       max_batch_size: 64
     engine_args:
       model_stage: thinker
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.6
+      gpu_memory_utilization: 0.85
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
@@ -24,7 +24,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       hf_config_name: thinker_config
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
       custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk
     final_output: true
     final_output_type: text
@@ -41,14 +41,14 @@ stage_args:
   - stage_id: 1
     stage_type: llm  # Use llm stage type to launch OmniLLM
     runtime:
-      devices: "0"
+      devices: "1"
       max_batch_size: 64
     engine_args:
       model_stage: talker
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.3
+      gpu_memory_utilization: 0.6
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: latent  # Output codec codes for code2wav
diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml
index 69102df6ff7..301a23c32c7 100644
--- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml
+++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml
@@ -8,21 +8,21 @@ stage_args:
   - stage_id: 0
     stage_type: llm  # Use llm stage type to launch OmniLLM
     runtime:
-      devices: "0,1"
+      devices: "0"
       max_batch_size: 1
     engine_args:
       model_stage: thinker
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.6
+      gpu_memory_utilization: 0.85
       enforce_eager: true
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
       distributed_executor_backend: "mp"
       enable_prefix_caching: false
       hf_config_name: thinker_config
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
     final_output: true
     final_output_type: text
     is_comprehension: true
@@ -48,7 +48,7 @@ stage_args:
        model_arch: Qwen3OmniMoeForConditionalGeneration
        worker_type: ar
        scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-       gpu_memory_utilization: 0.3
+       gpu_memory_utilization: 0.6
        enforce_eager: true
        trust_remote_code: true
        engine_output_type: latent  # Output codec codes for code2wav
@@ -77,7 +77,7 @@ stage_args:
   - stage_id: 2
     stage_type: llm  # Use llm stage type to launch OmniLLM
     runtime:
-      devices: "0"
+      devices: "1"
       max_batch_size: 1
     engine_args:
       model_stage: code2wav

From 515dfdd3eca6b9b108b7245f7d01a6db6f8b129e Mon Sep 17 00:00:00 2001
From: ZeldaHuang <hzm414167@alibaba-inc.com>
Date: Thu, 29 Jan 2026 17:48:44 +0800
Subject: [PATCH 2/3] update

Signed-off-by: ZeldaHuang <hzm414167@alibaba-inc.com>
---
 .../platforms/rocm/stage_configs/qwen3_omni_moe.yaml   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml
index 93e05ea9c4a..863128170fc 100644
--- a/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml
+++ b/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml
@@ -7,14 +7,14 @@
 stage_args:
   - stage_id: 0
     runtime:
-      devices: "0,1"
+      devices: "0"
       max_batch_size: 1
     engine_args:
       model_stage: thinker
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.6
+      gpu_memory_utilization: 0.85
       enforce_eager: true
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
@@ -22,7 +22,7 @@ stage_args:
       enable_prefix_caching: false
       max_num_batched_tokens: 32768
       hf_config_name: thinker_config
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
     final_output: true
     final_output_type: text
     is_comprehension: true
@@ -44,7 +44,7 @@ stage_args:
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.3
+      gpu_memory_utilization: 0.6
       enforce_eager: true
       trust_remote_code: true
       engine_output_type: latent  # Output codec codes for code2wav
@@ -68,7 +68,7 @@ stage_args:
 
   - stage_id: 2
     runtime:
-      devices: "0"
+      devices: "1"
       max_batch_size: 1
     engine_args:
       model_stage: code2wav

From 01f5e08d5b1cec6da2ba060fe2e726d54288c2fc Mon Sep 17 00:00:00 2001
From: ZeldaHuang <hzm414167@alibaba-inc.com>
Date: Thu, 29 Jan 2026 18:01:27 +0800
Subject: [PATCH 3/3] update

Signed-off-by: ZeldaHuang <hzm414167@alibaba-inc.com>
---
 tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml    | 2 +-
 .../e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml | 2 +-
 tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml       | 2 +-
 tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml  | 2 +-
 tests/e2e/stage_configs/qwen3_omni_ci.yaml                      | 2 +-
 vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml      | 2 +-
 .../stage_configs/qwen3_omni_moe_async_chunk.yaml               | 2 +-
 .../stage_configs/qwen3_omni_moe_multiconnector.yaml            | 2 +-
 vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml      | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml
index 2048485946a..477e6e59f29 100644
--- a/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml
@@ -14,7 +14,7 @@ stage_args:
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.85
+      gpu_memory_utilization: 0.9
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
diff --git a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
index 2048485946a..477e6e59f29 100644
--- a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
@@ -14,7 +14,7 @@ stage_args:
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.85
+      gpu_memory_utilization: 0.9
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
diff --git a/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml
index 9f4be615615..8f0161edd2d 100644
--- a/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml
+++ b/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml
@@ -15,7 +15,7 @@ stage_args:
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.85
+      gpu_memory_utilization: 0.9
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
diff --git a/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml
index 2387f309d03..59642a77b6f 100644
--- a/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml
+++ b/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml
@@ -13,7 +13,7 @@ stage_args:
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.85
+      gpu_memory_utilization: 0.9
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
diff --git a/tests/e2e/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_ci.yaml
index 67ff8802911..479e4d6e99d 100644
--- a/tests/e2e/stage_configs/qwen3_omni_ci.yaml
+++ b/tests/e2e/stage_configs/qwen3_omni_ci.yaml
@@ -14,7 +14,7 @@ stage_args:
     model_arch: Qwen3OmniMoeForConditionalGeneration
     worker_type: ar
     scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-    gpu_memory_utilization: 0.85
+    gpu_memory_utilization: 0.9
     enforce_eager: false
     trust_remote_code: true
     engine_output_type: latent  # Output hidden states for talker
diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml
index 48442d8264a..e3dcf940f4b 100644
--- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml
+++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml
@@ -16,7 +16,7 @@ stage_args:
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.85
+      gpu_memory_utilization: 0.9
       enforce_eager: true
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml
index 48d7a3d6368..1a8b60aa1a7 100644
--- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml
+++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml
@@ -16,7 +16,7 @@ stage_args:
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.85
+      gpu_memory_utilization: 0.9
       enforce_eager: false
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml
index 301a23c32c7..0bc6e48e594 100644
--- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml
+++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml
@@ -15,7 +15,7 @@ stage_args:
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.85
+      gpu_memory_utilization: 0.9
       enforce_eager: true
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
diff --git a/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml
index 863128170fc..31312673ae8 100644
--- a/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml
+++ b/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml
@@ -14,7 +14,7 @@ stage_args:
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.85
+      gpu_memory_utilization: 0.9
       enforce_eager: true
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker