From 4e429ada3b4363e6390111615b3a4aa5b5e2cecd Mon Sep 17 00:00:00 2001 From: ZeldaHuang Date: Thu, 29 Jan 2026 17:23:32 +0800 Subject: [PATCH 1/3] change qwen3 omni yaml placement strategy Signed-off-by: ZeldaHuang --- .../offline_inference/stage_configs/qwen3_omni_ci.yaml | 10 +++++----- .../stage_configs/rocm/qwen3_omni_ci.yaml | 10 +++++----- .../online_serving/stage_configs/qwen3_omni_ci.yaml | 10 +++++----- .../stage_configs/rocm/qwen3_omni_ci.yaml | 10 +++++----- tests/e2e/stage_configs/qwen3_omni_ci.yaml | 10 +++++----- .../model_executor/stage_configs/qwen3_omni_moe.yaml | 10 +++++----- .../stage_configs/qwen3_omni_moe_async_chunk.yaml | 10 +++++----- .../stage_configs/qwen3_omni_moe_multiconnector.yaml | 10 +++++----- 8 files changed, 40 insertions(+), 40 deletions(-) diff --git a/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml index 7ac435ea5ab..2048485946a 100644 --- a/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml @@ -7,21 +7,21 @@ stage_args: - stage_id: 0 runtime: - devices: "0,1" + devices: "0" max_batch_size: 1 engine_args: model_stage: thinker model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.85 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker distributed_executor_backend: "mp" enable_prefix_caching: false hf_config_name: thinker_config - tensor_parallel_size: 2 + tensor_parallel_size: 1 load_format: dummy final_output: true final_output_type: text @@ -44,7 +44,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.3 + gpu_memory_utilization: 0.6 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output codec codes for code2wav @@ -68,7 +68,7 @@ stage_args: - stage_id: 2 runtime: - devices: "0" + devices: "1" max_batch_size: 1 engine_args: model_stage: code2wav diff --git a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml index 7ac435ea5ab..2048485946a 100644 --- a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml @@ -7,21 +7,21 @@ stage_args: - stage_id: 0 runtime: - devices: "0,1" + devices: "0" max_batch_size: 1 engine_args: model_stage: thinker model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.85 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker distributed_executor_backend: "mp" enable_prefix_caching: false hf_config_name: thinker_config - tensor_parallel_size: 2 + tensor_parallel_size: 1 load_format: dummy final_output: true final_output_type: text @@ -44,7 +44,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.3 + gpu_memory_utilization: 0.6 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output codec codes for code2wav @@ -68,7 +68,7 @@ stage_args: - stage_id: 2 runtime: - devices: "0" + devices: "1" max_batch_size: 1 engine_args: model_stage: code2wav diff --git a/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml index 4db11e11ffa..9f4be615615 100644 --- a/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml +++ b/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml @@ -8,14 +8,14 @@ stage_args: - stage_id: 0 stage_type: llm # Use llm stage type to launch OmniLLM runtime: - devices: "0,1" + devices: "0" max_batch_size: 5 engine_args: model_stage: thinker model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.85 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker @@ -23,7 +23,7 @@ stage_args: enable_prefix_caching: false max_num_batched_tokens: 32768 hf_config_name: thinker_config - tensor_parallel_size: 2 + tensor_parallel_size: 1 load_format: dummy final_output: true final_output_type: text @@ -47,7 +47,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.3 + gpu_memory_utilization: 0.6 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output codec codes for code2wav @@ -72,7 +72,7 @@ stage_args: - stage_id: 2 stage_type: llm # Use llm stage type to launch OmniLLM runtime: - devices: "0" + devices: "1" max_batch_size: 1 engine_args: model_stage: code2wav diff --git a/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml index 3297347c48b..2387f309d03 100644 --- a/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml +++ b/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml @@ -6,21 +6,21 @@ stage_args: - stage_id: 0 runtime: - devices: "0,1" + devices: "0" max_batch_size: 5 engine_args: model_stage: thinker model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.85 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker distributed_executor_backend: "mp" enable_prefix_caching: false hf_config_name: thinker_config - tensor_parallel_size: 2 + tensor_parallel_size: 1 final_output: true final_output_type: text is_comprehension: true @@ -42,7 +42,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.3 + gpu_memory_utilization: 0.6 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output codec codes for code2wav @@ -65,7 +65,7 @@ stage_args: - stage_id: 2 runtime: - devices: "0" + devices: "1" max_batch_size: 1 engine_args: model_stage: code2wav diff --git a/tests/e2e/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_ci.yaml index 148a949d886..67ff8802911 100644 --- a/tests/e2e/stage_configs/qwen3_omni_ci.yaml +++ b/tests/e2e/stage_configs/qwen3_omni_ci.yaml @@ -7,14 +7,14 @@ stage_args: - stage_id: 0 runtime: - devices: "0,1" + devices: "0" max_batch_size: 5 engine_args: model_stage: thinker model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.85 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker @@ -23,7 +23,7 @@ stage_args: max_model_len: 32768 enable_prefix_caching: false hf_config_name: thinker_config - tensor_parallel_size: 2 + tensor_parallel_size: 1 final_output: true final_output_type: text is_comprehension: true @@ -46,7 +46,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.3 + gpu_memory_utilization: 0.6 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output codec codes for code2wav @@ -68,7 +68,7 @@ stage_args: - stage_id: 2 runtime: - devices: "0" + devices: "1" max_batch_size: 1 engine_args: model_stage: code2wav diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml index d0380a39d95..48442d8264a 100644 --- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml @@ -9,14 +9,14 @@ stage_args: - stage_id: 0 stage_type: llm # Use llm stage type to launch OmniLLM runtime: - devices: "0,1" + devices: "0" max_batch_size: 64 engine_args: model_stage: thinker model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.85 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output hidden states for talker @@ -24,7 +24,7 @@ stage_args: enable_prefix_caching: false max_num_batched_tokens: 32768 hf_config_name: thinker_config - tensor_parallel_size: 2 + tensor_parallel_size: 1 final_output: true final_output_type: text is_comprehension: true @@ -47,7 +47,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.3 + gpu_memory_utilization: 0.6 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output codec codes for code2wav @@ -71,7 +71,7 @@ stage_args: - stage_id: 2 stage_type: llm # Use llm stage type to launch OmniLLM runtime: - devices: "0" + devices: "1" max_batch_size: 1 engine_args: model_stage: code2wav diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml index 42748960e77..48d7a3d6368 100644 --- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml @@ -9,14 +9,14 @@ stage_args: - stage_id: 0 stage_type: llm # Use llm stage type to launch OmniLLM runtime: - devices: "0,1" + devices: "0" max_batch_size: 64 engine_args: model_stage: thinker model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.85 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker @@ -24,7 +24,7 @@ stage_args: enable_prefix_caching: false max_num_batched_tokens: 32768 hf_config_name: thinker_config - tensor_parallel_size: 2 + tensor_parallel_size: 1 custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk final_output: true final_output_type: text @@ -41,14 +41,14 @@ stage_args: - stage_id: 1 stage_type: llm # Use llm stage type to launch OmniLLM runtime: - devices: "0" + devices: "1" max_batch_size: 64 engine_args: model_stage: talker model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.3 + gpu_memory_utilization: 0.6 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output codec codes for code2wav diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml index 69102df6ff7..301a23c32c7 100644 --- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml @@ -8,21 +8,21 @@ stage_args: - stage_id: 0 stage_type: llm # Use llm stage type to launch OmniLLM runtime: - devices: "0,1" + devices: "0" max_batch_size: 1 engine_args: model_stage: thinker model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.85 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output hidden states for talker distributed_executor_backend: "mp" enable_prefix_caching: false hf_config_name: thinker_config - tensor_parallel_size: 2 + tensor_parallel_size: 1 final_output: true final_output_type: text is_comprehension: true @@ -48,7 +48,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.3 + gpu_memory_utilization: 0.6 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output codec codes for code2wav @@ -77,7 +77,7 @@ stage_args: - stage_id: 2 stage_type: llm # Use llm stage type to launch OmniLLM runtime: - devices: "0" + devices: "1" max_batch_size: 1 engine_args: model_stage: code2wav From 515dfdd3eca6b9b108b7245f7d01a6db6f8b129e Mon Sep 17 00:00:00 2001 From: ZeldaHuang Date: Thu, 29 Jan 2026 17:48:44 +0800 Subject: [PATCH 2/3] update Signed-off-by: ZeldaHuang --- .../platforms/rocm/stage_configs/qwen3_omni_moe.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml index 93e05ea9c4a..863128170fc 100644 --- a/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml +++ b/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml @@ -7,14 +7,14 @@ stage_args: - stage_id: 0 runtime: - devices: "0,1" + devices: "0" max_batch_size: 1 engine_args: model_stage: thinker model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.85 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output hidden states for talker @@ -22,7 +22,7 @@ stage_args: enable_prefix_caching: false max_num_batched_tokens: 32768 hf_config_name: thinker_config - tensor_parallel_size: 2 + tensor_parallel_size: 1 final_output: true final_output_type: text is_comprehension: true @@ -44,7 +44,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.3 + gpu_memory_utilization: 0.6 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output codec codes for code2wav @@ -68,7 +68,7 @@ stage_args: - stage_id: 2 runtime: - devices: "0" + devices: "1" max_batch_size: 1 engine_args: model_stage: code2wav From 01f5e08d5b1cec6da2ba060fe2e726d54288c2fc Mon Sep 17 00:00:00 2001 From: ZeldaHuang Date: Thu, 29 Jan 2026 18:01:27 +0800 Subject: [PATCH 3/3] update Signed-off-by: ZeldaHuang --- tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml | 2 +- .../e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml | 2 +- tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml | 2 +- tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml | 2 +- tests/e2e/stage_configs/qwen3_omni_ci.yaml | 2 +- vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml | 2 +- .../stage_configs/qwen3_omni_moe_async_chunk.yaml | 2 +- .../stage_configs/qwen3_omni_moe_multiconnector.yaml | 2 +- vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml index 2048485946a..477e6e59f29 100644 --- a/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/qwen3_omni_ci.yaml @@ -14,7 +14,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.85 + gpu_memory_utilization: 0.9 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker diff --git a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml index 2048485946a..477e6e59f29 100644 --- a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml @@ -14,7 +14,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.85 + gpu_memory_utilization: 0.9 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker diff --git a/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml index 9f4be615615..8f0161edd2d 100644 --- a/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml +++ b/tests/e2e/online_serving/stage_configs/qwen3_omni_ci.yaml @@ -15,7 +15,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.85 + gpu_memory_utilization: 0.9 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker diff --git a/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml index 2387f309d03..59642a77b6f 100644 --- a/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml +++ b/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml @@ -13,7 +13,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.85 + gpu_memory_utilization: 0.9 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker diff --git a/tests/e2e/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_ci.yaml index 67ff8802911..479e4d6e99d 100644 --- a/tests/e2e/stage_configs/qwen3_omni_ci.yaml +++ b/tests/e2e/stage_configs/qwen3_omni_ci.yaml @@ -14,7 +14,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.85 + gpu_memory_utilization: 0.9 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml index 48442d8264a..e3dcf940f4b 100644 --- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml @@ -16,7 +16,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.85 + gpu_memory_utilization: 0.9 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output hidden states for talker diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml index 48d7a3d6368..1a8b60aa1a7 100644 --- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml @@ -16,7 +16,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.85 + gpu_memory_utilization: 0.9 enforce_eager: false trust_remote_code: true engine_output_type: latent # Output hidden states for talker diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml index 301a23c32c7..0bc6e48e594 100644 --- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_multiconnector.yaml @@ -15,7 +15,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.85 + gpu_memory_utilization: 0.9 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output hidden states for talker diff --git a/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml index 863128170fc..31312673ae8 100644 --- a/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml +++ b/vllm_omni/platforms/rocm/stage_configs/qwen3_omni_moe.yaml @@ -14,7 +14,7 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.85 + gpu_memory_utilization: 0.9 enforce_eager: true trust_remote_code: true engine_output_type: latent # Output hidden states for talker