From faf496f0a4ade54acf992347c161c59afc9b23fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= <qibaoyuan@xiaomi.com>
Date: Tue, 3 Mar 2026 19:07:46 +0800
Subject: [PATCH 1/7] [mimo-audio] tp>1 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 齐保元 <qibaoyuan@xiaomi.com>
---
 .../models/mimo_audio/mimo_audio_llm.py       | 65 +++++++++++++++++--
 1 file changed, 61 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py
index 3b9f94f2317..7975ad1d934 100644
--- a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py
+++ b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py
@@ -12,6 +12,7 @@
     Qwen2Model as TransformerQwen2Model,
 )
 from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -759,7 +760,59 @@ def _embed_input_ids(
         inputs_embeds.masked_fill_(is_multimodal.unsqueeze(-1), 0.0)
 
         if multimodal_embeddings is not None and len(multimodal_embeddings) != 0:
-            inputs_embeds = inputs_embeds + multimodal_embeddings
+            # Flatten multimodal embeddings: tuple of [N_i, H] -> [sum(N_i), H]
+            mm_flat = torch.cat(
+                [
+                    t.reshape(-1, t.shape[-1]) if isinstance(t, torch.Tensor) and t.dim() >= 2 else t.reshape(1, -1)
+                    for t in multimodal_embeddings
+                ],
+                dim=0,
+            )
+
+            # is_multimodal: [S] or [B, S] -> positions where we should insert mm embeddings
+            pos = is_multimodal.nonzero(as_tuple=False)  # [N, 1] if 1D else [N, 2]
+            num_expected = pos.shape[0]
+            if mm_flat.shape[0] != num_expected:
+                raise ValueError(
+                    f"MiMo multimodal embedding count mismatch: got {mm_flat.shape[0]} "
+                    f"tokens, expected {num_expected} (is_multimodal.sum()). "
+                    "Check that embed_multimodal output matches prompt_ids empty positions."
+                )
+
+            if pos.dim() == 2 and pos.shape[1] == 2:
+                b_idx = pos[:, 0]
+                s_idx = pos[:, 1]
+            else:
+                # is_multimodal was 1D [S] -> single batch, seq indices only
+                b_idx = torch.zeros(num_expected, dtype=torch.long, device=inputs_embeds.device)
+                s_idx = pos[:, 0].to(inputs_embeds.device)
+
+            if inputs_embeds.dim() == 3:
+                dst = inputs_embeds[b_idx, s_idx, :]
+            else:
+                dst = inputs_embeds[s_idx, :]
+
+            hidden_size = inputs_embeds.shape[-1]
+            mm_hidden = mm_flat.shape[-1]
+            if mm_hidden == hidden_size:
+                dst.copy_(mm_flat.to(inputs_embeds.dtype))
+            else:
+                # TP: mm_flat is the local shard from ColumnParallelLinear (e.g. 2048);
+                # inputs_embeds has full hidden (e.g. 4096). Write shard into correct slice.
+                tp_rank = get_tensor_model_parallel_rank()
+                tp_world = get_tensor_model_parallel_world_size()
+                shard_size = hidden_size // tp_world
+                start = tp_rank * shard_size
+                end = start + shard_size
+                if mm_hidden != shard_size:
+                    raise ValueError(
+                        f"MiMo TP shard size mismatch: mm_flat has {mm_hidden}, "
+                        f"expected {shard_size} (hidden_size={hidden_size}, tp={tp_world})"
+                    )
+                if inputs_embeds.dim() == 3:
+                    inputs_embeds[b_idx, s_idx, start:end] = mm_flat.to(inputs_embeds.dtype)
+                else:
+                    inputs_embeds[s_idx, start:end] = mm_flat.to(inputs_embeds.dtype)
 
         inputs_embeds = inputs_embeds.to(torch.bfloat16)
         return inputs_embeds
@@ -776,11 +829,13 @@ def embed_input_ids(
         if multimodal_embeddings is None or is_multimodal is None:
             return super().embed_input_ids(input_ids)
 
-        return super().embed_input_ids(
+        # Use MiMo's own merge logic instead of vllm's _merge_multimodal_embeddings.
+        # vllm expects num_mm_tokens == is_multimodal.sum(); MiMo's embed_multimodal
+        # format may differ, and masked_scatter_ triggers CUDA assert on mismatch.
+        return self._embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
-            handle_oov_mm_token=handle_oov_mm_token,
         )
 
     def base_local_forward(
@@ -1015,7 +1070,9 @@ def forward(
         **kwargs: object,
     ) -> torch.Tensor | IntermediateTensors:
         _forward_context = get_forward_context()
-        _default_query_start_loc = torch.tensor([0, input_ids.shape[-1]], device=input_ids.device)
+        # Keep on CPU to avoid device-side assert when tp>1 (indexing with CPU tensor is valid)
+        _seq_len = int(input_ids.shape[-1])
+        _default_query_start_loc = torch.tensor([0, _seq_len], dtype=torch.long, device="cpu")
         query_start_loc = (
             next(iter(_forward_context.attn_metadata.values())).query_start_loc
             if _forward_context.attn_metadata is not None

From 53c3d46e0fd52e38b2a8d017bbcd9697f4ea14ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= <qibaoyuan@xiaomi.com>
Date: Wed, 4 Mar 2026 09:59:17 +0800
Subject: [PATCH 2/7] [mimo-audio] tp>1 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 齐保元 <qibaoyuan@xiaomi.com>
---
 vllm_omni/model_executor/stage_configs/mimo_audio.yaml      | 6 ++++--
 .../stage_configs/mimo_audio_async_chunk.yaml               | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml
index 552ba2ab5fd..82c85c33c5e 100644
--- a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml
+++ b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml
@@ -7,7 +7,7 @@ stage_args:
     stage_type: llm
     runtime:
       process: true           # Run this stage in a separate process
-      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      devices: "0,1"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
       max_batch_size: 1
     engine_args:
       dtype: bfloat16
@@ -15,6 +15,7 @@ stage_args:
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      tensor_parallel_size: 2
       gpu_memory_utilization: 0.3
       enforce_eager: true    # need to discuss
       trust_remote_code: true
@@ -38,13 +39,14 @@ stage_args:
     stage_type: llm
     runtime:
       process: true            # Run this stage in a separate process
-      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      devices: "0,1"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
       max_batch_size: 1
     engine_args:
       model_stage: code2wav
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: generation
       scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      tensor_parallel_size: 2
       gpu_memory_utilization: 0.2
       enforce_eager: true
       trust_remote_code: true
diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
index a60fc566a99..c66620f337c 100644
--- a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
+++ b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
@@ -7,7 +7,7 @@ stage_args:
     stage_type: llm
     runtime:
       process: true           # Run this stage in a separate process
-      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      devices: "0,1"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
       max_batch_size: 1
     engine_args:
       dtype: bfloat16
@@ -15,6 +15,7 @@ stage_args:
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      tensor_parallel_size: 1
       gpu_memory_utilization: 0.3
       enforce_eager: true    # need to discuss
       trust_remote_code: true
@@ -39,13 +40,14 @@ stage_args:
     stage_type: llm
     runtime:
       process: true            # Run this stage in a separate process
-      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      devices: "0,1"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
       max_batch_size: 1
     engine_args:
       model_stage: code2wav
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: generation
       scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      tensor_parallel_size: 1
       gpu_memory_utilization: 0.2
       enforce_eager: true
       trust_remote_code: true

From 750385c7e9295ccebb6f4e9715ceefc606faca6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= <qibaoyuan@xiaomi.com>
Date: Wed, 4 Mar 2026 16:06:12 +0800
Subject: [PATCH 3/7] [mimo-audio] tp>1 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 齐保元 <qibaoyuan@xiaomi.com>
---
 .../model_executor/stage_configs/mimo_audio_async_chunk.yaml  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
index c66620f337c..9e61d71d7c3 100644
--- a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
+++ b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
@@ -15,7 +15,7 @@ stage_args:
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      tensor_parallel_size: 1
+      tensor_parallel_size: 2
       gpu_memory_utilization: 0.3
       enforce_eager: true    # need to discuss
       trust_remote_code: true
@@ -47,7 +47,7 @@ stage_args:
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: generation
       scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
-      tensor_parallel_size: 1
+      tensor_parallel_size: 2
       gpu_memory_utilization: 0.2
       enforce_eager: true
       trust_remote_code: true

From 094685f5a990e93a1e94bd4572e4a3d97f1cb800 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= <qibaoyuan@xiaomi.com>
Date: Wed, 4 Mar 2026 18:18:39 +0800
Subject: [PATCH 4/7] [mimo-audio] tp>1 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 齐保元 <qibaoyuan@xiaomi.com>
---
 .../stage_configs/mimo_audio_async_chunk.yaml             | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
index 9e61d71d7c3..e20b0840681 100644
--- a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
+++ b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
@@ -7,7 +7,7 @@ stage_args:
     stage_type: llm
     runtime:
       process: true           # Run this stage in a separate process
-      devices: "0,1"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
       max_batch_size: 1
     engine_args:
       dtype: bfloat16
@@ -15,7 +15,7 @@ stage_args:
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
       gpu_memory_utilization: 0.3
       enforce_eager: true    # need to discuss
       trust_remote_code: true
@@ -40,14 +40,14 @@ stage_args:
     stage_type: llm
     runtime:
       process: true            # Run this stage in a separate process
-      devices: "0,1"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
       max_batch_size: 1
     engine_args:
       model_stage: code2wav
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: generation
       scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
       gpu_memory_utilization: 0.2
       enforce_eager: true
       trust_remote_code: true

From 2014da80f6ddab86a14c8192b621abece8080320 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= <qibaoyuan@xiaomi.com>
Date: Thu, 5 Mar 2026 19:16:28 +0800
Subject: [PATCH 5/7] [mimo-audio] tp>1 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 齐保元 <qibaoyuan@xiaomi.com>
---
 .../models/mimo_audio/mimo_audio_llm.py       | 68 ++-----------------
 1 file changed, 7 insertions(+), 61 deletions(-)

diff --git a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py
index 7975ad1d934..c1b71e076e4 100644
--- a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py
+++ b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py
@@ -12,7 +12,6 @@
     Qwen2Model as TransformerQwen2Model,
 )
 from vllm.config import VllmConfig
-from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -557,12 +556,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.hidden_size,
             bias=False,
             return_bias=False,
+            gather_output=True,
         )
         self.hidden_states_downcast = ColumnParallelLinear(
             config.hidden_size,
             self.local_config.hidden_size,
             bias=False,
             return_bias=False,
+            gather_output=True,
         )
 
         self.lm_head = ColumnParallelLinear(
@@ -570,6 +571,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.vocab_size,
             bias=False,
             return_bias=False,
+            gather_output=True,
         )
 
         # Re-encode the sum of multi-layer RVQ embeddings to obtain true Audio Code Embeddings
@@ -760,59 +762,7 @@ def _embed_input_ids(
         inputs_embeds.masked_fill_(is_multimodal.unsqueeze(-1), 0.0)
 
         if multimodal_embeddings is not None and len(multimodal_embeddings) != 0:
-            # Flatten multimodal embeddings: tuple of [N_i, H] -> [sum(N_i), H]
-            mm_flat = torch.cat(
-                [
-                    t.reshape(-1, t.shape[-1]) if isinstance(t, torch.Tensor) and t.dim() >= 2 else t.reshape(1, -1)
-                    for t in multimodal_embeddings
-                ],
-                dim=0,
-            )
-
-            # is_multimodal: [S] or [B, S] -> positions where we should insert mm embeddings
-            pos = is_multimodal.nonzero(as_tuple=False)  # [N, 1] if 1D else [N, 2]
-            num_expected = pos.shape[0]
-            if mm_flat.shape[0] != num_expected:
-                raise ValueError(
-                    f"MiMo multimodal embedding count mismatch: got {mm_flat.shape[0]} "
-                    f"tokens, expected {num_expected} (is_multimodal.sum()). "
-                    "Check that embed_multimodal output matches prompt_ids empty positions."
-                )
-
-            if pos.dim() == 2 and pos.shape[1] == 2:
-                b_idx = pos[:, 0]
-                s_idx = pos[:, 1]
-            else:
-                # is_multimodal was 1D [S] -> single batch, seq indices only
-                b_idx = torch.zeros(num_expected, dtype=torch.long, device=inputs_embeds.device)
-                s_idx = pos[:, 0].to(inputs_embeds.device)
-
-            if inputs_embeds.dim() == 3:
-                dst = inputs_embeds[b_idx, s_idx, :]
-            else:
-                dst = inputs_embeds[s_idx, :]
-
-            hidden_size = inputs_embeds.shape[-1]
-            mm_hidden = mm_flat.shape[-1]
-            if mm_hidden == hidden_size:
-                dst.copy_(mm_flat.to(inputs_embeds.dtype))
-            else:
-                # TP: mm_flat is the local shard from ColumnParallelLinear (e.g. 2048);
-                # inputs_embeds has full hidden (e.g. 4096). Write shard into correct slice.
-                tp_rank = get_tensor_model_parallel_rank()
-                tp_world = get_tensor_model_parallel_world_size()
-                shard_size = hidden_size // tp_world
-                start = tp_rank * shard_size
-                end = start + shard_size
-                if mm_hidden != shard_size:
-                    raise ValueError(
-                        f"MiMo TP shard size mismatch: mm_flat has {mm_hidden}, "
-                        f"expected {shard_size} (hidden_size={hidden_size}, tp={tp_world})"
-                    )
-                if inputs_embeds.dim() == 3:
-                    inputs_embeds[b_idx, s_idx, start:end] = mm_flat.to(inputs_embeds.dtype)
-                else:
-                    inputs_embeds[s_idx, start:end] = mm_flat.to(inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds + multimodal_embeddings
 
         inputs_embeds = inputs_embeds.to(torch.bfloat16)
         return inputs_embeds
@@ -829,13 +779,11 @@ def embed_input_ids(
         if multimodal_embeddings is None or is_multimodal is None:
             return super().embed_input_ids(input_ids)
 
-        # Use MiMo's own merge logic instead of vllm's _merge_multimodal_embeddings.
-        # vllm expects num_mm_tokens == is_multimodal.sum(); MiMo's embed_multimodal
-        # format may differ, and masked_scatter_ triggers CUDA assert on mismatch.
-        return self._embed_input_ids(
+        return super().embed_input_ids(
             input_ids,
             multimodal_embeddings=multimodal_embeddings,
             is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
         )
 
     def base_local_forward(
@@ -1070,9 +1018,7 @@ def forward(
         **kwargs: object,
     ) -> torch.Tensor | IntermediateTensors:
         _forward_context = get_forward_context()
-        # Keep on CPU to avoid device-side assert when tp>1 (indexing with CPU tensor is valid)
-        _seq_len = int(input_ids.shape[-1])
-        _default_query_start_loc = torch.tensor([0, _seq_len], dtype=torch.long, device="cpu")
+        _default_query_start_loc = torch.tensor([0, input_ids.shape[-1]], device=input_ids.device)
         query_start_loc = (
             next(iter(_forward_context.attn_metadata.values())).query_start_loc
             if _forward_context.attn_metadata is not None

From 8181b2effdfc11e8e83fe1f2894f11d83c75fd09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= <qibaoyuan@xiaomi.com>
Date: Thu, 5 Mar 2026 20:00:44 +0800
Subject: [PATCH 6/7] [mimo-audio] add tp in yaml
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 齐保元 <qibaoyuan@xiaomi.com>
---
 vllm_omni/model_executor/stage_configs/mimo_audio.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml
index 82c85c33c5e..130ca5c9c38 100644
--- a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml
+++ b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml
@@ -7,7 +7,7 @@ stage_args:
     stage_type: llm
     runtime:
       process: true           # Run this stage in a separate process
-      devices: "0,1"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
       max_batch_size: 1
     engine_args:
       dtype: bfloat16
@@ -15,7 +15,7 @@ stage_args:
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
       gpu_memory_utilization: 0.3
       enforce_eager: true    # need to discuss
       trust_remote_code: true
@@ -39,14 +39,14 @@ stage_args:
     stage_type: llm
     runtime:
       process: true            # Run this stage in a separate process
-      devices: "0,1"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
       max_batch_size: 1
     engine_args:
       model_stage: code2wav
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: generation
       scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
-      tensor_parallel_size: 2
+      tensor_parallel_size: 1
       gpu_memory_utilization: 0.2
       enforce_eager: true
       trust_remote_code: true

From abe62c1ff11a11e5f1182eeb2d54971cba1df717 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= <qibaoyuan@xiaomi.com>
Date: Fri, 6 Mar 2026 09:37:42 +0800
Subject: [PATCH 7/7] [mimo-audio] add comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 齐保元 <qibaoyuan@xiaomi.com>
---
 vllm_omni/model_executor/stage_configs/mimo_audio.yaml        | 4 ++--
 .../model_executor/stage_configs/mimo_audio_async_chunk.yaml  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml
index 130ca5c9c38..b824fcb41f3 100644
--- a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml
+++ b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml
@@ -15,7 +15,7 @@ stage_args:
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      tensor_parallel_size: 1
+      tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs)
       gpu_memory_utilization: 0.3
       enforce_eager: true    # need to discuss
       trust_remote_code: true
@@ -46,7 +46,7 @@ stage_args:
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: generation
       scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
-      tensor_parallel_size: 1
+      tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs)
       gpu_memory_utilization: 0.2
       enforce_eager: true
       trust_remote_code: true
diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
index e20b0840681..7177aa80921 100644
--- a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
+++ b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml
@@ -15,7 +15,7 @@ stage_args:
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      tensor_parallel_size: 1
+      tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs)
       gpu_memory_utilization: 0.3
       enforce_eager: true    # need to discuss
       trust_remote_code: true
@@ -47,7 +47,7 @@ stage_args:
       model_arch: MiMoAudioForConditionalGeneration
       worker_type: generation
       scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
-      tensor_parallel_size: 1
+      tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs)
       gpu_memory_utilization: 0.2
       enforce_eager: true
       trust_remote_code: true