From faf496f0a4ade54acf992347c161c59afc9b23fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= Date: Tue, 3 Mar 2026 19:07:46 +0800 Subject: [PATCH 1/7] [mimo-audio] tp>1 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 齐保元 --- .../models/mimo_audio/mimo_audio_llm.py | 65 +++++++++++++++++-- 1 file changed, 61 insertions(+), 4 deletions(-) diff --git a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py index 3b9f94f2317..7975ad1d934 100644 --- a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py +++ b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py @@ -12,6 +12,7 @@ Qwen2Model as TransformerQwen2Model, ) from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size from vllm.forward_context import get_forward_context from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -759,7 +760,59 @@ def _embed_input_ids( inputs_embeds.masked_fill_(is_multimodal.unsqueeze(-1), 0.0) if multimodal_embeddings is not None and len(multimodal_embeddings) != 0: - inputs_embeds = inputs_embeds + multimodal_embeddings + # Flatten multimodal embeddings: tuple of [N_i, H] -> [sum(N_i), H] + mm_flat = torch.cat( + [ + t.reshape(-1, t.shape[-1]) if isinstance(t, torch.Tensor) and t.dim() >= 2 else t.reshape(1, -1) + for t in multimodal_embeddings + ], + dim=0, + ) + + # is_multimodal: [S] or [B, S] -> positions where we should insert mm embeddings + pos = is_multimodal.nonzero(as_tuple=False) # [N, 1] if 1D else [N, 2] + num_expected = pos.shape[0] + if mm_flat.shape[0] != num_expected: + raise ValueError( + f"MiMo multimodal embedding count mismatch: got {mm_flat.shape[0]} " + f"tokens, expected {num_expected} (is_multimodal.sum()). " + "Check that embed_multimodal output matches prompt_ids empty positions." + ) + + if pos.dim() == 2 and pos.shape[1] == 2: + b_idx = pos[:, 0] + s_idx = pos[:, 1] + else: + # is_multimodal was 1D [S] -> single batch, seq indices only + b_idx = torch.zeros(num_expected, dtype=torch.long, device=inputs_embeds.device) + s_idx = pos[:, 0].to(inputs_embeds.device) + + if inputs_embeds.dim() == 3: + dst = inputs_embeds[b_idx, s_idx, :] + else: + dst = inputs_embeds[s_idx, :] + + hidden_size = inputs_embeds.shape[-1] + mm_hidden = mm_flat.shape[-1] + if mm_hidden == hidden_size: + dst.copy_(mm_flat.to(inputs_embeds.dtype)) + else: + # TP: mm_flat is the local shard from ColumnParallelLinear (e.g. 2048); + # inputs_embeds has full hidden (e.g. 4096). Write shard into correct slice. + tp_rank = get_tensor_model_parallel_rank() + tp_world = get_tensor_model_parallel_world_size() + shard_size = hidden_size // tp_world + start = tp_rank * shard_size + end = start + shard_size + if mm_hidden != shard_size: + raise ValueError( + f"MiMo TP shard size mismatch: mm_flat has {mm_hidden}, " + f"expected {shard_size} (hidden_size={hidden_size}, tp={tp_world})" + ) + if inputs_embeds.dim() == 3: + inputs_embeds[b_idx, s_idx, start:end] = mm_flat.to(inputs_embeds.dtype) + else: + inputs_embeds[s_idx, start:end] = mm_flat.to(inputs_embeds.dtype) inputs_embeds = inputs_embeds.to(torch.bfloat16) return inputs_embeds @@ -776,11 +829,13 @@ def embed_input_ids( if multimodal_embeddings is None or is_multimodal is None: return super().embed_input_ids(input_ids) - return super().embed_input_ids( + # Use MiMo's own merge logic instead of vllm's _merge_multimodal_embeddings. + # vllm expects num_mm_tokens == is_multimodal.sum(); MiMo's embed_multimodal + # format may differ, and masked_scatter_ triggers CUDA assert on mismatch. + return self._embed_input_ids( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, - handle_oov_mm_token=handle_oov_mm_token, ) def base_local_forward( @@ -1015,7 +1070,9 @@ def forward( **kwargs: object, ) -> torch.Tensor | IntermediateTensors: _forward_context = get_forward_context() - _default_query_start_loc = torch.tensor([0, input_ids.shape[-1]], device=input_ids.device) + # Keep on CPU to avoid device-side assert when tp>1 (indexing with CPU tensor is valid) + _seq_len = int(input_ids.shape[-1]) + _default_query_start_loc = torch.tensor([0, _seq_len], dtype=torch.long, device="cpu") query_start_loc = ( next(iter(_forward_context.attn_metadata.values())).query_start_loc if _forward_context.attn_metadata is not None From 53c3d46e0fd52e38b2a8d017bbcd9697f4ea14ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= Date: Wed, 4 Mar 2026 09:59:17 +0800 Subject: [PATCH 2/7] [mimo-audio] tp>1 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 齐保元 --- vllm_omni/model_executor/stage_configs/mimo_audio.yaml | 6 ++++-- .../stage_configs/mimo_audio_async_chunk.yaml | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml index 552ba2ab5fd..82c85c33c5e 100644 --- a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml +++ b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml @@ -7,7 +7,7 @@ stage_args: stage_type: llm runtime: process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + devices: "0,1" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) max_batch_size: 1 engine_args: dtype: bfloat16 @@ -15,6 +15,7 @@ stage_args: model_arch: MiMoAudioForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + tensor_parallel_size: 2 gpu_memory_utilization: 0.3 enforce_eager: true # need to discuss trust_remote_code: true @@ -38,13 +39,14 @@ stage_args: stage_type: llm runtime: process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + devices: "0,1" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) max_batch_size: 1 engine_args: model_stage: code2wav model_arch: MiMoAudioForConditionalGeneration worker_type: generation scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + tensor_parallel_size: 2 gpu_memory_utilization: 0.2 enforce_eager: true trust_remote_code: true diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml index a60fc566a99..c66620f337c 100644 --- a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml +++ b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml @@ -7,7 +7,7 @@ stage_args: stage_type: llm runtime: process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + devices: "0,1" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) max_batch_size: 1 engine_args: dtype: bfloat16 @@ -15,6 +15,7 @@ stage_args: model_arch: MiMoAudioForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + tensor_parallel_size: 1 gpu_memory_utilization: 0.3 enforce_eager: true # need to discuss trust_remote_code: true @@ -39,13 +40,14 @@ stage_args: stage_type: llm runtime: process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + devices: "0,1" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) max_batch_size: 1 engine_args: model_stage: code2wav model_arch: MiMoAudioForConditionalGeneration worker_type: generation scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + tensor_parallel_size: 1 gpu_memory_utilization: 0.2 enforce_eager: true trust_remote_code: true From 750385c7e9295ccebb6f4e9715ceefc606faca6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= Date: Wed, 4 Mar 2026 16:06:12 +0800 Subject: [PATCH 3/7] [mimo-audio] tp>1 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 齐保元 --- .../model_executor/stage_configs/mimo_audio_async_chunk.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml index c66620f337c..9e61d71d7c3 100644 --- a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml +++ b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml @@ -15,7 +15,7 @@ stage_args: model_arch: MiMoAudioForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - tensor_parallel_size: 1 + tensor_parallel_size: 2 gpu_memory_utilization: 0.3 enforce_eager: true # need to discuss trust_remote_code: true @@ -47,7 +47,7 @@ stage_args: model_arch: MiMoAudioForConditionalGeneration worker_type: generation scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - tensor_parallel_size: 1 + tensor_parallel_size: 2 gpu_memory_utilization: 0.2 enforce_eager: true trust_remote_code: true From 094685f5a990e93a1e94bd4572e4a3d97f1cb800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= Date: Wed, 4 Mar 2026 18:18:39 +0800 Subject: [PATCH 4/7] [mimo-audio] tp>1 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 齐保元 --- .../stage_configs/mimo_audio_async_chunk.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml index 9e61d71d7c3..e20b0840681 100644 --- a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml +++ b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml @@ -7,7 +7,7 @@ stage_args: stage_type: llm runtime: process: true # Run this stage in a separate process - devices: "0,1" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) max_batch_size: 1 engine_args: dtype: bfloat16 @@ -15,7 +15,7 @@ stage_args: model_arch: MiMoAudioForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - tensor_parallel_size: 2 + tensor_parallel_size: 1 gpu_memory_utilization: 0.3 enforce_eager: true # need to discuss trust_remote_code: true @@ -40,14 +40,14 @@ stage_args: stage_type: llm runtime: process: true # Run this stage in a separate process - devices: "0,1" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) max_batch_size: 1 engine_args: model_stage: code2wav model_arch: MiMoAudioForConditionalGeneration worker_type: generation scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - tensor_parallel_size: 2 + tensor_parallel_size: 1 gpu_memory_utilization: 0.2 enforce_eager: true trust_remote_code: true From 2014da80f6ddab86a14c8192b621abece8080320 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= Date: Thu, 5 Mar 2026 19:16:28 +0800 Subject: [PATCH 5/7] [mimo-audio] tp>1 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 齐保元 --- .../models/mimo_audio/mimo_audio_llm.py | 68 ++----------------- 1 file changed, 7 insertions(+), 61 deletions(-) diff --git a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py index 7975ad1d934..c1b71e076e4 100644 --- a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py +++ b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py @@ -12,7 +12,6 @@ Qwen2Model as TransformerQwen2Model, ) from vllm.config import VllmConfig -from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size from vllm.forward_context import get_forward_context from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -557,12 +556,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size, bias=False, return_bias=False, + gather_output=True, ) self.hidden_states_downcast = ColumnParallelLinear( config.hidden_size, self.local_config.hidden_size, bias=False, return_bias=False, + gather_output=True, ) self.lm_head = ColumnParallelLinear( @@ -570,6 +571,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size, bias=False, return_bias=False, + gather_output=True, ) # Re-encode the sum of multi-layer RVQ embeddings to obtain true Audio Code Embeddings @@ -760,59 +762,7 @@ def _embed_input_ids( inputs_embeds.masked_fill_(is_multimodal.unsqueeze(-1), 0.0) if multimodal_embeddings is not None and len(multimodal_embeddings) != 0: - # Flatten multimodal embeddings: tuple of [N_i, H] -> [sum(N_i), H] - mm_flat = torch.cat( - [ - t.reshape(-1, t.shape[-1]) if isinstance(t, torch.Tensor) and t.dim() >= 2 else t.reshape(1, -1) - for t in multimodal_embeddings - ], - dim=0, - ) - - # is_multimodal: [S] or [B, S] -> positions where we should insert mm embeddings - pos = is_multimodal.nonzero(as_tuple=False) # [N, 1] if 1D else [N, 2] - num_expected = pos.shape[0] - if mm_flat.shape[0] != num_expected: - raise ValueError( - f"MiMo multimodal embedding count mismatch: got {mm_flat.shape[0]} " - f"tokens, expected {num_expected} (is_multimodal.sum()). " - "Check that embed_multimodal output matches prompt_ids empty positions." - ) - - if pos.dim() == 2 and pos.shape[1] == 2: - b_idx = pos[:, 0] - s_idx = pos[:, 1] - else: - # is_multimodal was 1D [S] -> single batch, seq indices only - b_idx = torch.zeros(num_expected, dtype=torch.long, device=inputs_embeds.device) - s_idx = pos[:, 0].to(inputs_embeds.device) - - if inputs_embeds.dim() == 3: - dst = inputs_embeds[b_idx, s_idx, :] - else: - dst = inputs_embeds[s_idx, :] - - hidden_size = inputs_embeds.shape[-1] - mm_hidden = mm_flat.shape[-1] - if mm_hidden == hidden_size: - dst.copy_(mm_flat.to(inputs_embeds.dtype)) - else: - # TP: mm_flat is the local shard from ColumnParallelLinear (e.g. 2048); - # inputs_embeds has full hidden (e.g. 4096). Write shard into correct slice. - tp_rank = get_tensor_model_parallel_rank() - tp_world = get_tensor_model_parallel_world_size() - shard_size = hidden_size // tp_world - start = tp_rank * shard_size - end = start + shard_size - if mm_hidden != shard_size: - raise ValueError( - f"MiMo TP shard size mismatch: mm_flat has {mm_hidden}, " - f"expected {shard_size} (hidden_size={hidden_size}, tp={tp_world})" - ) - if inputs_embeds.dim() == 3: - inputs_embeds[b_idx, s_idx, start:end] = mm_flat.to(inputs_embeds.dtype) - else: - inputs_embeds[s_idx, start:end] = mm_flat.to(inputs_embeds.dtype) + inputs_embeds = inputs_embeds + multimodal_embeddings inputs_embeds = inputs_embeds.to(torch.bfloat16) return inputs_embeds @@ -829,13 +779,11 @@ def embed_input_ids( if multimodal_embeddings is None or is_multimodal is None: return super().embed_input_ids(input_ids) - # Use MiMo's own merge logic instead of vllm's _merge_multimodal_embeddings. - # vllm expects num_mm_tokens == is_multimodal.sum(); MiMo's embed_multimodal - # format may differ, and masked_scatter_ triggers CUDA assert on mismatch. - return self._embed_input_ids( + return super().embed_input_ids( input_ids, multimodal_embeddings=multimodal_embeddings, is_multimodal=is_multimodal, + handle_oov_mm_token=handle_oov_mm_token, ) def base_local_forward( @@ -1070,9 +1018,7 @@ def forward( **kwargs: object, ) -> torch.Tensor | IntermediateTensors: _forward_context = get_forward_context() - # Keep on CPU to avoid device-side assert when tp>1 (indexing with CPU tensor is valid) - _seq_len = int(input_ids.shape[-1]) - _default_query_start_loc = torch.tensor([0, _seq_len], dtype=torch.long, device="cpu") + _default_query_start_loc = torch.tensor([0, input_ids.shape[-1]], device=input_ids.device) query_start_loc = ( next(iter(_forward_context.attn_metadata.values())).query_start_loc if _forward_context.attn_metadata is not None From 8181b2effdfc11e8e83fe1f2894f11d83c75fd09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= Date: Thu, 5 Mar 2026 20:00:44 +0800 Subject: [PATCH 6/7] [mimo-audio] add tp in yaml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 齐保元 --- vllm_omni/model_executor/stage_configs/mimo_audio.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml index 82c85c33c5e..130ca5c9c38 100644 --- a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml +++ b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml @@ -7,7 +7,7 @@ stage_args: stage_type: llm runtime: process: true # Run this stage in a separate process - devices: "0,1" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) max_batch_size: 1 engine_args: dtype: bfloat16 @@ -15,7 +15,7 @@ stage_args: model_arch: MiMoAudioForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - tensor_parallel_size: 2 + tensor_parallel_size: 1 gpu_memory_utilization: 0.3 enforce_eager: true # need to discuss trust_remote_code: true @@ -39,14 +39,14 @@ stage_args: stage_type: llm runtime: process: true # Run this stage in a separate process - devices: "0,1" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) max_batch_size: 1 engine_args: model_stage: code2wav model_arch: MiMoAudioForConditionalGeneration worker_type: generation scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - tensor_parallel_size: 2 + tensor_parallel_size: 1 gpu_memory_utilization: 0.2 enforce_eager: true trust_remote_code: true From abe62c1ff11a11e5f1182eeb2d54971cba1df717 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BD=90=E4=BF=9D=E5=85=83?= Date: Fri, 6 Mar 2026 09:37:42 +0800 Subject: [PATCH 7/7] [mimo-audio] add comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 齐保元 --- vllm_omni/model_executor/stage_configs/mimo_audio.yaml | 4 ++-- .../model_executor/stage_configs/mimo_audio_async_chunk.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml index 130ca5c9c38..b824fcb41f3 100644 --- a/vllm_omni/model_executor/stage_configs/mimo_audio.yaml +++ b/vllm_omni/model_executor/stage_configs/mimo_audio.yaml @@ -15,7 +15,7 @@ stage_args: model_arch: MiMoAudioForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - tensor_parallel_size: 1 + tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs) gpu_memory_utilization: 0.3 enforce_eager: true # need to discuss trust_remote_code: true @@ -46,7 +46,7 @@ stage_args: model_arch: MiMoAudioForConditionalGeneration worker_type: generation scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - tensor_parallel_size: 1 + tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs) gpu_memory_utilization: 0.2 enforce_eager: true trust_remote_code: true diff --git a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml index e20b0840681..7177aa80921 100644 --- a/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml +++ b/vllm_omni/model_executor/stage_configs/mimo_audio_async_chunk.yaml @@ -15,7 +15,7 @@ stage_args: model_arch: MiMoAudioForConditionalGeneration worker_type: ar scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - tensor_parallel_size: 1 + tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs) gpu_memory_utilization: 0.3 enforce_eager: true # need to discuss trust_remote_code: true @@ -47,7 +47,7 @@ stage_args: model_arch: MiMoAudioForConditionalGeneration worker_type: generation scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - tensor_parallel_size: 1 + tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs) gpu_memory_utilization: 0.2 enforce_eager: true trust_remote_code: true