Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -556,19 +556,22 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config.hidden_size,
bias=False,
return_bias=False,
gather_output=True,
Comment thread
qibaoyuan marked this conversation as resolved.
)
self.hidden_states_downcast = ColumnParallelLinear(
config.hidden_size,
self.local_config.hidden_size,
bias=False,
return_bias=False,
gather_output=True,
)

self.lm_head = ColumnParallelLinear(
config.hidden_size,
config.vocab_size,
bias=False,
return_bias=False,
gather_output=True,
)

# Re-encode the sum of multi-layer RVQ embeddings to obtain true Audio Code Embeddings
Expand Down
2 changes: 2 additions & 0 deletions vllm_omni/model_executor/stage_configs/mimo_audio.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ stage_args:
model_arch: MiMoAudioForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs)
gpu_memory_utilization: 0.3
enforce_eager: true # need to discuss
trust_remote_code: true
Expand Down Expand Up @@ -45,6 +46,7 @@ stage_args:
model_arch: MiMoAudioForConditionalGeneration
worker_type: generation
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs)
gpu_memory_utilization: 0.2
enforce_eager: true
trust_remote_code: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ stage_args:
model_arch: MiMoAudioForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs)
gpu_memory_utilization: 0.3
enforce_eager: true # need to discuss
trust_remote_code: true
Expand Down Expand Up @@ -46,6 +47,7 @@ stage_args:
model_arch: MiMoAudioForConditionalGeneration
worker_type: generation
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
tensor_parallel_size: 1 # Change to desired TP size for multi-GPU inference (e.g., 4 for 4 GPUs)
gpu_memory_utilization: 0.2
enforce_eager: true
trust_remote_code: true
Expand Down