Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions docker/Dockerfile.npu
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend
ARG VLLM_ASCEND_TAG=v0.14.0rc1
ARG VLLM_ASCEND_TAG=v0.17.0rc1
FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}

WORKDIR /vllm-workspace/vllm-ascend
RUN git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
RUN pip install -v -e .

ARG APP_DIR=/vllm-workspace/vllm-omni
WORKDIR ${APP_DIR}

COPY . .

RUN pip install -v -e .
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /vllm-workspace/vllm-omni/ --no-build-isolation

ENV VLLM_WORKER_MULTIPROC_METHOD=spawn

Expand Down
12 changes: 6 additions & 6 deletions docker/Dockerfile.npu.a3
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend
ARG VLLM_ASCEND_TAG=v0.14.0rc1-a3
ARG VLLM_ASCEND_TAG=v0.17.0rc1-a3
FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}

WORKDIR /vllm-workspace/vllm-ascend
RUN git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
RUN pip install -v -e .

ARG APP_DIR=/vllm-workspace/vllm-omni
WORKDIR ${APP_DIR}

COPY . .

RUN pip install -v -e .
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /vllm-workspace/vllm-omni/ --no-build-isolation

ENV VLLM_WORKER_MULTIPROC_METHOD=spawn

Expand Down
21 changes: 14 additions & 7 deletions docs/getting_started/installation/npu/npu.inc.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,25 @@ docker run --rm \
-p 8000:8000 \
-it $IMAGE bash

cd /vllm-workspace/vllm
git pull origin main
git fetch origin --tags
git checkout v0.16.0

# Because vllm-ascend will release v0.16.0rc1 after vllm-omni 0.16.0,
# we have to pin vllm-ascend at the current commit.
cd /vllm-workspace/vllm-ascend
git pull origin main
git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
pip install -v -e .

# Inside the container, install vLLM-Omni from source
cd /vllm-workspace
git clone -b v0.16.0 https://github.com/vllm-project/vllm-omni.git

cd vllm-omni
pip install -v -e .
pip install -v -e . --no-build-isolation
# or VLLM_OMNI_TARGET_DEVICE=npu pip install -v -e .

export VLLM_WORKER_MULTIPROC_METHOD=spawn
```

Expand All @@ -61,22 +68,22 @@ We are keeping [issue #886](https://github.com/vllm-project/vllm-omni/issues/886
You can also build vLLM-Omni from the latest main branch if you want to use the latest features or bug fixes. (But sometimes it will break for a while. You can check [issue #886](https://github.com/vllm-project/vllm-omni/issues/886) for the status of the latest commit of vLLM-Omni main branch on NPU.)

```bash
# Pin vLLM version to 0.16.0
# Pin vLLM version to 0.17.0
cd /vllm-workspace/vllm
git pull origin main
git fetch origin --tags
git checkout v0.16.0
git checkout v0.17.0
VLLM_TARGET_DEVICE=empty pip install -v -e .

# Because vllm-ascend has not yet entered continuous development and has not been officially released, we need to pin it to a specific commit. Please note that this commit may change over time.
cd ../vllm-ascend
cd /vllm-workspace/vllm-ascend
git pull origin main
git fetch origin --tags
git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
git checkout v0.17.0
pip install -v -e .

# Install vLLM-Omni from the latest main branch
cd ../vllm-omni
cd /vllm-workspace/vllm-omni
git clone https://github.com/vllm-project/vllm-omni.git
pip install -v -e . --no-build-isolation
# or VLLM_OMNI_TARGET_DEVICE=npu pip install -v -e .
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding

from vllm_omni.platforms import current_omni_platform

logger = init_logger(__name__)


Expand Down Expand Up @@ -343,6 +345,10 @@ def _ensure_cached_refs(self) -> None:
def _ensure_model_fwd(self) -> None:
if self._model_fwd is not None:
return
if not current_omni_platform.supports_torch_inductor():
logger.warning_once("code_predictor: torch.compile disabled")
self._model_fwd = self.model.forward
return
self._model_fwd = torch.compile(
self.model.forward,
mode="default",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
)
from vllm.model_executor.models.utils import is_pp_missing_parameter

from vllm_omni.platforms import current_omni_platform

from .configuration_qwen3_tts import Qwen3TTSTalkerCodePredictorConfig, Qwen3TTSTalkerConfig

logger = init_logger(__name__)
Expand Down Expand Up @@ -410,6 +412,10 @@ def _setup_compile(self) -> None:
"""
if self._compiled_model_fwd is not None:
return
if not current_omni_platform.supports_torch_inductor():
logger.warning_once("code_predictor: torch.compile disabled")
self._compiled_model_fwd = self.model.forward
return
self._compiled_model_fwd = torch.compile(
self.model.forward,
mode="default",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
# Stage 0: Thinker (multimodal understanding + text generation)
# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
# Stage 2: Code2Wav (16-layer RVQ codes → audio waveform)

# The following config has been verified on 2x H100-80G GPUs.
async_chunk: true
stage_args:
- stage_id: 0
stage_type: llm # Use llm stage type to launch OmniLLM
runtime:
devices: "0,1"
max_batch_size: 10
engine_args:
model_stage: thinker
model_arch: Qwen3OmniMoeForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.9
enforce_eager: false
trust_remote_code: true
engine_output_type: latent # Output hidden states for talker
distributed_executor_backend: "mp"
enable_prefix_caching: false
max_num_batched_tokens: 32768
hf_config_name: thinker_config
tensor_parallel_size: 2
custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk
final_output: true
final_output_type: text
is_comprehension: true
default_sampling_params:
temperature: 0.4
top_p: 0.9
top_k: 1
max_tokens: 2048
seed: 42
detokenize: True
repetition_penalty: 1.05

- stage_id: 1
stage_type: llm # Use llm stage type to launch OmniLLM
runtime:
devices: "2"
max_batch_size: 10
engine_args:
model_stage: talker
model_arch: Qwen3OmniMoeForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.6
enforce_eager: true
trust_remote_code: true
engine_output_type: latent # Output codec codes for code2wav
enable_prefix_caching: false
max_num_batched_tokens: 32768
distributed_executor_backend: "mp"
hf_config_name: talker_config
custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk
engine_input_source: [0]
# final_output: true
# final_output_type: text
default_sampling_params:
temperature: 0.9
top_k: 50
max_tokens: 4096
seed: 42
detokenize: False
repetition_penalty: 1.0
stop_token_ids: [2150]

- stage_id: 2
stage_type: llm # Use llm stage type to launch OmniLLM
runtime:
devices: "2"
max_batch_size: 10
engine_args:
model_stage: code2wav
model_arch: Qwen3OmniMoeForConditionalGeneration
worker_type: generation
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
enforce_eager: true
trust_remote_code: true
async_scheduling: false
enable_prefix_caching: false
engine_output_type: audio # Final output: audio waveform
gpu_memory_utilization: 0.3
distributed_executor_backend: "mp"
max_num_batched_tokens: 51200 # [TODO] if max_num_batch_tokens < max_batch_size * 800, there will be precision problem.
hf_config_name: thinker_config
engine_input_source: [1]
final_output: true
final_output_type: audio
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 65536
seed: 42
detokenize: True
repetition_penalty: 1.1
Loading
Loading