diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 2d18af238c3..4f049be0d8f 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -7,3 +7,4 @@ sphinx-togglebutton myst-parser msgspec sphinx-substitution-extensions +snowballstemmer<3.0.0 diff --git a/docs/source/conf.py b/docs/source/conf.py index 866dd11dfde..130143a3667 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -72,6 +72,8 @@ # This value should be updated when cut down release. 'pip_vllm_ascend_version': "0.7.3rc2", 'pip_vllm_version': "0.7.3", + # The maching MindIE Turbo for vLLM Ascend + 'pip_mindie_turbo_version': "2.0rc1", # CANN image tag 'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10", } diff --git a/docs/source/installation.md b/docs/source/installation.md index f1d6612711f..97e1d2ab861 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -195,6 +195,16 @@ The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed in `/v ::::: +## (Optional) Install MindIE Turbo + +Install MindIE Turbo for performance acceleration: + +```{code-block} bash + :substitutions: + +pip install mindie_turbo==|pip_mindie_turbo_version| +``` + ## Extra information ### Verify installation @@ -254,3 +264,10 @@ Prompt: 'The president of the United States is', Generated text: ' a very import Prompt: 'The capital of France is', Generated text: ' Paris. The oldest part of the city is Saint-Germain-des-Pr' Prompt: 'The future of AI is', Generated text: ' not bright\n\nThere is no doubt that the evolution of AI will have a huge' ``` + +### Compile Enhancement + +Get more performance gains by optimizing Python and torch-npu with the Bisheng compiler, please follow these official turtorial: + +[Optimizing Python with Bisheng](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0063.html) +[Optimizing torch-npu with Bisheng](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0058.html) diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 58b28e0c9b8..7cda0dbaa76 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -33,6 +33,15 @@ docker run --rm \ The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed in `/vllm-workspace` and installed in [development mode](https://setuptools.pypa.io/en/latest/userguide/development_mode.html)(`pip install -e`) to help developer immediately take place changes without requiring a new installation. +## (Optional) Install MindIE Turbo + +Install MindIE Turbo for performance acceleration: + +```{code-block} bash + :substitutions: +pip install mindie_turbo==|pip_mindie_turbo_version| +``` + ## Usage You can use Modelscope mirror to speed up download: @@ -130,4 +139,8 @@ INFO: Application shutdown complete. Finally, you can exit container by using `ctrl-D`. :::: -::::: \ No newline at end of file +::::: + +### Performance enhancement related environment variables in Mindie Turbo + +Currently, some performance enhancement features in MindIE Turbo have certain scenario restrictions. For these features, environment variables are used to control whether to enable them. For related environment variables, see its [official documentation](https://www.hiascend.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev/mindie-turbo-0010.html). diff --git a/docs/source/tutorials/multi_node.md b/docs/source/tutorials/multi_node.md index d674d8a5890..ddeb18509fb 100644 --- a/docs/source/tutorials/multi_node.md +++ b/docs/source/tutorials/multi_node.md @@ -30,6 +30,14 @@ docker run --rm \ -it quay.io/ascend/vllm-ascend:|vllm_ascend_version| bash ``` +(Optional) Install MindIE Turbo for performance acceleration: + +```{code-block} bash + :substitutions: + +pip install mindie_turbo==|pip_mindie_turbo_version| +``` + Choose one machine as head node, the other are worker nodes, then start ray on each machine: :::{note} diff --git a/docs/source/tutorials/multi_npu.md b/docs/source/tutorials/multi_npu.md index e59b72568f8..7f162f2e36a 100644 --- a/docs/source/tutorials/multi_npu.md +++ b/docs/source/tutorials/multi_npu.md @@ -27,6 +27,14 @@ docker run --rm \ -it $IMAGE bash ``` +(Optional) Install MindIE Turbo for performance acceleration: + +```{code-block} bash + :substitutions: + +pip install mindie_turbo==|pip_mindie_turbo_version| +``` + Setup environment variables: ```bash diff --git a/docs/source/tutorials/single_npu.md b/docs/source/tutorials/single_npu.md index 445d9518bba..ab2ee392f28 100644 --- a/docs/source/tutorials/single_npu.md +++ b/docs/source/tutorials/single_npu.md @@ -26,6 +26,14 @@ docker run --rm \ -it $IMAGE bash ``` +(Optional) Install MindIE Turbo for performance acceleration: + +```{code-block} bash + :substitutions: + +pip install mindie_turbo==|pip_mindie_turbo_version| +``` + Setup environment variables: ```bash @@ -90,7 +98,20 @@ docker run --rm \ -p 8000:8000 \ -e VLLM_USE_MODELSCOPE=True \ -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ --it $IMAGE \ +-it $IMAGE bash +``` + +(Optional) Install MindIE Turbo for performance acceleration: + +```{code-block} bash + :substitutions: + +pip install mindie_turbo==|pip_mindie_turbo_version| +``` + +Run the following script to start the vLLM server: + +``` vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240 ``` diff --git a/docs/source/tutorials/single_npu_multimodal.md b/docs/source/tutorials/single_npu_multimodal.md index 3b01397a964..5d68a5387e3 100644 --- a/docs/source/tutorials/single_npu_multimodal.md +++ b/docs/source/tutorials/single_npu_multimodal.md @@ -26,6 +26,14 @@ docker run --rm \ -it $IMAGE bash ``` +(Optional) Install MindIE Turbo for performance acceleration: + +```{code-block} bash + :substitutions: + +pip install mindie_turbo==|pip_mindie_turbo_version| +``` + Setup environment variables: ```bash @@ -143,7 +151,20 @@ docker run --rm \ -p 8000:8000 \ -e VLLM_USE_MODELSCOPE=True \ -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ --it $IMAGE \ +-it $IMAGE bash +``` + +(Optional) Install MindIE Turbo for performance acceleration: + +```{code-block} bash + :substitutions: + +pip install mindie_turbo==|pip_mindie_turbo_version| +``` + +Run the following script to start the vLLM server: + +``` vllm serve Qwen/Qwen2.5-VL-7B-Instruct --dtype bfloat16 --max_model_len 16384 --max-num-batched-tokens 16384 ``` diff --git a/docs/source/user_guide/suppoted_features.md b/docs/source/user_guide/suppoted_features.md index 34aa09b6011..3980fa359ca 100644 --- a/docs/source/user_guide/suppoted_features.md +++ b/docs/source/user_guide/suppoted_features.md @@ -1,21 +1,38 @@ # Feature Support -| Feature | Supported | CI Coverage | Guidance Document | Current Status | Next Step | -|--------------------------|-----------|-------------|-------------------|---------------------------|--------------------| -| Chunked Prefill | ❌ | | | NA | Rely on CANN 8.1 NNAL package release | -| Automatic Prefix Caching | ✅ | | | Basic functions available | Rely on CANN 8.1 NNAL package release | -| LoRA | ❌ | | | NA | Plan in 2025.06.30 | -| Prompt adapter | ❌ | | | NA | Plan in 2025.06.30 | -| Speculative decoding | ✅ | | | Basic functions available | Need fully test | -| Pooling | ✅ | | | Basic functions available(Bert) | Need fully test and add more models support| -| Enc-dec | ❌ | | | NA | Plan in 2025.06.30| -| Multi Modality | ✅ | | ✅ | Basic functions available(LLaVA/Qwen2-vl/Qwen2-audio/internVL)| Improve perforamance, and add more models support | -| LogProbs | ✅ | | | Basic functions available | Need fully test | -| Prompt logProbs | ✅ | | | Basic functions available | Need fully test | -| Async output | ✅ | | | Basic functions available | Need fully test | -| Multi step scheduler | ✅ | | | Basic functions available | Need fully test, Find more details at [ Blog ](https://blog.vllm.ai/2024/09/05/perf-update.html#batch-scheduling-multiple-steps-ahead-pr-7000), [ RFC ](https://github.com/vllm-project/vllm/issues/6854) and [issue](https://github.com/vllm-project/vllm/pull/7000) | -| Best of | ✅ | | | Basic functions available | Need fully test | -| Beam search | ✅ | | | Basic functions available | Need fully test | -| Guided Decoding | ✅ | | | Basic functions available | Find more details at the [issue](https://github.com/vllm-project/vllm-ascend/issues/177) | -| Tensor Parallel | ✅ | | | Basic functions available | Need fully test | -| Pipeline Parallel | ✅ | | | Basic functions available | Need fully test | +The feature support principle of vLLM Ascend is: **aligned with the vLLM**. We are also actively collaborating with the community to accelerate support. + +vLLM Ascend offers the overall functional support of the most features in vLLM, and the usage keep the same with vLLM except for some limits. + +```{note} +MindIE Turbo is an optional performace optimization plugin. Find more information about the feature support of MindIE Turbo here(UPDATE_ME_AS_A_LINK). +``` + +| Feature | vLLM Ascend | MindIE Turbo | Notes | +|-------------------------------|----------------|-----------------|------------------------------------------------------------------------| +| V1Engine | 🔵 Experimental| 🔵 Experimental| Will enhance in v0.8.x | +| Chunked Prefill | 🟢 Functional | 🟢 Functional | / | +| Automatic Prefix Caching | 🟢 Functional | 🟢 Functional | [Usage Limits][#732](https://github.com/vllm-project/vllm-ascend/issues/732) | +| LoRA | 🟢 Functional | 🟢 Functional | / | +| Prompt adapter | 🟡 Planned | 🟡 Planned | / | +| Speculative decoding | 🟢 Functional | 🟢 Functional | [Usage Limits][#734](https://github.com/vllm-project/vllm-ascend/issues/734) | +| Pooling | 🟢 Functional | 🟢 Functional | / | +| Enc-dec | 🟡 Planned | 🟡 Planned | / | +| Multi Modality | 🟢 Functional | 🟢 Functional | / | +| LogProbs | 🟢 Functional | 🟢 Functional | / | +| Prompt logProbs | 🟢 Functional | 🟢 Functional | / | +| Async output | 🟢 Functional | 🟢 Functional | / | +| Multi step scheduler | 🟢 Functional | 🟢 Functional | / | +| Best of | 🟢 Functional | 🟢 Functional | / | +| Beam search | 🟢 Functional | 🟢 Functional | / | +| Guided Decoding | 🟢 Functional | 🟢 Functional | / | +| Tensor Parallel | 🟢 Functional | ⚡Optimized | / | +| Pipeline Parallel | 🟢 Functional | ⚡Optimized | / | +| Expert Parallel | 🟡 Planned | 🟡 Planned | Will support in v0.8.x | +| Data Parallel | 🟡 Planned | 🟡 Planned | Will support in v0.8.x | +| Prefill Decode Disaggregation | 🟢 Functional | 🟢 Functional | todo | +| Quantization | 🟡 Planned | 🟢 Functional | Will support in v0.8.x | +| Graph Mode | 🟡 Planned | 🟡 Planned | Will support in v0.8.x | +| Sleep Mode | 🟢 Functional | 🟢 Functional | [Usage Limits][#733](https://github.com/vllm-project/vllm-ascend/issues/733) | +| MTP | 🟢 Functional | 🟢 Functional | [Usage Limits][#734](https://github.com/vllm-project/vllm-ascend/issues/734) | +| Custom Scheduler | 🟢 Functional | 🟢 Functional | [Usage Limits][#788](https://github.com/vllm-project/vllm-ascend/issues/788) | diff --git a/setup.py b/setup.py index 542af85151a..64c04af7d74 100644 --- a/setup.py +++ b/setup.py @@ -368,7 +368,7 @@ def _read_requirements(filename: str) -> List[str]: install_requires=get_requirements(), ext_modules=ext_modules, cmdclass=cmdclass, - extras_require={}, + extras_require={"mindie_turbo": ["mindie-turbo==2.0rc1"]}, entry_points={ "vllm.platform_plugins": ["ascend = vllm_ascend:register"], "vllm.general_plugins": diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index dda11198707..ad079e3e482 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -20,7 +20,9 @@ from vllm.logger import logger -def try_register_lib(lib_name: str, lib_info: str = ""): +def try_register_lib(lib_name: str, + lib_info: str = "", + exception_info: str = ""): import importlib import importlib.util try: @@ -30,4 +32,4 @@ def try_register_lib(lib_name: str, lib_info: str = ""): if lib_info: logger.info(lib_info) except Exception: - pass + logger.warning_once(exception_info) diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 147673754b8..120d343b7c9 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -77,7 +77,8 @@ def __init__( # Try to import mindie_turbo to accelerate vLLM inference. try_register_lib( "mindie_turbo", - "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo." + "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo.", + "MindIE Turbo is installed but unable to `import mindie_turbo`, skip MindIE Turbo acceleration." ) # distribute related config self.parallel_config.rank = rank diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index cc300282f57..c1ed5b39d64 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -77,7 +77,8 @@ def __init__(self, # Try to import mindie_turbo to accelerate vLLM inference. try_register_lib( "mindie_turbo", - "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo." + "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo.", + "MindIE Turbo is installed but unable to `import mindie_turbo`, skip MindIE Turbo acceleration." ) if self.cache_config.cache_dtype == "auto":