diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index 2d18af238c3..4f049be0d8f 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -7,3 +7,4 @@ sphinx-togglebutton
myst-parser
msgspec
sphinx-substitution-extensions
+snowballstemmer<3.0.0
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 866dd11dfde..130143a3667 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -72,6 +72,8 @@
# This value should be updated when cut down release.
'pip_vllm_ascend_version': "0.7.3rc2",
'pip_vllm_version': "0.7.3",
+ # The maching MindIE Turbo for vLLM Ascend
+ 'pip_mindie_turbo_version': "2.0rc1",
# CANN image tag
'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10",
}
diff --git a/docs/source/installation.md b/docs/source/installation.md
index f1d6612711f..97e1d2ab861 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -195,6 +195,16 @@ The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed in `/v
:::::
+## (Optional) Install MindIE Turbo
+
+Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+ :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
## Extra information
### Verify installation
@@ -254,3 +264,10 @@ Prompt: 'The president of the United States is', Generated text: ' a very import
Prompt: 'The capital of France is', Generated text: ' Paris. The oldest part of the city is Saint-Germain-des-Pr'
Prompt: 'The future of AI is', Generated text: ' not bright\n\nThere is no doubt that the evolution of AI will have a huge'
```
+
+### Compile Enhancement
+
+Get more performance gains by optimizing Python and torch-npu with the Bisheng compiler, please follow these official turtorial:
+
+[Optimizing Python with Bisheng](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0063.html)
+[Optimizing torch-npu with Bisheng](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0058.html)
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 58b28e0c9b8..7cda0dbaa76 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -33,6 +33,15 @@ docker run --rm \
The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed in `/vllm-workspace` and installed in [development mode](https://setuptools.pypa.io/en/latest/userguide/development_mode.html)(`pip install -e`) to help developer immediately take place changes without requiring a new installation.
+## (Optional) Install MindIE Turbo
+
+Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+ :substitutions:
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
## Usage
You can use Modelscope mirror to speed up download:
@@ -130,4 +139,8 @@ INFO: Application shutdown complete.
Finally, you can exit container by using `ctrl-D`.
::::
-:::::
\ No newline at end of file
+:::::
+
+### Performance enhancement related environment variables in Mindie Turbo
+
+Currently, some performance enhancement features in MindIE Turbo have certain scenario restrictions. For these features, environment variables are used to control whether to enable them. For related environment variables, see its [official documentation](https://www.hiascend.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev/mindie-turbo-0010.html).
diff --git a/docs/source/tutorials/multi_node.md b/docs/source/tutorials/multi_node.md
index d674d8a5890..ddeb18509fb 100644
--- a/docs/source/tutorials/multi_node.md
+++ b/docs/source/tutorials/multi_node.md
@@ -30,6 +30,14 @@ docker run --rm \
-it quay.io/ascend/vllm-ascend:|vllm_ascend_version| bash
```
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+ :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
Choose one machine as head node, the other are worker nodes, then start ray on each machine:
:::{note}
diff --git a/docs/source/tutorials/multi_npu.md b/docs/source/tutorials/multi_npu.md
index e59b72568f8..7f162f2e36a 100644
--- a/docs/source/tutorials/multi_npu.md
+++ b/docs/source/tutorials/multi_npu.md
@@ -27,6 +27,14 @@ docker run --rm \
-it $IMAGE bash
```
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+ :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
Setup environment variables:
```bash
diff --git a/docs/source/tutorials/single_npu.md b/docs/source/tutorials/single_npu.md
index 445d9518bba..ab2ee392f28 100644
--- a/docs/source/tutorials/single_npu.md
+++ b/docs/source/tutorials/single_npu.md
@@ -26,6 +26,14 @@ docker run --rm \
-it $IMAGE bash
```
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+ :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
Setup environment variables:
```bash
@@ -90,7 +98,20 @@ docker run --rm \
-p 8000:8000 \
-e VLLM_USE_MODELSCOPE=True \
-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
--it $IMAGE \
+-it $IMAGE bash
+```
+
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+ :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
+Run the following script to start the vLLM server:
+
+```
vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240
```
diff --git a/docs/source/tutorials/single_npu_multimodal.md b/docs/source/tutorials/single_npu_multimodal.md
index 3b01397a964..5d68a5387e3 100644
--- a/docs/source/tutorials/single_npu_multimodal.md
+++ b/docs/source/tutorials/single_npu_multimodal.md
@@ -26,6 +26,14 @@ docker run --rm \
-it $IMAGE bash
```
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+ :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
Setup environment variables:
```bash
@@ -143,7 +151,20 @@ docker run --rm \
-p 8000:8000 \
-e VLLM_USE_MODELSCOPE=True \
-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
--it $IMAGE \
+-it $IMAGE bash
+```
+
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+ :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
+Run the following script to start the vLLM server:
+
+```
vllm serve Qwen/Qwen2.5-VL-7B-Instruct --dtype bfloat16 --max_model_len 16384 --max-num-batched-tokens 16384
```
diff --git a/docs/source/user_guide/suppoted_features.md b/docs/source/user_guide/suppoted_features.md
index 34aa09b6011..3980fa359ca 100644
--- a/docs/source/user_guide/suppoted_features.md
+++ b/docs/source/user_guide/suppoted_features.md
@@ -1,21 +1,38 @@
# Feature Support
-| Feature | Supported | CI Coverage | Guidance Document | Current Status | Next Step |
-|--------------------------|-----------|-------------|-------------------|---------------------------|--------------------|
-| Chunked Prefill | ❌ | | | NA | Rely on CANN 8.1 NNAL package release |
-| Automatic Prefix Caching | ✅ | | | Basic functions available | Rely on CANN 8.1 NNAL package release |
-| LoRA | ❌ | | | NA | Plan in 2025.06.30 |
-| Prompt adapter | ❌ | | | NA | Plan in 2025.06.30 |
-| Speculative decoding | ✅ | | | Basic functions available | Need fully test |
-| Pooling | ✅ | | | Basic functions available(Bert) | Need fully test and add more models support|
-| Enc-dec | ❌ | | | NA | Plan in 2025.06.30|
-| Multi Modality | ✅ | | ✅ | Basic functions available(LLaVA/Qwen2-vl/Qwen2-audio/internVL)| Improve perforamance, and add more models support |
-| LogProbs | ✅ | | | Basic functions available | Need fully test |
-| Prompt logProbs | ✅ | | | Basic functions available | Need fully test |
-| Async output | ✅ | | | Basic functions available | Need fully test |
-| Multi step scheduler | ✅ | | | Basic functions available | Need fully test, Find more details at [ Blog ](https://blog.vllm.ai/2024/09/05/perf-update.html#batch-scheduling-multiple-steps-ahead-pr-7000), [ RFC ](https://github.com/vllm-project/vllm/issues/6854) and [issue](https://github.com/vllm-project/vllm/pull/7000) |
-| Best of | ✅ | | | Basic functions available | Need fully test |
-| Beam search | ✅ | | | Basic functions available | Need fully test |
-| Guided Decoding | ✅ | | | Basic functions available | Find more details at the [issue](https://github.com/vllm-project/vllm-ascend/issues/177) |
-| Tensor Parallel | ✅ | | | Basic functions available | Need fully test |
-| Pipeline Parallel | ✅ | | | Basic functions available | Need fully test |
+The feature support principle of vLLM Ascend is: **aligned with the vLLM**. We are also actively collaborating with the community to accelerate support.
+
+vLLM Ascend offers the overall functional support of the most features in vLLM, and the usage keep the same with vLLM except for some limits.
+
+```{note}
+MindIE Turbo is an optional performace optimization plugin. Find more information about the feature support of MindIE Turbo here(UPDATE_ME_AS_A_LINK).
+```
+
+| Feature | vLLM Ascend | MindIE Turbo | Notes |
+|-------------------------------|----------------|-----------------|------------------------------------------------------------------------|
+| V1Engine | 🔵 Experimental| 🔵 Experimental| Will enhance in v0.8.x |
+| Chunked Prefill | 🟢 Functional | 🟢 Functional | / |
+| Automatic Prefix Caching | 🟢 Functional | 🟢 Functional | [Usage Limits][#732](https://github.com/vllm-project/vllm-ascend/issues/732) |
+| LoRA | 🟢 Functional | 🟢 Functional | / |
+| Prompt adapter | 🟡 Planned | 🟡 Planned | / |
+| Speculative decoding | 🟢 Functional | 🟢 Functional | [Usage Limits][#734](https://github.com/vllm-project/vllm-ascend/issues/734) |
+| Pooling | 🟢 Functional | 🟢 Functional | / |
+| Enc-dec | 🟡 Planned | 🟡 Planned | / |
+| Multi Modality | 🟢 Functional | 🟢 Functional | / |
+| LogProbs | 🟢 Functional | 🟢 Functional | / |
+| Prompt logProbs | 🟢 Functional | 🟢 Functional | / |
+| Async output | 🟢 Functional | 🟢 Functional | / |
+| Multi step scheduler | 🟢 Functional | 🟢 Functional | / |
+| Best of | 🟢 Functional | 🟢 Functional | / |
+| Beam search | 🟢 Functional | 🟢 Functional | / |
+| Guided Decoding | 🟢 Functional | 🟢 Functional | / |
+| Tensor Parallel | 🟢 Functional | ⚡Optimized | / |
+| Pipeline Parallel | 🟢 Functional | ⚡Optimized | / |
+| Expert Parallel | 🟡 Planned | 🟡 Planned | Will support in v0.8.x |
+| Data Parallel | 🟡 Planned | 🟡 Planned | Will support in v0.8.x |
+| Prefill Decode Disaggregation | 🟢 Functional | 🟢 Functional | todo |
+| Quantization | 🟡 Planned | 🟢 Functional | Will support in v0.8.x |
+| Graph Mode | 🟡 Planned | 🟡 Planned | Will support in v0.8.x |
+| Sleep Mode | 🟢 Functional | 🟢 Functional | [Usage Limits][#733](https://github.com/vllm-project/vllm-ascend/issues/733) |
+| MTP | 🟢 Functional | 🟢 Functional | [Usage Limits][#734](https://github.com/vllm-project/vllm-ascend/issues/734) |
+| Custom Scheduler | 🟢 Functional | 🟢 Functional | [Usage Limits][#788](https://github.com/vllm-project/vllm-ascend/issues/788) |
diff --git a/setup.py b/setup.py
index 542af85151a..64c04af7d74 100644
--- a/setup.py
+++ b/setup.py
@@ -368,7 +368,7 @@ def _read_requirements(filename: str) -> List[str]:
install_requires=get_requirements(),
ext_modules=ext_modules,
cmdclass=cmdclass,
- extras_require={},
+ extras_require={"mindie_turbo": ["mindie-turbo==2.0rc1"]},
entry_points={
"vllm.platform_plugins": ["ascend = vllm_ascend:register"],
"vllm.general_plugins":
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index dda11198707..ad079e3e482 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -20,7 +20,9 @@
from vllm.logger import logger
-def try_register_lib(lib_name: str, lib_info: str = ""):
+def try_register_lib(lib_name: str,
+ lib_info: str = "",
+ exception_info: str = ""):
import importlib
import importlib.util
try:
@@ -30,4 +32,4 @@ def try_register_lib(lib_name: str, lib_info: str = ""):
if lib_info:
logger.info(lib_info)
except Exception:
- pass
+ logger.warning_once(exception_info)
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
index 147673754b8..120d343b7c9 100644
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -77,7 +77,8 @@ def __init__(
# Try to import mindie_turbo to accelerate vLLM inference.
try_register_lib(
"mindie_turbo",
- "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo."
+ "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo.",
+ "MindIE Turbo is installed but unable to `import mindie_turbo`, skip MindIE Turbo acceleration."
)
# distribute related config
self.parallel_config.rank = rank
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
index cc300282f57..c1ed5b39d64 100644
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -77,7 +77,8 @@ def __init__(self,
# Try to import mindie_turbo to accelerate vLLM inference.
try_register_lib(
"mindie_turbo",
- "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo."
+ "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo.",
+ "MindIE Turbo is installed but unable to `import mindie_turbo`, skip MindIE Turbo acceleration."
)
if self.cache_config.cache_dtype == "auto":