From 92f29cef77c46e6f9d003e5e72c742ffd97e4d53 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 3 Feb 2026 15:01:39 +0800 Subject: [PATCH] [doc] refactor tutorial Signed-off-by: wangxiyuan --- docs/source/index.md | 4 +- docs/source/installation.md | 20 +++++----- docs/source/quick_start.md | 8 ++-- docs/source/tutorials/features/index.md | 15 ++++++++ ...ng_sequence_context_parallel_multi_node.md | 8 ++-- ...g_sequence_context_parallel_single_node.md | 4 +- .../pd_colocated_mooncake_multi_instance.md | 0 .../pd_disaggregation_mooncake_multi_node.md | 0 .../pd_disaggregation_mooncake_single_node.md | 0 docs/source/tutorials/{ => features}/ray.md | 0 .../suffix_speculative_decoding.md | 0 docs/source/tutorials/{ => hardwares}/310p.md | 0 docs/source/tutorials/hardwares/index.md | 9 +++++ .../tutorials/{ => models}/DeepSeek-R1.md | 14 +++---- .../tutorials/{ => models}/DeepSeek-V3.1.md | 14 +++---- .../tutorials/{ => models}/DeepSeek-V3.2.md | 14 +++---- docs/source/tutorials/{ => models}/GLM4.x.md | 10 ++--- .../{ => models}/Kimi-K2-Thinking.md | 0 .../tutorials/{ => models}/PaddleOCR-VL.md | 2 +- .../tutorials/{ => models}/Qwen-VL-Dense.md | 8 ++-- .../tutorials/{ => models}/Qwen2.5-7B.md | 8 ++-- .../tutorials/{ => models}/Qwen2.5-Omni.md | 10 ++--- .../tutorials/{ => models}/Qwen3-235B-A22B.md | 18 ++++----- .../tutorials/{ => models}/Qwen3-30B-A3B.md | 0 .../tutorials/{ => models}/Qwen3-32B-W4A4.md | 0 .../tutorials/{ => models}/Qwen3-8B-W4A8.md | 0 .../{ => models}/Qwen3-Coder-30B-A3B.md | 10 ++--- .../tutorials/{ => models}/Qwen3-Dense.md | 12 +++--- .../tutorials/{ => models}/Qwen3-Next.md | 8 ++-- .../Qwen3-Omni-30B-A3B-Thinking.md | 4 +- .../Qwen3-VL-235B-A22B-Instruct.md | 18 ++++----- .../{ => models}/Qwen3-VL-30B-A3B-Instruct.md | 4 +- .../{ => models}/Qwen3-VL-Embedding.md | 6 +-- .../{ => models}/Qwen3-VL-Reranker.md | 6 +-- .../tutorials/{ => models}/Qwen3_embedding.md | 6 +-- .../tutorials/{ => models}/Qwen3_reranker.md | 6 +-- docs/source/tutorials/{ => models}/index.md | 24 ++---------- .../user_guide/feature_guide/quantization.md | 2 - .../support_matrix/supported_models.md | 38 +++++++++---------- 39 files changed, 159 insertions(+), 151 deletions(-) create mode 100644 docs/source/tutorials/features/index.md rename docs/source/tutorials/{ => features}/long_sequence_context_parallel_multi_node.md (94%) rename docs/source/tutorials/{ => features}/long_sequence_context_parallel_single_node.md (97%) rename docs/source/tutorials/{ => features}/pd_colocated_mooncake_multi_instance.md (100%) rename docs/source/tutorials/{ => features}/pd_disaggregation_mooncake_multi_node.md (100%) rename docs/source/tutorials/{ => features}/pd_disaggregation_mooncake_single_node.md (100%) rename docs/source/tutorials/{ => features}/ray.md (100%) rename docs/source/tutorials/{ => features}/suffix_speculative_decoding.md (100%) rename docs/source/tutorials/{ => hardwares}/310p.md (100%) create mode 100644 docs/source/tutorials/hardwares/index.md rename docs/source/tutorials/{ => models}/DeepSeek-R1.md (93%) rename docs/source/tutorials/{ => models}/DeepSeek-V3.1.md (97%) rename docs/source/tutorials/{ => models}/DeepSeek-V3.2.md (97%) rename docs/source/tutorials/{ => models}/GLM4.x.md (91%) rename docs/source/tutorials/{ => models}/Kimi-K2-Thinking.md (100%) rename docs/source/tutorials/{ => models}/PaddleOCR-VL.md (98%) rename docs/source/tutorials/{ => models}/Qwen-VL-Dense.md (97%) rename docs/source/tutorials/{ => models}/Qwen2.5-7B.md (91%) rename docs/source/tutorials/{ => models}/Qwen2.5-Omni.md (92%) rename docs/source/tutorials/{ => models}/Qwen3-235B-A22B.md (96%) rename docs/source/tutorials/{ => models}/Qwen3-30B-A3B.md (100%) rename docs/source/tutorials/{ => models}/Qwen3-32B-W4A4.md (100%) rename docs/source/tutorials/{ => models}/Qwen3-8B-W4A8.md (100%) rename docs/source/tutorials/{ => models}/Qwen3-Coder-30B-A3B.md (86%) rename docs/source/tutorials/{ => models}/Qwen3-Dense.md (97%) rename docs/source/tutorials/{ => models}/Qwen3-Next.md (92%) rename docs/source/tutorials/{ => models}/Qwen3-Omni-30B-A3B-Thinking.md (98%) rename docs/source/tutorials/{ => models}/Qwen3-VL-235B-A22B-Instruct.md (93%) rename docs/source/tutorials/{ => models}/Qwen3-VL-30B-A3B-Instruct.md (97%) rename docs/source/tutorials/{ => models}/Qwen3-VL-Embedding.md (96%) rename docs/source/tutorials/{ => models}/Qwen3-VL-Reranker.md (98%) rename docs/source/tutorials/{ => models}/Qwen3_embedding.md (96%) rename docs/source/tutorials/{ => models}/Qwen3_reranker.md (97%) rename docs/source/tutorials/{ => models}/index.md (54%) diff --git a/docs/source/index.md b/docs/source/index.md index 36776e71659..a8a2d28d41e 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -35,7 +35,9 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l :maxdepth: 1 quick_start installation -tutorials/index.md +tutorials/models/index +tutorials/features/index +tutorials/hardwares/index faqs ::: diff --git a/docs/source/installation.md b/docs/source/installation.md index 5b0cc2cf0f7..d316608e219 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -136,7 +136,7 @@ pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/si ```bash # For torch-npu dev version or x86 machine -pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/ https://mirrors.huaweicloud.com/ascend/repos/pypi" +pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/" ``` Then you can install `vllm` and `vllm-ascend` from **pre-built wheel**: @@ -187,12 +187,12 @@ Supported images as following. | image name | Hardware | OS | |-|-|-| -| image-tag | Atlas A2 | Ubuntu | -| image-tag-openeuler | Atlas A2 | openEuler | -| image-tag-a3 | Atlas A3 | Ubuntu | -| image-tag-a3-openeuler | Atlas A3 | openEuler | -| image-tag-310p | Atlas 300I | Ubuntu | -| image-tag-310p-openeuler | Atlas 300I | openEuler | +| vllm-ascend: | Atlas A2 | Ubuntu | +| vllm-ascend:-openeuler | Atlas A2 | openEuler | +| vllm-ascend:-a3 | Atlas A3 | Ubuntu | +| vllm-ascend:-a3-openeuler | Atlas A3 | openEuler | +| vllm-ascend:-310p | Atlas 300I | Ubuntu | +| vllm-ascend:-310p-openeuler | Atlas 300I | openEuler | :::{dropdown} Click here to see "Build from Dockerfile" or build IMAGE from **source code**: @@ -258,7 +258,7 @@ prompts = [ # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. -llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") +llm = LLM(model="Qwen/Qwen3-0.6B") # Generate texts from the prompts. outputs = llm.generate(prompts, sampling_params) @@ -277,7 +277,7 @@ python example.py If you encounter a connection error with Hugging Face (e.g., `We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.`), run the following commands to use ModelScope as an alternative: ```bash -export VLLM_USE_MODELSCOPE = true +export VLLM_USE_MODELSCOPE=true pip install modelscope python example.py ``` @@ -292,7 +292,7 @@ INFO 02-18 08:49:58 __init__.py:34] set environment variable VLLM_PLUGINS to con INFO 02-18 08:49:58 __init__.py:42] plugin ascend loaded. INFO 02-18 08:49:58 __init__.py:174] Platform plugin ascend is activated INFO 02-18 08:50:12 config.py:526] This model supports multiple tasks: {'embed', 'classify', 'generate', 'score', 'reward'}. Defaulting to 'generate'. -INFO 02-18 08:50:12 llm_engine.py:232] Initializing a V0 LLM engine (v0.7.1) with config: model='./Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='./Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=npu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./Qwen2.5-0.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False, +INFO 02-18 08:50:12 llm_engine.py:232] Initializing a V0 LLM engine (v0.7.1) with config: model='./Qwen3-0.6B', speculative_config=None, tokenizer='./Qwen3-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=npu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./Qwen3-0.6B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False, Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 ```bash # Deploy vLLM server (The first run will take about 3-5 mins (10 MB/s) to download models) -vllm serve Qwen/Qwen2.5-0.5B-Instruct & +vllm serve Qwen/Qwen3-0.6B & ``` If you see a log as below: @@ -166,7 +166,7 @@ You can also query the model with input prompts: curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "Qwen/Qwen2.5-0.5B-Instruct", + "model": "Qwen/Qwen3-0.6B", "prompt": "Beijing is a", "max_completion_tokens": 5, "temperature": 0 diff --git a/docs/source/tutorials/features/index.md b/docs/source/tutorials/features/index.md new file mode 100644 index 00000000000..b285fd2464b --- /dev/null +++ b/docs/source/tutorials/features/index.md @@ -0,0 +1,15 @@ +# Feature Tutorials + +This section provides tutorials for different features of vLLM Ascend. + +:::{toctree} +:caption: Feature Tutorials +:maxdepth: 1 +pd_colocated_mooncake_multi_instance +pd_disaggregation_mooncake_single_node +pd_disaggregation_mooncake_multi_node +long_sequence_context_parallel_single_node +long_sequence_context_parallel_multi_node +suffix_speculative_decoding +ray +::: diff --git a/docs/source/tutorials/long_sequence_context_parallel_multi_node.md b/docs/source/tutorials/features/long_sequence_context_parallel_multi_node.md similarity index 94% rename from docs/source/tutorials/long_sequence_context_parallel_multi_node.md rename to docs/source/tutorials/features/long_sequence_context_parallel_multi_node.md index 3cbe67489f1..eb179d8e7fa 100644 --- a/docs/source/tutorials/long_sequence_context_parallel_multi_node.md +++ b/docs/source/tutorials/features/long_sequence_context_parallel_multi_node.md @@ -20,13 +20,13 @@ It is recommended to download the model weight to the shared directory of multip ### Verify Multi-node Communication -Refer to [verify multi-node communication environment](../installation.md#verify-multi-node-communication) to verify multi-node communication. +Refer to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication) to verify multi-node communication. ### Installation You can use our official docker image to run `DeepSeek-V3.1` directly. -Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). ```{code-block} bash :substitutions: @@ -331,7 +331,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result, here is the result of `DeepSeek-V3.1-w8a8` for reference only. @@ -343,7 +343,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. ### Using vLLM Benchmark diff --git a/docs/source/tutorials/long_sequence_context_parallel_single_node.md b/docs/source/tutorials/features/long_sequence_context_parallel_single_node.md similarity index 97% rename from docs/source/tutorials/long_sequence_context_parallel_single_node.md rename to docs/source/tutorials/features/long_sequence_context_parallel_single_node.md index cc49fa4248e..ef84706f0f6 100644 --- a/docs/source/tutorials/long_sequence_context_parallel_single_node.md +++ b/docs/source/tutorials/features/long_sequence_context_parallel_single_node.md @@ -139,7 +139,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result, here is the result of `Qwen3-235B-A22B-w8a8` for reference only. @@ -151,7 +151,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. ### Using vLLM Benchmark diff --git a/docs/source/tutorials/pd_colocated_mooncake_multi_instance.md b/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md similarity index 100% rename from docs/source/tutorials/pd_colocated_mooncake_multi_instance.md rename to docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md diff --git a/docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md b/docs/source/tutorials/features/pd_disaggregation_mooncake_multi_node.md similarity index 100% rename from docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md rename to docs/source/tutorials/features/pd_disaggregation_mooncake_multi_node.md diff --git a/docs/source/tutorials/pd_disaggregation_mooncake_single_node.md b/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md similarity index 100% rename from docs/source/tutorials/pd_disaggregation_mooncake_single_node.md rename to docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md diff --git a/docs/source/tutorials/ray.md b/docs/source/tutorials/features/ray.md similarity index 100% rename from docs/source/tutorials/ray.md rename to docs/source/tutorials/features/ray.md diff --git a/docs/source/tutorials/suffix_speculative_decoding.md b/docs/source/tutorials/features/suffix_speculative_decoding.md similarity index 100% rename from docs/source/tutorials/suffix_speculative_decoding.md rename to docs/source/tutorials/features/suffix_speculative_decoding.md diff --git a/docs/source/tutorials/310p.md b/docs/source/tutorials/hardwares/310p.md similarity index 100% rename from docs/source/tutorials/310p.md rename to docs/source/tutorials/hardwares/310p.md diff --git a/docs/source/tutorials/hardwares/index.md b/docs/source/tutorials/hardwares/index.md new file mode 100644 index 00000000000..23f4740b079 --- /dev/null +++ b/docs/source/tutorials/hardwares/index.md @@ -0,0 +1,9 @@ +# Hardware Tutorials + +This section provides tutorials on different hardware of vLLM Ascend. + +:::{toctree} +:caption: Hardware Tutorials +:maxdepth: 1 +310p +::: diff --git a/docs/source/tutorials/DeepSeek-R1.md b/docs/source/tutorials/models/DeepSeek-R1.md similarity index 93% rename from docs/source/tutorials/DeepSeek-R1.md rename to docs/source/tutorials/models/DeepSeek-R1.md index 5371ec8c791..34e9ee00b60 100644 --- a/docs/source/tutorials/DeepSeek-R1.md +++ b/docs/source/tutorials/models/DeepSeek-R1.md @@ -7,9 +7,9 @@ This article takes the `DeepSeek-R1-W8A8` version as an example to introduce the ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation @@ -21,13 +21,13 @@ It is recommended to download the model weight to the shared directory of multip ### Verify Multi-node Communication(Optional) -If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication). +If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication). ### Installation You can use our official docker image to run `DeepSeek-R1-W8A8` directly. -Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). ```{code-block} bash :substitutions: @@ -254,7 +254,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result, here is the result of `DeepSeek-R1-W8A8` in `vllm-ascend:0.11.0rc2` for reference only. @@ -267,7 +267,7 @@ Here are two accuracy evaluation methods. As an example, take the `gsm8k` dataset as a test dataset, and run accuracy evaluation of `DeepSeek-R1-W8A8` in online mode. -1. Refer to [Using lm_eval](../developer_guide/evaluation/using_lm_eval.md) for `lm_eval` installation. +1. Refer to [Using lm_eval](../../developer_guide/evaluation/using_lm_eval.md) for `lm_eval` installation. 2. Run `lm_eval` to execute the accuracy evaluation. @@ -285,7 +285,7 @@ lm_eval \ ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. ### Using vLLM Benchmark diff --git a/docs/source/tutorials/DeepSeek-V3.1.md b/docs/source/tutorials/models/DeepSeek-V3.1.md similarity index 97% rename from docs/source/tutorials/DeepSeek-V3.1.md rename to docs/source/tutorials/models/DeepSeek-V3.1.md index acac9c17263..3e067a667bb 100644 --- a/docs/source/tutorials/DeepSeek-V3.1.md +++ b/docs/source/tutorials/models/DeepSeek-V3.1.md @@ -16,9 +16,9 @@ This document will show the main verification steps of the model, including supp ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation @@ -34,13 +34,13 @@ It is recommended to download the model weight to the shared directory of multip ### Verify Multi-node Communication(Optional) -If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication). +If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication). ### Installation You can use our official docker image to run `DeepSeek-V3.1` directly. -Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). ```{code-block} bash :substitutions: @@ -252,7 +252,7 @@ vllm serve /weights/DeepSeek-V3.1-w8a8-mtp-QuaRot \ ### Prefill-Decode Disaggregation -We recommend using Mooncake for deployment: [Mooncake](./pd_disaggregation_mooncake_multi_node.md). +We recommend using Mooncake for deployment: [Mooncake](../features/pd_disaggregation_mooncake_multi_node.md). Take Atlas 800 A3 (64G × 16) for example, we recommend to deploy 2P1D (4 nodes) rather than 1P1D (2 nodes), because there is no enough NPU memory to serve high concurrency in 1P1D case. @@ -672,7 +672,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result, here is the result of `DeepSeek-V3.1-w8a8-mtp-QuaRot` in `vllm-ascend:0.11.0rc1` for reference only. @@ -689,7 +689,7 @@ Not test yet. ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. The performance result is: diff --git a/docs/source/tutorials/DeepSeek-V3.2.md b/docs/source/tutorials/models/DeepSeek-V3.2.md similarity index 97% rename from docs/source/tutorials/DeepSeek-V3.2.md rename to docs/source/tutorials/models/DeepSeek-V3.2.md index 528176f3ae9..9435aec4769 100644 --- a/docs/source/tutorials/DeepSeek-V3.2.md +++ b/docs/source/tutorials/models/DeepSeek-V3.2.md @@ -8,9 +8,9 @@ This document will show the main verification steps of the model, including supp ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation @@ -25,7 +25,7 @@ It is recommended to download the model weight to the shared directory of multip ### Verify Multi-node Communication(Optional) -If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication). +If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication). ### Installation @@ -116,7 +116,7 @@ docker run --rm \ In addition, if you don't want to use the docker image as above, you can also build all from source: -- Install `vllm-ascend` from source, refer to [installation](../installation.md). +- Install `vllm-ascend` from source, refer to [installation](../../installation.md). If you want to deploy multi-node environment, you need to set up environment on each node. @@ -851,7 +851,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result. @@ -859,7 +859,7 @@ Here are two accuracy evaluation methods. As an example, take the `gsm8k` dataset as a test dataset, and run accuracy evaluation of `DeepSeek-V3.2-W8A8` in online mode. -1. Refer to [Using lm_eval](../developer_guide/evaluation/using_lm_eval.md) for `lm_eval` installation. +1. Refer to [Using lm_eval](../../developer_guide/evaluation/using_lm_eval.md) for `lm_eval` installation. 2. Run `lm_eval` to execute the accuracy evaluation. @@ -877,7 +877,7 @@ lm_eval \ ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. The performance result is: diff --git a/docs/source/tutorials/GLM4.x.md b/docs/source/tutorials/models/GLM4.x.md similarity index 91% rename from docs/source/tutorials/GLM4.x.md rename to docs/source/tutorials/models/GLM4.x.md index 5a6d3178e2c..6cb96c90f16 100644 --- a/docs/source/tutorials/GLM4.x.md +++ b/docs/source/tutorials/models/GLM4.x.md @@ -10,9 +10,9 @@ This document will show the main verification steps of the model, including supp ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation @@ -31,7 +31,7 @@ It is recommended to download the model weight to the shared directory of multip You can use our official docker image to run `GLM-4.x` directly. -Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). ```{code-block} bash :substitutions: @@ -121,7 +121,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result, here is the result of `GLM4.6` in `vllm-ascend:main` (after `vllm-ascend:0.13.0rc1`) for reference only. @@ -138,7 +138,7 @@ Not test yet. ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. ### Using vLLM Benchmark diff --git a/docs/source/tutorials/Kimi-K2-Thinking.md b/docs/source/tutorials/models/Kimi-K2-Thinking.md similarity index 100% rename from docs/source/tutorials/Kimi-K2-Thinking.md rename to docs/source/tutorials/models/Kimi-K2-Thinking.md diff --git a/docs/source/tutorials/PaddleOCR-VL.md b/docs/source/tutorials/models/PaddleOCR-VL.md similarity index 98% rename from docs/source/tutorials/PaddleOCR-VL.md rename to docs/source/tutorials/models/PaddleOCR-VL.md index 8ba020a7578..e73424bcf1f 100644 --- a/docs/source/tutorials/PaddleOCR-VL.md +++ b/docs/source/tutorials/models/PaddleOCR-VL.md @@ -24,7 +24,7 @@ It is recommended to download the model weights to a local directory (e.g., `./P You can use our official docker image to run `PaddleOCR-VL` directly. -Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). ```{code-block} bash :substitutions: diff --git a/docs/source/tutorials/Qwen-VL-Dense.md b/docs/source/tutorials/models/Qwen-VL-Dense.md similarity index 97% rename from docs/source/tutorials/Qwen-VL-Dense.md rename to docs/source/tutorials/models/Qwen-VL-Dense.md index 55f71a1259d..6426b796330 100644 --- a/docs/source/tutorials/Qwen-VL-Dense.md +++ b/docs/source/tutorials/models/Qwen-VL-Dense.md @@ -10,9 +10,9 @@ This tutorial uses the vLLM-Ascend `v0.11.0rc3-a3` version for demonstration, sh ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation @@ -484,7 +484,7 @@ You can refer to the [monitoring configuration](https://github.com/vllm-project/ As an example, take the `mmmu_val` dataset as a test dataset, and run accuracy evaluation of `Qwen3-VL-8B-Instruct` in offline mode. -1. Refer to [Using lm_eval](../developer_guide/evaluation/using_lm_eval.md) for more details on `lm_eval` installation. +1. Refer to [Using lm_eval](../../developer_guide/evaluation/using_lm_eval.md) for more details on `lm_eval` installation. ```shell pip install lm_eval @@ -515,7 +515,7 @@ lm_eval \ As an example, take the `mmmu_val` dataset as a test dataset, and run accuracy evaluation of `Qwen2.5-VL-32B-Instruct` in offline mode. -1. Refer to [Using lm_eval](../developer_guide/evaluation/using_lm_eval.md) for more details on `lm_eval` installation. +1. Refer to [Using lm_eval](../../developer_guide/evaluation/using_lm_eval.md) for more details on `lm_eval` installation. ```shell pip install lm_eval diff --git a/docs/source/tutorials/Qwen2.5-7B.md b/docs/source/tutorials/models/Qwen2.5-7B.md similarity index 91% rename from docs/source/tutorials/Qwen2.5-7B.md rename to docs/source/tutorials/models/Qwen2.5-7B.md index be92c1cdc6e..1ecdc765099 100644 --- a/docs/source/tutorials/Qwen2.5-7B.md +++ b/docs/source/tutorials/models/Qwen2.5-7B.md @@ -10,9 +10,9 @@ The `Qwen2.5-7B-Instruct` model was supported since `vllm-ascend:v0.9.0`. ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation @@ -138,7 +138,7 @@ A valid response (e.g., `"Beijing is a vibrant and historic capital city"`) indi ### Using AISBench -Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. Results and logs are saved to `benchmark/outputs/default/`. A sample accuracy report is shown below: @@ -150,7 +150,7 @@ Results and logs are saved to `benchmark/outputs/default/`. A sample accuracy re ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. ### Using vLLM Benchmark diff --git a/docs/source/tutorials/Qwen2.5-Omni.md b/docs/source/tutorials/models/Qwen2.5-Omni.md similarity index 92% rename from docs/source/tutorials/Qwen2.5-Omni.md rename to docs/source/tutorials/models/Qwen2.5-Omni.md index ff5eecaa7f6..55436958021 100644 --- a/docs/source/tutorials/Qwen2.5-Omni.md +++ b/docs/source/tutorials/models/Qwen2.5-Omni.md @@ -8,9 +8,9 @@ The `Qwen2.5-Omni` model was supported since `vllm-ascend:v0.11.0rc0`. This docu ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation @@ -25,7 +25,7 @@ Following examples use the 7B version by default. You can use our official docker image to run `Qwen2.5-Omni` directly. -Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). ```{code-block} bash :substitutions: @@ -174,7 +174,7 @@ Qwen2.5-Omni on vllm-ascend has been test on AISBench. ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result, here is the result of `Qwen2.5-Omni-7B` with `vllm-ascend:0.11.0rc0` for reference only. @@ -187,7 +187,7 @@ Qwen2.5-Omni on vllm-ascend has been test on AISBench. ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. ### Using vLLM Benchmark diff --git a/docs/source/tutorials/Qwen3-235B-A22B.md b/docs/source/tutorials/models/Qwen3-235B-A22B.md similarity index 96% rename from docs/source/tutorials/Qwen3-235B-A22B.md rename to docs/source/tutorials/models/Qwen3-235B-A22B.md index 3a2c0a9f738..5ab6f8e62b0 100644 --- a/docs/source/tutorials/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/models/Qwen3-235B-A22B.md @@ -10,9 +10,9 @@ The `Qwen3-235B-A22B` model is first supported in `vllm-ascend:v0.8.4rc2`. ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation @@ -25,7 +25,7 @@ It is recommended to download the model weight to the shared directory of multip ### Verify Multi-node Communication(Optional) -If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication). +If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication). ### Installation @@ -34,7 +34,7 @@ If you want to deploy multi-node environment, you need to verify multi-node comm For example, using images `quay.io/ascend/vllm-ascend:v0.11.0rc2`(for Atlas 800 A2) and `quay.io/ascend/vllm-ascend:v0.11.0rc2-a3`(for Atlas 800 A3). -Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). ```{code-block} bash :substitutions: @@ -76,7 +76,7 @@ Select an image based on your machine type and start the docker image on your no You can build all from source. -- Install `vllm-ascend`, refer to [set up using python](../installation.md#set-up-using-python). +- Install `vllm-ascend`, refer to [set up using python](../../installation.md#set-up-using-python). :::: ::::: @@ -253,11 +253,11 @@ INFO: Application startup complete. ### Multi-node Deployment with Ray -- refer to [Ray Distributed (Qwen/Qwen3-235B-A22B)](./ray.md). +- refer to [Ray Distributed (Qwen/Qwen3-235B-A22B)](../features/ray.md). ### Prefill-Decode Disaggregation -- refer to [Prefill-Decode Disaggregation Mooncake Verification (Qwen)](./pd_disaggregation_mooncake_multi_node.md) +- refer to [Prefill-Decode Disaggregation Mooncake Verification (Qwen)](../features/pd_disaggregation_mooncake_multi_node.md) ## Functional Verification @@ -280,7 +280,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result, here is the result of `Qwen3-235B-A22B-w8a8` in `vllm-ascend:0.11.0rc0` for reference only. @@ -292,7 +292,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. ### Using vLLM Benchmark diff --git a/docs/source/tutorials/Qwen3-30B-A3B.md b/docs/source/tutorials/models/Qwen3-30B-A3B.md similarity index 100% rename from docs/source/tutorials/Qwen3-30B-A3B.md rename to docs/source/tutorials/models/Qwen3-30B-A3B.md diff --git a/docs/source/tutorials/Qwen3-32B-W4A4.md b/docs/source/tutorials/models/Qwen3-32B-W4A4.md similarity index 100% rename from docs/source/tutorials/Qwen3-32B-W4A4.md rename to docs/source/tutorials/models/Qwen3-32B-W4A4.md diff --git a/docs/source/tutorials/Qwen3-8B-W4A8.md b/docs/source/tutorials/models/Qwen3-8B-W4A8.md similarity index 100% rename from docs/source/tutorials/Qwen3-8B-W4A8.md rename to docs/source/tutorials/models/Qwen3-8B-W4A8.md diff --git a/docs/source/tutorials/Qwen3-Coder-30B-A3B.md b/docs/source/tutorials/models/Qwen3-Coder-30B-A3B.md similarity index 86% rename from docs/source/tutorials/Qwen3-Coder-30B-A3B.md rename to docs/source/tutorials/models/Qwen3-Coder-30B-A3B.md index 03fb0085b72..8a627f5899e 100644 --- a/docs/source/tutorials/Qwen3-Coder-30B-A3B.md +++ b/docs/source/tutorials/models/Qwen3-Coder-30B-A3B.md @@ -8,9 +8,9 @@ This document will show the main verification steps of the model, including supp ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation @@ -52,7 +52,7 @@ docker run --rm \ In addition, if you don't want to use the docker image as above, you can also build all from source: -- Install `vllm-ascend` from source, refer to [installation](../installation.md). +- Install `vllm-ascend` from source, refer to [installation](../../installation.md). ## Deployment @@ -90,7 +90,7 @@ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/jso ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result, here is the result of `Qwen3-Coder-30B-A3B-Instruct` in `vllm-ascend:0.11.0rc0` for reference only. @@ -102,4 +102,4 @@ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/jso ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. diff --git a/docs/source/tutorials/Qwen3-Dense.md b/docs/source/tutorials/models/Qwen3-Dense.md similarity index 97% rename from docs/source/tutorials/Qwen3-Dense.md rename to docs/source/tutorials/models/Qwen3-Dense.md index 413d125d37d..70814d7a39d 100644 --- a/docs/source/tutorials/Qwen3-Dense.md +++ b/docs/source/tutorials/models/Qwen3-Dense.md @@ -16,9 +16,9 @@ This example requires version **v0.11.0rc2**. Earlier versions may lack certain ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation @@ -38,7 +38,7 @@ It is recommended to download the model weight to the shared directory of multip ### Verify Multi-node Communication(Optional) -If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication). +If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication). ### Installation @@ -97,7 +97,7 @@ In the [Run docker container](./Qwen3-Dense.md#run-docker-container), detailed e In addition, if you don't want to use the docker image as above, you can also build all from source: -- Install `vllm-ascend` from source, refer to [installation](../installation.md). +- Install `vllm-ascend` from source, refer to [installation](../../installation.md). If you want to deploy multi-node environment, you need to set up environment on each node. @@ -269,7 +269,7 @@ Here is one accuracy evaluation methods. ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result, here is the result of `Qwen3-32B-W8A8` in `vllm-ascend:0.11.0rc2` for reference only. @@ -283,7 +283,7 @@ Here is one accuracy evaluation methods. ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. ### Using vLLM Benchmark diff --git a/docs/source/tutorials/Qwen3-Next.md b/docs/source/tutorials/models/Qwen3-Next.md similarity index 92% rename from docs/source/tutorials/Qwen3-Next.md rename to docs/source/tutorials/models/Qwen3-Next.md index 3c92f56518d..d0f5becbc56 100644 --- a/docs/source/tutorials/Qwen3-Next.md +++ b/docs/source/tutorials/models/Qwen3-Next.md @@ -10,9 +10,9 @@ The `Qwen3-Next` model is first supported in `vllm-ascend:v0.10.2rc1`. ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Weight Preparation @@ -134,7 +134,7 @@ Prompt: 'Who are you?', Generated text: ' What do you know about me?\n\nHello! I ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result, here is the result of `Qwen3-Next-80B-A3B-Instruct` in `vllm-ascend:0.13.0rc1` for reference only. @@ -146,7 +146,7 @@ Prompt: 'Who are you?', Generated text: ' What do you know about me?\n\nHello! I ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. ### Using vLLM Benchmark diff --git a/docs/source/tutorials/Qwen3-Omni-30B-A3B-Thinking.md b/docs/source/tutorials/models/Qwen3-Omni-30B-A3B-Thinking.md similarity index 98% rename from docs/source/tutorials/Qwen3-Omni-30B-A3B-Thinking.md rename to docs/source/tutorials/models/Qwen3-Omni-30B-A3B-Thinking.md index 578d91a79e2..c2be1f3a13f 100644 --- a/docs/source/tutorials/Qwen3-Omni-30B-A3B-Thinking.md +++ b/docs/source/tutorials/models/Qwen3-Omni-30B-A3B-Thinking.md @@ -26,7 +26,7 @@ It is recommended to download the model weight to the shared directory of multip You can use our official docker image to run Qwen3-Omni-30B-A3B-Thinking directly -Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). ```{code-block} bash :substitutions: @@ -63,7 +63,7 @@ docker run --rm \ You can build all from source. -- Install `vllm-ascend`, refer to [set up using python](../installation.md#set-up-using-python). +- Install `vllm-ascend`, refer to [set up using python](../../installation.md#set-up-using-python). :::: ::::: diff --git a/docs/source/tutorials/Qwen3-VL-235B-A22B-Instruct.md b/docs/source/tutorials/models/Qwen3-VL-235B-A22B-Instruct.md similarity index 93% rename from docs/source/tutorials/Qwen3-VL-235B-A22B-Instruct.md rename to docs/source/tutorials/models/Qwen3-VL-235B-A22B-Instruct.md index 55a5d28077f..8107315d8ee 100644 --- a/docs/source/tutorials/Qwen3-VL-235B-A22B-Instruct.md +++ b/docs/source/tutorials/models/Qwen3-VL-235B-A22B-Instruct.md @@ -10,9 +10,9 @@ This tutorial uses the vLLM-Ascend `v0.11.0rc2` version for demonstration, sho ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation @@ -24,7 +24,7 @@ It is recommended to download the model weight to the shared directory of multip ### Verify Multi-node Communication(Optional) -If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication). +If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication). ### Installation @@ -33,7 +33,7 @@ If you want to deploy multi-node environment, you need to verify multi-node comm For example, using images `quay.io/ascend/vllm-ascend:v0.11.0rc2`(for Atlas 800 A2) and `quay.io/ascend/vllm-ascend:v0.11.0rc2-a3`(for Atlas 800 A3). -Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). ```{code-block} bash :substitutions: @@ -76,7 +76,7 @@ Select an image based on your machine type and start the docker image on your no You can build all from source. -- Install `vllm-ascend`, refer to [set up using python](../installation.md#set-up-using-python). +- Install `vllm-ascend`, refer to [set up using python](../../installation.md#set-up-using-python). :::: ::::: @@ -209,11 +209,11 @@ INFO: Application startup complete. ### Multi-node Deployment with Ray -- refer to [Ray Distributed (Qwen/Qwen3-235B-A22B)](./ray.md). +- refer to [Ray Distributed (Qwen/Qwen3-235B-A22B)](../features/ray.md). ### Prefill-Decode Disaggregation -- refer to [Prefill-Decode Disaggregation Mooncake Verification](./pd_disaggregation_mooncake_multi_node.md) +- refer to [Prefill-Decode Disaggregation Mooncake Verification](../features/pd_disaggregation_mooncake_multi_node.md) ## Functional Verification @@ -240,7 +240,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details. +1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details. 2. After execution, you can get the result, here is the result of `Qwen3-VL-235B-A22B-Instruct` in `vllm-ascend:0.11.0rc2` for reference only. @@ -252,7 +252,7 @@ Here are two accuracy evaluation methods. ### Using AISBench -Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. +Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details. ### Using vLLM Benchmark diff --git a/docs/source/tutorials/Qwen3-VL-30B-A3B-Instruct.md b/docs/source/tutorials/models/Qwen3-VL-30B-A3B-Instruct.md similarity index 97% rename from docs/source/tutorials/Qwen3-VL-30B-A3B-Instruct.md rename to docs/source/tutorials/models/Qwen3-VL-30B-A3B-Instruct.md index d1368d8460b..28c52eb2a6f 100644 --- a/docs/source/tutorials/Qwen3-VL-30B-A3B-Instruct.md +++ b/docs/source/tutorials/models/Qwen3-VL-30B-A3B-Instruct.md @@ -8,8 +8,8 @@ This document will show the main verification steps of the `Qwen3-VL-30B-A3B-Ins ## Supported Features -- Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. -- Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration. +- Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +- Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration. ## Environment Preparation diff --git a/docs/source/tutorials/Qwen3-VL-Embedding.md b/docs/source/tutorials/models/Qwen3-VL-Embedding.md similarity index 96% rename from docs/source/tutorials/Qwen3-VL-Embedding.md rename to docs/source/tutorials/models/Qwen3-VL-Embedding.md index d39aed9ce5e..a6694fc90f7 100644 --- a/docs/source/tutorials/Qwen3-VL-Embedding.md +++ b/docs/source/tutorials/models/Qwen3-VL-Embedding.md @@ -6,7 +6,7 @@ The Qwen3-VL-Embedding and Qwen3-VL-Reranker model series are the latest additio ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. ## Environment Preparation @@ -21,11 +21,11 @@ It is recommended to download the model weight to the shared directory of multip You can use our official docker image to run `Qwen3-VL-Embedding` series models. -- Start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +- Start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). If you don't want to use the docker image as above, you can also build all from source: -- Install `vllm-ascend` from source, refer to [installation](../installation.md). +- Install `vllm-ascend` from source, refer to [installation](../../installation.md). ## Deployment diff --git a/docs/source/tutorials/Qwen3-VL-Reranker.md b/docs/source/tutorials/models/Qwen3-VL-Reranker.md similarity index 98% rename from docs/source/tutorials/Qwen3-VL-Reranker.md rename to docs/source/tutorials/models/Qwen3-VL-Reranker.md index 740e1a1ca6e..deba9104f27 100644 --- a/docs/source/tutorials/Qwen3-VL-Reranker.md +++ b/docs/source/tutorials/models/Qwen3-VL-Reranker.md @@ -6,7 +6,7 @@ The Qwen3-VL-Embedding and Qwen3-VL-Reranker model series are the latest additio ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. ## Environment Preparation @@ -21,11 +21,11 @@ It is recommended to download the model weight to the shared directory of multip You can use our official docker image to run `Qwen3-VL-Reranker` series models. -- Start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +- Start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). If you don't want to use the docker image as above, you can also build all from source: -- Install `vllm-ascend` from source, refer to [installation](../installation.md). +- Install `vllm-ascend` from source, refer to [installation](../../installation.md). ## Deployment diff --git a/docs/source/tutorials/Qwen3_embedding.md b/docs/source/tutorials/models/Qwen3_embedding.md similarity index 96% rename from docs/source/tutorials/Qwen3_embedding.md rename to docs/source/tutorials/models/Qwen3_embedding.md index d2369ba934e..7e490e7aa2e 100644 --- a/docs/source/tutorials/Qwen3_embedding.md +++ b/docs/source/tutorials/models/Qwen3_embedding.md @@ -6,7 +6,7 @@ The Qwen3 Embedding model series is the latest proprietary model of the Qwen fam ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. ## Environment Preparation @@ -22,11 +22,11 @@ It is recommended to download the model weight to the shared directory of multip You can use our official docker image to run `Qwen3-Embedding` series models. -- Start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +- Start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). if you don't want to use the docker image as above, you can also build all from source: -- Install `vllm-ascend` from source, refer to [installation](../installation.md). +- Install `vllm-ascend` from source, refer to [installation](../../installation.md). ## Deployment diff --git a/docs/source/tutorials/Qwen3_reranker.md b/docs/source/tutorials/models/Qwen3_reranker.md similarity index 97% rename from docs/source/tutorials/Qwen3_reranker.md rename to docs/source/tutorials/models/Qwen3_reranker.md index d8ef6a15a59..94c1c8b60fe 100644 --- a/docs/source/tutorials/Qwen3_reranker.md +++ b/docs/source/tutorials/models/Qwen3_reranker.md @@ -6,7 +6,7 @@ The Qwen3 Reranker model series is the latest proprietary model of the Qwen fami ## Supported Features -Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. +Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix. ## Environment Preparation @@ -22,11 +22,11 @@ It is recommended to download the model weight to the shared directory of multip You can use our official docker image to run `Qwen3-Reranker` series models. -- Start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker). +- Start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker). if you don't want to use the docker image as above, you can also build all from source: -- Install `vllm-ascend` from source, refer to [installation](../installation.md). +- Install `vllm-ascend` from source, refer to [installation](../../installation.md). ## Deployment diff --git a/docs/source/tutorials/index.md b/docs/source/tutorials/models/index.md similarity index 54% rename from docs/source/tutorials/index.md rename to docs/source/tutorials/models/index.md index ea344fde5b0..78181feda27 100644 --- a/docs/source/tutorials/index.md +++ b/docs/source/tutorials/models/index.md @@ -1,7 +1,9 @@ -# Tutorials +# Model Tutorials + +This section provides tutorials for different models of vLLM Ascend. :::{toctree} -:caption: Models +:caption: Model Tutorials :maxdepth: 1 Qwen2.5-Omni.md Qwen2.5-7B.md @@ -27,21 +29,3 @@ GLM4.x.md Kimi-K2-Thinking.md PaddleOCR-VL.md ::: - -:::{toctree} -:caption: Features -:maxdepth: 1 -pd_colocated_mooncake_multi_instance.md -pd_disaggregation_mooncake_single_node.md -pd_disaggregation_mooncake_multi_node.md -long_sequence_context_parallel_single_node.md -long_sequence_context_parallel_multi_node.md -suffix_speculative_decoding.md -ray -::: - -:::{toctree} -:caption: Hardware -:maxdepth: 1 -310p.md -::: diff --git a/docs/source/user_guide/feature_guide/quantization.md b/docs/source/user_guide/feature_guide/quantization.md index 90a2cf901b6..ecf665882de 100644 --- a/docs/source/user_guide/feature_guide/quantization.md +++ b/docs/source/user_guide/feature_guide/quantization.md @@ -155,8 +155,6 @@ python -m vllm.entrypoints.api_server \ --quantization ascend ``` -The above commands are for reference only. For more details, consult the [official guide](../../tutorials/index.md). - ## References - [ModelSlim Documentation](https://gitcode.com/Ascend/msit/blob/master/msmodelslim/README.md) diff --git a/docs/source/user_guide/support_matrix/supported_models.md b/docs/source/user_guide/support_matrix/supported_models.md index 3dbd9d66f1f..1f70c5cb2f5 100644 --- a/docs/source/user_guide/support_matrix/supported_models.md +++ b/docs/source/user_guide/support_matrix/supported_models.md @@ -16,16 +16,16 @@ Get the latest info here: