From 92f29cef77c46e6f9d003e5e72c742ffd97e4d53 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Tue, 3 Feb 2026 15:01:39 +0800
Subject: [PATCH] [doc] refactor tutorial

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 docs/source/index.md                          |  4 +-
 docs/source/installation.md                   | 20 +++++-----
 docs/source/quick_start.md                    |  8 ++--
 docs/source/tutorials/features/index.md       | 15 ++++++++
 ...ng_sequence_context_parallel_multi_node.md |  8 ++--
 ...g_sequence_context_parallel_single_node.md |  4 +-
 .../pd_colocated_mooncake_multi_instance.md   |  0
 .../pd_disaggregation_mooncake_multi_node.md  |  0
 .../pd_disaggregation_mooncake_single_node.md |  0
 docs/source/tutorials/{ => features}/ray.md   |  0
 .../suffix_speculative_decoding.md            |  0
 docs/source/tutorials/{ => hardwares}/310p.md |  0
 docs/source/tutorials/hardwares/index.md      |  9 +++++
 .../tutorials/{ => models}/DeepSeek-R1.md     | 14 +++----
 .../tutorials/{ => models}/DeepSeek-V3.1.md   | 14 +++----
 .../tutorials/{ => models}/DeepSeek-V3.2.md   | 14 +++----
 docs/source/tutorials/{ => models}/GLM4.x.md  | 10 ++---
 .../{ => models}/Kimi-K2-Thinking.md          |  0
 .../tutorials/{ => models}/PaddleOCR-VL.md    |  2 +-
 .../tutorials/{ => models}/Qwen-VL-Dense.md   |  8 ++--
 .../tutorials/{ => models}/Qwen2.5-7B.md      |  8 ++--
 .../tutorials/{ => models}/Qwen2.5-Omni.md    | 10 ++---
 .../tutorials/{ => models}/Qwen3-235B-A22B.md | 18 ++++-----
 .../tutorials/{ => models}/Qwen3-30B-A3B.md   |  0
 .../tutorials/{ => models}/Qwen3-32B-W4A4.md  |  0
 .../tutorials/{ => models}/Qwen3-8B-W4A8.md   |  0
 .../{ => models}/Qwen3-Coder-30B-A3B.md       | 10 ++---
 .../tutorials/{ => models}/Qwen3-Dense.md     | 12 +++---
 .../tutorials/{ => models}/Qwen3-Next.md      |  8 ++--
 .../Qwen3-Omni-30B-A3B-Thinking.md            |  4 +-
 .../Qwen3-VL-235B-A22B-Instruct.md            | 18 ++++-----
 .../{ => models}/Qwen3-VL-30B-A3B-Instruct.md |  4 +-
 .../{ => models}/Qwen3-VL-Embedding.md        |  6 +--
 .../{ => models}/Qwen3-VL-Reranker.md         |  6 +--
 .../tutorials/{ => models}/Qwen3_embedding.md |  6 +--
 .../tutorials/{ => models}/Qwen3_reranker.md  |  6 +--
 docs/source/tutorials/{ => models}/index.md   | 24 ++----------
 .../user_guide/feature_guide/quantization.md  |  2 -
 .../support_matrix/supported_models.md        | 38 +++++++++----------
 39 files changed, 159 insertions(+), 151 deletions(-)
 create mode 100644 docs/source/tutorials/features/index.md
 rename docs/source/tutorials/{ => features}/long_sequence_context_parallel_multi_node.md (94%)
 rename docs/source/tutorials/{ => features}/long_sequence_context_parallel_single_node.md (97%)
 rename docs/source/tutorials/{ => features}/pd_colocated_mooncake_multi_instance.md (100%)
 rename docs/source/tutorials/{ => features}/pd_disaggregation_mooncake_multi_node.md (100%)
 rename docs/source/tutorials/{ => features}/pd_disaggregation_mooncake_single_node.md (100%)
 rename docs/source/tutorials/{ => features}/ray.md (100%)
 rename docs/source/tutorials/{ => features}/suffix_speculative_decoding.md (100%)
 rename docs/source/tutorials/{ => hardwares}/310p.md (100%)
 create mode 100644 docs/source/tutorials/hardwares/index.md
 rename docs/source/tutorials/{ => models}/DeepSeek-R1.md (93%)
 rename docs/source/tutorials/{ => models}/DeepSeek-V3.1.md (97%)
 rename docs/source/tutorials/{ => models}/DeepSeek-V3.2.md (97%)
 rename docs/source/tutorials/{ => models}/GLM4.x.md (91%)
 rename docs/source/tutorials/{ => models}/Kimi-K2-Thinking.md (100%)
 rename docs/source/tutorials/{ => models}/PaddleOCR-VL.md (98%)
 rename docs/source/tutorials/{ => models}/Qwen-VL-Dense.md (97%)
 rename docs/source/tutorials/{ => models}/Qwen2.5-7B.md (91%)
 rename docs/source/tutorials/{ => models}/Qwen2.5-Omni.md (92%)
 rename docs/source/tutorials/{ => models}/Qwen3-235B-A22B.md (96%)
 rename docs/source/tutorials/{ => models}/Qwen3-30B-A3B.md (100%)
 rename docs/source/tutorials/{ => models}/Qwen3-32B-W4A4.md (100%)
 rename docs/source/tutorials/{ => models}/Qwen3-8B-W4A8.md (100%)
 rename docs/source/tutorials/{ => models}/Qwen3-Coder-30B-A3B.md (86%)
 rename docs/source/tutorials/{ => models}/Qwen3-Dense.md (97%)
 rename docs/source/tutorials/{ => models}/Qwen3-Next.md (92%)
 rename docs/source/tutorials/{ => models}/Qwen3-Omni-30B-A3B-Thinking.md (98%)
 rename docs/source/tutorials/{ => models}/Qwen3-VL-235B-A22B-Instruct.md (93%)
 rename docs/source/tutorials/{ => models}/Qwen3-VL-30B-A3B-Instruct.md (97%)
 rename docs/source/tutorials/{ => models}/Qwen3-VL-Embedding.md (96%)
 rename docs/source/tutorials/{ => models}/Qwen3-VL-Reranker.md (98%)
 rename docs/source/tutorials/{ => models}/Qwen3_embedding.md (96%)
 rename docs/source/tutorials/{ => models}/Qwen3_reranker.md (97%)
 rename docs/source/tutorials/{ => models}/index.md (54%)

diff --git a/docs/source/index.md b/docs/source/index.md
index 36776e71659..a8a2d28d41e 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -35,7 +35,9 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
 :maxdepth: 1
 quick_start
 installation
-tutorials/index.md
+tutorials/models/index
+tutorials/features/index
+tutorials/hardwares/index
 faqs
 :::
 
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 5b0cc2cf0f7..d316608e219 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -136,7 +136,7 @@ pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/si
 
 ```bash
 # For torch-npu dev version or x86 machine
-pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/ https://mirrors.huaweicloud.com/ascend/repos/pypi"
+pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"
 ```
 
 Then you can install `vllm` and `vllm-ascend` from **pre-built wheel**:
@@ -187,12 +187,12 @@ Supported images as following.
 
 | image name | Hardware | OS |
 |-|-|-|
-| image-tag | Atlas A2 | Ubuntu |
-| image-tag-openeuler | Atlas A2 | openEuler |
-| image-tag-a3 | Atlas A3 | Ubuntu |
-| image-tag-a3-openeuler | Atlas A3 | openEuler |
-| image-tag-310p | Atlas 300I | Ubuntu |
-| image-tag-310p-openeuler | Atlas 300I | openEuler |
+| vllm-ascend:<image-tag> | Atlas A2 | Ubuntu |
+| vllm-ascend:<image-tag>-openeuler | Atlas A2 | openEuler |
+| vllm-ascend:<image-tag>-a3 | Atlas A3 | Ubuntu |
+| vllm-ascend:<image-tag>-a3-openeuler | Atlas A3 | openEuler |
+| vllm-ascend:<image-tag>-310p | Atlas 300I | Ubuntu |
+| vllm-ascend:<image-tag>-310p-openeuler | Atlas 300I | openEuler |
 
 :::{dropdown} Click here to see "Build from Dockerfile"
 or build IMAGE from **source code**:
@@ -258,7 +258,7 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # Create an LLM.
-llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
+llm = LLM(model="Qwen/Qwen3-0.6B")
 
 # Generate texts from the prompts.
 outputs = llm.generate(prompts, sampling_params)
@@ -277,7 +277,7 @@ python example.py
 If you encounter a connection error with Hugging Face (e.g., `We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.`), run the following commands to use ModelScope as an alternative:
 
 ```bash
-export VLLM_USE_MODELSCOPE = true
+export VLLM_USE_MODELSCOPE=true
 pip install modelscope
 python example.py
 ```
@@ -292,7 +292,7 @@ INFO 02-18 08:49:58 __init__.py:34] set environment variable VLLM_PLUGINS to con
 INFO 02-18 08:49:58 __init__.py:42] plugin ascend loaded.
 INFO 02-18 08:49:58 __init__.py:174] Platform plugin ascend is activated
 INFO 02-18 08:50:12 config.py:526] This model supports multiple tasks: {'embed', 'classify', 'generate', 'score', 'reward'}. Defaulting to 'generate'.
-INFO 02-18 08:50:12 llm_engine.py:232] Initializing a V0 LLM engine (v0.7.1) with config: model='./Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='./Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=npu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./Qwen2.5-0.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False,
+INFO 02-18 08:50:12 llm_engine.py:232] Initializing a V0 LLM engine (v0.7.1) with config: model='./Qwen3-0.6B', speculative_config=None, tokenizer='./Qwen3-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=npu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./Qwen3-0.6B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False,
 Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
 Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.86it/s]
 Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.85it/s]
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 97133676f79..4553108a014 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -114,7 +114,7 @@ prompts = [
 ]
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # The first run will take about 3-5 mins (10 MB/s) to download models
-llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
+llm = LLM(model="Qwen/Qwen3-0.6B")
 
 outputs = llm.generate(prompts, sampling_params)
 
@@ -130,13 +130,13 @@ for output in outputs:
 
 vLLM can also be deployed as a server that implements the OpenAI API protocol. Run
 the following command to start the vLLM server with the
-[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) model:
+[Qwen/Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) model:
 
 <!-- tests/e2e/doctest/001-quickstart-test.sh should be considered updating as well -->
 
 ```bash
 # Deploy vLLM server (The first run will take about 3-5 mins (10 MB/s) to download models)
-vllm serve Qwen/Qwen2.5-0.5B-Instruct &
+vllm serve Qwen/Qwen3-0.6B &
 ```
 
 If you see a log as below:
@@ -166,7 +166,7 @@ You can also query the model with input prompts:
 curl http://localhost:8000/v1/completions \
     -H "Content-Type: application/json" \
     -d '{
-        "model": "Qwen/Qwen2.5-0.5B-Instruct",
+        "model": "Qwen/Qwen3-0.6B",
         "prompt": "Beijing is a",
         "max_completion_tokens": 5,
         "temperature": 0
diff --git a/docs/source/tutorials/features/index.md b/docs/source/tutorials/features/index.md
new file mode 100644
index 00000000000..b285fd2464b
--- /dev/null
+++ b/docs/source/tutorials/features/index.md
@@ -0,0 +1,15 @@
+# Feature Tutorials
+
+This section provides tutorials for different features of vLLM Ascend.
+
+:::{toctree}
+:caption: Feature Tutorials
+:maxdepth: 1
+pd_colocated_mooncake_multi_instance
+pd_disaggregation_mooncake_single_node
+pd_disaggregation_mooncake_multi_node
+long_sequence_context_parallel_single_node
+long_sequence_context_parallel_multi_node
+suffix_speculative_decoding
+ray
+:::
diff --git a/docs/source/tutorials/long_sequence_context_parallel_multi_node.md b/docs/source/tutorials/features/long_sequence_context_parallel_multi_node.md
similarity index 94%
rename from docs/source/tutorials/long_sequence_context_parallel_multi_node.md
rename to docs/source/tutorials/features/long_sequence_context_parallel_multi_node.md
index 3cbe67489f1..eb179d8e7fa 100644
--- a/docs/source/tutorials/long_sequence_context_parallel_multi_node.md
+++ b/docs/source/tutorials/features/long_sequence_context_parallel_multi_node.md
@@ -20,13 +20,13 @@ It is recommended to download the model weight to the shared directory of multip
 
 ### Verify Multi-node Communication
 
-Refer to [verify multi-node communication environment](../installation.md#verify-multi-node-communication) to verify multi-node communication.
+Refer to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication) to verify multi-node communication.
 
 ### Installation
 
 You can use our official docker image to run `DeepSeek-V3.1` directly.
 
-Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 ```{code-block} bash
    :substitutions:
@@ -331,7 +331,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result, here is the result of `DeepSeek-V3.1-w8a8` for reference only.
 
@@ -343,7 +343,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 ### Using vLLM Benchmark
 
diff --git a/docs/source/tutorials/long_sequence_context_parallel_single_node.md b/docs/source/tutorials/features/long_sequence_context_parallel_single_node.md
similarity index 97%
rename from docs/source/tutorials/long_sequence_context_parallel_single_node.md
rename to docs/source/tutorials/features/long_sequence_context_parallel_single_node.md
index cc49fa4248e..ef84706f0f6 100644
--- a/docs/source/tutorials/long_sequence_context_parallel_single_node.md
+++ b/docs/source/tutorials/features/long_sequence_context_parallel_single_node.md
@@ -139,7 +139,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result, here is the result of `Qwen3-235B-A22B-w8a8` for reference only.
 
@@ -151,7 +151,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 ### Using vLLM Benchmark
 
diff --git a/docs/source/tutorials/pd_colocated_mooncake_multi_instance.md b/docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md
similarity index 100%
rename from docs/source/tutorials/pd_colocated_mooncake_multi_instance.md
rename to docs/source/tutorials/features/pd_colocated_mooncake_multi_instance.md
diff --git a/docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md b/docs/source/tutorials/features/pd_disaggregation_mooncake_multi_node.md
similarity index 100%
rename from docs/source/tutorials/pd_disaggregation_mooncake_multi_node.md
rename to docs/source/tutorials/features/pd_disaggregation_mooncake_multi_node.md
diff --git a/docs/source/tutorials/pd_disaggregation_mooncake_single_node.md b/docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md
similarity index 100%
rename from docs/source/tutorials/pd_disaggregation_mooncake_single_node.md
rename to docs/source/tutorials/features/pd_disaggregation_mooncake_single_node.md
diff --git a/docs/source/tutorials/ray.md b/docs/source/tutorials/features/ray.md
similarity index 100%
rename from docs/source/tutorials/ray.md
rename to docs/source/tutorials/features/ray.md
diff --git a/docs/source/tutorials/suffix_speculative_decoding.md b/docs/source/tutorials/features/suffix_speculative_decoding.md
similarity index 100%
rename from docs/source/tutorials/suffix_speculative_decoding.md
rename to docs/source/tutorials/features/suffix_speculative_decoding.md
diff --git a/docs/source/tutorials/310p.md b/docs/source/tutorials/hardwares/310p.md
similarity index 100%
rename from docs/source/tutorials/310p.md
rename to docs/source/tutorials/hardwares/310p.md
diff --git a/docs/source/tutorials/hardwares/index.md b/docs/source/tutorials/hardwares/index.md
new file mode 100644
index 00000000000..23f4740b079
--- /dev/null
+++ b/docs/source/tutorials/hardwares/index.md
@@ -0,0 +1,9 @@
+# Hardware Tutorials
+
+This section provides tutorials on different hardware of vLLM Ascend.
+
+:::{toctree}
+:caption: Hardware Tutorials
+:maxdepth: 1
+310p
+:::
diff --git a/docs/source/tutorials/DeepSeek-R1.md b/docs/source/tutorials/models/DeepSeek-R1.md
similarity index 93%
rename from docs/source/tutorials/DeepSeek-R1.md
rename to docs/source/tutorials/models/DeepSeek-R1.md
index 5371ec8c791..34e9ee00b60 100644
--- a/docs/source/tutorials/DeepSeek-R1.md
+++ b/docs/source/tutorials/models/DeepSeek-R1.md
@@ -7,9 +7,9 @@ This article takes the `DeepSeek-R1-W8A8` version as an example to introduce the
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
@@ -21,13 +21,13 @@ It is recommended to download the model weight to the shared directory of multip
 
 ### Verify Multi-node Communication(Optional)
 
-If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication).
+If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication).
 
 ### Installation
 
 You can use our official docker image to run `DeepSeek-R1-W8A8` directly.
 
-Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 ```{code-block} bash
    :substitutions:
@@ -254,7 +254,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result, here is the result of `DeepSeek-R1-W8A8` in `vllm-ascend:0.11.0rc2` for reference only.
 
@@ -267,7 +267,7 @@ Here are two accuracy evaluation methods.
 
 As an example, take the `gsm8k` dataset as a test dataset, and run accuracy evaluation of `DeepSeek-R1-W8A8` in online mode.
 
-1. Refer to [Using lm_eval](../developer_guide/evaluation/using_lm_eval.md) for `lm_eval` installation.
+1. Refer to [Using lm_eval](../../developer_guide/evaluation/using_lm_eval.md) for `lm_eval` installation.
 
 2. Run `lm_eval` to execute the accuracy evaluation.
 
@@ -285,7 +285,7 @@ lm_eval \
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 ### Using vLLM Benchmark
 
diff --git a/docs/source/tutorials/DeepSeek-V3.1.md b/docs/source/tutorials/models/DeepSeek-V3.1.md
similarity index 97%
rename from docs/source/tutorials/DeepSeek-V3.1.md
rename to docs/source/tutorials/models/DeepSeek-V3.1.md
index acac9c17263..3e067a667bb 100644
--- a/docs/source/tutorials/DeepSeek-V3.1.md
+++ b/docs/source/tutorials/models/DeepSeek-V3.1.md
@@ -16,9 +16,9 @@ This document will show the main verification steps of the model, including supp
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
@@ -34,13 +34,13 @@ It is recommended to download the model weight to the shared directory of multip
 
 ### Verify Multi-node Communication(Optional)
 
-If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication).
+If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication).
 
 ### Installation
 
 You can use our official docker image to run `DeepSeek-V3.1` directly.
 
-Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 ```{code-block} bash
    :substitutions:
@@ -252,7 +252,7 @@ vllm serve /weights/DeepSeek-V3.1-w8a8-mtp-QuaRot \
 
 ### Prefill-Decode Disaggregation
 
-We recommend using Mooncake for deployment: [Mooncake](./pd_disaggregation_mooncake_multi_node.md).
+We recommend using Mooncake for deployment: [Mooncake](../features/pd_disaggregation_mooncake_multi_node.md).
 
 Take Atlas 800 A3 (64G × 16) for example, we recommend to deploy 2P1D (4 nodes) rather than 1P1D (2 nodes), because there is no enough NPU memory to serve high concurrency in 1P1D case.
 
@@ -672,7 +672,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result, here is the result of `DeepSeek-V3.1-w8a8-mtp-QuaRot` in `vllm-ascend:0.11.0rc1` for reference only.
 
@@ -689,7 +689,7 @@ Not test yet.
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 The performance result is:  
 
diff --git a/docs/source/tutorials/DeepSeek-V3.2.md b/docs/source/tutorials/models/DeepSeek-V3.2.md
similarity index 97%
rename from docs/source/tutorials/DeepSeek-V3.2.md
rename to docs/source/tutorials/models/DeepSeek-V3.2.md
index 528176f3ae9..9435aec4769 100644
--- a/docs/source/tutorials/DeepSeek-V3.2.md
+++ b/docs/source/tutorials/models/DeepSeek-V3.2.md
@@ -8,9 +8,9 @@ This document will show the main verification steps of the model, including supp
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
@@ -25,7 +25,7 @@ It is recommended to download the model weight to the shared directory of multip
 
 ### Verify Multi-node Communication(Optional)
 
-If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication).
+If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication).
 
 ### Installation
 
@@ -116,7 +116,7 @@ docker run --rm \
 
 In addition, if you don't want to use the docker image as above, you can also build all from source:
 
-- Install `vllm-ascend` from source, refer to [installation](../installation.md).
+- Install `vllm-ascend` from source, refer to [installation](../../installation.md).
 
 If you want to deploy multi-node environment, you need to set up environment on each node.
 
@@ -851,7 +851,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result.
 
@@ -859,7 +859,7 @@ Here are two accuracy evaluation methods.
 
 As an example, take the `gsm8k` dataset as a test dataset, and run accuracy evaluation of `DeepSeek-V3.2-W8A8` in online mode.
 
-1. Refer to [Using lm_eval](../developer_guide/evaluation/using_lm_eval.md) for `lm_eval` installation.
+1. Refer to [Using lm_eval](../../developer_guide/evaluation/using_lm_eval.md) for `lm_eval` installation.
 
 2. Run `lm_eval` to execute the accuracy evaluation.
 
@@ -877,7 +877,7 @@ lm_eval \
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 The performance result is:  
 
diff --git a/docs/source/tutorials/GLM4.x.md b/docs/source/tutorials/models/GLM4.x.md
similarity index 91%
rename from docs/source/tutorials/GLM4.x.md
rename to docs/source/tutorials/models/GLM4.x.md
index 5a6d3178e2c..6cb96c90f16 100644
--- a/docs/source/tutorials/GLM4.x.md
+++ b/docs/source/tutorials/models/GLM4.x.md
@@ -10,9 +10,9 @@ This document will show the main verification steps of the model, including supp
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
@@ -31,7 +31,7 @@ It is recommended to download the model weight to the shared directory of multip
 
 You can use our official docker image to run `GLM-4.x` directly.
 
-Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 ```{code-block} bash
    :substitutions:
@@ -121,7 +121,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result, here is the result of `GLM4.6` in `vllm-ascend:main` (after `vllm-ascend:0.13.0rc1`) for reference only.
 
@@ -138,7 +138,7 @@ Not test yet.
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 ### Using vLLM Benchmark
 
diff --git a/docs/source/tutorials/Kimi-K2-Thinking.md b/docs/source/tutorials/models/Kimi-K2-Thinking.md
similarity index 100%
rename from docs/source/tutorials/Kimi-K2-Thinking.md
rename to docs/source/tutorials/models/Kimi-K2-Thinking.md
diff --git a/docs/source/tutorials/PaddleOCR-VL.md b/docs/source/tutorials/models/PaddleOCR-VL.md
similarity index 98%
rename from docs/source/tutorials/PaddleOCR-VL.md
rename to docs/source/tutorials/models/PaddleOCR-VL.md
index 8ba020a7578..e73424bcf1f 100644
--- a/docs/source/tutorials/PaddleOCR-VL.md
+++ b/docs/source/tutorials/models/PaddleOCR-VL.md
@@ -24,7 +24,7 @@ It is recommended to download the model weights to a local directory (e.g., `./P
 
 You can use our official docker image to run `PaddleOCR-VL` directly.
 
-Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 ```{code-block} bash
    :substitutions:
diff --git a/docs/source/tutorials/Qwen-VL-Dense.md b/docs/source/tutorials/models/Qwen-VL-Dense.md
similarity index 97%
rename from docs/source/tutorials/Qwen-VL-Dense.md
rename to docs/source/tutorials/models/Qwen-VL-Dense.md
index 55f71a1259d..6426b796330 100644
--- a/docs/source/tutorials/Qwen-VL-Dense.md
+++ b/docs/source/tutorials/models/Qwen-VL-Dense.md
@@ -10,9 +10,9 @@ This tutorial uses the vLLM-Ascend `v0.11.0rc3-a3` version for demonstration, sh
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
@@ -484,7 +484,7 @@ You can refer to the [monitoring configuration](https://github.com/vllm-project/
 
 As an example, take the `mmmu_val` dataset as a test dataset, and run accuracy evaluation of `Qwen3-VL-8B-Instruct` in offline mode.
 
-1. Refer to [Using lm_eval](../developer_guide/evaluation/using_lm_eval.md) for more details on `lm_eval` installation.
+1. Refer to [Using lm_eval](../../developer_guide/evaluation/using_lm_eval.md) for more details on `lm_eval` installation.
 
 ```shell
 pip install lm_eval
@@ -515,7 +515,7 @@ lm_eval \
 
 As an example, take the `mmmu_val` dataset as a test dataset, and run accuracy evaluation of `Qwen2.5-VL-32B-Instruct` in offline mode.
 
-1. Refer to [Using lm_eval](../developer_guide/evaluation/using_lm_eval.md) for more details on `lm_eval` installation.
+1. Refer to [Using lm_eval](../../developer_guide/evaluation/using_lm_eval.md) for more details on `lm_eval` installation.
 
 ```shell
 pip install lm_eval
diff --git a/docs/source/tutorials/Qwen2.5-7B.md b/docs/source/tutorials/models/Qwen2.5-7B.md
similarity index 91%
rename from docs/source/tutorials/Qwen2.5-7B.md
rename to docs/source/tutorials/models/Qwen2.5-7B.md
index be92c1cdc6e..1ecdc765099 100644
--- a/docs/source/tutorials/Qwen2.5-7B.md
+++ b/docs/source/tutorials/models/Qwen2.5-7B.md
@@ -10,9 +10,9 @@ The `Qwen2.5-7B-Instruct` model was supported since `vllm-ascend:v0.9.0`.
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
@@ -138,7 +138,7 @@ A valid response (e.g., `"Beijing is a vibrant and historic capital city"`) indi
 
 ### Using AISBench
 
-Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 Results and logs are saved to `benchmark/outputs/default/`. A sample accuracy report is shown below:
 
@@ -150,7 +150,7 @@ Results and logs are saved to `benchmark/outputs/default/`. A sample accuracy re
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 ### Using vLLM Benchmark
 
diff --git a/docs/source/tutorials/Qwen2.5-Omni.md b/docs/source/tutorials/models/Qwen2.5-Omni.md
similarity index 92%
rename from docs/source/tutorials/Qwen2.5-Omni.md
rename to docs/source/tutorials/models/Qwen2.5-Omni.md
index ff5eecaa7f6..55436958021 100644
--- a/docs/source/tutorials/Qwen2.5-Omni.md
+++ b/docs/source/tutorials/models/Qwen2.5-Omni.md
@@ -8,9 +8,9 @@ The `Qwen2.5-Omni` model was supported since `vllm-ascend:v0.11.0rc0`. This docu
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
@@ -25,7 +25,7 @@ Following examples use the 7B version by default.
 
 You can use our official docker image to run `Qwen2.5-Omni` directly.
 
-Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 ```{code-block} bash
    :substitutions:
@@ -174,7 +174,7 @@ Qwen2.5-Omni on vllm-ascend has been test on AISBench.
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result, here is the result of `Qwen2.5-Omni-7B` with `vllm-ascend:0.11.0rc0` for reference only.
 
@@ -187,7 +187,7 @@ Qwen2.5-Omni on vllm-ascend has been test on AISBench.
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 ### Using vLLM Benchmark
 
diff --git a/docs/source/tutorials/Qwen3-235B-A22B.md b/docs/source/tutorials/models/Qwen3-235B-A22B.md
similarity index 96%
rename from docs/source/tutorials/Qwen3-235B-A22B.md
rename to docs/source/tutorials/models/Qwen3-235B-A22B.md
index 3a2c0a9f738..5ab6f8e62b0 100644
--- a/docs/source/tutorials/Qwen3-235B-A22B.md
+++ b/docs/source/tutorials/models/Qwen3-235B-A22B.md
@@ -10,9 +10,9 @@ The `Qwen3-235B-A22B` model is first supported in `vllm-ascend:v0.8.4rc2`.
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
@@ -25,7 +25,7 @@ It is recommended to download the model weight to the shared directory of multip
 
 ### Verify Multi-node Communication(Optional)
 
-If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication).
+If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication).
 
 ### Installation
 
@@ -34,7 +34,7 @@ If you want to deploy multi-node environment, you need to verify multi-node comm
 
 For example, using images `quay.io/ascend/vllm-ascend:v0.11.0rc2`(for Atlas 800 A2) and `quay.io/ascend/vllm-ascend:v0.11.0rc2-a3`(for Atlas 800 A3).
 
-Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 ```{code-block} bash
   :substitutions:
@@ -76,7 +76,7 @@ Select an image based on your machine type and start the docker image on your no
 
 You can build all from source.
 
-- Install `vllm-ascend`, refer to [set up using python](../installation.md#set-up-using-python).
+- Install `vllm-ascend`, refer to [set up using python](../../installation.md#set-up-using-python).
 
 ::::
 :::::
@@ -253,11 +253,11 @@ INFO:     Application startup complete.
 
 ### Multi-node Deployment with Ray
 
-- refer to [Ray Distributed (Qwen/Qwen3-235B-A22B)](./ray.md).
+- refer to [Ray Distributed (Qwen/Qwen3-235B-A22B)](../features/ray.md).
 
 ### Prefill-Decode Disaggregation
 
-- refer to [Prefill-Decode Disaggregation Mooncake Verification (Qwen)](./pd_disaggregation_mooncake_multi_node.md)
+- refer to [Prefill-Decode Disaggregation Mooncake Verification (Qwen)](../features/pd_disaggregation_mooncake_multi_node.md)
 
 ## Functional Verification
 
@@ -280,7 +280,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result, here is the result of `Qwen3-235B-A22B-w8a8` in `vllm-ascend:0.11.0rc0` for reference only.
 
@@ -292,7 +292,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 ### Using vLLM Benchmark
 
diff --git a/docs/source/tutorials/Qwen3-30B-A3B.md b/docs/source/tutorials/models/Qwen3-30B-A3B.md
similarity index 100%
rename from docs/source/tutorials/Qwen3-30B-A3B.md
rename to docs/source/tutorials/models/Qwen3-30B-A3B.md
diff --git a/docs/source/tutorials/Qwen3-32B-W4A4.md b/docs/source/tutorials/models/Qwen3-32B-W4A4.md
similarity index 100%
rename from docs/source/tutorials/Qwen3-32B-W4A4.md
rename to docs/source/tutorials/models/Qwen3-32B-W4A4.md
diff --git a/docs/source/tutorials/Qwen3-8B-W4A8.md b/docs/source/tutorials/models/Qwen3-8B-W4A8.md
similarity index 100%
rename from docs/source/tutorials/Qwen3-8B-W4A8.md
rename to docs/source/tutorials/models/Qwen3-8B-W4A8.md
diff --git a/docs/source/tutorials/Qwen3-Coder-30B-A3B.md b/docs/source/tutorials/models/Qwen3-Coder-30B-A3B.md
similarity index 86%
rename from docs/source/tutorials/Qwen3-Coder-30B-A3B.md
rename to docs/source/tutorials/models/Qwen3-Coder-30B-A3B.md
index 03fb0085b72..8a627f5899e 100644
--- a/docs/source/tutorials/Qwen3-Coder-30B-A3B.md
+++ b/docs/source/tutorials/models/Qwen3-Coder-30B-A3B.md
@@ -8,9 +8,9 @@ This document will show the main verification steps of the model, including supp
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
@@ -52,7 +52,7 @@ docker run --rm \
 
 In addition, if you don't want to use the docker image as above, you can also build all from source:
 
-- Install `vllm-ascend` from source, refer to [installation](../installation.md).
+- Install `vllm-ascend` from source, refer to [installation](../../installation.md).
 
 ## Deployment
 
@@ -90,7 +90,7 @@ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/jso
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result, here is the result of `Qwen3-Coder-30B-A3B-Instruct` in `vllm-ascend:0.11.0rc0` for reference only.
 
@@ -102,4 +102,4 @@ curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/jso
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
diff --git a/docs/source/tutorials/Qwen3-Dense.md b/docs/source/tutorials/models/Qwen3-Dense.md
similarity index 97%
rename from docs/source/tutorials/Qwen3-Dense.md
rename to docs/source/tutorials/models/Qwen3-Dense.md
index 413d125d37d..70814d7a39d 100644
--- a/docs/source/tutorials/Qwen3-Dense.md
+++ b/docs/source/tutorials/models/Qwen3-Dense.md
@@ -16,9 +16,9 @@ This example requires version **v0.11.0rc2**. Earlier versions may lack certain
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
@@ -38,7 +38,7 @@ It is recommended to download the model weight to the shared directory of multip
 
 ### Verify Multi-node Communication(Optional)
 
-If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication).
+If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication).
 
 ### Installation
 
@@ -97,7 +97,7 @@ In the [Run docker container](./Qwen3-Dense.md#run-docker-container), detailed e
 
 In addition, if you don't want to use the docker image as above, you can also build all from source:
 
-- Install `vllm-ascend` from source, refer to [installation](../installation.md).
+- Install `vllm-ascend` from source, refer to [installation](../../installation.md).
 
 If you want to deploy multi-node environment, you need to set up environment on each node.
 
@@ -269,7 +269,7 @@ Here is one accuracy evaluation methods.
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result, here is the result of `Qwen3-32B-W8A8` in `vllm-ascend:0.11.0rc2` for reference only.
 
@@ -283,7 +283,7 @@ Here is one accuracy evaluation methods.
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 ### Using vLLM Benchmark
 
diff --git a/docs/source/tutorials/Qwen3-Next.md b/docs/source/tutorials/models/Qwen3-Next.md
similarity index 92%
rename from docs/source/tutorials/Qwen3-Next.md
rename to docs/source/tutorials/models/Qwen3-Next.md
index 3c92f56518d..d0f5becbc56 100644
--- a/docs/source/tutorials/Qwen3-Next.md
+++ b/docs/source/tutorials/models/Qwen3-Next.md
@@ -10,9 +10,9 @@ The `Qwen3-Next` model is first supported in `vllm-ascend:v0.10.2rc1`.
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Weight Preparation
 
@@ -134,7 +134,7 @@ Prompt: 'Who are you?', Generated text: ' What do you know about me?\n\nHello! I
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result, here is the result of `Qwen3-Next-80B-A3B-Instruct` in `vllm-ascend:0.13.0rc1` for reference only.
 
@@ -146,7 +146,7 @@ Prompt: 'Who are you?', Generated text: ' What do you know about me?\n\nHello! I
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 ### Using vLLM Benchmark
 
diff --git a/docs/source/tutorials/Qwen3-Omni-30B-A3B-Thinking.md b/docs/source/tutorials/models/Qwen3-Omni-30B-A3B-Thinking.md
similarity index 98%
rename from docs/source/tutorials/Qwen3-Omni-30B-A3B-Thinking.md
rename to docs/source/tutorials/models/Qwen3-Omni-30B-A3B-Thinking.md
index 578d91a79e2..c2be1f3a13f 100644
--- a/docs/source/tutorials/Qwen3-Omni-30B-A3B-Thinking.md
+++ b/docs/source/tutorials/models/Qwen3-Omni-30B-A3B-Thinking.md
@@ -26,7 +26,7 @@ It is recommended to download the model weight to the shared directory of multip
 
 You can use our official docker image to run Qwen3-Omni-30B-A3B-Thinking directly
 
-Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 ```{code-block} bash
   :substitutions:
@@ -63,7 +63,7 @@ docker run --rm \
 
 You can build all from source.
 
-- Install `vllm-ascend`, refer to [set up using python](../installation.md#set-up-using-python).
+- Install `vllm-ascend`, refer to [set up using python](../../installation.md#set-up-using-python).
 
 ::::
 :::::
diff --git a/docs/source/tutorials/Qwen3-VL-235B-A22B-Instruct.md b/docs/source/tutorials/models/Qwen3-VL-235B-A22B-Instruct.md
similarity index 93%
rename from docs/source/tutorials/Qwen3-VL-235B-A22B-Instruct.md
rename to docs/source/tutorials/models/Qwen3-VL-235B-A22B-Instruct.md
index 55a5d28077f..8107315d8ee 100644
--- a/docs/source/tutorials/Qwen3-VL-235B-A22B-Instruct.md
+++ b/docs/source/tutorials/models/Qwen3-VL-235B-A22B-Instruct.md
@@ -10,9 +10,9 @@ This tutorial uses the vLLM-Ascend `v0.11.0rc2` version for demonstration, sho
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
-Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
@@ -24,7 +24,7 @@ It is recommended to download the model weight to the shared directory of multip
 
 ### Verify Multi-node Communication(Optional)
 
-If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../installation.md#verify-multi-node-communication).
+If you want to deploy multi-node environment, you need to verify multi-node communication according to [verify multi-node communication environment](../../installation.md#verify-multi-node-communication).
 
 ### Installation
 
@@ -33,7 +33,7 @@ If you want to deploy multi-node environment, you need to verify multi-node comm
 
 For example, using images `quay.io/ascend/vllm-ascend:v0.11.0rc2`(for Atlas 800 A2) and `quay.io/ascend/vllm-ascend:v0.11.0rc2-a3`(for Atlas 800 A3).
 
-Select an image based on your machine type and start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+Select an image based on your machine type and start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 ```{code-block} bash
   :substitutions:
@@ -76,7 +76,7 @@ Select an image based on your machine type and start the docker image on your no
 
 You can build all from source.
 
-- Install `vllm-ascend`, refer to [set up using python](../installation.md#set-up-using-python).
+- Install `vllm-ascend`, refer to [set up using python](../../installation.md#set-up-using-python).
 
 ::::
 :::::
@@ -209,11 +209,11 @@ INFO:     Application startup complete.
 
 ### Multi-node Deployment with Ray
 
-- refer to [Ray Distributed (Qwen/Qwen3-235B-A22B)](./ray.md).
+- refer to [Ray Distributed (Qwen/Qwen3-235B-A22B)](../features/ray.md).
 
 ### Prefill-Decode Disaggregation
 
-- refer to [Prefill-Decode Disaggregation Mooncake Verification](./pd_disaggregation_mooncake_multi_node.md)
+- refer to [Prefill-Decode Disaggregation Mooncake Verification](../features/pd_disaggregation_mooncake_multi_node.md)
 
 ## Functional Verification
 
@@ -240,7 +240,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-1. Refer to [Using AISBench](../developer_guide/evaluation/using_ais_bench.md) for details.
+1. Refer to [Using AISBench](../../developer_guide/evaluation/using_ais_bench.md) for details.
 
 2. After execution, you can get the result, here is the result of `Qwen3-VL-235B-A22B-Instruct` in `vllm-ascend:0.11.0rc2` for reference only.
 
@@ -252,7 +252,7 @@ Here are two accuracy evaluation methods.
 
 ### Using AISBench
 
-Refer to [Using AISBench for performance evaluation](../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
+Refer to [Using AISBench for performance evaluation](../../developer_guide/evaluation/using_ais_bench.md#execute-performance-evaluation) for details.
 
 ### Using vLLM Benchmark
 
diff --git a/docs/source/tutorials/Qwen3-VL-30B-A3B-Instruct.md b/docs/source/tutorials/models/Qwen3-VL-30B-A3B-Instruct.md
similarity index 97%
rename from docs/source/tutorials/Qwen3-VL-30B-A3B-Instruct.md
rename to docs/source/tutorials/models/Qwen3-VL-30B-A3B-Instruct.md
index d1368d8460b..28c52eb2a6f 100644
--- a/docs/source/tutorials/Qwen3-VL-30B-A3B-Instruct.md
+++ b/docs/source/tutorials/models/Qwen3-VL-30B-A3B-Instruct.md
@@ -8,8 +8,8 @@ This document will show the main verification steps of the `Qwen3-VL-30B-A3B-Ins
 
 ## Supported Features
 
-- Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
-- Refer to [feature guide](../user_guide/feature_guide/index.md) to get the feature's configuration.
+- Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+- Refer to [feature guide](../../user_guide/feature_guide/index.md) to get the feature's configuration.
 
 ## Environment Preparation
 
diff --git a/docs/source/tutorials/Qwen3-VL-Embedding.md b/docs/source/tutorials/models/Qwen3-VL-Embedding.md
similarity index 96%
rename from docs/source/tutorials/Qwen3-VL-Embedding.md
rename to docs/source/tutorials/models/Qwen3-VL-Embedding.md
index d39aed9ce5e..a6694fc90f7 100644
--- a/docs/source/tutorials/Qwen3-VL-Embedding.md
+++ b/docs/source/tutorials/models/Qwen3-VL-Embedding.md
@@ -6,7 +6,7 @@ The Qwen3-VL-Embedding and Qwen3-VL-Reranker model series are the latest additio
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
 ## Environment Preparation
 
@@ -21,11 +21,11 @@ It is recommended to download the model weight to the shared directory of multip
 
 You can use our official docker image to run `Qwen3-VL-Embedding` series models.
 
-- Start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+- Start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 If you don't want to use the docker image as above, you can also build all from source:
 
-- Install `vllm-ascend` from source, refer to [installation](../installation.md).
+- Install `vllm-ascend` from source, refer to [installation](../../installation.md).
 
 ## Deployment
 
diff --git a/docs/source/tutorials/Qwen3-VL-Reranker.md b/docs/source/tutorials/models/Qwen3-VL-Reranker.md
similarity index 98%
rename from docs/source/tutorials/Qwen3-VL-Reranker.md
rename to docs/source/tutorials/models/Qwen3-VL-Reranker.md
index 740e1a1ca6e..deba9104f27 100644
--- a/docs/source/tutorials/Qwen3-VL-Reranker.md
+++ b/docs/source/tutorials/models/Qwen3-VL-Reranker.md
@@ -6,7 +6,7 @@ The Qwen3-VL-Embedding and Qwen3-VL-Reranker model series are the latest additio
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
 ## Environment Preparation
 
@@ -21,11 +21,11 @@ It is recommended to download the model weight to the shared directory of multip
 
 You can use our official docker image to run `Qwen3-VL-Reranker` series models.
 
-- Start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+- Start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 If you don't want to use the docker image as above, you can also build all from source:
 
-- Install `vllm-ascend` from source, refer to [installation](../installation.md).
+- Install `vllm-ascend` from source, refer to [installation](../../installation.md).
 
 ## Deployment
 
diff --git a/docs/source/tutorials/Qwen3_embedding.md b/docs/source/tutorials/models/Qwen3_embedding.md
similarity index 96%
rename from docs/source/tutorials/Qwen3_embedding.md
rename to docs/source/tutorials/models/Qwen3_embedding.md
index d2369ba934e..7e490e7aa2e 100644
--- a/docs/source/tutorials/Qwen3_embedding.md
+++ b/docs/source/tutorials/models/Qwen3_embedding.md
@@ -6,7 +6,7 @@ The Qwen3 Embedding model series is the latest proprietary model of the Qwen fam
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
 ## Environment Preparation
 
@@ -22,11 +22,11 @@ It is recommended to download the model weight to the shared directory of multip
 
 You can use our official docker image to run `Qwen3-Embedding` series models.
 
-- Start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+- Start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 if you don't want to use the docker image as above, you can also build all from source:
 
-- Install `vllm-ascend` from source, refer to [installation](../installation.md).
+- Install `vllm-ascend` from source, refer to [installation](../../installation.md).
 
 ## Deployment
 
diff --git a/docs/source/tutorials/Qwen3_reranker.md b/docs/source/tutorials/models/Qwen3_reranker.md
similarity index 97%
rename from docs/source/tutorials/Qwen3_reranker.md
rename to docs/source/tutorials/models/Qwen3_reranker.md
index d8ef6a15a59..94c1c8b60fe 100644
--- a/docs/source/tutorials/Qwen3_reranker.md
+++ b/docs/source/tutorials/models/Qwen3_reranker.md
@@ -6,7 +6,7 @@ The Qwen3 Reranker model series is the latest proprietary model of the Qwen fami
 
 ## Supported Features
 
-Refer to [supported features](../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
+Refer to [supported features](../../user_guide/support_matrix/supported_models.md) to get the model's supported feature matrix.
 
 ## Environment Preparation
 
@@ -22,11 +22,11 @@ It is recommended to download the model weight to the shared directory of multip
 
 You can use our official docker image to run `Qwen3-Reranker` series models.
 
-- Start the docker image on your node, refer to [using docker](../installation.md#set-up-using-docker).
+- Start the docker image on your node, refer to [using docker](../../installation.md#set-up-using-docker).
 
 if you don't want to use the docker image as above, you can also build all from source:
 
-- Install `vllm-ascend` from source, refer to [installation](../installation.md).
+- Install `vllm-ascend` from source, refer to [installation](../../installation.md).
 
 ## Deployment
 
diff --git a/docs/source/tutorials/index.md b/docs/source/tutorials/models/index.md
similarity index 54%
rename from docs/source/tutorials/index.md
rename to docs/source/tutorials/models/index.md
index ea344fde5b0..78181feda27 100644
--- a/docs/source/tutorials/index.md
+++ b/docs/source/tutorials/models/index.md
@@ -1,7 +1,9 @@
-# Tutorials
+# Model Tutorials
+
+This section provides tutorials for different models of vLLM Ascend.
 
 :::{toctree}
-:caption: Models
+:caption: Model Tutorials
 :maxdepth: 1
 Qwen2.5-Omni.md
 Qwen2.5-7B.md
@@ -27,21 +29,3 @@ GLM4.x.md
 Kimi-K2-Thinking.md
 PaddleOCR-VL.md
 :::
-
-:::{toctree}
-:caption: Features
-:maxdepth: 1
-pd_colocated_mooncake_multi_instance.md
-pd_disaggregation_mooncake_single_node.md
-pd_disaggregation_mooncake_multi_node.md
-long_sequence_context_parallel_single_node.md
-long_sequence_context_parallel_multi_node.md
-suffix_speculative_decoding.md
-ray
-:::
-
-:::{toctree}
-:caption: Hardware
-:maxdepth: 1
-310p.md
-:::
diff --git a/docs/source/user_guide/feature_guide/quantization.md b/docs/source/user_guide/feature_guide/quantization.md
index 90a2cf901b6..ecf665882de 100644
--- a/docs/source/user_guide/feature_guide/quantization.md
+++ b/docs/source/user_guide/feature_guide/quantization.md
@@ -155,8 +155,6 @@ python -m vllm.entrypoints.api_server \
     --quantization ascend
 ```
 
-The above commands are for reference only. For more details, consult the [official guide](../../tutorials/index.md).
-
 ## References
 
 - [ModelSlim Documentation](https://gitcode.com/Ascend/msit/blob/master/msmodelslim/README.md)
diff --git a/docs/source/user_guide/support_matrix/supported_models.md b/docs/source/user_guide/support_matrix/supported_models.md
index 3dbd9d66f1f..1f70c5cb2f5 100644
--- a/docs/source/user_guide/support_matrix/supported_models.md
+++ b/docs/source/user_guide/support_matrix/supported_models.md
@@ -16,16 +16,16 @@ Get the latest info here: <https://github.com/vllm-project/vllm-ascend/issues/16
 
 | Model                         | Support   | Note                                                                 | BF16 | Supported Hardware | W8A8 | Chunked Prefill | Automatic Prefix Cache | LoRA | Speculative Decoding | Async Scheduling | Tensor Parallel | Pipeline Parallel | Expert Parallel | Data Parallel | Prefill-decode Disaggregation | Piecewise AclGraph | Fullgraph AclGraph | max-model-len | MLP Weight Prefetch | Doc |
 |-------------------------------|-----------|----------------------------------------------------------------------|------|--------------------|------|-----------------|------------------------|------|----------------------|------------------|-----------------|-------------------|-----------------|---------------|-------------------------------|--------------------|--------------------|---------------|---------------------|-----|
-| DeepSeek V3/3.1               | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ || ✅ || ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 240k || [DeepSeek-V3.1](../../tutorials/DeepSeek-V3.1.md) |
-| DeepSeek V3.2                 | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 160k | ✅ | [DeepSeek-V3.2](../../tutorials/DeepSeek-V3.2.md) |
-| DeepSeek R1                   | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ || ✅ || ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 128k || [DeepSeek-R1](../../tutorials/DeepSeek-R1.md) |
-| Qwen3                         | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ ||| ✅ | ✅ ||| ✅ || ✅ | ✅ | 128k | ✅ | [Qwen3-Dense](../../tutorials/Qwen3-Dense.md) |
-| Qwen3-Coder                   | ✅        |                                                                      | ✅ | A2/A3 ||✅|✅|✅|||✅|✅|✅|✅||||||[Qwen3-Coder-30B-A3B tutorial](../../tutorials/Qwen3-Coder-30B-A3B.md)|
-| Qwen3-Moe                     | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ ||| ✅ | ✅ || ✅ | ✅ | ✅ | ✅ | ✅ | 256k || [Qwen3-235B-A22B](../../tutorials/Qwen3-235B-A22B.md) |
-| Qwen3-Next                    | ✅        |                                                                      | ✅ | A2/A3 | ✅ |||||| ✅ ||| ✅ || ✅ | ✅ ||| [Qwen3-Next](../../tutorials/Qwen3-Next.md) |
-| Qwen2.5                       | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ |||| ✅ ||| ✅ |||||| [Qwen2.5-7B](../../tutorials/Qwen2.5-7B.md) |
-| GLM-4.x                       | ✅        |                                                                      || A2/A3 |✅|✅|✅||✅|✅|✅|||✅||✅|✅|128k||[GLM-4.x](../../tutorials/GLM4.x.md)|
-| Kimi-K2-Thinking              | ✅        |                                                                      || A2/A3 |||||||||||||||| [Kimi-K2-Thinking](../../tutorials/Kimi-K2-Thinking.md) |
+| DeepSeek V3/3.1               | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ || ✅ || ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 240k || [DeepSeek-V3.1](../../tutorials/models/DeepSeek-V3.1.md) |
+| DeepSeek V3.2                 | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 160k | ✅ | [DeepSeek-V3.2](../../tutorials/models/DeepSeek-V3.2.md) |
+| DeepSeek R1                   | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ || ✅ || ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 128k || [DeepSeek-R1](../../tutorials/models/DeepSeek-R1.md) |
+| Qwen3                         | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ ||| ✅ | ✅ ||| ✅ || ✅ | ✅ | 128k | ✅ | [Qwen3-Dense](../../tutorials/models/Qwen3-Dense.md) |
+| Qwen3-Coder                   | ✅        |                                                                      | ✅ | A2/A3 ||✅|✅|✅|||✅|✅|✅|✅||||||[Qwen3-Coder-30B-A3B tutorial](../../tutorials/models/Qwen3-Coder-30B-A3B.md)|
+| Qwen3-Moe                     | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ ||| ✅ | ✅ || ✅ | ✅ | ✅ | ✅ | ✅ | 256k || [Qwen3-235B-A22B](../../tutorials/models/Qwen3-235B-A22B.md) |
+| Qwen3-Next                    | ✅        |                                                                      | ✅ | A2/A3 | ✅ |||||| ✅ ||| ✅ || ✅ | ✅ ||| [Qwen3-Next](../../tutorials/models/Qwen3-Next.md) |
+| Qwen2.5                       | ✅        |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ |||| ✅ ||| ✅ |||||| [Qwen2.5-7B](../../tutorials/models/Qwen2.5-7B.md) |
+| GLM-4.x                       | ✅        |                                                                      || A2/A3 |✅|✅|✅||✅|✅|✅|||✅||✅|✅|128k||[GLM-4.x](../../tutorials/models/GLM4.x.md)|
+| Kimi-K2-Thinking              | ✅        |                                                                      || A2/A3 |||||||||||||||| [Kimi-K2-Thinking](../../tutorials/models/Kimi-K2-Thinking.md) |
 
 #### Extended Compatible Models
 
@@ -60,10 +60,10 @@ Get the latest info here: <https://github.com/vllm-project/vllm-ascend/issues/16
 
 | Model                         | Support   | Note                                                                 |    Supported Hardware    |  Doc |
 |-------------------------------|-----------|----------------------------------------------------------------------|--------------------------|------|
-| Qwen3-Embedding               | ✅        |                                                                      |         A2/A3            | [Qwen3_embedding](../../tutorials/Qwen3_embedding.md)|
-| Qwen3-VL-Embedding               | ✅        |                                                                      |         A2/A3            | [Qwen3-VL-Embedding](../../tutorials/Qwen3-VL-Embedding.md)|
-| Qwen3-Reranker                | ✅        |                                                                      |         A2/A3            | [Qwen3_reranker](../../tutorials/Qwen3_reranker.md)|
-| Qwen3-VL-Reranker                | ✅        |                                                                      |         A2/A3            | [Qwen3-VL-Reranker](../../tutorials/Qwen3-VL-Reranker.md)|
+| Qwen3-Embedding               | ✅        |                                                                      |         A2/A3            | [Qwen3_embedding](../../tutorials/models/Qwen3_embedding.md)|
+| Qwen3-VL-Embedding               | ✅        |                                                                      |         A2/A3            | [Qwen3-VL-Embedding](../../tutorials/models/Qwen3-VL-Embedding.md)|
+| Qwen3-Reranker                | ✅        |                                                                      |         A2/A3            | [Qwen3_reranker](../../tutorials/models/Qwen3_reranker.md)|
+| Qwen3-VL-Reranker                | ✅        |                                                                      |         A2/A3            | [Qwen3-VL-Reranker](../../tutorials/models/Qwen3-VL-Reranker.md)|
 | Molmo                         | ✅        | [1942](https://github.com/vllm-project/vllm-ascend/issues/1942)      |         A2/A3            |      |
 | XLM-RoBERTa-based             | ✅        |                                                                      |         A2/A3            |      |
 | Bert                          | ✅        |                                                                      |         A2/A3            |      |
@@ -76,11 +76,11 @@ Get the latest info here: <https://github.com/vllm-project/vllm-ascend/issues/16
 
 | Model                          | Support       | Note                                                                 | BF16 | Supported Hardware | W8A8 | Chunked Prefill | Automatic Prefix Cache | LoRA | Speculative Decoding | Async Scheduling | Tensor Parallel | Pipeline Parallel | Expert Parallel | Data Parallel | Prefill-decode Disaggregation | Piecewise AclGraph | Fullgraph AclGraph | max-model-len | MLP Weight Prefetch | Doc |
 |--------------------------------|---------------|----------------------------------------------------------------------|------|--------------------|------|-----------------|------------------------|------|----------------------|------------------|-----------------|-------------------|-----------------|---------------|-------------------------------|--------------------|--------------------|---------------|---------------------|-----|
-| Qwen2.5-VL                     | ✅            |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ ||| ✅ | ✅ |||| ✅ | ✅ | ✅ | 30k || [Qwen-VL-Dense](../../tutorials/Qwen-VL-Dense.md) |
-| Qwen3-VL                       | ✅            |                                                                      ||A2/A3|||||||✅|||||✅|✅||| [Qwen-VL-Dense](../../tutorials/Qwen-VL-Dense.md) |
-| Qwen3-VL-MOE                   | ✅            |                                                                      | ✅ | A2/A3||✅|✅|||✅|✅|✅|✅|✅|✅|✅|✅|256k||[Qwen3-VL-MOE](../../tutorials/Qwen3-VL-235B-A22B-Instruct.md)|
-| Qwen3-Omni-30B-A3B-Thinking    | ✅            |                                                                      ||A2/A3|||||||✅||✅|||||||[Qwen3-Omni-30B-A3B-Thinking](../../tutorials/Qwen3-Omni-30B-A3B-Thinking.md)|
-| Qwen2.5-Omni                   | ✅            |                                                                      || A2/A3 |||||||||||||||| [Qwen2.5-Omni](../../tutorials/Qwen2.5-Omni.md) |
+| Qwen2.5-VL                     | ✅            |                                                                      | ✅ | A2/A3 | ✅ | ✅ | ✅ ||| ✅ | ✅ |||| ✅ | ✅ | ✅ | 30k || [Qwen-VL-Dense](../../tutorials/models/Qwen-VL-Dense.md) |
+| Qwen3-VL                       | ✅            |                                                                      ||A2/A3|||||||✅|||||✅|✅||| [Qwen-VL-Dense](../../tutorials/models/Qwen-VL-Dense.md) |
+| Qwen3-VL-MOE                   | ✅            |                                                                      | ✅ | A2/A3||✅|✅|||✅|✅|✅|✅|✅|✅|✅|✅|256k||[Qwen3-VL-MOE](../../tutorials/models/Qwen3-VL-235B-A22B-Instruct.md)|
+| Qwen3-Omni-30B-A3B-Thinking    | ✅            |                                                                      ||A2/A3|||||||✅||✅|||||||[Qwen3-Omni-30B-A3B-Thinking](../../tutorials/models/Qwen3-Omni-30B-A3B-Thinking.md)|
+| Qwen2.5-Omni                   | ✅            |                                                                      || A2/A3 |||||||||||||||| [Qwen2.5-Omni](../../tutorials/models/Qwen2.5-Omni.md) |
 
 #### Extended Compatible Models