vllm-project · wangxiyuan · Dec 10, 2025 · Nov 28, 2025 · Nov 28, 2025 · Nov 29, 2025
@@ -266,6 +266,7 @@ jobs:
         run: |
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Kimi_K2_Thinking_W4A16
           # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP
           # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_W8A8_WITH_EP
           pytest -sv tests/e2e/multicard/test_data_parallel_tp2.py

@@ -12,6 +12,7 @@ single_npu_qwen3_w4a4
 single_node_pd_disaggregation_mooncake
 multi_npu_qwen3_next
 multi_npu
+multi_npu_kimi-k2-thinking
 multi_npu_moge
 multi_npu_qwen3_moe
 multi_npu_quantization

@@ -0,0 +1,107 @@
+# Multi-NPU (Kimi-K2-Thinking)
+
+## Run with Docker
+
+```{code-block} bash
+   :substitutions:
+# Update the vllm-ascend image
+export IMAGE=m.daocloud.io/quay.io/ascend/vllm-ascend:|vllm_ascend_version|
+export NAME=vllm-ascend
+
+# Run the container using the defined variables
+# Note: If you are running bridge network with docker, please expose available ports for multiple nodes communication in advance
+docker run --rm \
+--name $NAME \
+--net=host \
+--shm-size=1g \
+--device /dev/davinci0 \
+--device /dev/davinci1 \
+--device /dev/davinci2 \
+--device /dev/davinci3 \
+--device /dev/davinci4 \
+--device /dev/davinci5 \
+--device /dev/davinci6 \
+--device /dev/davinci7 \
+--device /dev/davinci8 \
+--device /dev/davinci9 \
+--device /dev/davinci10 \
+--device /dev/davinci11 \
+--device /dev/davinci12 \
+--device /dev/davinci13 \
+--device /dev/davinci14 \
+--device /dev/davinci15 \
+--device /dev/davinci_manager \
+--device /dev/devmm_svm \
+--device /dev/hisi_hdc \
+-v /usr/local/dcmi:/usr/local/dcmi \
+-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
+-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+-v /etc/ascend_install.info:/etc/ascend_install.info \
+-v /mnt/sfs_turbo/.cache:/home/cache \
+-it $IMAGE bash
+```
+
+## Verify the Quantized Model
+Please be advised to edit the value of `"quantization_config.config_groups.group_0.targets"` from `["Linear"]` into `["MoE"]` in `config.json` of original model downloaded from [Hugging Face](https://huggingface.co/moonshotai/Kimi-K2-Thinking).
+
+```json
+{
+  "quantization_config": {
+    "config_groups": {
+      "group_0": {
+        "targets": [
+          "MoE"
+        ]
+      }
+    }
+  }
+}
+```
+
+Your model files look like:
+
+```bash
+.
+|-- chat_template.jinja
+|-- config.json
+|-- configuration_deepseek.py
+|-- configuration.json
+|-- generation_config.json
+|-- model-00001-of-000062.safetensors
+|-- ...
+|-- model-00062-of-000062.safetensors
+|-- model.safetensors.index.json
+|-- modeling_deepseek.py
+|-- tiktoken.model
+|-- tokenization_kimi.py
+`-- tokenizer_config.json
+```
+
+## Online Inference on Multi-NPU
+
+Run the following script to start the vLLM server on Multi-NPU:
+
+For an Atlas 800 A3 (64G*16) node, tensor-parallel-size should be at least 16.
+
+```bash
+vllm serve Kimi-K2-Thinking \
+--served-model-name kimi-k2-thinking \
+--tensor-parallel-size 16 \
+--enable_expert_parallel \
+--trust-remote-code \
+--no-enable-prefix-caching
+```
+
+Once your server is started, you can query the model with input prompts.
+
+```bash
+curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "kimi-k2-thinking",
+  "messages": [
+    {"role": "user", "content": "Who are you?"}
+  ],
+  "temperature": 1.0
+}'
+```
@@ -49,6 +49,10 @@
     "vllm-ascend/DeepSeek-V3.1-W4A8-puring"
 ]
 
+KIMI_W4A16_MODELS = [
+    "vllm-ascend/Kimi-K2-Thinking-Pruning",
+]
+
 
 def test_models_distributed_QwQ():
     example_prompts = [
@@ -250,3 +254,24 @@ def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(model):
             quantization="ascend",
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@pytest.mark.parametrize("model", KIMI_W4A16_MODELS)
+def test_models_distributed_Kimi_K2_Thinking_W4A16(model):
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+
+    with VllmRunner(
+            model,
+            max_model_len=8192,
+            dtype="auto",
+            tensor_parallel_size=4,
+            enable_expert_parallel=True,
+            compilation_config={
+                "cudagraph_mode": "FULL_DECODE_ONLY",
+                "cudagraph_capture_sizes": [1],
+            },
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)