Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/pr-test-npu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
if: needs.check-changes.outputs.main_package == 'true'
runs-on: linux-arm64-npu-1
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
matrix:
part: [0, 1, 2]
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand Down Expand Up @@ -129,7 +129,7 @@ jobs:
if: needs.check-changes.outputs.main_package == 'true'
runs-on: linux-arm64-npu-4
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand Down Expand Up @@ -172,7 +172,7 @@ jobs:
matrix:
part: [0, 1]
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release-docker-npu-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
runs-on: ubuntu-22.04-arm
strategy:
matrix:
cann_version: ["8.2.rc1"]
cann_version: ["8.3.rc1"]
device_type: ["910b", "a3"]
steps:
- name: Checkout repository
Expand Down Expand Up @@ -73,6 +73,6 @@ jobs:
push: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
provenance: false
build-args: |
SGLANG_KERNEL_NPU_TAG=20251110
SGLANG_KERNEL_NPU_TAG=20251120
CANN_VERSION=${{ matrix.cann_version }}
DEVICE_TYPE=${{ matrix.device_type }}
4 changes: 2 additions & 2 deletions .github/workflows/release-docker-npu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-22.04-arm
strategy:
matrix:
cann_version: ["8.2.rc1"]
cann_version: ["8.3.rc1"]
device_type: ["910b", "a3"]
steps:
- name: Checkout repository
Expand Down Expand Up @@ -70,6 +70,6 @@ jobs:
push: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
provenance: false
build-args: |
SGLANG_KERNEL_NPU_TAG=20251110
SGLANG_KERNEL_NPU_TAG=20251120
CANN_VERSION=${{ matrix.cann_version }}
DEVICE_TYPE=${{ matrix.device_type }}
51 changes: 26 additions & 25 deletions docker/npu.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
ARG CANN_VERSION=8.2.rc1
ARG CANN_VERSION=8.3.rc1
ARG DEVICE_TYPE=a3
ARG OS=ubuntu22.04
ARG PYTHON_VERSION=py3.11

FROM quay.io/ascend/cann:$CANN_VERSION-$DEVICE_TYPE-$OS-$PYTHON_VERSION

# Update pip & apt sources
ARG DEVICE_TYPE
ARG PIP_INDEX_URL="https://pypi.org/simple/"
ARG APTMIRROR=""
ARG MEMFABRIC_URL=https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl
ARG PYTORCH_VERSION=2.6.0
ARG TORCHVISION_VERSION=0.21.0
ARG PTA_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/torch_npu-2.6.0.post2%2Bgit95d6260-cp311-cp311-linux_aarch64.whl"
ARG VLLM_TAG=v0.8.5
ARG PYTORCH_VERSION="2.8.0"
ARG TORCHVISION_VERSION="0.23.0"
ARG PTA_VERSION="v7.2.0-pytorch${PYTORCH_VERSION}"
ARG PTA_NAME="torch_npu-${PYTORCH_VERSION}-cp311-cp311-manylinux_2_28_aarch64.whl"
ARG PTA_URL="https://gitcode.com/Ascend/pytorch/releases/download/${PTA_VERSION}/${PTA_NAME}"
ARG TRITON_ASCEND_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/triton_ascend-3.2.0%2Bgitb0ea0850-cp311-cp311-linux_aarch64.whl"
ARG BISHENG_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/Ascend-BiSheng-toolkit_aarch64.run"
ARG SGLANG_TAG=main
ARG ASCEND_CANN_PATH=/usr/local/Ascend/ascend-toolkit
ARG SGLANG_KERNEL_NPU_TAG=main

ARG PIP_INSTALL="python3 -m pip install --no-cache-dir"
ARG DEVICE_TYPE

WORKDIR /workspace

# Define environments
Expand Down Expand Up @@ -54,45 +56,44 @@ ENV LANG=en_US.UTF-8
ENV LANGUAGE=en_US:en
ENV LC_ALL=en_US.UTF-8

# Install dependencies
# TODO: install from pypi released memfabric
RUN pip install $MEMFABRIC_URL --no-cache-dir

# Install vLLM
RUN git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG && \
(cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v . --no-cache-dir) && rm -rf vllm
### Install MemFabric
RUN ${PIP_INSTALL} mf-adapter==1.0.0
### Install SGLang Model Gateway
RUN ${PIP_INSTALL} sglang-router


### Install PyTorch and PTA
RUN (${PIP_INSTALL} torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION} --index-url https://download.pytorch.org/whl/cpu) && \
(wget -O "${PTA_NAME}" "${PTA_URL}" && ${PIP_INSTALL} "./${PTA_NAME}" && rm "./${PTA_NAME}")


# TODO: install from pypi released triton-ascend
RUN pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu --no-cache-dir \
&& wget ${PTA_URL} && pip install "./torch_npu-2.6.0.post2+git95d6260-cp311-cp311-linux_aarch64.whl" --no-cache-dir \
&& python3 -m pip install --no-cache-dir attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11 \
&& pip install ${TRITON_ASCEND_URL} --no-cache-dir
RUN ${PIP_INSTALL} attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11 && \
${PIP_INSTALL} ${TRITON_ASCEND_URL}

# Install SGLang
RUN git clone https://github.com/sgl-project/sglang --branch $SGLANG_TAG && \
(cd sglang/python && rm -rf pyproject.toml && mv pyproject_other.toml pyproject.toml && pip install -v .[srt_npu] --no-cache-dir) && \
(cd sglang/python && rm -rf pyproject.toml && mv pyproject_other.toml pyproject.toml && ${PIP_INSTALL} -v .[srt_npu]) && \
rm -rf sglang

# Install SGLang Model Gateway
RUN pip install sglang-router --no-cache-dir

# Install Deep-ep
# pin wheel to 0.45.1 ref: https://github.com/pypa/wheel/issues/662
RUN pip install wheel==0.45.1 && git clone --branch $SGLANG_KERNEL_NPU_TAG https://github.com/sgl-project/sgl-kernel-npu.git \
RUN ${PIP_INSTALL} wheel==0.45.1 && git clone --branch $SGLANG_KERNEL_NPU_TAG https://github.com/sgl-project/sgl-kernel-npu.git \
&& export LD_LIBRARY_PATH=${ASCEND_CANN_PATH}/latest/runtime/lib64/stub:$LD_LIBRARY_PATH && \
source ${ASCEND_CANN_PATH}/set_env.sh && \
cd sgl-kernel-npu && \
bash build.sh \
&& pip install output/deep_ep*.whl output/sgl_kernel_npu*.whl --no-cache-dir \
&& ${PIP_INSTALL} output/deep_ep*.whl output/sgl_kernel_npu*.whl \
&& cd .. && rm -rf sgl-kernel-npu \
&& cd "$(pip show deep-ep | awk '/^Location:/ {print $2}')" && ln -s deep_ep/deep_ep_cpp*.so
&& cd "$(python3 -m pip show deep-ep | awk '/^Location:/ {print $2}')" && ln -s deep_ep/deep_ep_cpp*.so

# Install CustomOps
RUN wget https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/CANN-custom_ops-8.2.0.0-$DEVICE_TYPE-linux.aarch64.run && \
chmod a+x ./CANN-custom_ops-8.2.0.0-$DEVICE_TYPE-linux.aarch64.run && \
./CANN-custom_ops-8.2.0.0-$DEVICE_TYPE-linux.aarch64.run --quiet --install-path=/usr/local/Ascend/ascend-toolkit/latest/opp && \
wget https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/custom_ops-1.0.$DEVICE_TYPE-cp311-cp311-linux_aarch64.whl && \
pip install ./custom_ops-1.0.$DEVICE_TYPE-cp311-cp311-linux_aarch64.whl
${PIP_INSTALL} ./custom_ops-1.0.$DEVICE_TYPE-cp311-cp311-linux_aarch64.whl

# Install Bisheng
RUN wget ${BISHENG_URL} && chmod a+x Ascend-BiSheng-toolkit_aarch64.run && ./Ascend-BiSheng-toolkit_aarch64.run --install && rm Ascend-BiSheng-toolkit_aarch64.run
Expand Down
31 changes: 7 additions & 24 deletions docs/platforms/ascend_npu.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,41 +48,23 @@ conda activate sglang_npu

#### MemFabric Adaptor

_TODO: MemFabric is still a working project yet open sourced til August/September, 2025. We will release it as prebuilt wheel package for now._

_Notice: Prebuilt wheel package is based on `aarch64`, please leave an issue [here at sglang](https://github.com/sgl-project/sglang/issues) to let us know the requests for `amd64` build._
_TODO: MemFabric is still a working project yet open sourced til end of year 2025. We will release it as prebuilt wheel package for now._

MemFabric Adaptor is a drop-in replacement of Mooncake Transfer Engine that enables KV cache transfer on Ascend NPU clusters.

```shell
MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}"
wget -O "${MF_WHL_NAME}" "${MEMFABRIC_URL}" && pip install "./${MF_WHL_NAME}"
pip install mf-adapter==1.0.0
```

#### Pytorch and Pytorch Framework Adaptor on Ascend

Only `torch==2.6.0` is supported currently due to NPUgraph and Triton-on-Ascend's limitation, however a more generalized version will be release by the end of September, 2025.

```shell
PYTORCH_VERSION=2.6.0
TORCHVISION_VERSION=0.21.0
PYTORCH_VERSION="2.8.0"
TORCHVISION_VERSION="0.23.0"
pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu

PTA_VERSION="v7.1.0.1-pytorch2.6.0"
PTA_NAME="torch_npu-2.6.0.post1-cp311-cp311-manylinux_2_28_aarch64.whl"
PTA_URL="https://gitee.com/ascend/pytorch/releases/download/${PTA_VERSION}/${PTA_WHL_NAME}"
wget -O "${PTA_NAME}" "${PTA_URL}" && pip install "./${PTA_NAME}"
```

#### vLLM

vLLM is still a major prerequisite on Ascend NPU. Because of `torch==2.6.0` limitation, only vLLM v0.8.5 is supported.

```shell
VLLM_TAG=v0.8.5
git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG
(cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v -e .)
PTA_VERSION="2.8.0"
pip install torch-npu==$PTA_VERSION
```

#### Triton on Ascend
Expand All @@ -103,6 +85,7 @@ git clone -b v0.5.5.post3 https://github.com/sgl-project/sglang.git
cd sglang

pip install --upgrade pip
rm -vf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml
pip install -e python[srt_npu]
```

Expand Down
1 change: 1 addition & 0 deletions python/pyproject_other.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ runtime_common = [
"datasets",
"einops",
"fastapi",
"gguf",
"hf_transfer",
"huggingface_hub",
"interegular",
Expand Down
8 changes: 7 additions & 1 deletion python/sglang/check_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,15 @@ def get_topology(self):
class NPUEnv(BaseEnv):
"""Environment checker for Ascend NPU"""

EXTRA_PACKAGE_LIST = [
"torch_npu",
"sgl-kernel-npu",
"deep_ep",
]

def __init__(self):
super().__init__()
self.package_list = ["torch_npu", "sgl-kernel-npu"] + self.package_list
self.package_list.extend(NPUEnv.EXTRA_PACKAGE_LIST)

def get_info(self):
cuda_info = {"NPU available": torch.npu.is_available()}
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/layers/attention/ascend_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def forward_extend(
num_key_value_heads=layer.tp_k_head_num,
input_layout="BSND", # todo, TND not supports q_heads!=k_heads
atten_mask=self.fia_mask.unsqueeze(0),
sparse_mode=3,
sparse_mode=3 if q_len != 1 else 0,
scale=layer.scaling,
next_tokens=0,
)[0]
Expand Down
4 changes: 2 additions & 2 deletions python/sglang/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1245,8 +1245,8 @@ def run_bench_offline_throughput(model, other_args):

try:
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
output = stdout.decode(errors="backslashreplace")
error = stderr.decode(errors="backslashreplace")
print(f"Output: {output}", flush=True)
print(f"Error: {error}", flush=True)

Expand Down
36 changes: 14 additions & 22 deletions scripts/ci/npu_ci_install_dependency.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
set -euo pipefail

PIP_INSTALL="pip install --no-cache-dir"
PIP_INSTALL="python3 -m pip install --no-cache-dir"
DEVICE_TYPE=$1


Expand All @@ -19,29 +19,23 @@ apt update -y && apt install -y \
ccache \
ca-certificates
update-ca-certificates
python3 -m ${PIP_INSTALL} --upgrade pip
${PIP_INSTALL} --upgrade pip
# Pin wheel to 0.45.1, REF: https://github.com/pypa/wheel/issues/662
${PIP_INSTALL} wheel==0.45.1


### Download MemFabricV2
MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}"
wget -O "${MF_WHL_NAME}" "${MEMFABRIC_URL}" && ${PIP_INSTALL} "./${MF_WHL_NAME}"


### Install vLLM
VLLM_TAG=v0.8.5
git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG
(cd vllm && VLLM_TARGET_DEVICE="empty" ${PIP_INSTALL} -v -e .)
### Install MemFabric
${PIP_INSTALL} mf-adapter==1.0.0


### Install PyTorch and PTA
PYTORCH_VERSION=2.6.0
TORCHVISION_VERSION=0.21.0
${PIP_INSTALL} torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu
PYTORCH_VERSION="2.8.0"
TORCHVISION_VERSION="0.23.0"
${PIP_INSTALL} torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION} --index-url https://download.pytorch.org/whl/cpu

PTA_VERSION="v7.1.0.1-pytorch2.6.0"
PTA_NAME="torch_npu-2.6.0.post2+git95d6260-cp311-cp311-linux_aarch64.whl"
PTA_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/torch_npu-2.6.0.post2%2Bgit95d6260-cp311-cp311-linux_aarch64.whl"
PTA_VERSION="v7.2.0-pytorch${PYTORCH_VERSION}"
PTA_NAME="torch_npu-${PYTORCH_VERSION}-cp311-cp311-manylinux_2_28_aarch64.whl"
PTA_URL="https://gitcode.com/Ascend/pytorch/releases/download/${PTA_VERSION}/${PTA_NAME}"
wget -O "${PTA_NAME}" "${PTA_URL}" && ${PIP_INSTALL} "./${PTA_NAME}"


Expand All @@ -59,11 +53,9 @@ wget -O "${BISHENG_NAME}" "${BISHENG_URL}" && chmod a+x "${BISHENG_NAME}" && "./


### Install sgl-kernel-npu
SGL_KERNEL_NPU_TAG="20251110"
SGL_KERNEL_NPU_TAG="20251120"
git clone --depth 1 https://github.com/sgl-project/sgl-kernel-npu.git --branch ${SGL_KERNEL_NPU_TAG}
# pin wheel to 0.45.1 ref: https://github.com/pypa/wheel/issues/662
pip install wheel==0.45.1
(cd sgl-kernel-npu && bash ./build.sh && pip install output/deep_ep*.whl output/sgl_kernel_npu*.whl && cd "$(pip show deep-ep | grep -E '^Location:' | awk '{print $2}')" && ln -s deep_ep/deep_ep_cpp*.so)
(cd sgl-kernel-npu && bash ./build.sh && ${PIP_INSTALL} output/deep_ep*.whl output/sgl_kernel_npu*.whl && cd "$(python3 -m pip show deep-ep | grep -E '^Location:' | awk '{print $2}')" && ln -s deep_ep/deep_ep_cpp*.so)


### Install CustomOps (TODO: to be removed once merged into sgl-kernel-npu)
Expand Down
Loading