diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index b28be34d71b8..d47a3961531f 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -47,7 +47,7 @@ jobs: if: needs.check-changes.outputs.main_package == 'true' runs-on: linux-arm64-npu-1 container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 steps: - name: Checkout code uses: actions/checkout@v4 @@ -90,7 +90,7 @@ jobs: matrix: part: [0, 1, 2] container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 steps: - name: Checkout code uses: actions/checkout@v4 @@ -129,7 +129,7 @@ jobs: if: needs.check-changes.outputs.main_package == 'true' runs-on: linux-arm64-npu-4 container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 steps: - name: Checkout code uses: actions/checkout@v4 @@ -172,7 +172,7 @@ jobs: matrix: part: [0, 1] container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11 steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/.github/workflows/release-docker-npu-nightly.yml b/.github/workflows/release-docker-npu-nightly.yml index 765b45336248..1ede19a35589 100644 --- a/.github/workflows/release-docker-npu-nightly.yml +++ b/.github/workflows/release-docker-npu-nightly.yml @@ -19,7 +19,7 @@ jobs: runs-on: ubuntu-22.04-arm strategy: matrix: - cann_version: ["8.2.rc1"] + cann_version: ["8.3.rc1"] device_type: ["910b", "a3"] steps: - name: Checkout repository @@ -73,6 +73,6 @@ jobs: push: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }} provenance: false build-args: | - SGLANG_KERNEL_NPU_TAG=20251110 + SGLANG_KERNEL_NPU_TAG=20251120 CANN_VERSION=${{ matrix.cann_version }} DEVICE_TYPE=${{ matrix.device_type }} diff --git a/.github/workflows/release-docker-npu.yml b/.github/workflows/release-docker-npu.yml index 9afe3dfd6dc5..2b2506a28c63 100644 --- a/.github/workflows/release-docker-npu.yml +++ b/.github/workflows/release-docker-npu.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-22.04-arm strategy: matrix: - cann_version: ["8.2.rc1"] + cann_version: ["8.3.rc1"] device_type: ["910b", "a3"] steps: - name: Checkout repository @@ -70,6 +70,6 @@ jobs: push: ${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }} provenance: false build-args: | - SGLANG_KERNEL_NPU_TAG=20251110 + SGLANG_KERNEL_NPU_TAG=20251120 CANN_VERSION=${{ matrix.cann_version }} DEVICE_TYPE=${{ matrix.device_type }} diff --git a/docker/npu.Dockerfile b/docker/npu.Dockerfile index 48d0f354e496..21a8f7edffb7 100644 --- a/docker/npu.Dockerfile +++ b/docker/npu.Dockerfile @@ -1,4 +1,4 @@ -ARG CANN_VERSION=8.2.rc1 +ARG CANN_VERSION=8.3.rc1 ARG DEVICE_TYPE=a3 ARG OS=ubuntu22.04 ARG PYTHON_VERSION=py3.11 @@ -6,20 +6,22 @@ ARG PYTHON_VERSION=py3.11 FROM quay.io/ascend/cann:$CANN_VERSION-$DEVICE_TYPE-$OS-$PYTHON_VERSION # Update pip & apt sources -ARG DEVICE_TYPE ARG PIP_INDEX_URL="https://pypi.org/simple/" ARG APTMIRROR="" -ARG MEMFABRIC_URL=https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl -ARG PYTORCH_VERSION=2.6.0 -ARG TORCHVISION_VERSION=0.21.0 -ARG PTA_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/torch_npu-2.6.0.post2%2Bgit95d6260-cp311-cp311-linux_aarch64.whl" -ARG VLLM_TAG=v0.8.5 +ARG PYTORCH_VERSION="2.8.0" +ARG TORCHVISION_VERSION="0.23.0" +ARG PTA_VERSION="v7.2.0-pytorch${PYTORCH_VERSION}" +ARG PTA_NAME="torch_npu-${PYTORCH_VERSION}-cp311-cp311-manylinux_2_28_aarch64.whl" +ARG PTA_URL="https://gitcode.com/Ascend/pytorch/releases/download/${PTA_VERSION}/${PTA_NAME}" ARG TRITON_ASCEND_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/triton_ascend-3.2.0%2Bgitb0ea0850-cp311-cp311-linux_aarch64.whl" ARG BISHENG_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/Ascend-BiSheng-toolkit_aarch64.run" ARG SGLANG_TAG=main ARG ASCEND_CANN_PATH=/usr/local/Ascend/ascend-toolkit ARG SGLANG_KERNEL_NPU_TAG=main +ARG PIP_INSTALL="python3 -m pip install --no-cache-dir" +ARG DEVICE_TYPE + WORKDIR /workspace # Define environments @@ -54,45 +56,44 @@ ENV LANG=en_US.UTF-8 ENV LANGUAGE=en_US:en ENV LC_ALL=en_US.UTF-8 -# Install dependencies -# TODO: install from pypi released memfabric -RUN pip install $MEMFABRIC_URL --no-cache-dir -# Install vLLM -RUN git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG && \ - (cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v . --no-cache-dir) && rm -rf vllm +### Install MemFabric +RUN ${PIP_INSTALL} mf-adapter==1.0.0 +### Install SGLang Model Gateway +RUN ${PIP_INSTALL} sglang-router + + +### Install PyTorch and PTA +RUN (${PIP_INSTALL} torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION} --index-url https://download.pytorch.org/whl/cpu) && \ + (wget -O "${PTA_NAME}" "${PTA_URL}" && ${PIP_INSTALL} "./${PTA_NAME}" && rm "./${PTA_NAME}") + # TODO: install from pypi released triton-ascend -RUN pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu --no-cache-dir \ - && wget ${PTA_URL} && pip install "./torch_npu-2.6.0.post2+git95d6260-cp311-cp311-linux_aarch64.whl" --no-cache-dir \ - && python3 -m pip install --no-cache-dir attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11 \ - && pip install ${TRITON_ASCEND_URL} --no-cache-dir +RUN ${PIP_INSTALL} attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11 && \ + ${PIP_INSTALL} ${TRITON_ASCEND_URL} # Install SGLang RUN git clone https://github.com/sgl-project/sglang --branch $SGLANG_TAG && \ - (cd sglang/python && rm -rf pyproject.toml && mv pyproject_other.toml pyproject.toml && pip install -v .[srt_npu] --no-cache-dir) && \ + (cd sglang/python && rm -rf pyproject.toml && mv pyproject_other.toml pyproject.toml && ${PIP_INSTALL} -v .[srt_npu]) && \ rm -rf sglang -# Install SGLang Model Gateway -RUN pip install sglang-router --no-cache-dir - # Install Deep-ep # pin wheel to 0.45.1 ref: https://github.com/pypa/wheel/issues/662 -RUN pip install wheel==0.45.1 && git clone --branch $SGLANG_KERNEL_NPU_TAG https://github.com/sgl-project/sgl-kernel-npu.git \ +RUN ${PIP_INSTALL} wheel==0.45.1 && git clone --branch $SGLANG_KERNEL_NPU_TAG https://github.com/sgl-project/sgl-kernel-npu.git \ && export LD_LIBRARY_PATH=${ASCEND_CANN_PATH}/latest/runtime/lib64/stub:$LD_LIBRARY_PATH && \ source ${ASCEND_CANN_PATH}/set_env.sh && \ cd sgl-kernel-npu && \ bash build.sh \ - && pip install output/deep_ep*.whl output/sgl_kernel_npu*.whl --no-cache-dir \ + && ${PIP_INSTALL} output/deep_ep*.whl output/sgl_kernel_npu*.whl \ && cd .. && rm -rf sgl-kernel-npu \ - && cd "$(pip show deep-ep | awk '/^Location:/ {print $2}')" && ln -s deep_ep/deep_ep_cpp*.so + && cd "$(python3 -m pip show deep-ep | awk '/^Location:/ {print $2}')" && ln -s deep_ep/deep_ep_cpp*.so # Install CustomOps RUN wget https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/CANN-custom_ops-8.2.0.0-$DEVICE_TYPE-linux.aarch64.run && \ chmod a+x ./CANN-custom_ops-8.2.0.0-$DEVICE_TYPE-linux.aarch64.run && \ ./CANN-custom_ops-8.2.0.0-$DEVICE_TYPE-linux.aarch64.run --quiet --install-path=/usr/local/Ascend/ascend-toolkit/latest/opp && \ wget https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/custom_ops-1.0.$DEVICE_TYPE-cp311-cp311-linux_aarch64.whl && \ - pip install ./custom_ops-1.0.$DEVICE_TYPE-cp311-cp311-linux_aarch64.whl + ${PIP_INSTALL} ./custom_ops-1.0.$DEVICE_TYPE-cp311-cp311-linux_aarch64.whl # Install Bisheng RUN wget ${BISHENG_URL} && chmod a+x Ascend-BiSheng-toolkit_aarch64.run && ./Ascend-BiSheng-toolkit_aarch64.run --install && rm Ascend-BiSheng-toolkit_aarch64.run diff --git a/docs/platforms/ascend_npu.md b/docs/platforms/ascend_npu.md index 9525af5af205..6a3f9ad27e67 100644 --- a/docs/platforms/ascend_npu.md +++ b/docs/platforms/ascend_npu.md @@ -48,41 +48,23 @@ conda activate sglang_npu #### MemFabric Adaptor -_TODO: MemFabric is still a working project yet open sourced til August/September, 2025. We will release it as prebuilt wheel package for now._ - -_Notice: Prebuilt wheel package is based on `aarch64`, please leave an issue [here at sglang](https://github.com/sgl-project/sglang/issues) to let us know the requests for `amd64` build._ +_TODO: MemFabric is still a working project yet open sourced til end of year 2025. We will release it as prebuilt wheel package for now._ MemFabric Adaptor is a drop-in replacement of Mooncake Transfer Engine that enables KV cache transfer on Ascend NPU clusters. ```shell -MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl" -MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}" -wget -O "${MF_WHL_NAME}" "${MEMFABRIC_URL}" && pip install "./${MF_WHL_NAME}" +pip install mf-adapter==1.0.0 ``` #### Pytorch and Pytorch Framework Adaptor on Ascend -Only `torch==2.6.0` is supported currently due to NPUgraph and Triton-on-Ascend's limitation, however a more generalized version will be release by the end of September, 2025. - ```shell -PYTORCH_VERSION=2.6.0 -TORCHVISION_VERSION=0.21.0 +PYTORCH_VERSION="2.8.0" +TORCHVISION_VERSION="0.23.0" pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu -PTA_VERSION="v7.1.0.1-pytorch2.6.0" -PTA_NAME="torch_npu-2.6.0.post1-cp311-cp311-manylinux_2_28_aarch64.whl" -PTA_URL="https://gitee.com/ascend/pytorch/releases/download/${PTA_VERSION}/${PTA_WHL_NAME}" -wget -O "${PTA_NAME}" "${PTA_URL}" && pip install "./${PTA_NAME}" -``` - -#### vLLM - -vLLM is still a major prerequisite on Ascend NPU. Because of `torch==2.6.0` limitation, only vLLM v0.8.5 is supported. - -```shell -VLLM_TAG=v0.8.5 -git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG -(cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v -e .) +PTA_VERSION="2.8.0" +pip install torch-npu==$PTA_VERSION ``` #### Triton on Ascend @@ -103,6 +85,7 @@ git clone -b v0.5.5.post3 https://github.com/sgl-project/sglang.git cd sglang pip install --upgrade pip +rm -vf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml pip install -e python[srt_npu] ``` diff --git a/python/pyproject_other.toml b/python/pyproject_other.toml index 3f0b32341ad9..c191a752b41a 100755 --- a/python/pyproject_other.toml +++ b/python/pyproject_other.toml @@ -27,6 +27,7 @@ runtime_common = [ "datasets", "einops", "fastapi", + "gguf", "hf_transfer", "huggingface_hub", "interegular", diff --git a/python/sglang/check_env.py b/python/sglang/check_env.py index e737da0dc4e2..18fa94afadb2 100644 --- a/python/sglang/check_env.py +++ b/python/sglang/check_env.py @@ -311,9 +311,15 @@ def get_topology(self): class NPUEnv(BaseEnv): """Environment checker for Ascend NPU""" + EXTRA_PACKAGE_LIST = [ + "torch_npu", + "sgl-kernel-npu", + "deep_ep", + ] + def __init__(self): super().__init__() - self.package_list = ["torch_npu", "sgl-kernel-npu"] + self.package_list + self.package_list.extend(NPUEnv.EXTRA_PACKAGE_LIST) def get_info(self): cuda_info = {"NPU available": torch.npu.is_available()} diff --git a/python/sglang/srt/layers/attention/ascend_backend.py b/python/sglang/srt/layers/attention/ascend_backend.py index 498cff07b185..54668423e4af 100644 --- a/python/sglang/srt/layers/attention/ascend_backend.py +++ b/python/sglang/srt/layers/attention/ascend_backend.py @@ -376,7 +376,7 @@ def forward_extend( num_key_value_heads=layer.tp_k_head_num, input_layout="BSND", # todo, TND not supports q_heads!=k_heads atten_mask=self.fia_mask.unsqueeze(0), - sparse_mode=3, + sparse_mode=3 if q_len != 1 else 0, scale=layer.scaling, next_tokens=0, )[0] diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 12e6c0fad5e5..98b2e8a4daba 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -1245,8 +1245,8 @@ def run_bench_offline_throughput(model, other_args): try: stdout, stderr = process.communicate() - output = stdout.decode() - error = stderr.decode() + output = stdout.decode(errors="backslashreplace") + error = stderr.decode(errors="backslashreplace") print(f"Output: {output}", flush=True) print(f"Error: {error}", flush=True) diff --git a/scripts/ci/npu_ci_install_dependency.sh b/scripts/ci/npu_ci_install_dependency.sh index 145085f6cfd2..ce092ed5b35e 100755 --- a/scripts/ci/npu_ci_install_dependency.sh +++ b/scripts/ci/npu_ci_install_dependency.sh @@ -1,7 +1,7 @@ #!/bin/bash set -euo pipefail -PIP_INSTALL="pip install --no-cache-dir" +PIP_INSTALL="python3 -m pip install --no-cache-dir" DEVICE_TYPE=$1 @@ -19,29 +19,23 @@ apt update -y && apt install -y \ ccache \ ca-certificates update-ca-certificates -python3 -m ${PIP_INSTALL} --upgrade pip +${PIP_INSTALL} --upgrade pip +# Pin wheel to 0.45.1, REF: https://github.com/pypa/wheel/issues/662 +${PIP_INSTALL} wheel==0.45.1 -### Download MemFabricV2 -MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl" -MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/${MF_WHL_NAME}" -wget -O "${MF_WHL_NAME}" "${MEMFABRIC_URL}" && ${PIP_INSTALL} "./${MF_WHL_NAME}" - - -### Install vLLM -VLLM_TAG=v0.8.5 -git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG -(cd vllm && VLLM_TARGET_DEVICE="empty" ${PIP_INSTALL} -v -e .) +### Install MemFabric +${PIP_INSTALL} mf-adapter==1.0.0 ### Install PyTorch and PTA -PYTORCH_VERSION=2.6.0 -TORCHVISION_VERSION=0.21.0 -${PIP_INSTALL} torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu +PYTORCH_VERSION="2.8.0" +TORCHVISION_VERSION="0.23.0" +${PIP_INSTALL} torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION} --index-url https://download.pytorch.org/whl/cpu -PTA_VERSION="v7.1.0.1-pytorch2.6.0" -PTA_NAME="torch_npu-2.6.0.post2+git95d6260-cp311-cp311-linux_aarch64.whl" -PTA_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/torch_npu-2.6.0.post2%2Bgit95d6260-cp311-cp311-linux_aarch64.whl" +PTA_VERSION="v7.2.0-pytorch${PYTORCH_VERSION}" +PTA_NAME="torch_npu-${PYTORCH_VERSION}-cp311-cp311-manylinux_2_28_aarch64.whl" +PTA_URL="https://gitcode.com/Ascend/pytorch/releases/download/${PTA_VERSION}/${PTA_NAME}" wget -O "${PTA_NAME}" "${PTA_URL}" && ${PIP_INSTALL} "./${PTA_NAME}" @@ -59,11 +53,9 @@ wget -O "${BISHENG_NAME}" "${BISHENG_URL}" && chmod a+x "${BISHENG_NAME}" && "./ ### Install sgl-kernel-npu -SGL_KERNEL_NPU_TAG="20251110" +SGL_KERNEL_NPU_TAG="20251120" git clone --depth 1 https://github.com/sgl-project/sgl-kernel-npu.git --branch ${SGL_KERNEL_NPU_TAG} -# pin wheel to 0.45.1 ref: https://github.com/pypa/wheel/issues/662 -pip install wheel==0.45.1 -(cd sgl-kernel-npu && bash ./build.sh && pip install output/deep_ep*.whl output/sgl_kernel_npu*.whl && cd "$(pip show deep-ep | grep -E '^Location:' | awk '{print $2}')" && ln -s deep_ep/deep_ep_cpp*.so) +(cd sgl-kernel-npu && bash ./build.sh && ${PIP_INSTALL} output/deep_ep*.whl output/sgl_kernel_npu*.whl && cd "$(python3 -m pip show deep-ep | grep -E '^Location:' | awk '{print $2}')" && ln -s deep_ep/deep_ep_cpp*.so) ### Install CustomOps (TODO: to be removed once merged into sgl-kernel-npu)