Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/workflows/ascend-build-and-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Ascend-Build-And-Test

on:
push:
branches: [ "triton_v3.2.x" ]
pull_request:
branches: [ "triton_v3.2.x" ]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
ascend-build-and-test:
runs-on: ascend
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: FlagTree Build on Ascend
shell: bash
run: |
source ~/env.sh
cd python
MAX_JOBS=32 python3.9 -m pip install . --no-build-isolation

- name: FlagTree Test on Ascend
shell: bash
run: |
python3.9 ../third_party/ascend/test/tutorials/01-vector-add.py
4 changes: 2 additions & 2 deletions .github/workflows/code-format-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ name: Code-Format-Check

on:
push:
branches: [ "main" ]
branches: [ "main", "triton_v3.2.x" ]
pull_request:
branches: [ "main" ]
branches: [ "main", "triton_v3.2.x" ]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand Down
40 changes: 35 additions & 5 deletions .github/workflows/nv-build-and-test.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
name: NV-Build-And-Test

on:
schedule:
- cron: '0 21 * * *'
push:
branches: [ "main" ]
branches: [ "main", "triton_v3.2.x", "triton_v3.3.x" ]
pull_request:
branches: [ "main" ]
branches: [ "main", "triton_v3.2.x", "triton_v3.3.x" ]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand All @@ -17,14 +19,42 @@ jobs:
- name: Checkout code
uses: actions/checkout@v4

- name: FlagTree Build on NVIDIA-A100
- name: Detect Target Branch
shell: bash
run: |
if [ "${{ github.event_name }}" = "pull_request" ]; then
TARGET_BRANCH="${{ github.base_ref }}"
else
TARGET_BRANCH="${{ github.ref_name }}"
fi
echo "TARGET_BRANCH=$TARGET_BRANCH" >> $GITHUB_ENV
echo "TARGET_BRANCH=$TARGET_BRANCH"

- name: FlagTree Build (Main branch)
if: ${{ env.TARGET_BRANCH == 'main' }}
shell: bash
run: |
source ~/env.sh
cd python
MAX_JOBS=20 pip3.11 install . --no-build-isolation
MAX_JOBS=32 pip3.11 install . --no-build-isolation

- name: FlagTree Build (triton_v3.2.x branch)
if: ${{ env.TARGET_BRANCH == 'triton_v3.2.x' }}
shell: bash
run: |
source ~/env-3.2.sh
cd python
MAX_JOBS=32 pip3.11 install . --no-build-isolation

- name: FlagTree Build (triton_v3.3.x branch)
if: ${{ env.TARGET_BRANCH == 'triton_v3.3.x' }}
shell: bash
run: |
source ~/env-3.3.sh
cd python
MAX_JOBS=32 pip3.11 install . --no-build-isolation

- name: FlagTree Test on NVIDIA-A100
- name: FlagTree Test
shell: bash
run: |
pytest -s python/test/unit
76 changes: 0 additions & 76 deletions .github/workflows/wheels_v2.yml

This file was deleted.

15 changes: 2 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,10 @@ python3 -m pip install . --no-build-isolation -v
```
```shell
# ascend
# manually download LLVM
cd ${YOUR_LLVM_DOWNLOAD_DIR}
# if the output of `uname -a` is x64 or x86_64
wget https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-x64.tar.gz
tar -zxvf llvm-b5cc222d-ubuntu-x64.tar.gz
export LLVM_BUILD_DIR=${YOUR_LLVM_DOWNLOAD_DIR}/llvm-b5cc222d-ubuntu-x64
# if the output of `uname -a` is aarch64
# Recommended: Use the Dockerfile flagtree/dockerfiles/Dockerfile-ubuntu20.04-python3.9-ascend
mkdir -p ~/.flagtree/ascend; cd ~/.flagtree/ascend
wget https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-arm64.tar.gz
tar -zxvf llvm-b5cc222d-ubuntu-arm64.tar.gz
export LLVM_BUILD_DIR=${YOUR_LLVM_DOWNLOAD_DIR}/llvm-b5cc222d-ubuntu-arm64
# build
cd ${YOUR_CODE_DIR}/flagtree/python
export LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include
export LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib
export LLVM_SYSPATH=$LLVM_BUILD_DIR
export FLAGTREE_BACKEND=ascend
python3 -m pip install . --no-build-isolation -v
```
Expand Down
15 changes: 2 additions & 13 deletions README_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,10 @@ python3 -m pip install . --no-build-isolation -v
```
```shell
# ascend
# 自行下载 LLVM
cd ${YOUR_LLVM_DOWNLOAD_DIR}
# 如果 `uname -a` 的输出是 x64 或 x86_64
wget https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-x64.tar.gz
tar -zxvf llvm-b5cc222d-ubuntu-x64.tar.gz
export LLVM_BUILD_DIR=${YOUR_LLVM_DOWNLOAD_DIR}/llvm-b5cc222d-ubuntu-x64
# 如果 `uname -a` 的输出是 aarch64
# 推荐使用镜像 flagtree/dockerfiles/Dockerfile-ubuntu20.04-python3.9-ascend
mkdir -p ~/.flagtree/ascend; cd ~/.flagtree/ascend
wget https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-arm64.tar.gz
tar -zxvf llvm-b5cc222d-ubuntu-arm64.tar.gz
export LLVM_BUILD_DIR=${YOUR_LLVM_DOWNLOAD_DIR}/llvm-b5cc222d-ubuntu-arm64
# 编译安装
cd ${YOUR_CODE_DIR}/flagtree/python
export LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include
export LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib
export LLVM_SYSPATH=$LLVM_BUILD_DIR
export FLAGTREE_BACKEND=ascend
python3 -m pip install . --no-build-isolation -v
```
Expand Down
31 changes: 31 additions & 0 deletions dockerfiles/Dockerfile-ubuntu20.04-python3.9-ascend
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/ascend-pytorch:24.0.0-A1-2.1.0-ubuntu20.04

RUN apt-get update && \
apt-get install zip unzip git vim zstd libzstd-dev && \
apt-get install zlib1g zlib1g-dev libxml2 libxml2-dev && \
apt-get install clang lld

RUN pip3 install -U pip && \
pip3 install numpy && \
pip3 install decorator && \
pip3 install sympy==1.4 && \
pip3 install cffi==1.12.3 && \
pip3 install pyyaml && \
pip3 install pathlib2 && \
pip3 install protobuf attrs attr && \
pip3 install scipy && \
pip3 install requests psutil absl-py && \
pip3 install ninja cmake wheel pybind11 && \
pip3 install setuptools==75.1.0 && \
pip3 install attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 && \
pip3 install pytest==8.3.2 pytest-xdist==3.6.1 pyyaml torch==2.3.1 torchvision==0.18.1 torch-npu==2.3.1.post2 && \
pip3 install scikit-build==0.18.1 scikit_build_core==0.11.1 && \
pip3 install pre-commit torch_npu==2.6.0rc1 && \
rm -rf /root/.cache/pip

ENV LD_LIBRARY_PATH=/usr/lib/aarch64-linux-gnu/hdf5/serial:$LD_LIBRARY_PATH

RUN if [ ! -d "/lib64" ]; \
then \
mkdir /lib64 && ln -sf /lib/ld-linux-aarch64.so.1 /lib64/ld-linux-aarch64.so.1; \
fi
15 changes: 7 additions & 8 deletions python/setup_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ class FlagTreeBackend:
}

set_llvm_env = lambda path: set_env({
'LLVM_BUILD_DIR': path,
'LLVM_INCLUDE_DIRS': Path(path) / "include",
'LLVM_LIBRARY_DIR': Path(path) / "lib",
'LLVM_SYSPATH': path,
Expand Down Expand Up @@ -388,7 +387,7 @@ def check_env(env_val):
file="iluvatar-llvm18-x86_64",
condition=("iluvatar" == flagtree_backend),
url="https://github.com/FlagTree/flagtree/releases/download/v0.1.0-build-deps/iluvatar-llvm18-x86_64.tar.gz",
pre_hock=lambda: check_env('LLVM_BUILD_DIR'),
pre_hock=lambda: check_env('LLVM_SYSPATH'),
post_hock=set_llvm_env,
)

Expand All @@ -397,7 +396,7 @@ def check_env(env_val):
file="XTDK-llvm18-ubuntu2004_x86_64",
condition=("xpu" == flagtree_backend),
url="https://github.com/FlagTree/flagtree/releases/download/v0.1.0-build-deps/XTDK-llvm18-ubuntu2004_x86_64.tar",
pre_hock=lambda: check_env('LLVM_BUILD_DIR'),
pre_hock=lambda: check_env('LLVM_SYSPATH'),
post_hock=set_llvm_env,
)

Expand All @@ -408,10 +407,10 @@ def check_env(env_val):
cache.store(
files=("clang", "xpu-xxd", "xpu3-crt.xpu", "xpu-kernel.t", "ld.lld", "llvm-readelf", "llvm-objdump",
"llvm-objcopy"), condition=("xpu" == flagtree_backend),
copy_src_path=f"{os.environ.get('LLVM_BUILD_DIR','')}/bin", copy_dst_path="third_party/xpu/backend/xpu3/bin")
copy_src_path=f"{os.environ.get('LLVM_SYSPATH','')}/bin", copy_dst_path="third_party/xpu/backend/xpu3/bin")

cache.store(files=("libclang_rt.builtins-xpu3.a", "libclang_rt.builtins-xpu3s.a"),
condition=("xpu" == flagtree_backend), copy_src_path=f"{os.environ.get('LLVM_BUILD_DIR','')}/lib/linux",
condition=("xpu" == flagtree_backend), copy_src_path=f"{os.environ.get('LLVM_SYSPATH','')}/lib/linux",
copy_dst_path="third_party/xpu/backend/xpu3/lib/linux")

cache.store(files=("include", "so"), condition=("xpu" == flagtree_backend),
Expand All @@ -423,15 +422,15 @@ def check_env(env_val):
condition=("mthreads" == flagtree_backend),
url=
"https://github.com/FlagTree/flagtree/releases/download/v0.1.0-build-deps/mthreads-llvm19-glibc2.34-glibcxx3.4.30-x64.tar.gz",
pre_hock=lambda: check_env('LLVM_BUILD_DIR'),
pre_hock=lambda: check_env('LLVM_SYSPATH'),
post_hock=set_llvm_env,
)

# ascend
cache.store(
file="ascend-llvm-b5cc222d-ubuntu-x64.tar.gz",
file="ascend-llvm-b5cc222d-ubuntu-arm64",
condition=("ascend" == flagtree_backend),
url="https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-x64.tar.gz",
url="https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-arm64.tar.gz",
pre_hock=lambda: check_env('LLVM_SYSPATH'),
post_hock=set_llvm_env,
)
80 changes: 80 additions & 0 deletions third_party/ascend/python/tutorials/01-vector-add.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
Vector Addition
===============

In this tutorial, you will write a simple vector addition using Triton.

In doing so, you will learn about:

* The basic programming model of Triton.

* The `triton.jit` decorator, which is used to define Triton kernels.

* The best practices for validating and benchmarking your custom ops against native reference implementations.

"""

# %%
# Compute Kernel
# --------------

import torch
import torch_npu

import triton
import triton.language as tl


@triton.jit
def add_kernel(x_ptr, # *Pointer* to first input vector.
y_ptr, # *Pointer* to second input vector.
output_ptr, # *Pointer* to output vector.
n_elements, # Size of the vector.
BLOCK_SIZE: tl.constexpr, # Number of elements each program should process.
# NOTE: `constexpr` so it can be used as a shape value.
):
# There are multiple 'programs' processing different data. We identify which program
# we are here:
pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0.
# This program will process inputs that are offset from the initial data.
# For instance, if you had a vector of length 256 and block_size of 64, the programs
# would each access the elements [0:64, 64:128, 128:192, 192:256].
# Note that offsets is a list of pointers:
block_start = pid * BLOCK_SIZE
offsets = block_start + tl.arange(0, BLOCK_SIZE)
# Create a mask to guard memory operations against out-of-bounds accesses.
mask = offsets < n_elements
# Load x and y from DRAM, masking out any extra elements in case the input is not a
# multiple of the block size.
x = tl.load(x_ptr + offsets, mask=mask)
y = tl.load(y_ptr + offsets, mask=mask)
output = x + y
# Write x + y back to DRAM.
tl.store(output_ptr + offsets, output, mask=mask)


# %%
# Let's also declare a helper function to (1) allocate the `z` tensor
# and (2) enqueue the above kernel with appropriate grid/block sizes:


def add(x: torch.Tensor, y: torch.Tensor):
output = torch.empty_like(x)
n_elements = output.numel()
grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
return output


# %%
# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
torch.manual_seed(0)
size = 98432
x = torch.rand(size, device='npu')
y = torch.rand(size, device='npu')
output_torch = x + y
output_triton = add(x, y)
print(output_torch)
print(output_triton)
print(f'The maximum difference between torch and triton is '
f'{torch.max(torch.abs(output_torch - output_triton))}')
Loading