flagos-ai · zhzhcookie · Jun 6, 2025 · Jun 5, 2025 · Jun 5, 2025 · Jun 5, 2025
@@ -0,0 +1,30 @@
+name: Ascend-Build-And-Test
+
+on:
+  push:
+    branches: [ "triton_v3.2.x" ]
+  pull_request:
+    branches: [ "triton_v3.2.x" ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  ascend-build-and-test:
+    runs-on: ascend
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: FlagTree Build on Ascend
+        shell: bash
+        run: |
+          source ~/env.sh
+          cd python
+          MAX_JOBS=32 python3.9 -m pip install . --no-build-isolation
+
+      - name: FlagTree Test on Ascend
+        shell: bash
+        run: |
+          python3.9 ../third_party/ascend/test/tutorials/01-vector-add.py
@@ -2,9 +2,9 @@ name: Code-Format-Check
 
 on:
   push:
-    branches: [ "main" ]
+    branches: [ "main", "triton_v3.2.x" ]
   pull_request:
-    branches: [ "main" ]
+    branches: [ "main", "triton_v3.2.x" ]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}

@@ -1,10 +1,12 @@
 name: NV-Build-And-Test
 
 on:
+  schedule:
+    - cron: '0 21 * * *'
   push:
-    branches: [ "main" ]
+    branches: [ "main", "triton_v3.2.x", "triton_v3.3.x" ]
   pull_request:
-    branches: [ "main" ]
+    branches: [ "main", "triton_v3.2.x", "triton_v3.3.x" ]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -17,14 +19,42 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: FlagTree Build on NVIDIA-A100
+      - name: Detect Target Branch
+        shell: bash
+        run: |
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            TARGET_BRANCH="${{ github.base_ref }}"
+          else
+            TARGET_BRANCH="${{ github.ref_name }}"
+          fi
+          echo "TARGET_BRANCH=$TARGET_BRANCH" >> $GITHUB_ENV
+          echo "TARGET_BRANCH=$TARGET_BRANCH"
+
+      - name: FlagTree Build (Main branch)
+        if: ${{ env.TARGET_BRANCH == 'main' }}
         shell: bash
         run: |
           source ~/env.sh
           cd python
-          MAX_JOBS=20 pip3.11 install . --no-build-isolation
+          MAX_JOBS=32 pip3.11 install . --no-build-isolation
+
+      - name: FlagTree Build (triton_v3.2.x branch)
+        if: ${{ env.TARGET_BRANCH == 'triton_v3.2.x' }}
+        shell: bash
+        run: |
+          source ~/env-3.2.sh
+          cd python
+          MAX_JOBS=32 pip3.11 install . --no-build-isolation
+
+      - name: FlagTree Build (triton_v3.3.x branch)
+        if: ${{ env.TARGET_BRANCH == 'triton_v3.3.x' }}
+        shell: bash
+        run: |
+          source ~/env-3.3.sh
+          cd python
+          MAX_JOBS=32 pip3.11 install . --no-build-isolation
 
-      - name: FlagTree Test on NVIDIA-A100
+      - name: FlagTree Test
         shell: bash
         run: |
           pytest -s python/test/unit
@@ -53,21 +53,10 @@ python3 -m pip install . --no-build-isolation -v
 ```
 ```shell
 # ascend
-# manually download LLVM
-cd ${YOUR_LLVM_DOWNLOAD_DIR}
-# if the output of `uname -a` is x64 or x86_64
-wget https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-x64.tar.gz
-tar -zxvf llvm-b5cc222d-ubuntu-x64.tar.gz
-export LLVM_BUILD_DIR=${YOUR_LLVM_DOWNLOAD_DIR}/llvm-b5cc222d-ubuntu-x64
-# if the output of `uname -a` is aarch64
+# Recommended: Use the Dockerfile flagtree/dockerfiles/Dockerfile-ubuntu20.04-python3.9-ascend
+mkdir -p ~/.flagtree/ascend; cd ~/.flagtree/ascend
 wget https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-arm64.tar.gz
-tar -zxvf llvm-b5cc222d-ubuntu-arm64.tar.gz
-export LLVM_BUILD_DIR=${YOUR_LLVM_DOWNLOAD_DIR}/llvm-b5cc222d-ubuntu-arm64
-# build
 cd ${YOUR_CODE_DIR}/flagtree/python
-export LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include
-export LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib
-export LLVM_SYSPATH=$LLVM_BUILD_DIR
 export FLAGTREE_BACKEND=ascend
 python3 -m pip install . --no-build-isolation -v
 ```

@@ -53,21 +53,10 @@ python3 -m pip install . --no-build-isolation -v
 ```
 ```shell
 # ascend
-# 自行下载 LLVM
-cd ${YOUR_LLVM_DOWNLOAD_DIR}
-# 如果 `uname -a` 的输出是 x64 或 x86_64
-wget https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-x64.tar.gz
-tar -zxvf llvm-b5cc222d-ubuntu-x64.tar.gz
-export LLVM_BUILD_DIR=${YOUR_LLVM_DOWNLOAD_DIR}/llvm-b5cc222d-ubuntu-x64
-# 如果 `uname -a` 的输出是 aarch64
+# 推荐使用镜像 flagtree/dockerfiles/Dockerfile-ubuntu20.04-python3.9-ascend
+mkdir -p ~/.flagtree/ascend; cd ~/.flagtree/ascend
 wget https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-arm64.tar.gz
-tar -zxvf llvm-b5cc222d-ubuntu-arm64.tar.gz
-export LLVM_BUILD_DIR=${YOUR_LLVM_DOWNLOAD_DIR}/llvm-b5cc222d-ubuntu-arm64
-# 编译安装
 cd ${YOUR_CODE_DIR}/flagtree/python
-export LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include
-export LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib
-export LLVM_SYSPATH=$LLVM_BUILD_DIR
 export FLAGTREE_BACKEND=ascend
 python3 -m pip install . --no-build-isolation -v
 ```

@@ -0,0 +1,31 @@
+FROM swr.cn-south-1.myhuaweicloud.com/ascendhub/ascend-pytorch:24.0.0-A1-2.1.0-ubuntu20.04
+
+RUN apt-get update && \
+    apt-get install zip unzip git vim zstd libzstd-dev && \
+    apt-get install zlib1g zlib1g-dev libxml2 libxml2-dev && \
+    apt-get install clang lld
+
+RUN pip3 install -U pip && \
+    pip3 install numpy && \
+    pip3 install decorator && \
+    pip3 install sympy==1.4 && \
+    pip3 install cffi==1.12.3 && \
+    pip3 install pyyaml && \
+    pip3 install pathlib2 && \
+    pip3 install protobuf attrs attr && \
+    pip3 install scipy && \
+    pip3 install requests psutil absl-py && \
+    pip3 install ninja cmake wheel pybind11 && \
+    pip3 install setuptools==75.1.0 && \
+    pip3 install attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 && \
+    pip3 install pytest==8.3.2 pytest-xdist==3.6.1 pyyaml torch==2.3.1 torchvision==0.18.1 torch-npu==2.3.1.post2 && \
+    pip3 install scikit-build==0.18.1 scikit_build_core==0.11.1 && \
+    pip3 install pre-commit torch_npu==2.6.0rc1 && \
+    rm -rf /root/.cache/pip
+
+ENV LD_LIBRARY_PATH=/usr/lib/aarch64-linux-gnu/hdf5/serial:$LD_LIBRARY_PATH
+
+RUN if [ ! -d "/lib64" ]; \
+    then \
+        mkdir /lib64 && ln -sf /lib/ld-linux-aarch64.so.1 /lib64/ld-linux-aarch64.so.1; \
+    fi
@@ -39,7 +39,6 @@ class FlagTreeBackend:
 }
 
 set_llvm_env = lambda path: set_env({
-    'LLVM_BUILD_DIR': path,
     'LLVM_INCLUDE_DIRS': Path(path) / "include",
     'LLVM_LIBRARY_DIR': Path(path) / "lib",
     'LLVM_SYSPATH': path,
@@ -388,7 +387,7 @@ def check_env(env_val):
     file="iluvatar-llvm18-x86_64",
     condition=("iluvatar" == flagtree_backend),
     url="https://github.com/FlagTree/flagtree/releases/download/v0.1.0-build-deps/iluvatar-llvm18-x86_64.tar.gz",
-    pre_hock=lambda: check_env('LLVM_BUILD_DIR'),
+    pre_hock=lambda: check_env('LLVM_SYSPATH'),
     post_hock=set_llvm_env,
 )
 
@@ -397,7 +396,7 @@ def check_env(env_val):
     file="XTDK-llvm18-ubuntu2004_x86_64",
     condition=("xpu" == flagtree_backend),
     url="https://github.com/FlagTree/flagtree/releases/download/v0.1.0-build-deps/XTDK-llvm18-ubuntu2004_x86_64.tar",
-    pre_hock=lambda: check_env('LLVM_BUILD_DIR'),
+    pre_hock=lambda: check_env('LLVM_SYSPATH'),
     post_hock=set_llvm_env,
 )
 
@@ -408,10 +407,10 @@ def check_env(env_val):
 cache.store(
     files=("clang", "xpu-xxd", "xpu3-crt.xpu", "xpu-kernel.t", "ld.lld", "llvm-readelf", "llvm-objdump",
            "llvm-objcopy"), condition=("xpu" == flagtree_backend),
-    copy_src_path=f"{os.environ.get('LLVM_BUILD_DIR','')}/bin", copy_dst_path="third_party/xpu/backend/xpu3/bin")
+    copy_src_path=f"{os.environ.get('LLVM_SYSPATH','')}/bin", copy_dst_path="third_party/xpu/backend/xpu3/bin")
 
 cache.store(files=("libclang_rt.builtins-xpu3.a", "libclang_rt.builtins-xpu3s.a"),
-            condition=("xpu" == flagtree_backend), copy_src_path=f"{os.environ.get('LLVM_BUILD_DIR','')}/lib/linux",
+            condition=("xpu" == flagtree_backend), copy_src_path=f"{os.environ.get('LLVM_SYSPATH','')}/lib/linux",
             copy_dst_path="third_party/xpu/backend/xpu3/lib/linux")
 
 cache.store(files=("include", "so"), condition=("xpu" == flagtree_backend),
@@ -423,15 +422,15 @@ def check_env(env_val):
     condition=("mthreads" == flagtree_backend),
     url=
     "https://github.com/FlagTree/flagtree/releases/download/v0.1.0-build-deps/mthreads-llvm19-glibc2.34-glibcxx3.4.30-x64.tar.gz",
-    pre_hock=lambda: check_env('LLVM_BUILD_DIR'),
+    pre_hock=lambda: check_env('LLVM_SYSPATH'),
     post_hock=set_llvm_env,
 )
 
 # ascend
 cache.store(
-    file="ascend-llvm-b5cc222d-ubuntu-x64.tar.gz",
+    file="ascend-llvm-b5cc222d-ubuntu-arm64",
     condition=("ascend" == flagtree_backend),
-    url="https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-x64.tar.gz",
+    url="https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-b5cc222d-ubuntu-arm64.tar.gz",
     pre_hock=lambda: check_env('LLVM_SYSPATH'),
     post_hock=set_llvm_env,
 )
@@ -0,0 +1,80 @@
+"""
+Vector Addition
+===============
+
+In this tutorial, you will write a simple vector addition using Triton.
+
+In doing so, you will learn about:
+
+* The basic programming model of Triton.
+
+* The `triton.jit` decorator, which is used to define Triton kernels.
+
+* The best practices for validating and benchmarking your custom ops against native reference implementations.
+
+"""
+
+# %%
+# Compute Kernel
+# --------------
+
+import torch
+import torch_npu
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def add_kernel(x_ptr,  # *Pointer* to first input vector.
+               y_ptr,  # *Pointer* to second input vector.
+               output_ptr,  # *Pointer* to output vector.
+               n_elements,  # Size of the vector.
+               BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process.
+               # NOTE: `constexpr` so it can be used as a shape value.
+               ):
+    # There are multiple 'programs' processing different data. We identify which program
+    # we are here:
+    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+    # This program will process inputs that are offset from the initial data.
+    # For instance, if you had a vector of length 256 and block_size of 64, the programs
+    # would each access the elements [0:64, 64:128, 128:192, 192:256].
+    # Note that offsets is a list of pointers:
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    # Create a mask to guard memory operations against out-of-bounds accesses.
+    mask = offsets < n_elements
+    # Load x and y from DRAM, masking out any extra elements in case the input is not a
+    # multiple of the block size.
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    output = x + y
+    # Write x + y back to DRAM.
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+
+# %%
+# Let's also declare a helper function to (1) allocate the `z` tensor
+# and (2) enqueue the above kernel with appropriate grid/block sizes:
+
+
+def add(x: torch.Tensor, y: torch.Tensor):
+    output = torch.empty_like(x)
+    n_elements = output.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
+    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+    return output
+
+
+# %%
+# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
+torch.manual_seed(0)
+size = 98432
+x = torch.rand(size, device='npu')
+y = torch.rand(size, device='npu')
+output_torch = x + y
+output_triton = add(x, y)
+print(output_torch)
+print(output_triton)
+print(f'The maximum difference between torch and triton is '
+      f'{torch.max(torch.abs(output_torch - output_triton))}')