Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
2546d33
CI: try downloading pre-built Triton wheel before building from source
mengfei-jiang Apr 16, 2026
8c7d89e
CI: add rocm version to pre-built Triton wheel filename
mengfei-jiang Apr 16, 2026
c842dfe
CI: URL-encode '+' as '%2B' in Triton wheel download URL
mengfei-jiang Apr 16, 2026
81cf12f
CI: fix Triton wheel download URL path
mengfei-jiang Apr 16, 2026
cb5f77d
Use requirements-triton.txt to install triton from AMD PyPI
mengfei-jiang Apr 23, 2026
24c29e3
add torch compile test for triton
mengfei-jiang Apr 27, 2026
a17bc2b
Fix black formatting in torch_compile tests
mengfei-jiang Apr 28, 2026
33af62c
Fix fused_mul_add bf16 tolerance and add fullgraph=True
mengfei-jiang Apr 28, 2026
439e166
Fix black formatting for fullgraph=True line wrap
mengfei-jiang Apr 28, 2026
5b3666a
Use fp16/bf16 dtypes, add dynamo.reset, relax bf16 tolerance
mengfei-jiang Apr 28, 2026
c455ab6
Replace manual triton install with amd-triton from PyPI
mengfei-jiang Apr 29, 2026
fbd23e6
Remove PIP_EXTRA_INDEX_URL for AMD PyPI in CI script
mengfei-jiang Apr 29, 2026
318d563
Wrap amd-triton uninstall in try/except to avoid setup failure
mengfei-jiang Apr 29, 2026
f114343
Revert to requirements-triton.txt approach with amd-triton from PyPI
mengfei-jiang Apr 29, 2026
365a46b
Auto-detect ROCm version and install amd-triton from AMD PyPI
mengfei-jiang Apr 29, 2026
6033274
Add amd-triton install to CI workflows and fix PEP 517 build failure
mengfei-jiang Apr 29, 2026
d7fee4b
Extract triton install logic into shared install_triton.sh script
mengfei-jiang Apr 29, 2026
2301ed0
Use install_triton.sh in setup.py instead of inline logic
mengfei-jiang Apr 29, 2026
6bdadaa
Deduplicate _get_compiled helper, add torch_compile FILE_TIMES, and i…
mengfei-jiang Apr 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/scripts/build_aiter_triton.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ pip install --upgrade "pybind11>=3.0.1"
pip install --upgrade "ninja>=1.11.1"
pip install tabulate
pip install -e .
./.github/scripts/install_triton.sh

# Read BUILD_TRITON env var, default to 1. If 1, install Triton; if 0, skip installation.
BUILD_TRITON=${BUILD_TRITON:-1}
Expand Down
14 changes: 14 additions & 0 deletions .github/scripts/install_triton.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
set -e

pip uninstall -y triton pytorch-triton pytorch-triton-rocm triton-rocm amd-triton || true

TRITON_INDEX_URL="https://pypi.amd.com/triton/rocm-7.0.0/simple/"
ROCM_VERSION=$(dpkg -l rocm-core 2>/dev/null | awk '/^ii/{print $3}')
if [[ -n "$ROCM_VERSION" ]]; then
ROCM_MAJOR_MINOR=$(echo "$ROCM_VERSION" | cut -d. -f1,2)
TRITON_INDEX_URL="https://pypi.amd.com/triton/rocm-${ROCM_MAJOR_MINOR}.0/simple/"
fi

echo "Installing amd-triton from $TRITON_INDEX_URL"
pip install --extra-index-url "$TRITON_INDEX_URL" amd-triton
10 changes: 10 additions & 0 deletions .github/scripts/split_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,16 @@ elif [[ "$TEST_TYPE" == "triton" ]]; then
FILE_TIMES[op_tests/triton_tests/gemm/basic/test_gemm_a8w8_per_token_scale.py]=17
FILE_TIMES[op_tests/triton_tests/quant/test_fused_fp8_quant.py]=17
FILE_TIMES[op_tests/triton_tests/test_gather_kv_b_proj.py]=16
FILE_TIMES[op_tests/triton_tests/torch_compile/test_compile_gemm_a16w16.py]=19
FILE_TIMES[op_tests/triton_tests/torch_compile/test_compile_activation.py]=11
FILE_TIMES[op_tests/triton_tests/torch_compile/test_compile_moe_routing.py]=10
FILE_TIMES[op_tests/triton_tests/torch_compile/test_compile_rope.py]=9
FILE_TIMES[op_tests/triton_tests/torch_compile/test_compile_softmax.py]=8
FILE_TIMES[op_tests/triton_tests/torch_compile/test_compile_fused_mul_add.py]=7
FILE_TIMES[op_tests/triton_tests/torch_compile/test_compile_quant_per_tensor.py]=7
FILE_TIMES[op_tests/triton_tests/torch_compile/test_compile_quant_per_token.py]=7
FILE_TIMES[op_tests/triton_tests/torch_compile/test_compile_rmsnorm.py]=7
FILE_TIMES[op_tests/triton_tests/torch_compile/test_compile_topk.py]=5
FILE_TIMES[op_tests/triton_tests/attention/test_extend_attention.py]=7
FILE_TIMES[op_tests/triton_tests/fusions/test_fused_qk_concat.py]=7
FILE_TIMES[op_tests/triton_tests/gemm/basic/test_gemm_a8w8_blockscale.py]=7
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/aiter-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,11 @@ jobs:
pip show amd-aiter
"

- name: Install amd-triton
run: |
docker exec -w /workspace aiter_test \
bash -c "./.github/scripts/install_triton.sh && pip show amd-triton"

- name: Show Aiter version
run: |
set -ex
Expand Down Expand Up @@ -564,6 +569,11 @@ jobs:
pip show amd-aiter
"

- name: Install amd-triton
run: |
docker exec -w /workspace aiter_test \
bash -c "./.github/scripts/install_triton.sh && pip show amd-triton"

- name: Show Aiter version
run: |
set -ex
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/atom-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@ jobs:
cd /app/aiter-test && \\
git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
git submodule sync && git submodule update --init --recursive && \\
MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 pip install -e .
MAX_JOBS=64 PREBUILD_KERNELS=0 GPU_ARCHS=gfx950 pip install -e . && \\
./.github/scripts/install_triton.sh
RUN echo "=== amd-triton version ===" && pip show amd-triton || true
RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true
EOF

Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/sglang_downstream.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ jobs:
git submodule sync --recursive
git submodule update --init --recursive
pip install -e .
./.github/scripts/install_triton.sh
pip show amd-triton || true
pip show amd-aiter || pip show aiter
"

Expand Down
90 changes: 4 additions & 86 deletions .github/workflows/triton-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,82 +56,12 @@ jobs:
path: triton_shard_*.list
retention-days: 7

# Build Triton wheel once, shared by all shard jobs via artifact
build-triton:
if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }}
name: Build Triton Wheel
runs-on: linux-aiter-mi35x-1
needs: [check-signal]
env:
DOCKER_IMAGE: "rocm/pytorch:latest"
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 1

- name: Docker login
if: ${{ github.event_name != 'pull_request' || !github.event.pull_request.head.repo.fork }}
env:
DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
run: |
for attempt in 1 2 3; do
if echo "$DOCKER_PASSWORD" | docker login -u rocmshared --password-stdin; then
echo "Docker login succeeded on attempt ${attempt}"
exit 0
fi
echo "Docker login attempt ${attempt} failed"
if [ "${attempt}" != 3 ]; then
sleep 10
fi
done
echo "Docker login failed after 3 attempts, continuing anyway"
exit 0

- name: Build Triton wheel in Docker
run: |
set -ex
mkdir -p triton-wheels

if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi

docker run --rm \
--device=/dev/kfd $DEVICE_FLAG \
--shm-size=16G \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
${{ env.DOCKER_IMAGE }} \
bash -c '
set -ex
pip config set global.default-timeout 60
pip config set global.retries 10
TRITON_COMMIT=${TRITON_COMMIT:-756afc06}
git clone https://github.com/triton-lang/triton
cd triton
git checkout "$TRITON_COMMIT"
pip install -r python/requirements.txt
MAX_JOBS=64 pip wheel --no-deps -w /workspace/triton-wheels .
'

- name: Upload Triton wheel
uses: actions/upload-artifact@v4
with:
name: triton-wheel
path: triton-wheels/*.whl
retention-days: 7

# Step 2: MI35X matrix jobs
triton:
if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') }}
name: Triton Tests (MI35X) / Shard ${{ matrix.shard }}
runs-on: linux-aiter-mi35x-1
needs: [split_triton_tests, build-triton, check-signal]
needs: [split_triton_tests, check-signal]
strategy:
fail-fast: false
matrix:
Expand All @@ -152,12 +82,6 @@ jobs:
with:
name: triton_shards

- name: Download Triton wheel
uses: actions/download-artifact@v4
with:
name: triton-wheel
path: triton-wheels

- name: List test shard files
run: |
ls -l triton_shard_*.list
Expand Down Expand Up @@ -218,7 +142,7 @@ jobs:
set -ex
echo "Setting up Aiter and Triton..."
docker exec \
-e TRITON_WHEEL_DIR=/workspace/triton-wheels \
-e BUILD_TRITON=0 \
-w /workspace \
triton_test \
./.github/scripts/build_aiter_triton.sh
Expand Down Expand Up @@ -279,7 +203,7 @@ jobs:
if: ${{ (!github.event.pull_request || github.event.pull_request.draft == false) && (github.event_name != 'pull_request' || github.event.action != 'labeled' || github.event.label.name == 'ci:triton-300x') && (github.ref == 'refs/heads/main' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'ci:triton-300x'))) }}
name: Triton Tests (MI300X) / Shard ${{ matrix.shard }}
runs-on: linux-aiter-mi300x-1
needs: [split_triton_tests, build-triton, check-signal]
needs: [split_triton_tests, check-signal]
strategy:
fail-fast: false
matrix:
Expand All @@ -300,12 +224,6 @@ jobs:
with:
name: triton_shards

- name: Download Triton wheel
uses: actions/download-artifact@v4
with:
name: triton-wheel
path: triton-wheels

- name: List test shard files
run: |
ls -l triton_shard_*.list
Expand Down Expand Up @@ -366,7 +284,7 @@ jobs:
set -ex
echo "Setting up Aiter and Triton..."
docker exec \
-e TRITON_WHEEL_DIR=/workspace/triton-wheels \
-e BUILD_TRITON=0 \
-w /workspace \
triton_test \
./.github/scripts/build_aiter_triton.sh
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/vllm_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ jobs:
cd /aiter && \\
git checkout ${{ env.GITHUB_COMMIT_SHA }} && \\
git submodule sync && git submodule update --init --recursive && \\
pip install -e .
pip install -e . && \\
./.github/scripts/install_triton.sh
RUN echo "=== amd-triton version ===" && pip show amd-triton || true

RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true

Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,16 @@ Or install all optional dependencies at once:
pip install -r requirements.txt
```

### Triton

AITER includes Triton-based operators that require amd-triton ([ROCm 7.0](https://pypi.amd.com/triton/rocm-7.0.0/simple/), [ROCm 7.1](https://pypi.amd.com/triton/rocm-7.1.0/simple/), [ROCm 7.2](https://pypi.amd.com/triton/rocm-7.2.0/simple/)), with the correct version selected based on your ROCm installation.

If you install with `python3 setup.py develop`, amd-triton is installed automatically. If you use `pip install -e .`, run the install script manually:

```bash
./.github/scripts/install_triton.sh
```

### Opus — Lightweight C++ Template for Kernel Development

[Opus](csrc/include/opus/) is a single-header C++ template library (`opus.hpp`) for writing HIP kernels on AMD GPUs — vectorized load/store, layout abstractions, and MFMA wrappers with a strong focus on **build time optimization** (up to 61x faster than standard torch extension builds). See the [Opus README](csrc/include/opus/README.md) and [`op_tests/opus/`](op_tests/opus/) for details.
Expand Down
10 changes: 10 additions & 0 deletions op_tests/triton_tests/torch_compile/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

import torch


def _get_compiled(fn):
return torch.compile(
fn, backend="inductor", fullgraph=True, options={"max_autotune": True}
)
47 changes: 47 additions & 0 deletions op_tests/triton_tests/torch_compile/test_compile_activation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

import pytest
import torch
import torch.nn.functional as F

from . import _get_compiled


def torch_silu_mul(x):
half = x.shape[-1] // 2
x1 = x[..., :half]
x2 = x[..., half:]
return F.silu(x1) * x2


def torch_gelu_mul(x):
half = x.shape[-1] // 2
x1 = x[..., :half]
x2 = x[..., half:]
return F.gelu(x1) * x2


@pytest.mark.parametrize("activation", ["silu", "gelu"])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("M, N", [(64, 256), (128, 512), (256, 1024)])
def test_compile_activation(M, N, dtype, activation):
torch.manual_seed(42)
torch.cuda.empty_cache()
torch._dynamo.reset()
x = torch.randn(M, N, device="cuda", dtype=dtype)

act_fn = torch_silu_mul if activation == "silu" else torch_gelu_mul
out_eager = act_fn(x)

compiled_fn = _get_compiled(act_fn)
out_compiled = compiled_fn(x)
torch.cuda.synchronize()

assert not torch.isnan(out_compiled).any(), "torch.compile produced NaN"
tol = (0.1, 0.1) if dtype == torch.bfloat16 else (1e-3, 1e-3)
torch.testing.assert_close(out_compiled, out_eager, atol=tol[0], rtol=tol[1])


if __name__ == "__main__":
pytest.main([__file__, "-v"])
44 changes: 44 additions & 0 deletions op_tests/triton_tests/torch_compile/test_compile_fused_mul_add.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

import pytest
import torch

from . import _get_compiled


def torch_fused_mul_add(x, a, b):
return x * a + b


@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("M, N", [(128, 256), (256, 512), (512, 1024)])
@pytest.mark.parametrize("scalar_ab", [False, True])
def test_compile_fused_mul_add(M, N, dtype, scalar_ab):
torch.manual_seed(42)
torch.cuda.empty_cache()
torch._dynamo.reset()
x = torch.randn(M, N, device="cuda", dtype=dtype)

if scalar_ab:
a, b = 2.0, 0.5
else:
a = torch.randn(M, N, device="cuda", dtype=dtype)
b = torch.randn(M, N, device="cuda", dtype=dtype)

out_eager = torch_fused_mul_add(x, a, b)

def fn(x, a, b):
return torch_fused_mul_add(x, a, b)

compiled_fn = _get_compiled(fn)
out_compiled = compiled_fn(x, a, b)
torch.cuda.synchronize()

assert not torch.isnan(out_compiled).any(), "torch.compile produced NaN"
tol = (0.1, 0.1) if dtype == torch.bfloat16 else (1e-3, 1e-3)
torch.testing.assert_close(out_compiled, out_eager, atol=tol[0], rtol=tol[1])


if __name__ == "__main__":
pytest.main([__file__, "-v"])
Loading
Loading