Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,354 changes: 1,354 additions & 0 deletions .github/workflows/nightly-test-amd-aiter-latest.yml

Large diffs are not rendered by default.

157 changes: 87 additions & 70 deletions .github/workflows/nightly-test-amd-rocm720.yml

Large diffs are not rendered by default.

95 changes: 59 additions & 36 deletions .github/workflows/nightly-test-amd.yml

Large diffs are not rendered by default.

76 changes: 44 additions & 32 deletions .github/workflows/pr-test-amd-rocm720.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@ on:
required: false
type: string
default: ""
aiter_ref:
description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
required: false
type: string
default: ''
continue_on_error:
description: 'Continue on error (do not fail the workflow on test failures)'
required: false
type: boolean
default: true
workflow_call:
inputs:
ref:
Expand All @@ -49,6 +59,19 @@ on:
required: false
type: boolean
default: false
aiter_ref:
description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
required: false
type: string
default: ''
continue_on_error:
description: 'Continue on error (do not fail the workflow on test failures)'
required: false
type: boolean
default: true

env:
AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }}

concurrency:
# Include pr_head_sha in group for /rerun-stage dispatches to avoid collisions with main branch runs
Expand Down Expand Up @@ -146,8 +169,7 @@ jobs:

- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build

bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 14
run: |
Expand Down Expand Up @@ -193,8 +215,7 @@ jobs:

- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build

bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
run: |
Expand Down Expand Up @@ -236,12 +257,11 @@ jobs:

- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build

bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 10
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd --continue-on-error
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

jit-kernel-unit-test-amd:
needs: [check-changes]
Expand Down Expand Up @@ -275,8 +295,7 @@ jobs:

- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build

bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run JIT kernel unit tests
timeout-minutes: 10
run: |
Expand Down Expand Up @@ -315,12 +334,11 @@ jobs:
GITHUB_WORKSPACE: ${{ github.workspace }}

- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build

run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 --continue-on-error
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

stage-b-test-small-1-gpu-amd-nondeterministic:
needs: [check-changes]
Expand Down Expand Up @@ -354,12 +372,11 @@ jobs:
GITHUB_WORKSPACE: ${{ github.workspace }}

- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build

run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 --continue-on-error
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

stage-b-test-small-1-gpu-amd-mi35x:
needs: [check-changes]
Expand Down Expand Up @@ -393,12 +410,11 @@ jobs:
GITHUB_WORKSPACE: ${{ github.workspace }}

- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build

run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x --continue-on-error
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

stage-b-test-large-1-gpu-amd:
needs: [check-changes]
Expand Down Expand Up @@ -433,12 +449,11 @@ jobs:
GITHUB_WORKSPACE: ${{ github.workspace }}

- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build

run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 --continue-on-error
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

stage-b-test-large-2-gpu-amd:
needs: [check-changes]
Expand Down Expand Up @@ -473,12 +488,11 @@ jobs:
GITHUB_WORKSPACE: ${{ github.workspace }}

- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build

run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 --continue-on-error
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

multimodal-gen-test-1-gpu-amd:
needs: [check-changes]
Expand Down Expand Up @@ -523,7 +537,7 @@ jobs:

- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build diffusion
bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion
docker exec ci_sglang pip install amdsmi

- name: Setup kernel caches
Expand Down Expand Up @@ -653,7 +667,7 @@ jobs:

- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build diffusion
bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion
docker exec ci_sglang pip install amdsmi

- name: Setup kernel caches
Expand Down Expand Up @@ -774,8 +788,7 @@ jobs:
GITHUB_WORKSPACE: ${{ github.workspace }}

- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build

run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Test RCCL multi-GPU communication
timeout-minutes: 5
run: |
Expand All @@ -785,7 +798,7 @@ jobs:
- name: Run test
timeout-minutes: 60
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 --continue-on-error
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

stage-c-test-large-8-gpu-amd-mi35x:
needs: [check-changes]
Expand Down Expand Up @@ -820,12 +833,11 @@ jobs:
GITHUB_WORKSPACE: ${{ github.workspace }}

- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build

run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 60
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 --continue-on-error
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

pr-test-amd-finish:
needs:
Expand Down
41 changes: 32 additions & 9 deletions .github/workflows/pr-test-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@ on:
required: false
type: string
default: ""
aiter_ref:
description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
required: false
type: string
default: ''
continue_on_error:
description: 'Continue on error (do not fail the workflow on test failures)'
required: false
type: boolean
default: false
workflow_call:
inputs:
ref:
Expand All @@ -46,6 +56,19 @@ on:
required: false
type: boolean
default: false
aiter_ref:
description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
required: false
type: string
default: ''
continue_on_error:
description: 'Continue on error (do not fail the workflow on test failures)'
required: false
type: boolean
default: false

env:
AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }}

concurrency:
# Include pr_head_sha in group for /rerun-stage dispatches to avoid collisions with main branch runs
Expand Down Expand Up @@ -238,7 +261,7 @@ jobs:
- name: Run test
timeout-minutes: 10
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

jit-kernel-unit-test-amd:
needs: [check-changes]
Expand Down Expand Up @@ -317,7 +340,7 @@ jobs:
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

stage-b-test-small-1-gpu-amd-nondeterministic:
needs: [check-changes, stage-a-test-1-amd]
Expand Down Expand Up @@ -356,7 +379,7 @@ jobs:
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 --continue-on-error
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

stage-b-test-small-1-gpu-amd-mi35x:
needs: [check-changes, stage-a-test-1-amd]
Expand Down Expand Up @@ -395,7 +418,7 @@ jobs:
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

stage-b-test-large-1-gpu-amd:
needs: [check-changes, stage-a-test-1-amd]
Expand Down Expand Up @@ -435,7 +458,7 @@ jobs:
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

stage-b-test-large-2-gpu-amd:
needs: [check-changes, stage-a-test-1-amd]
Expand Down Expand Up @@ -475,7 +498,7 @@ jobs:
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

multimodal-gen-test-1-gpu-amd:
needs: [check-changes]
Expand Down Expand Up @@ -762,7 +785,7 @@ jobs:
- name: Run test
timeout-minutes: 60
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

stage-c-test-large-8-gpu-amd-mi35x:
needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd, stage-b-test-large-2-gpu-amd]
Expand Down Expand Up @@ -802,7 +825,7 @@ jobs:
- name: Run test
timeout-minutes: 60
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

# =============================================== Disaggregation ====================================================
stage-b-test-large-8-gpu-35x-disaggregation-amd:
Expand Down Expand Up @@ -914,7 +937,7 @@ jobs:
run: |
bash scripts/ci/amd/amd_ci_exec.sh \
-e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \
-w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800
-w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}

pr-test-amd-finish:
needs:
Expand Down
Loading
Loading