Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
433f2ef
add logic for event driven runs
cquil11 Dec 10, 2025
dd4682b
testing pt 1
cquil11 Dec 10, 2025
7d6e052
raise error if yaml diff in perf changelog is not valid
cquil11 Dec 11, 2025
ce49098
remove unused imports in process_changelog.py
cquil11 Dec 11, 2025
e6f6fe9
config data key fix
cquil11 Dec 11, 2025
b87eedd
raise error if test-config subprocess fails to run
cquil11 Dec 11, 2025
ba0b115
backfill changelog
cquil11 Dec 11, 2025
747bc2d
backfill changelog pt 2
cquil11 Dec 11, 2025
ca24b8e
backfill changelog pt 3
cquil11 Dec 11, 2025
954ebd6
backfill changelog pt 4
cquil11 Dec 11, 2025
ee346b3
backfill changelog pt 5
cquil11 Dec 11, 2025
ab6f948
backfill changelog pt 6
cquil11 Dec 11, 2025
27074d2
add always() condition to upload changelog metadata
cquil11 Dec 12, 2025
763b394
backfill changelog pt 7 (test)
cquil11 Dec 12, 2025
d0b2de7
backfill changelog pt 8 (revert test)
cquil11 Dec 12, 2025
41341ad
backfill changelog pt 9
cquil11 Dec 12, 2025
f131962
backfill changelog pt 11
cquil11 Dec 12, 2025
dfeba21
change if condition for jobs in run sweep workflow
cquil11 Dec 12, 2025
fd07f40
debugging run sweep workflow
cquil11 Dec 12, 2025
228e0a2
debugging run sweep workflow pt 2
cquil11 Dec 12, 2025
cb2cc8a
debugging run sweep workflow pt 3 (revert)
cquil11 Dec 12, 2025
055b324
debugging run sweep workflow pt 4
cquil11 Dec 12, 2025
ae65551
debugging run sweep workflow pt 5
cquil11 Dec 12, 2025
667d2e1
debugging run sweep workflow pt 6
cquil11 Dec 12, 2025
ef3ba6b
debugging run sweep workflow pt 7
cquil11 Dec 12, 2025
fae8278
add always() condition to upload changelog metadata (add back, this g…
cquil11 Dec 12, 2025
2018ad3
add bmk prefix to results
cquil11 Dec 15, 2025
5e0c779
backfill changelog official
cquil11 Dec 15, 2025
8d8ffa1
for concurrency group, use more unique sha
cquil11 Dec 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/workflows/full-sweep-1k1k-scheduler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ name: "Full Sweep Scheduler - 1k1k"

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"

jobs:
get-dsr1-configs:
Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/full-sweep-1k8k-scheduler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ name: "Full Sweep Scheduler - 1k8k"

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"

jobs:
get-dsr1-configs:
Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/full-sweep-8k1k-scheduler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ name: "Full Sweep Scheduler - 8k1k"

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"

jobs:
get-dsr1-configs:
Expand Down
233 changes: 233 additions & 0 deletions .github/workflows/run-sweep.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
name: "Run Sweep"
run-name: Run Sweep - ${{ github.event.pull_request.title || github.ref_name }}

concurrency:
group: sweep-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

on:
push:
branches:
- main
paths:
- "perf-changelog.yaml"
pull_request:
branches:
- main
types:
- ready_for_review
- synchronize
- labeled
Comment thread
cquil11 marked this conversation as resolved.
paths:
- "perf-changelog.yaml"

jobs:
setup:
runs-on: ubuntu-latest
if: >-
(github.event_name == 'pull_request' && !github.event.pull_request.draft && contains(github.event.pull_request.labels.*.name, 'sweep-enabled')) ||
(github.event_name != 'pull_request' && !contains(github.event.head_commit.message, '[skip-sweep]'))
outputs:
search-space-config: ${{ steps.setup.outputs.search-space-config }}
steps:
- name: Checkout code
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
with:
fetch-depth: 0

- id: setup
run: |
pip install pydantic

if [ "${{ github.event_name }}" == "pull_request" ]; then
BASE_REF="origin/${{ github.base_ref }}"
HEAD_REF="${{ github.event.pull_request.head.sha }}"
else
BASE_REF="${{ github.event.before }}"
HEAD_REF="${{ github.event.after }}"
fi

CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/process_changelog.py \
--changelog-file ${GITHUB_WORKSPACE}/perf-changelog.yaml \
--base-ref "$BASE_REF" \
--head-ref "$HEAD_REF")

echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT

sweep-multi-node-1k1k:
needs: setup
if: ${{ needs.setup.outputs.search-space-config.multi_node['1k1k'] != '[]' }}
Comment thread
chunfangamd marked this conversation as resolved.
Outdated
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P0 Badge Parse sweep outputs before gating jobs

The sweep jobs gate on needs.setup.outputs.search-space-config.multi_node[...], but outputs with hyphens must be accessed with bracket syntax and parsed with fromJson before properties can be read. As written, the expression is interpreted as needs.setup.outputs.search - space - config..., which GitHub Actions rejects at evaluation time, so every sweep job will fail expression parsing and the workflow cannot start whenever run-sweep.yml is triggered.

Useful? React with 👍 / 👎.

name: multi-node 1k1k /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k'] }}
secrets: inherit
with: &multi-node-inputs
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
exp-name: ${{ matrix.config.exp-name }}
conc-list: ${{ toJson(matrix.config.conc) }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}

prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
prefill-tp: ${{ matrix.config.prefill.tp }}
prefill-ep: ${{ matrix.config.prefill.ep }}
prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}

decode-num-worker: ${{ matrix.config.decode.num-worker }}
decode-tp: ${{ matrix.config.decode.tp }}
decode-ep: ${{ matrix.config.decode.ep }}
decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}

sweep-multi-node-1k8k:
needs: setup
if: ${{ needs.setup.outputs.search-space-config.multi_node['1k8k'] != '[]' }}
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
name: multi-node 1k8k /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k'] }}
secrets: inherit
with: *multi-node-inputs

sweep-multi-node-8k1k:
needs: setup
if: ${{ needs.setup.outputs.search-space-config.multi_node['8k1k'] != '[]' }}
uses: ./.github/workflows/benchmark-multinode-tmpl.yml
name: multi-node 8k1k /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k'] }}
secrets: inherit
with: *multi-node-inputs

sweep-single-node-1k1k:
needs: setup
if: ${{ needs.setup.outputs.search-space-config.single_node['1k1k'] != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: single-node 1k1k /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k1k'] }}
secrets: inherit
with: &single-node-inputs
exp-name: ${{ matrix.config.exp-name }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
spec-decoding: ${{ matrix.config.spec-decoding }}
disagg: ${{ matrix.config.disagg }}

sweep-single-node-1k8k:
needs: setup
if: ${{ needs.setup.outputs.search-space-config.single_node['1k8k'] != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: single-node 1k8k /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k8k'] }}
secrets: inherit
with: *single-node-inputs

sweep-single-node-8k1k:
needs: setup
if: ${{ needs.setup.outputs.search-space-config.single_node['8k1k'] != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: single-node 8k1k /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }}
secrets: inherit
with: *single-node-inputs

collect-results:
needs:
[
sweep-single-node-1k1k,
sweep-single-node-1k8k,
sweep-single-node-8k1k,
sweep-multi-node-1k1k,
sweep-multi-node-1k8k,
sweep-multi-node-8k1k,
setup,
]
if: ${{ always() && needs.setup.result != 'skipped' }}
uses: ./.github/workflows/collect-results.yml
secrets: inherit

upload-changelog-metadata:
needs: [setup, collect-results]
if: ${{ needs.setup.result != 'skipped' }}
runs-on: ubuntu-latest
steps:
- name: Extract and save changelog metadata
env:
CONFIG_JSON: ${{ needs.setup.outputs.search-space-config }}
run: |
echo "$CONFIG_JSON" | jq '.changelog_metadata' > changelog_metadata.json

- name: Upload changelog artifact
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: changelog-metadata
path: changelog_metadata.json

calc-success-rate:
needs: collect-results
if: ${{ always() && needs.collect-results.result != 'skipped'}}
runs-on: ubuntu-latest

env:
RESULTS_DIR: "results/"
STATS_FILENAME: "run_stats"
GITHUB_TOKEN: ${{ secrets.REPO_PAT }}

steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
with:
token: ${{ secrets.REPO_PAT }}
fetch-depth: 0

- name: Download results artifacts
uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
path: ${{ env.RESULTS_DIR }}
pattern: results_*

- name: Install python dependencies
run: pip install PyGithub

- name: Calculate success rate
run: python3 utils/calc_success_rate.py $STATS_FILENAME

- uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: "run-stats"
path: ${{ env.STATS_FILENAME }}.json
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
- config-keys:
- gptoss-fp4-*-trt
description: |
- Upgrade GPT-OSS TRT images from 'release:1.1.0rc2.post2' to '1.2.0rc0.post1'
- Add NCCL_GRAPH_REGISTER=0 to benchmarks/gptoss_fp4_b200_trt_slurm.sh
- Change kv_cache_config.dtype from 'auto' to 'fp8' in benchmarks/gptoss_fp4_b200_trt_slurm.sh
- Remove MOE_BACKEND=CUTLASS, now just defaults to TRTLLM
PR: https://github.com/InferenceMAX/InferenceMAX/pull/110

4 changes: 4 additions & 0 deletions utils/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
MASTER_CONFIGS = [".github/configs/amd-master.yaml",
".github/configs/nvidia-master.yaml"]
RUNNER_CONFIG = ".github/configs/runners.yaml"
GENERATE_SWEEPS_PY_SCRIPT = "utils/matrix_logic/generate_sweep_configs.py"
Loading
Loading