Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/build_windows_pytorch_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,8 @@ jobs:
env:
TARGET: ${{ inputs.amdgpu_family }}
PLATFORM: "windows"
run: python ./build_tools/github_actions/configure_target_run.py
run: python ./build_tools/github_actions/configure_target_run.py \
--test-project-name=pytorch

test_pytorch_wheels:
name: Test | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }}
Expand Down
3 changes: 3 additions & 0 deletions build_tools/github_actions/amdgpu_family_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
- test-runs-on-alternate-weight: (optional) Probability (0.0-1.0) of selecting the alternate runner.
- test-runs-on-multi-gpu: (optional) GitHub runner label for multi-GPU tests for this architecture
- benchmark-runs-on: (optional) GitHub runner label for benchmarks for this architecture
- pytorch-ci-test-runs-on: (optional) GitHub runner label for PyTorch wheel tests only; when set,
the workflow should pass `--test-project-name=pytorch` to configure_target_run.py to use this label instead of test-runs-on
- test-runs-on-kernel: (optional) dict of kernel-specific runner labels, keyed by kernel type (e.g. "oem")
- family: (required) AMD GPU family name, used for test selection and artifact fetching
- fetch-gfx-targets: (required) list of gfx targets to fetch split test artifacts for (e.g. ["gfx942", "gfx942:xnack+"])
Expand Down Expand Up @@ -120,6 +122,7 @@
},
"windows": {
"test-runs-on": "windows-gfx1151-gpu-rocm",
Comment thread
zichguan-amd marked this conversation as resolved.
"pytorch-ci-test-runs-on": "windows-strix-halo-gpu-rocm-128gb",
Comment on lines 124 to +125
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@geomin12 / @amd-shiraz / @amd-justchen

What's our spread of test runners for this windows-gfx1151-gpu-rocm label?

We should either:

  1. Have all machines using the same specific runner label have the same system configuration
  2. Have all workflows requesting the same generic runner label pass tests

RIght now windows-gfx1151-gpu-rocm seems to include runners with multiple different system configurations and the tests are not passing.

Copy link
Copy Markdown
Contributor

@amd-justchen amd-justchen Apr 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think they are still including all of the machines from 16, 32, 64, 128gb of total RAM. There was a point where I started adding runner labels for minimum amount of RAM for tests to select. Plumbing needs to be in place for that though, @geomin12 thoughts?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my experience there are 128gb models and 64gb models, all configured to the maximum carveout sizes (96gb and 48gb iirc).

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any update here? does the runner label exist?
code wise the pr looks good now. but do not know what the runner status is

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The label windows-strix-halo-gpu-rocm-128gb exists and currently has 6 runners. Do we want something else?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@geomin12 / @amd-shiraz / @amd-justchen
any opinion? from my side it looks good to merge.

# TODO(#2754): Add new benchmark-runs-on runner for benchmarks
"benchmark-runs-on": "windows-gfx1151-gpu-rocm",
"family": "gfx1151",
Expand Down
64 changes: 58 additions & 6 deletions build_tools/github_actions/configure_target_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,45 @@
* 'TARGET': A GPU family like 'gfx95X-dcgpu' or 'gfx1151', corresponding
to a release index.
* 'PLATFORM': "linux" or "windows"

Command-line:
* `--test-project-name`: When set to `pytorch`, use `pytorch-ci-test-runs-on`
instead of `test-runs-on` label. Workflows need specific runners for
PyTorch testing should pass this explicitly.
"""

import argparse
import os
from amdgpu_family_matrix import get_all_families_for_trigger_types

from github_actions_api import *

test_project_runs_on_label = {
"pytorch": "pytorch-ci-test-runs-on",
}


def validate_test_project_name(project_name: str) -> str:
"""Validate the test project name.

Empty input returns ``""`` (use default ``test-runs-on`` in the matrix).
Unknown names raise ``argparse.ArgumentTypeError``.
"""
if not project_name:
return ""

if project_name in test_project_runs_on_label:
return project_name

raise argparse.ArgumentTypeError(
f"Project '{project_name}' does not have a dedicated test runner label."
)


def get_runner_label(target: str, platform: str) -> str:
def get_runner_label(target: str, platform: str, *, test_project_name: str = "") -> str:
print(f"Searching for a runner for target '{target}' on platform '{platform}'")
if test_project_name:
print(f"Using test project name: '{test_project_name}'")
amdgpu_family_info_matrix = get_all_families_for_trigger_types(
["presubmit", "postsubmit"]
)
Expand All @@ -41,8 +70,15 @@ def get_runner_label(target: str, platform: str) -> str:
)
continue

# If there is a test machine available for this target, run on it.
test_runs_on_machine = platform_for_key.get("test-runs-on")
# Optional per-project matrix key (e.g. pytorch-ci-test-runs-on); missing
# or empty dedicated label falls back to test-runs-on.
if test_project_name:
test_runs_on_machine = platform_for_key.get(
test_project_runs_on_label[test_project_name]
) or platform_for_key.get("test-runs-on")
else:
test_runs_on_machine = platform_for_key.get("test-runs-on")

if test_runs_on_machine:
print(f" Found runner: '{test_runs_on_machine}'")
return test_runs_on_machine
Expand Down Expand Up @@ -83,8 +119,10 @@ def get_upload_label(target: str, platform: str) -> str:
return ""


def main(target: str, platform: str):
runner_label = get_runner_label(target, platform)
def main(target: str, platform: str, *, test_project_name: str = ""):
runner_label = get_runner_label(
target, platform, test_project_name=test_project_name
)
if runner_label:
gha_set_output({"test-runs-on": runner_label})
upload_label = get_upload_label(target, platform)
Expand All @@ -93,6 +131,20 @@ def main(target: str, platform: str):


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--test-project-name",
default="",
type=validate_test_project_name,
help=(
"Request project specific test runner label. e.g. 'pytorch' for `pytorch-ci-test-runs-on` label."
),
)
args = parser.parse_args()
target = os.getenv("TARGET", "")
platform = os.getenv("PLATFORM", "")
main(target=target, platform=platform)
main(
target=target,
platform=platform,
test_project_name=args.test_project_name,
)
13 changes: 13 additions & 0 deletions build_tools/github_actions/tests/configure_target_run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,19 @@ def test_windows_gfx115x(self):
runner_label = configure_target_run.get_runner_label("gfx1151", "windows")
Comment thread
zichguan-amd marked this conversation as resolved.
self.assertEqual(runner_label, "windows-gfx1151-gpu-rocm")

def test_windows_gfx1151_pytorch_ci_test_runner(self):
runner_label = configure_target_run.get_runner_label(
"gfx1151", "windows", test_project_name="pytorch"
)
self.assertEqual(runner_label, "windows-strix-halo-gpu-rocm-128gb")

def test_linux_gfx1151_pytorch_ci_test_runner(self):
# fallback to default test runner label when pytorch-ci-test-runs-on is not set
runner_label = configure_target_run.get_runner_label(
"gfx1151", "linux", test_project_name="pytorch"
)
self.assertEqual(runner_label, "linux-gfx1151-gpu-rocm")

def test_windows_gfx120X_all(self):
runner_label = configure_target_run.get_runner_label("gfx120X-all", "windows")
# No runner label yet.
Expand Down
Loading