From 372864bdc65ec51056aee26f4756cc0af07f91e5 Mon Sep 17 00:00:00 2001 From: zichguan-amd Date: Thu, 16 Apr 2026 14:47:05 -0400 Subject: [PATCH 1/2] Use 128GB runners for gfx1151 pytorch CI on Windows Signed-off-by: zichguan-amd --- .../github_actions/amdgpu_family_matrix.py | 3 +++ .../github_actions/configure_target_run.py | 24 +++++++++++++++++-- .../tests/configure_target_run_test.py | 7 ++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/build_tools/github_actions/amdgpu_family_matrix.py b/build_tools/github_actions/amdgpu_family_matrix.py index c8f00f8e3fd..094ead3bbc5 100644 --- a/build_tools/github_actions/amdgpu_family_matrix.py +++ b/build_tools/github_actions/amdgpu_family_matrix.py @@ -57,6 +57,8 @@ - test-runs-on-alternate-weight: (optional) Probability (0.0-1.0) of selecting the alternate runner. - test-runs-on-multi-gpu: (optional) GitHub runner label for multi-GPU tests for this architecture - benchmark-runs-on: (optional) GitHub runner label for benchmarks for this architecture +- pytorch-ci-test-runs-on: (optional) GitHub runner label for PyTorch wheel tests only; when set, + portable PyTorch test jobs use this label instead of test-runs-on (other workflows keep test-runs-on) - test-runs-on-kernel: (optional) dict of kernel-specific runner labels, keyed by kernel type (e.g. "oem") - family: (required) AMD GPU family name, used for test selection and artifact fetching - fetch-gfx-targets: (required) list of gfx targets to fetch split test artifacts for (e.g. ["gfx942", "gfx942:xnack+"]) @@ -120,6 +122,7 @@ }, "windows": { "test-runs-on": "windows-gfx1151-gpu-rocm", + "pytorch-ci-test-runs-on": "windows-strix-halo-gpu-rocm-128gb", # TODO(#2754): Add new benchmark-runs-on runner for benchmarks "benchmark-runs-on": "windows-gfx1151-gpu-rocm", "family": "gfx1151", diff --git a/build_tools/github_actions/configure_target_run.py b/build_tools/github_actions/configure_target_run.py index ddac03c59f9..11366b50c63 100644 --- a/build_tools/github_actions/configure_target_run.py +++ b/build_tools/github_actions/configure_target_run.py @@ -15,6 +15,15 @@ from github_actions_api import * +def is_pytorch_wheel_workflow() -> bool: + """True when this process runs from a *pytorch_wheels*.yml GitHub Actions workflow. + + Matches the workflow file path in ``GITHUB_WORKFLOW_REF`` (stable) + """ + ref = os.getenv("GITHUB_WORKFLOW_REF", "") + return "pytorch_wheels" in ref.replace("\\", "/").lower() + + def get_runner_label(target: str, platform: str) -> str: print(f"Searching for a runner for target '{target}' on platform '{platform}'") amdgpu_family_info_matrix = get_all_families_for_trigger_types( @@ -41,8 +50,19 @@ def get_runner_label(target: str, platform: str) -> str: ) continue - # If there is a test machine available for this target, run on it. - test_runs_on_machine = platform_for_key.get("test-runs-on") + # `pytorch-ci-test-runs-on` is used only for Windows gfx1151 when the workflow + # is a `*pytorch_wheels*.yml` job; all other families use `test-runs-on`. + use_pytorch_ci_windows_gfx1151 = ( + is_pytorch_wheel_workflow() + and platform == "windows" + and family_for_platform == "gfx1151" + ) + if use_pytorch_ci_windows_gfx1151: + test_runs_on_machine = platform_for_key.get( + "pytorch-ci-test-runs-on" + ) or platform_for_key.get("test-runs-on") + else: + test_runs_on_machine = platform_for_key.get("test-runs-on") if test_runs_on_machine: print(f" Found runner: '{test_runs_on_machine}'") return test_runs_on_machine diff --git a/build_tools/github_actions/tests/configure_target_run_test.py b/build_tools/github_actions/tests/configure_target_run_test.py index bb79c7073eb..0c1f62cc743 100644 --- a/build_tools/github_actions/tests/configure_target_run_test.py +++ b/build_tools/github_actions/tests/configure_target_run_test.py @@ -5,6 +5,7 @@ import os import sys import unittest +from unittest.mock import patch sys.path.insert(0, os.fspath(Path(__file__).parent.parent)) import configure_target_run @@ -29,6 +30,12 @@ def test_windows_gfx115x(self): runner_label = configure_target_run.get_runner_label("gfx1151", "windows") self.assertEqual(runner_label, "windows-gfx1151-gpu-rocm") + def test_windows_gfx1151_pytorch_ci_runner(self): + ref = "refs/heads/main/.github/workflows/build_windows_pytorch_wheels.yml" + with patch.dict(os.environ, {"GITHUB_WORKFLOW_REF": ref}, clear=False): + runner_label = configure_target_run.get_runner_label("gfx1151", "windows") + self.assertEqual(runner_label, "windows-strix-halo-gpu-rocm-128gb") + def test_windows_gfx120X_all(self): runner_label = configure_target_run.get_runner_label("gfx120X-all", "windows") # No runner label yet. From 14971ff60276dd27be2c411b93b40dcc7087ad75 Mon Sep 17 00:00:00 2001 From: zichguan-amd Date: Wed, 29 Apr 2026 14:17:29 -0400 Subject: [PATCH 2/2] Use workflow provided argument to determine runner label Signed-off-by: zichguan-amd --- .../build_windows_pytorch_wheels.yml | 3 +- .../github_actions/amdgpu_family_matrix.py | 2 +- .../github_actions/configure_target_run.py | 68 ++++++++++++++----- .../tests/configure_target_run_test.py | 16 +++-- 4 files changed, 64 insertions(+), 25 deletions(-) diff --git a/.github/workflows/build_windows_pytorch_wheels.yml b/.github/workflows/build_windows_pytorch_wheels.yml index b2a143eac24..ddfb1810bc3 100644 --- a/.github/workflows/build_windows_pytorch_wheels.yml +++ b/.github/workflows/build_windows_pytorch_wheels.yml @@ -367,7 +367,8 @@ jobs: env: TARGET: ${{ inputs.amdgpu_family }} PLATFORM: "windows" - run: python ./build_tools/github_actions/configure_target_run.py + run: python ./build_tools/github_actions/configure_target_run.py \ + --test-project-name=pytorch test_pytorch_wheels: name: Test | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }} diff --git a/build_tools/github_actions/amdgpu_family_matrix.py b/build_tools/github_actions/amdgpu_family_matrix.py index 094ead3bbc5..82c8cc30ed9 100644 --- a/build_tools/github_actions/amdgpu_family_matrix.py +++ b/build_tools/github_actions/amdgpu_family_matrix.py @@ -58,7 +58,7 @@ - test-runs-on-multi-gpu: (optional) GitHub runner label for multi-GPU tests for this architecture - benchmark-runs-on: (optional) GitHub runner label for benchmarks for this architecture - pytorch-ci-test-runs-on: (optional) GitHub runner label for PyTorch wheel tests only; when set, - portable PyTorch test jobs use this label instead of test-runs-on (other workflows keep test-runs-on) + the workflow should pass `--test-project-name=pytorch` to configure_target_run.py to use this label instead of test-runs-on - test-runs-on-kernel: (optional) dict of kernel-specific runner labels, keyed by kernel type (e.g. "oem") - family: (required) AMD GPU family name, used for test selection and artifact fetching - fetch-gfx-targets: (required) list of gfx targets to fetch split test artifacts for (e.g. ["gfx942", "gfx942:xnack+"]) diff --git a/build_tools/github_actions/configure_target_run.py b/build_tools/github_actions/configure_target_run.py index 11366b50c63..11cbfb42809 100644 --- a/build_tools/github_actions/configure_target_run.py +++ b/build_tools/github_actions/configure_target_run.py @@ -7,25 +7,45 @@ * 'TARGET': A GPU family like 'gfx95X-dcgpu' or 'gfx1151', corresponding to a release index. * 'PLATFORM': "linux" or "windows" + +Command-line: + * `--test-project-name`: When set to `pytorch`, use `pytorch-ci-test-runs-on` + instead of `test-runs-on` label. Workflows need specific runners for + PyTorch testing should pass this explicitly. """ +import argparse import os from amdgpu_family_matrix import get_all_families_for_trigger_types from github_actions_api import * +test_project_runs_on_label = { + "pytorch": "pytorch-ci-test-runs-on", +} + -def is_pytorch_wheel_workflow() -> bool: - """True when this process runs from a *pytorch_wheels*.yml GitHub Actions workflow. +def validate_test_project_name(project_name: str) -> str: + """Validate the test project name. - Matches the workflow file path in ``GITHUB_WORKFLOW_REF`` (stable) + Empty input returns ``""`` (use default ``test-runs-on`` in the matrix). + Unknown names raise ``argparse.ArgumentTypeError``. """ - ref = os.getenv("GITHUB_WORKFLOW_REF", "") - return "pytorch_wheels" in ref.replace("\\", "/").lower() + if not project_name: + return "" + + if project_name in test_project_runs_on_label: + return project_name + + raise argparse.ArgumentTypeError( + f"Project '{project_name}' does not have a dedicated test runner label." + ) -def get_runner_label(target: str, platform: str) -> str: +def get_runner_label(target: str, platform: str, *, test_project_name: str = "") -> str: print(f"Searching for a runner for target '{target}' on platform '{platform}'") + if test_project_name: + print(f"Using test project name: '{test_project_name}'") amdgpu_family_info_matrix = get_all_families_for_trigger_types( ["presubmit", "postsubmit"] ) @@ -50,19 +70,15 @@ def get_runner_label(target: str, platform: str) -> str: ) continue - # `pytorch-ci-test-runs-on` is used only for Windows gfx1151 when the workflow - # is a `*pytorch_wheels*.yml` job; all other families use `test-runs-on`. - use_pytorch_ci_windows_gfx1151 = ( - is_pytorch_wheel_workflow() - and platform == "windows" - and family_for_platform == "gfx1151" - ) - if use_pytorch_ci_windows_gfx1151: + # Optional per-project matrix key (e.g. pytorch-ci-test-runs-on); missing + # or empty dedicated label falls back to test-runs-on. + if test_project_name: test_runs_on_machine = platform_for_key.get( - "pytorch-ci-test-runs-on" + test_project_runs_on_label[test_project_name] ) or platform_for_key.get("test-runs-on") else: test_runs_on_machine = platform_for_key.get("test-runs-on") + if test_runs_on_machine: print(f" Found runner: '{test_runs_on_machine}'") return test_runs_on_machine @@ -103,8 +119,10 @@ def get_upload_label(target: str, platform: str) -> str: return "" -def main(target: str, platform: str): - runner_label = get_runner_label(target, platform) +def main(target: str, platform: str, *, test_project_name: str = ""): + runner_label = get_runner_label( + target, platform, test_project_name=test_project_name + ) if runner_label: gha_set_output({"test-runs-on": runner_label}) upload_label = get_upload_label(target, platform) @@ -113,6 +131,20 @@ def main(target: str, platform: str): if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--test-project-name", + default="", + type=validate_test_project_name, + help=( + "Request project specific test runner label. e.g. 'pytorch' for `pytorch-ci-test-runs-on` label." + ), + ) + args = parser.parse_args() target = os.getenv("TARGET", "") platform = os.getenv("PLATFORM", "") - main(target=target, platform=platform) + main( + target=target, + platform=platform, + test_project_name=args.test_project_name, + ) diff --git a/build_tools/github_actions/tests/configure_target_run_test.py b/build_tools/github_actions/tests/configure_target_run_test.py index 0c1f62cc743..770ea068e4f 100644 --- a/build_tools/github_actions/tests/configure_target_run_test.py +++ b/build_tools/github_actions/tests/configure_target_run_test.py @@ -5,7 +5,6 @@ import os import sys import unittest -from unittest.mock import patch sys.path.insert(0, os.fspath(Path(__file__).parent.parent)) import configure_target_run @@ -30,12 +29,19 @@ def test_windows_gfx115x(self): runner_label = configure_target_run.get_runner_label("gfx1151", "windows") self.assertEqual(runner_label, "windows-gfx1151-gpu-rocm") - def test_windows_gfx1151_pytorch_ci_runner(self): - ref = "refs/heads/main/.github/workflows/build_windows_pytorch_wheels.yml" - with patch.dict(os.environ, {"GITHUB_WORKFLOW_REF": ref}, clear=False): - runner_label = configure_target_run.get_runner_label("gfx1151", "windows") + def test_windows_gfx1151_pytorch_ci_test_runner(self): + runner_label = configure_target_run.get_runner_label( + "gfx1151", "windows", test_project_name="pytorch" + ) self.assertEqual(runner_label, "windows-strix-halo-gpu-rocm-128gb") + def test_linux_gfx1151_pytorch_ci_test_runner(self): + # fallback to default test runner label when pytorch-ci-test-runs-on is not set + runner_label = configure_target_run.get_runner_label( + "gfx1151", "linux", test_project_name="pytorch" + ) + self.assertEqual(runner_label, "linux-gfx1151-gpu-rocm") + def test_windows_gfx120X_all(self): runner_label = configure_target_run.get_runner_label("gfx120X-all", "windows") # No runner label yet.