diff --git a/.github/workflows/build_windows_pytorch_wheels.yml b/.github/workflows/build_windows_pytorch_wheels.yml index b2a143eac24..ddfb1810bc3 100644 --- a/.github/workflows/build_windows_pytorch_wheels.yml +++ b/.github/workflows/build_windows_pytorch_wheels.yml @@ -367,7 +367,8 @@ jobs: env: TARGET: ${{ inputs.amdgpu_family }} PLATFORM: "windows" - run: python ./build_tools/github_actions/configure_target_run.py + run: python ./build_tools/github_actions/configure_target_run.py \ + --test-project-name=pytorch test_pytorch_wheels: name: Test | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }} diff --git a/build_tools/github_actions/amdgpu_family_matrix.py b/build_tools/github_actions/amdgpu_family_matrix.py index c8f00f8e3fd..82c8cc30ed9 100644 --- a/build_tools/github_actions/amdgpu_family_matrix.py +++ b/build_tools/github_actions/amdgpu_family_matrix.py @@ -57,6 +57,8 @@ - test-runs-on-alternate-weight: (optional) Probability (0.0-1.0) of selecting the alternate runner. - test-runs-on-multi-gpu: (optional) GitHub runner label for multi-GPU tests for this architecture - benchmark-runs-on: (optional) GitHub runner label for benchmarks for this architecture +- pytorch-ci-test-runs-on: (optional) GitHub runner label for PyTorch wheel tests only; when set, + the workflow should pass `--test-project-name=pytorch` to configure_target_run.py to use this label instead of test-runs-on - test-runs-on-kernel: (optional) dict of kernel-specific runner labels, keyed by kernel type (e.g. "oem") - family: (required) AMD GPU family name, used for test selection and artifact fetching - fetch-gfx-targets: (required) list of gfx targets to fetch split test artifacts for (e.g. ["gfx942", "gfx942:xnack+"]) @@ -120,6 +122,7 @@ }, "windows": { "test-runs-on": "windows-gfx1151-gpu-rocm", + "pytorch-ci-test-runs-on": "windows-strix-halo-gpu-rocm-128gb", # TODO(#2754): Add new benchmark-runs-on runner for benchmarks "benchmark-runs-on": "windows-gfx1151-gpu-rocm", "family": "gfx1151", diff --git a/build_tools/github_actions/configure_target_run.py b/build_tools/github_actions/configure_target_run.py index ddac03c59f9..11cbfb42809 100644 --- a/build_tools/github_actions/configure_target_run.py +++ b/build_tools/github_actions/configure_target_run.py @@ -7,16 +7,45 @@ * 'TARGET': A GPU family like 'gfx95X-dcgpu' or 'gfx1151', corresponding to a release index. * 'PLATFORM': "linux" or "windows" + +Command-line: + * `--test-project-name`: When set to `pytorch`, use `pytorch-ci-test-runs-on` + instead of `test-runs-on` label. Workflows need specific runners for + PyTorch testing should pass this explicitly. """ +import argparse import os from amdgpu_family_matrix import get_all_families_for_trigger_types from github_actions_api import * +test_project_runs_on_label = { + "pytorch": "pytorch-ci-test-runs-on", +} + + +def validate_test_project_name(project_name: str) -> str: + """Validate the test project name. + + Empty input returns ``""`` (use default ``test-runs-on`` in the matrix). + Unknown names raise ``argparse.ArgumentTypeError``. + """ + if not project_name: + return "" + + if project_name in test_project_runs_on_label: + return project_name + + raise argparse.ArgumentTypeError( + f"Project '{project_name}' does not have a dedicated test runner label." + ) + -def get_runner_label(target: str, platform: str) -> str: +def get_runner_label(target: str, platform: str, *, test_project_name: str = "") -> str: print(f"Searching for a runner for target '{target}' on platform '{platform}'") + if test_project_name: + print(f"Using test project name: '{test_project_name}'") amdgpu_family_info_matrix = get_all_families_for_trigger_types( ["presubmit", "postsubmit"] ) @@ -41,8 +70,15 @@ def get_runner_label(target: str, platform: str) -> str: ) continue - # If there is a test machine available for this target, run on it. - test_runs_on_machine = platform_for_key.get("test-runs-on") + # Optional per-project matrix key (e.g. pytorch-ci-test-runs-on); missing + # or empty dedicated label falls back to test-runs-on. + if test_project_name: + test_runs_on_machine = platform_for_key.get( + test_project_runs_on_label[test_project_name] + ) or platform_for_key.get("test-runs-on") + else: + test_runs_on_machine = platform_for_key.get("test-runs-on") + if test_runs_on_machine: print(f" Found runner: '{test_runs_on_machine}'") return test_runs_on_machine @@ -83,8 +119,10 @@ def get_upload_label(target: str, platform: str) -> str: return "" -def main(target: str, platform: str): - runner_label = get_runner_label(target, platform) +def main(target: str, platform: str, *, test_project_name: str = ""): + runner_label = get_runner_label( + target, platform, test_project_name=test_project_name + ) if runner_label: gha_set_output({"test-runs-on": runner_label}) upload_label = get_upload_label(target, platform) @@ -93,6 +131,20 @@ def main(target: str, platform: str): if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--test-project-name", + default="", + type=validate_test_project_name, + help=( + "Request project specific test runner label. e.g. 'pytorch' for `pytorch-ci-test-runs-on` label." + ), + ) + args = parser.parse_args() target = os.getenv("TARGET", "") platform = os.getenv("PLATFORM", "") - main(target=target, platform=platform) + main( + target=target, + platform=platform, + test_project_name=args.test_project_name, + ) diff --git a/build_tools/github_actions/tests/configure_target_run_test.py b/build_tools/github_actions/tests/configure_target_run_test.py index bb79c7073eb..770ea068e4f 100644 --- a/build_tools/github_actions/tests/configure_target_run_test.py +++ b/build_tools/github_actions/tests/configure_target_run_test.py @@ -29,6 +29,19 @@ def test_windows_gfx115x(self): runner_label = configure_target_run.get_runner_label("gfx1151", "windows") self.assertEqual(runner_label, "windows-gfx1151-gpu-rocm") + def test_windows_gfx1151_pytorch_ci_test_runner(self): + runner_label = configure_target_run.get_runner_label( + "gfx1151", "windows", test_project_name="pytorch" + ) + self.assertEqual(runner_label, "windows-strix-halo-gpu-rocm-128gb") + + def test_linux_gfx1151_pytorch_ci_test_runner(self): + # fallback to default test runner label when pytorch-ci-test-runs-on is not set + runner_label = configure_target_run.get_runner_label( + "gfx1151", "linux", test_project_name="pytorch" + ) + self.assertEqual(runner_label, "linux-gfx1151-gpu-rocm") + def test_windows_gfx120X_all(self): runner_label = configure_target_run.get_runner_label("gfx120X-all", "windows") # No runner label yet.