From ba3f530e1020eae1fb1ad519a2a4b325e431b707 Mon Sep 17 00:00:00 2001 From: geomin12 Date: Tue, 28 Apr 2026 08:52:36 -0700 Subject: [PATCH 1/7] Add weighted build runner selection (90% Azure, 10% AWS) Add logic to configure_ci.py and configure_multi_arch_ci.py to select build runners using a weighted distribution: - Default builds: 90% azure-linux-scale-rocm, 10% aws-linux-scale-rocm - Sanitizer builds (asan/tsan): 100% azure-linux-scale-rocm-heavy-ramdisk Changes: - Add BUILD_RUNNER_LABELS config and select_build_runner() function to amdgpu_family_matrix.py - Add build_runs_on field to BuildConfig in configure_multi_arch_ci.py - Add build-runs-on field to matrix output in configure_ci.py - Pass build_runs_on through workflow chain to build stage jobs - Update multi_arch_build_portable_linux_artifacts.yml to use build_runs_on when provided, with fallback to existing logic Co-Authored-By: Claude Opus 4.6 --- .../multi_arch_build_portable_linux.yml | 14 ++++++ ...ti_arch_build_portable_linux_artifacts.yml | 7 ++- .../workflows/multi_arch_build_windows.yml | 4 ++ .github/workflows/multi_arch_ci_linux.yml | 1 + .github/workflows/multi_arch_ci_windows.yml | 1 + .../github_actions/amdgpu_family_matrix.py | 44 +++++++++++++++++++ build_tools/github_actions/configure_ci.py | 7 +++ .../github_actions/configure_multi_arch_ci.py | 7 +++ 8 files changed, 84 insertions(+), 1 deletion(-) diff --git a/.github/workflows/multi_arch_build_portable_linux.yml b/.github/workflows/multi_arch_build_portable_linux.yml index 7baee1a09f7..10dbf3799c0 100644 --- a/.github/workflows/multi_arch_build_portable_linux.yml +++ b/.github/workflows/multi_arch_build_portable_linux.yml @@ -42,6 +42,10 @@ on: type: string default: "" description: "Comma-separated build stages to skip (artifacts already copied by the orchestrator)" + build_runs_on: + type: string + default: "" + description: "Build runner label (selected by configure script with weighted distribution)" release_type: description: 'Release type: "" for CI, or "dev", "nightly", "prerelease".' type: string @@ -74,6 +78,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -96,6 +101,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -128,6 +134,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -154,6 +161,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -176,6 +184,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -198,6 +207,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -220,6 +230,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -242,6 +253,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -271,6 +283,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -293,6 +306,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} diff --git a/.github/workflows/multi_arch_build_portable_linux_artifacts.yml b/.github/workflows/multi_arch_build_portable_linux_artifacts.yml index 04da22a090f..1cd3132a859 100644 --- a/.github/workflows/multi_arch_build_portable_linux_artifacts.yml +++ b/.github/workflows/multi_arch_build_portable_linux_artifacts.yml @@ -53,6 +53,10 @@ on: description: 'Release type: "" for CI, or "dev", "nightly", "prerelease". Controls artifact bucket and IAM role.' type: string default: "" + build_runs_on: + description: "Build runner label (selected by configure script with weighted distribution)" + type: string + default: "" repository: description: "Repository to checkout. Defaults to github.repository." type: string @@ -65,7 +69,8 @@ on: jobs: build_stage: name: ${{ inputs.stage_display_name }} - runs-on: ${{ contains(inputs.build_variant_cmake_preset, 'san') && 'azure-linux-scale-rocm-heavy-ramdisk' || 'azure-linux-scale-rocm' }} + # Use build_runs_on if provided, otherwise fall back to hardcoded logic for sanitizer vs default + runs-on: ${{ inputs.build_runs_on != '' && inputs.build_runs_on || (contains(inputs.build_variant_cmake_preset, 'san') && 'azure-linux-scale-rocm-heavy-ramdisk' || 'azure-linux-scale-rocm') }} timeout-minutes: ${{ inputs.timeout_minutes }} permissions: id-token: write diff --git a/.github/workflows/multi_arch_build_windows.yml b/.github/workflows/multi_arch_build_windows.yml index a20809c53ea..92163248ab5 100644 --- a/.github/workflows/multi_arch_build_windows.yml +++ b/.github/workflows/multi_arch_build_windows.yml @@ -38,6 +38,10 @@ on: type: string default: "" description: "Comma-separated build stages to skip (artifacts already copied by the orchestrator)" + build_runs_on: + type: string + default: "" + description: "Build runner label (not currently used for Windows, reserved for future use)" rocm_package_version: type: string release_type: diff --git a/.github/workflows/multi_arch_ci_linux.yml b/.github/workflows/multi_arch_ci_linux.yml index eaecc4cb8e7..12198b6a4e1 100644 --- a/.github/workflows/multi_arch_ci_linux.yml +++ b/.github/workflows/multi_arch_ci_linux.yml @@ -78,6 +78,7 @@ jobs: build_variant_suffix: ${{ fromJSON(inputs.build_config).build_variant_suffix }} expect_failure: ${{ fromJSON(inputs.build_config).expect_failure }} prebuilt_stages: ${{ fromJSON(inputs.build_config).prebuilt_stages }} + build_runs_on: ${{ fromJSON(inputs.build_config).build_runs_on }} rocm_package_version: ${{ inputs.rocm_package_version }} release_type: ${{ inputs.release_type }} permissions: diff --git a/.github/workflows/multi_arch_ci_windows.yml b/.github/workflows/multi_arch_ci_windows.yml index 56cb3505f9e..8910951b5cc 100644 --- a/.github/workflows/multi_arch_ci_windows.yml +++ b/.github/workflows/multi_arch_ci_windows.yml @@ -83,6 +83,7 @@ jobs: build_variant_suffix: ${{ fromJSON(inputs.build_config).build_variant_suffix }} expect_failure: ${{ fromJSON(inputs.build_config).expect_failure }} prebuilt_stages: ${{ fromJSON(inputs.build_config).prebuilt_stages }} + build_runs_on: ${{ fromJSON(inputs.build_config).build_runs_on }} rocm_package_version: ${{ inputs.rocm_package_version }} release_type: ${{ inputs.release_type }} permissions: diff --git a/build_tools/github_actions/amdgpu_family_matrix.py b/build_tools/github_actions/amdgpu_family_matrix.py index 448c9a6998f..5565139210e 100644 --- a/build_tools/github_actions/amdgpu_family_matrix.py +++ b/build_tools/github_actions/amdgpu_family_matrix.py @@ -45,6 +45,50 @@ def select_weighted_label(labels_config: list[dict], context_name: str) -> str: return selected["label"] +# Build runner configuration for Linux builds +# Uses weighted distribution: 90% Azure, 10% AWS +# Sanitizer builds (asan/tsan) use ramdisk variants (100% Azure, no AWS yet) +BUILD_RUNNER_LABELS = { + "linux": { + "default": [ + {"label": "azure-linux-scale-rocm", "weight": 0.90}, + {"label": "aws-linux-scale-rocm", "weight": 0.10}, + ], + "sanitizer": [ + {"label": "azure-linux-scale-rocm-heavy-ramdisk", "weight": 1.0}, + ], + }, +} + + +def select_build_runner(platform: str, build_variant: str) -> str: + """Select a build runner label based on platform and build variant. + + Args: + platform: The platform to build for (e.g., "linux", "windows") + build_variant: The build variant (e.g., "release", "asan", "tsan") + + Returns: + A runner label string for the build job + """ + if platform not in BUILD_RUNNER_LABELS: + # Platform not configured for weighted selection, return default + print(f" No build runner config for platform {platform}, using default") + return "" + + platform_config = BUILD_RUNNER_LABELS[platform] + + # Use sanitizer runners for asan/tsan builds + if "san" in build_variant: + labels_config = platform_config.get("sanitizer", platform_config["default"]) + context_name = f"build-runner ({platform}, {build_variant})" + else: + labels_config = platform_config["default"] + context_name = f"build-runner ({platform})" + + return select_weighted_label(labels_config, context_name) + + all_build_variants = { "linux": { "release": { diff --git a/build_tools/github_actions/configure_ci.py b/build_tools/github_actions/configure_ci.py index a30645c913b..5c5752d02d9 100755 --- a/build_tools/github_actions/configure_ci.py +++ b/build_tools/github_actions/configure_ci.py @@ -57,6 +57,7 @@ from amdgpu_family_matrix import ( all_build_variants, get_all_families_for_trigger_types, + select_build_runner, select_weighted_label, ) from fetch_test_configurations import test_matrix, functional_matrix @@ -453,6 +454,12 @@ def matrix_generator( ): matrix_row["test-runs-on"] = matrix_row["test-runs-on-sandbox"] + # Select build runner using weighted distribution (90% Azure, 10% AWS) + # Sanitizer builds use ramdisk variants + matrix_row["build-runs-on"] = select_build_runner( + platform, base_args.get("build_variant", "release") + ) + matrix_output.append(matrix_row) print(f"Generated build matrix: {str(matrix_output)}") diff --git a/build_tools/github_actions/configure_multi_arch_ci.py b/build_tools/github_actions/configure_multi_arch_ci.py index 5e58c00dc45..f5be863e73f 100755 --- a/build_tools/github_actions/configure_multi_arch_ci.py +++ b/build_tools/github_actions/configure_multi_arch_ci.py @@ -58,6 +58,7 @@ from amdgpu_family_matrix import ( all_build_variants, get_all_families_for_trigger_types, + select_build_runner, select_weighted_label, ) from configure_ci_path_filters import ( @@ -417,6 +418,8 @@ class BuildConfig: build_variant_cmake_preset: str expect_failure: bool build_pytorch: bool + # Build runner label for this platform/variant combination + build_runs_on: str = "" # Prebuilt stage configuration — set by configure() from JobDecisions. prebuilt_stages: list[str] = field(default_factory=list) baseline_run_id: str = "" @@ -885,6 +888,9 @@ def _expand_build_config_for_platform( expect_pytorch_failure = variant_config.get("expect_pytorch_failure", False) suffix = variant_config.get("build_variant_suffix", "") + # Select build runner using weighted distribution + build_runs_on = select_build_runner(platform, ci_inputs.build_variant) + return BuildConfig( per_family_info=per_family_info, dist_amdgpu_families=";".join(family_names), @@ -894,6 +900,7 @@ def _expand_build_config_for_platform( build_variant_cmake_preset=variant_config["build_variant_cmake_preset"], expect_failure=expect_failure, build_pytorch=not expect_failure and not expect_pytorch_failure, + build_runs_on=build_runs_on, prebuilt_stages=prebuilt_stages or [], baseline_run_id=baseline_run_id, ) From 8d3cbb053a4f2558e59ffa6d1fd5c6286b99a4b2 Mon Sep 17 00:00:00 2001 From: geomin12 Date: Tue, 28 Apr 2026 08:59:07 -0700 Subject: [PATCH 2/7] updating inputs --- .../workflows/multi_arch_build_portable_linux_artifacts.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/multi_arch_build_portable_linux_artifacts.yml b/.github/workflows/multi_arch_build_portable_linux_artifacts.yml index 1cd3132a859..a197fb50c53 100644 --- a/.github/workflows/multi_arch_build_portable_linux_artifacts.yml +++ b/.github/workflows/multi_arch_build_portable_linux_artifacts.yml @@ -69,8 +69,7 @@ on: jobs: build_stage: name: ${{ inputs.stage_display_name }} - # Use build_runs_on if provided, otherwise fall back to hardcoded logic for sanitizer vs default - runs-on: ${{ inputs.build_runs_on != '' && inputs.build_runs_on || (contains(inputs.build_variant_cmake_preset, 'san') && 'azure-linux-scale-rocm-heavy-ramdisk' || 'azure-linux-scale-rocm') }} + runs-on: ${{ inputs.build_runs_on }} timeout-minutes: ${{ inputs.timeout_minutes }} permissions: id-token: write From 14c44f4d838398a9dc12376aecdc741d874091e7 Mon Sep 17 00:00:00 2001 From: geomin12 Date: Tue, 28 Apr 2026 09:05:56 -0700 Subject: [PATCH 3/7] Add tests for build runner selection logic Add TestBuildRunnerSelection with 2 deterministic tests: - Test weighted selection (90% Azure, 10% AWS) for default builds - Test sanitizer builds always use Azure ramdisk runner Co-Authored-By: Claude Opus 4.6 --- .../tests/configure_multi_arch_ci_test.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/build_tools/github_actions/tests/configure_multi_arch_ci_test.py b/build_tools/github_actions/tests/configure_multi_arch_ci_test.py index a4543e06335..40604c509c1 100644 --- a/build_tools/github_actions/tests/configure_multi_arch_ci_test.py +++ b/build_tools/github_actions/tests/configure_multi_arch_ci_test.py @@ -1134,5 +1134,44 @@ def test_families_without_multi_label_use_primary_only(self): self.assertEqual(gfx103x_info["test-runs-on"], "linux-gfx1030-gpu-rocm") +# --------------------------------------------------------------------------- +# Build runner selection +# --------------------------------------------------------------------------- + + +class TestBuildRunnerSelection(unittest.TestCase): + """Test weighted random selection of build runners (Azure vs AWS).""" + + def test_select_build_runner_weighted_selection(self): + """Test weighted selection: Azure (90%) vs AWS (10%) for default builds.""" + from amdgpu_family_matrix import select_build_runner + + # Random < 0.9 should select Azure + with patch("random.random", return_value=0.5): + self.assertEqual( + select_build_runner("linux", "release"), "azure-linux-scale-rocm" + ) + + # Random >= 0.9 should select AWS + with patch("random.random", return_value=0.95): + self.assertEqual( + select_build_runner("linux", "release"), "aws-linux-scale-rocm" + ) + + def test_select_build_runner_sanitizer_uses_ramdisk(self): + """Sanitizer builds (asan/tsan) should always use Azure ramdisk runner.""" + from amdgpu_family_matrix import select_build_runner + + with patch("random.random", return_value=0.99): + self.assertEqual( + select_build_runner("linux", "asan"), + "azure-linux-scale-rocm-heavy-ramdisk", + ) + self.assertEqual( + select_build_runner("linux", "tsan"), + "azure-linux-scale-rocm-heavy-ramdisk", + ) + + if __name__ == "__main__": unittest.main() From af78afd5ddc431df742ea42cd2868cad71d8aaed Mon Sep 17 00:00:00 2001 From: geomin12 Date: Tue, 28 Apr 2026 09:41:06 -0700 Subject: [PATCH 4/7] precommit --- build_tools/github_actions/amdgpu_family_matrix.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/build_tools/github_actions/amdgpu_family_matrix.py b/build_tools/github_actions/amdgpu_family_matrix.py index 5565139210e..5e79d2d4614 100644 --- a/build_tools/github_actions/amdgpu_family_matrix.py +++ b/build_tools/github_actions/amdgpu_family_matrix.py @@ -58,6 +58,11 @@ def select_weighted_label(labels_config: list[dict], context_name: str) -> str: {"label": "azure-linux-scale-rocm-heavy-ramdisk", "weight": 1.0}, ], }, + "windows": { + "default": [ + {"label": "azure-windows-scale-rocm", "weight": 1.0}, + ], + }, } From 6fb777fff5f88957156bcb1970600e60d9d892ab Mon Sep 17 00:00:00 2001 From: geomin12 Date: Tue, 28 Apr 2026 09:42:10 -0700 Subject: [PATCH 5/7] adding default --- .github/workflows/multi_arch_build_portable_linux.yml | 2 +- .github/workflows/multi_arch_build_portable_linux_artifacts.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/multi_arch_build_portable_linux.yml b/.github/workflows/multi_arch_build_portable_linux.yml index 10dbf3799c0..e47ba963fd3 100644 --- a/.github/workflows/multi_arch_build_portable_linux.yml +++ b/.github/workflows/multi_arch_build_portable_linux.yml @@ -44,7 +44,7 @@ on: description: "Comma-separated build stages to skip (artifacts already copied by the orchestrator)" build_runs_on: type: string - default: "" + default: "azure-linux-scale-rocm" description: "Build runner label (selected by configure script with weighted distribution)" release_type: description: 'Release type: "" for CI, or "dev", "nightly", "prerelease".' diff --git a/.github/workflows/multi_arch_build_portable_linux_artifacts.yml b/.github/workflows/multi_arch_build_portable_linux_artifacts.yml index a197fb50c53..6bcf3da548d 100644 --- a/.github/workflows/multi_arch_build_portable_linux_artifacts.yml +++ b/.github/workflows/multi_arch_build_portable_linux_artifacts.yml @@ -56,7 +56,7 @@ on: build_runs_on: description: "Build runner label (selected by configure script with weighted distribution)" type: string - default: "" + default: "azure-linux-scale-rocm" repository: description: "Repository to checkout. Defaults to github.repository." type: string From b1af0777a1f3a53080ef9425fda0949bbdb7360a Mon Sep 17 00:00:00 2001 From: geomin12 Date: Tue, 28 Apr 2026 09:42:54 -0700 Subject: [PATCH 6/7] adding windows tests --- .../github_actions/tests/configure_multi_arch_ci_test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/build_tools/github_actions/tests/configure_multi_arch_ci_test.py b/build_tools/github_actions/tests/configure_multi_arch_ci_test.py index 40604c509c1..02969a475da 100644 --- a/build_tools/github_actions/tests/configure_multi_arch_ci_test.py +++ b/build_tools/github_actions/tests/configure_multi_arch_ci_test.py @@ -1158,6 +1158,12 @@ def test_select_build_runner_weighted_selection(self): select_build_runner("linux", "release"), "aws-linux-scale-rocm" ) + # Random >= 0.9 should select AWS + with patch("random.random", return_value=0.95): + self.assertEqual( + select_build_runner("windows", "release"), "azure-windows-scale-rocm" + ) + def test_select_build_runner_sanitizer_uses_ramdisk(self): """Sanitizer builds (asan/tsan) should always use Azure ramdisk runner.""" from amdgpu_family_matrix import select_build_runner From 8f9c6c278036345843cd79b686d5fd19fc4f11eb Mon Sep 17 00:00:00 2001 From: geomin12 Date: Tue, 28 Apr 2026 10:45:54 -0700 Subject: [PATCH 7/7] condensing comment --- build_tools/github_actions/amdgpu_family_matrix.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/build_tools/github_actions/amdgpu_family_matrix.py b/build_tools/github_actions/amdgpu_family_matrix.py index 5e79d2d4614..0ac01d18e1d 100644 --- a/build_tools/github_actions/amdgpu_family_matrix.py +++ b/build_tools/github_actions/amdgpu_family_matrix.py @@ -67,15 +67,7 @@ def select_weighted_label(labels_config: list[dict], context_name: str) -> str: def select_build_runner(platform: str, build_variant: str) -> str: - """Select a build runner label based on platform and build variant. - - Args: - platform: The platform to build for (e.g., "linux", "windows") - build_variant: The build variant (e.g., "release", "asan", "tsan") - - Returns: - A runner label string for the build job - """ + """Select a build runner label based on platform and build variant.""" if platform not in BUILD_RUNNER_LABELS: # Platform not configured for weighted selection, return default print(f" No build runner config for platform {platform}, using default")