diff --git a/.github/workflows/multi_arch_build_portable_linux.yml b/.github/workflows/multi_arch_build_portable_linux.yml index 7baee1a09f7..e47ba963fd3 100644 --- a/.github/workflows/multi_arch_build_portable_linux.yml +++ b/.github/workflows/multi_arch_build_portable_linux.yml @@ -42,6 +42,10 @@ on: type: string default: "" description: "Comma-separated build stages to skip (artifacts already copied by the orchestrator)" + build_runs_on: + type: string + default: "azure-linux-scale-rocm" + description: "Build runner label (selected by configure script with weighted distribution)" release_type: description: 'Release type: "" for CI, or "dev", "nightly", "prerelease".' type: string @@ -74,6 +78,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -96,6 +101,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -128,6 +134,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -154,6 +161,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -176,6 +184,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -198,6 +207,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -220,6 +230,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -242,6 +253,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -271,6 +283,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -293,6 +306,7 @@ jobs: dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }} rocm_package_version: ${{ inputs.rocm_package_version }} build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }} + build_runs_on: ${{ inputs.build_runs_on }} release_type: ${{ inputs.release_type }} repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} diff --git a/.github/workflows/multi_arch_build_portable_linux_artifacts.yml b/.github/workflows/multi_arch_build_portable_linux_artifacts.yml index 04da22a090f..6bcf3da548d 100644 --- a/.github/workflows/multi_arch_build_portable_linux_artifacts.yml +++ b/.github/workflows/multi_arch_build_portable_linux_artifacts.yml @@ -53,6 +53,10 @@ on: description: 'Release type: "" for CI, or "dev", "nightly", "prerelease". Controls artifact bucket and IAM role.' type: string default: "" + build_runs_on: + description: "Build runner label (selected by configure script with weighted distribution)" + type: string + default: "azure-linux-scale-rocm" repository: description: "Repository to checkout. Defaults to github.repository." type: string @@ -65,7 +69,7 @@ on: jobs: build_stage: name: ${{ inputs.stage_display_name }} - runs-on: ${{ contains(inputs.build_variant_cmake_preset, 'san') && 'azure-linux-scale-rocm-heavy-ramdisk' || 'azure-linux-scale-rocm' }} + runs-on: ${{ inputs.build_runs_on }} timeout-minutes: ${{ inputs.timeout_minutes }} permissions: id-token: write diff --git a/.github/workflows/multi_arch_build_windows.yml b/.github/workflows/multi_arch_build_windows.yml index a20809c53ea..92163248ab5 100644 --- a/.github/workflows/multi_arch_build_windows.yml +++ b/.github/workflows/multi_arch_build_windows.yml @@ -38,6 +38,10 @@ on: type: string default: "" description: "Comma-separated build stages to skip (artifacts already copied by the orchestrator)" + build_runs_on: + type: string + default: "" + description: "Build runner label (not currently used for Windows, reserved for future use)" rocm_package_version: type: string release_type: diff --git a/.github/workflows/multi_arch_ci_linux.yml b/.github/workflows/multi_arch_ci_linux.yml index eaecc4cb8e7..12198b6a4e1 100644 --- a/.github/workflows/multi_arch_ci_linux.yml +++ b/.github/workflows/multi_arch_ci_linux.yml @@ -78,6 +78,7 @@ jobs: build_variant_suffix: ${{ fromJSON(inputs.build_config).build_variant_suffix }} expect_failure: ${{ fromJSON(inputs.build_config).expect_failure }} prebuilt_stages: ${{ fromJSON(inputs.build_config).prebuilt_stages }} + build_runs_on: ${{ fromJSON(inputs.build_config).build_runs_on }} rocm_package_version: ${{ inputs.rocm_package_version }} release_type: ${{ inputs.release_type }} permissions: diff --git a/.github/workflows/multi_arch_ci_windows.yml b/.github/workflows/multi_arch_ci_windows.yml index 56cb3505f9e..8910951b5cc 100644 --- a/.github/workflows/multi_arch_ci_windows.yml +++ b/.github/workflows/multi_arch_ci_windows.yml @@ -83,6 +83,7 @@ jobs: build_variant_suffix: ${{ fromJSON(inputs.build_config).build_variant_suffix }} expect_failure: ${{ fromJSON(inputs.build_config).expect_failure }} prebuilt_stages: ${{ fromJSON(inputs.build_config).prebuilt_stages }} + build_runs_on: ${{ fromJSON(inputs.build_config).build_runs_on }} rocm_package_version: ${{ inputs.rocm_package_version }} release_type: ${{ inputs.release_type }} permissions: diff --git a/build_tools/github_actions/amdgpu_family_matrix.py b/build_tools/github_actions/amdgpu_family_matrix.py index 448c9a6998f..0ac01d18e1d 100644 --- a/build_tools/github_actions/amdgpu_family_matrix.py +++ b/build_tools/github_actions/amdgpu_family_matrix.py @@ -45,6 +45,47 @@ def select_weighted_label(labels_config: list[dict], context_name: str) -> str: return selected["label"] +# Build runner configuration for Linux builds +# Uses weighted distribution: 90% Azure, 10% AWS +# Sanitizer builds (asan/tsan) use ramdisk variants (100% Azure, no AWS yet) +BUILD_RUNNER_LABELS = { + "linux": { + "default": [ + {"label": "azure-linux-scale-rocm", "weight": 0.90}, + {"label": "aws-linux-scale-rocm", "weight": 0.10}, + ], + "sanitizer": [ + {"label": "azure-linux-scale-rocm-heavy-ramdisk", "weight": 1.0}, + ], + }, + "windows": { + "default": [ + {"label": "azure-windows-scale-rocm", "weight": 1.0}, + ], + }, +} + + +def select_build_runner(platform: str, build_variant: str) -> str: + """Select a build runner label based on platform and build variant.""" + if platform not in BUILD_RUNNER_LABELS: + # Platform not configured for weighted selection, return default + print(f" No build runner config for platform {platform}, using default") + return "" + + platform_config = BUILD_RUNNER_LABELS[platform] + + # Use sanitizer runners for asan/tsan builds + if "san" in build_variant: + labels_config = platform_config.get("sanitizer", platform_config["default"]) + context_name = f"build-runner ({platform}, {build_variant})" + else: + labels_config = platform_config["default"] + context_name = f"build-runner ({platform})" + + return select_weighted_label(labels_config, context_name) + + all_build_variants = { "linux": { "release": { diff --git a/build_tools/github_actions/configure_ci.py b/build_tools/github_actions/configure_ci.py index a30645c913b..5c5752d02d9 100755 --- a/build_tools/github_actions/configure_ci.py +++ b/build_tools/github_actions/configure_ci.py @@ -57,6 +57,7 @@ from amdgpu_family_matrix import ( all_build_variants, get_all_families_for_trigger_types, + select_build_runner, select_weighted_label, ) from fetch_test_configurations import test_matrix, functional_matrix @@ -453,6 +454,12 @@ def matrix_generator( ): matrix_row["test-runs-on"] = matrix_row["test-runs-on-sandbox"] + # Select build runner using weighted distribution (90% Azure, 10% AWS) + # Sanitizer builds use ramdisk variants + matrix_row["build-runs-on"] = select_build_runner( + platform, base_args.get("build_variant", "release") + ) + matrix_output.append(matrix_row) print(f"Generated build matrix: {str(matrix_output)}") diff --git a/build_tools/github_actions/configure_multi_arch_ci.py b/build_tools/github_actions/configure_multi_arch_ci.py index 5e58c00dc45..f5be863e73f 100755 --- a/build_tools/github_actions/configure_multi_arch_ci.py +++ b/build_tools/github_actions/configure_multi_arch_ci.py @@ -58,6 +58,7 @@ from amdgpu_family_matrix import ( all_build_variants, get_all_families_for_trigger_types, + select_build_runner, select_weighted_label, ) from configure_ci_path_filters import ( @@ -417,6 +418,8 @@ class BuildConfig: build_variant_cmake_preset: str expect_failure: bool build_pytorch: bool + # Build runner label for this platform/variant combination + build_runs_on: str = "" # Prebuilt stage configuration — set by configure() from JobDecisions. prebuilt_stages: list[str] = field(default_factory=list) baseline_run_id: str = "" @@ -885,6 +888,9 @@ def _expand_build_config_for_platform( expect_pytorch_failure = variant_config.get("expect_pytorch_failure", False) suffix = variant_config.get("build_variant_suffix", "") + # Select build runner using weighted distribution + build_runs_on = select_build_runner(platform, ci_inputs.build_variant) + return BuildConfig( per_family_info=per_family_info, dist_amdgpu_families=";".join(family_names), @@ -894,6 +900,7 @@ def _expand_build_config_for_platform( build_variant_cmake_preset=variant_config["build_variant_cmake_preset"], expect_failure=expect_failure, build_pytorch=not expect_failure and not expect_pytorch_failure, + build_runs_on=build_runs_on, prebuilt_stages=prebuilt_stages or [], baseline_run_id=baseline_run_id, ) diff --git a/build_tools/github_actions/tests/configure_multi_arch_ci_test.py b/build_tools/github_actions/tests/configure_multi_arch_ci_test.py index a4543e06335..02969a475da 100644 --- a/build_tools/github_actions/tests/configure_multi_arch_ci_test.py +++ b/build_tools/github_actions/tests/configure_multi_arch_ci_test.py @@ -1134,5 +1134,50 @@ def test_families_without_multi_label_use_primary_only(self): self.assertEqual(gfx103x_info["test-runs-on"], "linux-gfx1030-gpu-rocm") +# --------------------------------------------------------------------------- +# Build runner selection +# --------------------------------------------------------------------------- + + +class TestBuildRunnerSelection(unittest.TestCase): + """Test weighted random selection of build runners (Azure vs AWS).""" + + def test_select_build_runner_weighted_selection(self): + """Test weighted selection: Azure (90%) vs AWS (10%) for default builds.""" + from amdgpu_family_matrix import select_build_runner + + # Random < 0.9 should select Azure + with patch("random.random", return_value=0.5): + self.assertEqual( + select_build_runner("linux", "release"), "azure-linux-scale-rocm" + ) + + # Random >= 0.9 should select AWS + with patch("random.random", return_value=0.95): + self.assertEqual( + select_build_runner("linux", "release"), "aws-linux-scale-rocm" + ) + + # Random >= 0.9 should select AWS + with patch("random.random", return_value=0.95): + self.assertEqual( + select_build_runner("windows", "release"), "azure-windows-scale-rocm" + ) + + def test_select_build_runner_sanitizer_uses_ramdisk(self): + """Sanitizer builds (asan/tsan) should always use Azure ramdisk runner.""" + from amdgpu_family_matrix import select_build_runner + + with patch("random.random", return_value=0.99): + self.assertEqual( + select_build_runner("linux", "asan"), + "azure-linux-scale-rocm-heavy-ramdisk", + ) + self.assertEqual( + select_build_runner("linux", "tsan"), + "azure-linux-scale-rocm-heavy-ramdisk", + ) + + if __name__ == "__main__": unittest.main()