Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/multi_arch_build_portable_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ on:
type: string
default: ""
description: "Comma-separated build stages to skip (artifacts already copied by the orchestrator)"
build_runs_on:
type: string
default: "azure-linux-scale-rocm"
description: "Build runner label (selected by configure script with weighted distribution)"
release_type:
description: 'Release type: "" for CI, or "dev", "nightly", "prerelease".'
type: string
Expand Down Expand Up @@ -74,6 +78,7 @@ jobs:
dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }}
rocm_package_version: ${{ inputs.rocm_package_version }}
build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
build_runs_on: ${{ inputs.build_runs_on }}
release_type: ${{ inputs.release_type }}
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
Expand All @@ -96,6 +101,7 @@ jobs:
dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }}
rocm_package_version: ${{ inputs.rocm_package_version }}
build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
build_runs_on: ${{ inputs.build_runs_on }}
release_type: ${{ inputs.release_type }}
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
Expand Down Expand Up @@ -128,6 +134,7 @@ jobs:
dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }}
rocm_package_version: ${{ inputs.rocm_package_version }}
build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
build_runs_on: ${{ inputs.build_runs_on }}
release_type: ${{ inputs.release_type }}
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
Expand All @@ -154,6 +161,7 @@ jobs:
dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }}
rocm_package_version: ${{ inputs.rocm_package_version }}
build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
build_runs_on: ${{ inputs.build_runs_on }}
release_type: ${{ inputs.release_type }}
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
Expand All @@ -176,6 +184,7 @@ jobs:
dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }}
rocm_package_version: ${{ inputs.rocm_package_version }}
build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
build_runs_on: ${{ inputs.build_runs_on }}
release_type: ${{ inputs.release_type }}
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
Expand All @@ -198,6 +207,7 @@ jobs:
dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }}
rocm_package_version: ${{ inputs.rocm_package_version }}
build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
build_runs_on: ${{ inputs.build_runs_on }}
release_type: ${{ inputs.release_type }}
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
Expand All @@ -220,6 +230,7 @@ jobs:
dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }}
rocm_package_version: ${{ inputs.rocm_package_version }}
build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
build_runs_on: ${{ inputs.build_runs_on }}
release_type: ${{ inputs.release_type }}
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
Expand All @@ -242,6 +253,7 @@ jobs:
dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }}
rocm_package_version: ${{ inputs.rocm_package_version }}
build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
build_runs_on: ${{ inputs.build_runs_on }}
release_type: ${{ inputs.release_type }}
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
Expand Down Expand Up @@ -271,6 +283,7 @@ jobs:
dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }}
rocm_package_version: ${{ inputs.rocm_package_version }}
build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
build_runs_on: ${{ inputs.build_runs_on }}
release_type: ${{ inputs.release_type }}
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
Expand All @@ -293,6 +306,7 @@ jobs:
dist_amdgpu_families: ${{ inputs.dist_amdgpu_families }}
rocm_package_version: ${{ inputs.rocm_package_version }}
build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
build_runs_on: ${{ inputs.build_runs_on }}
release_type: ${{ inputs.release_type }}
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ on:
description: 'Release type: "" for CI, or "dev", "nightly", "prerelease". Controls artifact bucket and IAM role.'
type: string
default: ""
build_runs_on:
description: "Build runner label (selected by configure script with weighted distribution)"
type: string
default: "azure-linux-scale-rocm"
repository:
description: "Repository to checkout. Defaults to github.repository."
type: string
Expand All @@ -65,7 +69,7 @@ on:
jobs:
build_stage:
name: ${{ inputs.stage_display_name }}
runs-on: ${{ contains(inputs.build_variant_cmake_preset, 'san') && 'azure-linux-scale-rocm-heavy-ramdisk' || 'azure-linux-scale-rocm' }}
runs-on: ${{ inputs.build_runs_on }}
timeout-minutes: ${{ inputs.timeout_minutes }}
permissions:
id-token: write
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/multi_arch_build_windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ on:
type: string
default: ""
description: "Comma-separated build stages to skip (artifacts already copied by the orchestrator)"
build_runs_on:
type: string
default: ""
description: "Build runner label (not currently used for Windows, reserved for future use)"
rocm_package_version:
type: string
release_type:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/multi_arch_ci_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ jobs:
build_variant_suffix: ${{ fromJSON(inputs.build_config).build_variant_suffix }}
expect_failure: ${{ fromJSON(inputs.build_config).expect_failure }}
prebuilt_stages: ${{ fromJSON(inputs.build_config).prebuilt_stages }}
build_runs_on: ${{ fromJSON(inputs.build_config).build_runs_on }}
rocm_package_version: ${{ inputs.rocm_package_version }}
release_type: ${{ inputs.release_type }}
permissions:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/multi_arch_ci_windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ jobs:
build_variant_suffix: ${{ fromJSON(inputs.build_config).build_variant_suffix }}
expect_failure: ${{ fromJSON(inputs.build_config).expect_failure }}
prebuilt_stages: ${{ fromJSON(inputs.build_config).prebuilt_stages }}
build_runs_on: ${{ fromJSON(inputs.build_config).build_runs_on }}
rocm_package_version: ${{ inputs.rocm_package_version }}
release_type: ${{ inputs.release_type }}
permissions:
Expand Down
41 changes: 41 additions & 0 deletions build_tools/github_actions/amdgpu_family_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,47 @@ def select_weighted_label(labels_config: list[dict], context_name: str) -> str:
return selected["label"]


# Build runner configuration for Linux builds
# Uses weighted distribution: 90% Azure, 10% AWS
# Sanitizer builds (asan/tsan) use ramdisk variants (100% Azure, no AWS yet)
BUILD_RUNNER_LABELS = {
"linux": {
"default": [
{"label": "azure-linux-scale-rocm", "weight": 0.90},
{"label": "aws-linux-scale-rocm", "weight": 0.10},
],
"sanitizer": [
{"label": "azure-linux-scale-rocm-heavy-ramdisk", "weight": 1.0},
],
},
"windows": {
"default": [
{"label": "azure-windows-scale-rocm", "weight": 1.0},
],
},
}


def select_build_runner(platform: str, build_variant: str) -> str:
"""Select a build runner label based on platform and build variant."""
if platform not in BUILD_RUNNER_LABELS:
# Platform not configured for weighted selection, return default
print(f" No build runner config for platform {platform}, using default")
return ""

platform_config = BUILD_RUNNER_LABELS[platform]

# Use sanitizer runners for asan/tsan builds
if "san" in build_variant:
labels_config = platform_config.get("sanitizer", platform_config["default"])
context_name = f"build-runner ({platform}, {build_variant})"
else:
labels_config = platform_config["default"]
context_name = f"build-runner ({platform})"

return select_weighted_label(labels_config, context_name)


all_build_variants = {
"linux": {
"release": {
Expand Down
7 changes: 7 additions & 0 deletions build_tools/github_actions/configure_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
from amdgpu_family_matrix import (
all_build_variants,
get_all_families_for_trigger_types,
select_build_runner,
select_weighted_label,
)
from fetch_test_configurations import test_matrix, functional_matrix
Expand Down Expand Up @@ -453,6 +454,12 @@ def matrix_generator(
):
matrix_row["test-runs-on"] = matrix_row["test-runs-on-sandbox"]

# Select build runner using weighted distribution (90% Azure, 10% AWS)
# Sanitizer builds use ramdisk variants
matrix_row["build-runs-on"] = select_build_runner(
platform, base_args.get("build_variant", "release")
)

matrix_output.append(matrix_row)

print(f"Generated build matrix: {str(matrix_output)}")
Expand Down
7 changes: 7 additions & 0 deletions build_tools/github_actions/configure_multi_arch_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
from amdgpu_family_matrix import (
all_build_variants,
get_all_families_for_trigger_types,
select_build_runner,
select_weighted_label,
)
from configure_ci_path_filters import (
Expand Down Expand Up @@ -417,6 +418,8 @@ class BuildConfig:
build_variant_cmake_preset: str
expect_failure: bool
build_pytorch: bool
# Build runner label for this platform/variant combination
build_runs_on: str = ""
# Prebuilt stage configuration — set by configure() from JobDecisions.
prebuilt_stages: list[str] = field(default_factory=list)
baseline_run_id: str = ""
Expand Down Expand Up @@ -885,6 +888,9 @@ def _expand_build_config_for_platform(
expect_pytorch_failure = variant_config.get("expect_pytorch_failure", False)
suffix = variant_config.get("build_variant_suffix", "")

# Select build runner using weighted distribution
build_runs_on = select_build_runner(platform, ci_inputs.build_variant)

return BuildConfig(
per_family_info=per_family_info,
dist_amdgpu_families=";".join(family_names),
Expand All @@ -894,6 +900,7 @@ def _expand_build_config_for_platform(
build_variant_cmake_preset=variant_config["build_variant_cmake_preset"],
expect_failure=expect_failure,
build_pytorch=not expect_failure and not expect_pytorch_failure,
build_runs_on=build_runs_on,
prebuilt_stages=prebuilt_stages or [],
baseline_run_id=baseline_run_id,
)
Expand Down
45 changes: 45 additions & 0 deletions build_tools/github_actions/tests/configure_multi_arch_ci_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1134,5 +1134,50 @@ def test_families_without_multi_label_use_primary_only(self):
self.assertEqual(gfx103x_info["test-runs-on"], "linux-gfx1030-gpu-rocm")


# ---------------------------------------------------------------------------
# Build runner selection
# ---------------------------------------------------------------------------


class TestBuildRunnerSelection(unittest.TestCase):
"""Test weighted random selection of build runners (Azure vs AWS)."""

def test_select_build_runner_weighted_selection(self):
"""Test weighted selection: Azure (90%) vs AWS (10%) for default builds."""
from amdgpu_family_matrix import select_build_runner

# Random < 0.9 should select Azure
with patch("random.random", return_value=0.5):
self.assertEqual(
select_build_runner("linux", "release"), "azure-linux-scale-rocm"
)

# Random >= 0.9 should select AWS
with patch("random.random", return_value=0.95):
self.assertEqual(
select_build_runner("linux", "release"), "aws-linux-scale-rocm"
)

# Random >= 0.9 should select AWS
with patch("random.random", return_value=0.95):
self.assertEqual(
select_build_runner("windows", "release"), "azure-windows-scale-rocm"
)

def test_select_build_runner_sanitizer_uses_ramdisk(self):
"""Sanitizer builds (asan/tsan) should always use Azure ramdisk runner."""
from amdgpu_family_matrix import select_build_runner

with patch("random.random", return_value=0.99):
self.assertEqual(
select_build_runner("linux", "asan"),
"azure-linux-scale-rocm-heavy-ramdisk",
)
self.assertEqual(
select_build_runner("linux", "tsan"),
"azure-linux-scale-rocm-heavy-ramdisk",
)


if __name__ == "__main__":
unittest.main()
Loading