Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions build_tools/github_actions/amdgpu_family_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
"""
amdgpu_family_info_matrix dictionary fields:
- test-runs-on: (required) GitHub runner label for this architecture
- test-runs-on-alternate: (optional) Alternate runner label for load balancing across runner pools
- test-runs-on-alternate-weight: (optional) Probability (0.0-1.0) of selecting the alternate runner.
- test-runs-on-multi-gpu: (optional) GitHub runner label for multi-GPU tests for this architecture
- benchmark-runs-on: (optional) GitHub runner label for benchmarks for this architecture
- test-runs-on-kernel: (optional) dict of kernel-specific runner labels, keyed by kernel type (e.g. "oem")
Expand All @@ -68,7 +70,11 @@
amdgpu_family_info_matrix_presubmit = {
"gfx94x": {
"linux": {
# TODO: Remove alternative weight once we get dedicated set of machines
# As we are bringing back up mi325, we are using a dual-label configuration to distribute load
"test-runs-on": "linux-gfx942-1gpu-ossci-rocm",
"test-runs-on-alternate": "linux-gfx942-1gpu-ccs-ossci-rocm",
"test-runs-on-alternate-weight": 0.35, # 35% chance of using alternate
# TODO(#3433): Remove sandbox label once ASAN tests are passing
"test-runs-on-sandbox": "rocm-asan-mi325-sandbox",
"test-runs-on-multi-gpu": "linux-gfx942-8gpu-ossci-rocm",
Expand Down
20 changes: 20 additions & 0 deletions build_tools/github_actions/configure_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import json
import os
from pathlib import Path
import random
import sys
from typing import Iterable, List, Optional
import string
Expand Down Expand Up @@ -405,6 +406,25 @@ def matrix_generator(
artifact_group += f"-{build_variant_suffix}"
matrix_row["artifact_group"] = artifact_group

# Handle dual-label configuration with weighted random selection.
# Some families (e.g. gfx94x) have multiple runner labels available.
if "test-runs-on-alternate" in platform_info:
alternate_label = platform_info["test-runs-on-alternate"]
alternate_weight = platform_info.get(
"test-runs-on-alternate-weight", 0.5
)
if random.random() < alternate_weight:
matrix_row["test-runs-on"] = alternate_label
print(
f" {target_name}: selected alternate runner (weight={alternate_weight}): "
f"{alternate_label}"
)
else:
print(
f" {target_name}: selected primary runner (weight={1-alternate_weight}): "
f"{matrix_row['test-runs-on']}"
)

# We retrieve labels from both PR and workflow_dispatch to customize the build and test jobs
label_options = []
label_options.extend(get_pr_labels(base_args))
Expand Down
19 changes: 19 additions & 0 deletions build_tools/github_actions/configure_multi_arch_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import enum
import json
import os
import random
from dataclasses import asdict, dataclass, field, fields


Expand Down Expand Up @@ -760,6 +761,24 @@ def _expand_build_config_for_platform(

# Determine test runner label.
test_runs_on = platform_info["test-runs-on"]

# Handle dual-label configuration with weighted random selection.
# Some families (e.g. gfx94x) have multiple runner labels available.
if "test-runs-on-alternate" in platform_info:
alternate_label = platform_info["test-runs-on-alternate"]
alternate_weight = platform_info.get("test-runs-on-alternate-weight", 0.5)
if random.random() < alternate_weight:
test_runs_on = alternate_label
print(
f" {family_name}: selected alternate runner (weight={alternate_weight}): "
f"{test_runs_on}"
)
else:
print(
f" {family_name}: selected primary runner (weight={1-alternate_weight}): "
f"{test_runs_on}"
)

# When a test_runner:<kernel> label is set, use the
# kernel-specific runner if available, otherwise disable testing for
# this family (the default runner may not have the right kernel).
Expand Down
81 changes: 81 additions & 0 deletions build_tools/github_actions/tests/configure_ci_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,87 @@ def test_sandbox_test_runner_with_asan(self):
entry = linux_target_output[0]
self.assertEqual(entry["test-runs-on"], "rocm-asan-mi325-sandbox")

###########################################################################
# Tests for dual-label runner selection

def test_gfx94x_alternate_label_selected_when_random_below_weight(self):
"""When random() < weight, alternate label should be selected."""
base_args = {"build_variant": "release"}
build_families = {"amdgpu_families": "gfx94X"}

# Mock random.random() to return 0.1 (< 0.2 weight)
with patch("random.random", return_value=0.1):
linux_target_output, _ = configure_ci.matrix_generator(
is_pull_request=True,
is_workflow_dispatch=False,
is_push=False,
is_schedule=False,
base_args=base_args,
families=build_families,
platform="linux",
)

# Find the gfx94X entry
gfx94x_entries = [
e for e in linux_target_output if e["family"] == "gfx94X-dcgpu"
]
self.assertEqual(len(gfx94x_entries), 1, "Expected exactly one gfx94X entry")
entry = gfx94x_entries[0]
# Should select the alternate CCS label
self.assertEqual(entry["test-runs-on"], "linux-gfx942-1gpu-ccs-ossci-rocm")

def test_gfx94x_primary_label_selected_when_random_above_weight(self):
"""When random() >= weight, primary label should be selected."""
base_args = {"build_variant": "release"}
build_families = {"amdgpu_families": "gfx94X"}

# Mock random.random() to return 0.5 (>= 0.35 weight)
with patch("random.random", return_value=0.5):
linux_target_output, _ = configure_ci.matrix_generator(
is_pull_request=True,
is_workflow_dispatch=False,
is_push=False,
is_schedule=False,
base_args=base_args,
families=build_families,
platform="linux",
)

# Find the gfx94X entry
gfx94x_entries = [
e for e in linux_target_output if e["family"] == "gfx94X-dcgpu"
]
self.assertEqual(len(gfx94x_entries), 1, "Expected exactly one gfx94X entry")
entry = gfx94x_entries[0]
# Should select the primary label
self.assertEqual(entry["test-runs-on"], "linux-gfx942-1gpu-ossci-rocm")

def test_families_without_alternate_always_use_primary(self):
"""Families without dual-label config should always use primary label."""
base_args = {"build_variant": "release"}
build_families = {"amdgpu_families": "gfx103X"}

# Run multiple times to ensure consistency (no alternate label exists)
for _ in range(5):
linux_target_output, _ = configure_ci.matrix_generator(
is_pull_request=False,
is_workflow_dispatch=False,
is_push=False,
is_schedule=True,
base_args=base_args,
families=build_families,
platform="linux",
)

# Find the gfx103X entry
gfx103x_entries = [
e for e in linux_target_output if e["family"] == "gfx103X-dgpu"
]
if gfx103x_entries:
entry = gfx103x_entries[0]
# Should always use the same primary label
self.assertEqual(entry["test-runs-on"], "linux-gfx1030-gpu-rocm")


if __name__ == "__main__":
unittest.main()
124 changes: 124 additions & 0 deletions build_tools/github_actions/tests/configure_multi_arch_ci_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,5 +897,129 @@ def test_windows_workflow_uses_all_fields(self):
)


# ---------------------------------------------------------------------------
# Dual-label runner selection
# ---------------------------------------------------------------------------


class TestDualLabelRunnerSelection(unittest.TestCase):
"""Test weighted random selection of dual-label runner configurations."""

def test_gfx94x_has_dual_label_config(self):
"""Verify gfx94x has the dual-label configuration."""
from amdgpu_family_matrix import get_all_families_for_trigger_types
Comment on lines +900 to +910
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This small bit of logic doesn't need its own test class, it should just go in TestExpandBuildConfigs. I'm moving it there in #4500


all_families = get_all_families_for_trigger_types(["presubmit"])
self.assertIn("gfx94x", all_families)

gfx94x_linux = all_families["gfx94x"].get("linux", {})
self.assertIn("test-runs-on", gfx94x_linux)
self.assertIn("test-runs-on-alternate", gfx94x_linux)
self.assertIn("test-runs-on-alternate-weight", gfx94x_linux)

# Verify the expected labels
self.assertEqual(gfx94x_linux["test-runs-on"], "linux-gfx942-1gpu-ossci-rocm")
self.assertEqual(
gfx94x_linux["test-runs-on-alternate"],
"linux-gfx942-1gpu-ccs-ossci-rocm",
)

def test_alternate_label_selected_when_random_below_weight(self):
"""When random() < weight, alternate label should be selected."""
ci_inputs = cm.CIInputs(
run_id="12345",
event_name="pull_request",
commit_ref="feature",
base_ref="HEAD^",
build_variant="release",
)
targets = cm.TargetSelection(linux_families=["gfx94x"])

# Mock random.random() to return 0.1 (< 0.2 weight)
with patch("random.random", return_value=0.1):
builds = cm.expand_build_configs(targets, ci_inputs)

self.assertIsNotNone(builds.linux)
# Check that the alternate label was selected
gfx94x_info = builds.linux.per_family_info[0]
self.assertEqual(
gfx94x_info["test-runs-on"], "linux-gfx942-1gpu-ccs-ossci-rocm"
)

def test_primary_label_selected_when_random_above_weight(self):
"""When random() >= weight, primary label should be selected."""
ci_inputs = cm.CIInputs(
run_id="12345",
event_name="pull_request",
commit_ref="feature",
base_ref="HEAD^",
build_variant="release",
)
targets = cm.TargetSelection(linux_families=["gfx94x"])

# Mock random.random() to return 0.5 (>= 0.2 weight)
with patch("random.random", return_value=0.5):
builds = cm.expand_build_configs(targets, ci_inputs)

self.assertIsNotNone(builds.linux)
# Check that the primary label was selected
gfx94x_info = builds.linux.per_family_info[0]
self.assertEqual(gfx94x_info["test-runs-on"], "linux-gfx942-1gpu-ossci-rocm")

def test_distribution_approximates_weight(self):
"""Over many runs, selection should approximate the configured weight."""
with patch("random.random", return_value=0.5):
ci_inputs = cm.CIInputs(
run_id="12345",
event_name="pull_request",
commit_ref="feature",
base_ref="HEAD^",
build_variant="release",
)
targets = cm.TargetSelection(linux_families=["gfx94x"])
builds = cm.expand_build_configs(targets, ci_inputs)
gfx94x_info = builds.linux.per_family_info[0]
self.assertEqual(
gfx94x_info["test-runs-on"], "linux-gfx942-1gpu-ossci-rocm"
)

def test_distribution_approximates_alternative_weight(self):
"""Over many runs, selection should approximate the configured weight."""
with patch("random.random", return_value=0.2):
ci_inputs = cm.CIInputs(
run_id="12345",
event_name="pull_request",
commit_ref="feature",
base_ref="HEAD^",
build_variant="release",
)
targets = cm.TargetSelection(linux_families=["gfx94x"])
builds = cm.expand_build_configs(targets, ci_inputs)
gfx94x_info = builds.linux.per_family_info[0]
self.assertEqual(
gfx94x_info["test-runs-on"], "linux-gfx942-1gpu-ccs-ossci-rocm"
)

def test_families_without_alternate_use_primary_only(self):
"""Families without dual-label config should only use primary label."""
ci_inputs = cm.CIInputs(
run_id="12345",
event_name="schedule",
commit_ref="main",
base_ref="HEAD^1",
build_variant="release",
)
# gfx103x doesn't have alternate label
targets = cm.TargetSelection(linux_families=["gfx103x"])

# Run multiple times to ensure consistency
for _ in range(10):
builds = cm.expand_build_configs(targets, ci_inputs)
if builds.linux and builds.linux.per_family_info:
gfx103x_info = builds.linux.per_family_info[0]
# Should always use the primary label
self.assertEqual(gfx103x_info["test-runs-on"], "linux-gfx1030-gpu-rocm")


if __name__ == "__main__":
unittest.main()
Loading