diff --git a/.github/workflows/setup.yml b/.github/workflows/setup.yml index 1645eb6039f..5c8eecd30f9 100644 --- a/.github/workflows/setup.yml +++ b/.github/workflows/setup.yml @@ -76,6 +76,8 @@ jobs: - name: Configuring CI options id: configure env: + # TheRock is the source of truth for runner labels; disable S3 overrides + THEROCK_DISABLE_RUNNER_OVERRIDES: "true" INPUT_LINUX_AMDGPU_FAMILIES: ${{ github.event.inputs.linux_amdgpu_families }} LINUX_TEST_LABELS: ${{ github.event.inputs.linux_test_labels }} LINUX_USE_PREBUILT_ARTIFACTS: ${{ github.event.inputs.linux_use_prebuilt_artifacts }} diff --git a/build_tools/github_actions/amdgpu_family_matrix.py b/build_tools/github_actions/amdgpu_family_matrix.py index 550ce45903a..f60587796fc 100644 --- a/build_tools/github_actions/amdgpu_family_matrix.py +++ b/build_tools/github_actions/amdgpu_family_matrix.py @@ -4,6 +4,9 @@ """ This AMD GPU Family Matrix is the "source of truth" for GitHub workflows. +Runner labels can be dynamically overridden via S3 without requiring PRs. +See runner_overrides.py for details. + * Each entry determines which families and test runners are available to use * Each group determines which entries run by default on workflow triggers @@ -328,7 +331,14 @@ def get_all_families_for_trigger_types(trigger_types): """ Returns a combined family matrix for the specified trigger types. trigger_types: list of strings, e.g. ['presubmit', 'postsubmit', 'nightly'] + + The returned matrix has S3-based runner overrides applied, allowing + dynamic runner configuration without requiring PRs. """ + import copy + + from gpu_runner_s3_config import apply_overrides + result = {} matrix_map = { "presubmit": amdgpu_family_info_matrix_presubmit, @@ -339,6 +349,8 @@ def get_all_families_for_trigger_types(trigger_types): for trigger_type in trigger_types: if trigger_type in matrix_map: for family_name, family_config in matrix_map[trigger_type].items(): - result[family_name] = family_config + # Deep copy to avoid mutating original static matrices + result[family_name] = copy.deepcopy(family_config) - return result + # Apply S3 overrides for dynamic runner configuration + return apply_overrides(result) diff --git a/build_tools/github_actions/gpu_runner_s3_config.py b/build_tools/github_actions/gpu_runner_s3_config.py new file mode 100644 index 00000000000..bc186bab373 --- /dev/null +++ b/build_tools/github_actions/gpu_runner_s3_config.py @@ -0,0 +1,141 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +""" +Fetches runner label overrides from S3. +Falls back gracefully if S3 is unreachable. + +This module enables dynamic runner configuration without requiring PRs to TheRock. +Overrides are stored in a public S3 bucket and fetched at runtime during CI configuration. + +Environment variables: +- THEROCK_RUNNER_OVERRIDE_URL: Custom URL for override file (for testing) +- THEROCK_DISABLE_RUNNER_OVERRIDES: Set to "1" to skip fetching (for local dev/debugging) +""" + +import copy +import json +import os +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen + +from github_actions_utils import str2bool + +# Public HTTPS URL (no auth needed for reads) +DEFAULT_OVERRIDE_URL = ( + "https://therock-ci-config.s3.amazonaws.com/therock-runner-config.json" +) + +# Module-level cache (one fetch per process) +_cached_overrides: dict | None = None +_fetch_attempted: bool = False + + +def _get_override_url() -> str: + """Get the URL for runner overrides, allowing override via environment variable.""" + return os.environ.get("THEROCK_RUNNER_OVERRIDE_URL", DEFAULT_OVERRIDE_URL) + + +def _is_disabled() -> bool: + """Check if runner overrides are disabled via environment variable.""" + return str2bool(os.environ.get("THEROCK_DISABLE_RUNNER_OVERRIDES", "false")) + + +def fetch_overrides() -> dict: + """Fetch overrides from S3. Returns empty dict on failure. + + Returns: + Dict mapping family keys to platform overrides, e.g.: + { + "gfx94x": { + "linux": { + "test-runs-on": "linux-mi325-1gpu-ossci-rocm", + ... + } + } + } + """ + global _cached_overrides, _fetch_attempted + + if _is_disabled(): + return {} + + if _fetch_attempted: + return _cached_overrides or {} + + _fetch_attempted = True + + override_url = _get_override_url() + + try: + req = Request(override_url, headers={"User-Agent": "TheRock-CI"}) + with urlopen(req, timeout=5) as resp: + data = json.loads(resp.read().decode("utf-8")) + _cached_overrides = data.get("overrides", {}) + print(f"Loaded runner overrides from {override_url}") + return _cached_overrides + except (URLError, HTTPError, json.JSONDecodeError, TimeoutError, OSError) as e: + print(f"Warning: Failed to fetch runner overrides from {override_url}: {e}") + return {} + + +def apply_overrides(family_matrix: dict) -> dict: + """Apply S3 overrides to a family matrix.""" + overrides = fetch_overrides() + + if not overrides: + return family_matrix + + # Deep copy to avoid mutating the original matrix + result = copy.deepcopy(family_matrix) + + for family_key, family_overrides in overrides.items(): + if family_key not in result: + continue + + if not isinstance(family_overrides, dict): + continue + + for platform, platform_overrides in family_overrides.items(): + if platform not in result[family_key]: + continue + + if not isinstance(platform_overrides, dict): + continue + + # Merge overrides into existing config (sparse merge) + result[family_key][platform].update(platform_overrides) + + return result + + +def reset_cache() -> None: + """Reset the module-level cache. Useful for testing.""" + global _cached_overrides, _fetch_attempted + _cached_overrides = None + _fetch_attempted = False + + +def generate_overrides_json() -> str: + """Generate runner-overrides.json content from amdgpu_family_matrix.py.""" + from amdgpu_family_matrix import ( + amdgpu_family_info_matrix_nightly, + amdgpu_family_info_matrix_postsubmit, + amdgpu_family_info_matrix_presubmit, + ) + + overrides = {} + matrices = [ + amdgpu_family_info_matrix_presubmit, + amdgpu_family_info_matrix_postsubmit, + amdgpu_family_info_matrix_nightly, + ] + + for matrix in matrices: + for family_key, platforms in matrix.items(): + if family_key not in overrides: + overrides[family_key] = {} + for platform, config in platforms.items(): + overrides[family_key][platform] = dict(config) + + return json.dumps({"overrides": overrides}, indent=2, sort_keys=True) diff --git a/build_tools/github_actions/tests/gpu_runner_s3_config_test.py b/build_tools/github_actions/tests/gpu_runner_s3_config_test.py new file mode 100644 index 00000000000..4ba1d773437 --- /dev/null +++ b/build_tools/github_actions/tests/gpu_runner_s3_config_test.py @@ -0,0 +1,85 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +"""Unit tests for gpu_runner_s3_config.py.""" + +import json +import os +from pathlib import Path +import sys +import unittest +from unittest.mock import MagicMock, patch + +sys.path.insert(0, os.fspath(Path(__file__).parent.parent)) +import gpu_runner_s3_config + + +class TestRunnerOverrides(unittest.TestCase): + """Tests for gpu_runner_s3_config module.""" + + def setUp(self): + gpu_runner_s3_config.reset_cache() + os.environ.pop("THEROCK_RUNNER_OVERRIDE_URL", None) + os.environ.pop("THEROCK_DISABLE_gpu_runner_s3_config", None) + + def tearDown(self): + gpu_runner_s3_config.reset_cache() + os.environ.pop("THEROCK_RUNNER_OVERRIDE_URL", None) + os.environ.pop("THEROCK_DISABLE_gpu_runner_s3_config", None) + + def _mock_urlopen(self, mock, data): + """Helper to set up urlopen mock with given data.""" + resp = MagicMock() + resp.read.return_value = json.dumps(data).encode("utf-8") + resp.__enter__ = MagicMock(return_value=resp) + resp.__exit__ = MagicMock(return_value=False) + mock.return_value = resp + + @patch("gpu_runner_s3_config.urlopen") + def test_fetch_errors_return_empty(self, mock_urlopen): + """Test that network/parse errors return empty dict.""" + from urllib.error import URLError + + mock_urlopen.side_effect = URLError("Connection refused") + self.assertEqual(gpu_runner_s3_config.fetch_overrides(), {}) + + def test_fetch_disabled(self): + """Test fetch skipped when disabled.""" + os.environ["THEROCK_DISABLE_RUNNER_OVERRIDES"] = "true" + with patch("gpu_runner_s3_config.urlopen") as mock: + self.assertEqual(gpu_runner_s3_config.fetch_overrides(), {}) + mock.assert_not_called() + + @patch("gpu_runner_s3_config.fetch_overrides") + def test_apply_sparse_merge(self, mock_fetch): + """Test overrides are sparsely merged without mutating original.""" + mock_fetch.return_value = {"gfx94x": {"linux": {"test-runs-on": "new-runner"}}} + original = { + "gfx94x": { + "linux": {"test-runs-on": "old-runner", "family": "gfx94X-dcgpu"} + } + } + + result = gpu_runner_s3_config.apply_overrides(original) + + # Override applied + self.assertEqual(result["gfx94x"]["linux"]["test-runs-on"], "new-runner") + # Other fields preserved + self.assertEqual(result["gfx94x"]["linux"]["family"], "gfx94X-dcgpu") + # Original unchanged + self.assertEqual(original["gfx94x"]["linux"]["test-runs-on"], "old-runner") + + @patch("gpu_runner_s3_config.fetch_overrides") + def test_apply_ignores_unknown_families(self, mock_fetch): + """Test unknown families/platforms in overrides are ignored.""" + mock_fetch.return_value = {"unknown": {"linux": {"test-runs-on": "x"}}} + original = {"gfx94x": {"linux": {"test-runs-on": "runner"}}} + + result = gpu_runner_s3_config.apply_overrides(original) + + self.assertNotIn("unknown", result) + self.assertEqual(result["gfx94x"]["linux"]["test-runs-on"], "runner") + + +if __name__ == "__main__": + unittest.main()