ROCm · geomin12 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/.github/workflows/setup.yml b/.github/workflows/setup.yml
@@ -76,6 +76,8 @@ jobs:
       - name: Configuring CI options
         id: configure
         env:
+          # TheRock is the source of truth for runner labels; disable S3 overrides
+          THEROCK_DISABLE_RUNNER_OVERRIDES: "true"
           INPUT_LINUX_AMDGPU_FAMILIES: ${{ github.event.inputs.linux_amdgpu_families }}
           LINUX_TEST_LABELS: ${{ github.event.inputs.linux_test_labels }}
           LINUX_USE_PREBUILT_ARTIFACTS: ${{ github.event.inputs.linux_use_prebuilt_artifacts }}

diff --git a/build_tools/github_actions/amdgpu_family_matrix.py b/build_tools/github_actions/amdgpu_family_matrix.py
@@ -4,6 +4,9 @@
 """
 This AMD GPU Family Matrix is the "source of truth" for GitHub workflows.
 
+Runner labels can be dynamically overridden via S3 without requiring PRs.
+See runner_overrides.py for details.
+
 * Each entry determines which families and test runners are available to use
 * Each group determines which entries run by default on workflow triggers
 
@@ -328,7 +331,14 @@ def get_all_families_for_trigger_types(trigger_types):
     """
     Returns a combined family matrix for the specified trigger types.
     trigger_types: list of strings, e.g. ['presubmit', 'postsubmit', 'nightly']
+
+    The returned matrix has S3-based runner overrides applied, allowing
+    dynamic runner configuration without requiring PRs.
     """
+    import copy
+
+    from gpu_runner_s3_config import apply_overrides
+
     result = {}
     matrix_map = {
         "presubmit": amdgpu_family_info_matrix_presubmit,
@@ -339,6 +349,8 @@ def get_all_families_for_trigger_types(trigger_types):
     for trigger_type in trigger_types:
         if trigger_type in matrix_map:
             for family_name, family_config in matrix_map[trigger_type].items():
-                result[family_name] = family_config
+                # Deep copy to avoid mutating original static matrices
+                result[family_name] = copy.deepcopy(family_config)
 
-    return result
+    # Apply S3 overrides for dynamic runner configuration
+    return apply_overrides(result)
diff --git a/build_tools/github_actions/gpu_runner_s3_config.py b/build_tools/github_actions/gpu_runner_s3_config.py
@@ -0,0 +1,141 @@
+# Copyright Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""
+Fetches runner label overrides from S3.
+Falls back gracefully if S3 is unreachable.
+
+This module enables dynamic runner configuration without requiring PRs to TheRock.
+Overrides are stored in a public S3 bucket and fetched at runtime during CI configuration.
+
+Environment variables:
+- THEROCK_RUNNER_OVERRIDE_URL: Custom URL for override file (for testing)
+- THEROCK_DISABLE_RUNNER_OVERRIDES: Set to "1" to skip fetching (for local dev/debugging)
+"""
+
+import copy
+import json
+import os
+from urllib.error import HTTPError, URLError
+from urllib.request import Request, urlopen
+
+from github_actions_utils import str2bool
+
+# Public HTTPS URL (no auth needed for reads)
+DEFAULT_OVERRIDE_URL = (
+    "https://therock-ci-config.s3.amazonaws.com/therock-runner-config.json"
+)
+
+# Module-level cache (one fetch per process)
+_cached_overrides: dict | None = None
+_fetch_attempted: bool = False
+
+
+def _get_override_url() -> str:
+    """Get the URL for runner overrides, allowing override via environment variable."""
+    return os.environ.get("THEROCK_RUNNER_OVERRIDE_URL", DEFAULT_OVERRIDE_URL)
+
+
+def _is_disabled() -> bool:
+    """Check if runner overrides are disabled via environment variable."""
+    return str2bool(os.environ.get("THEROCK_DISABLE_RUNNER_OVERRIDES", "false"))
+
+
+def fetch_overrides() -> dict:
+    """Fetch overrides from S3. Returns empty dict on failure.
+
+    Returns:
+        Dict mapping family keys to platform overrides, e.g.:
+        {
+            "gfx94x": {
+                "linux": {
+                    "test-runs-on": "linux-mi325-1gpu-ossci-rocm",
+                    ...
+                }
+            }
+        }
+    """
+    global _cached_overrides, _fetch_attempted
+
+    if _is_disabled():
+        return {}
+
+    if _fetch_attempted:
+        return _cached_overrides or {}
+
+    _fetch_attempted = True
+
+    override_url = _get_override_url()
+
+    try:
+        req = Request(override_url, headers={"User-Agent": "TheRock-CI"})
+        with urlopen(req, timeout=5) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+            _cached_overrides = data.get("overrides", {})
+            print(f"Loaded runner overrides from {override_url}")
+            return _cached_overrides
+    except (URLError, HTTPError, json.JSONDecodeError, TimeoutError, OSError) as e:
+        print(f"Warning: Failed to fetch runner overrides from {override_url}: {e}")
+        return {}
+
+
+def apply_overrides(family_matrix: dict) -> dict:
+    """Apply S3 overrides to a family matrix."""
+    overrides = fetch_overrides()
+
+    if not overrides:
+        return family_matrix
+
+    # Deep copy to avoid mutating the original matrix
+    result = copy.deepcopy(family_matrix)
+
+    for family_key, family_overrides in overrides.items():
+        if family_key not in result:
+            continue
+
+        if not isinstance(family_overrides, dict):
+            continue
+
+        for platform, platform_overrides in family_overrides.items():
+            if platform not in result[family_key]:
+                continue
+
+            if not isinstance(platform_overrides, dict):
+                continue
+
+            # Merge overrides into existing config (sparse merge)
+            result[family_key][platform].update(platform_overrides)
+
+    return result
+
+
+def reset_cache() -> None:
+    """Reset the module-level cache. Useful for testing."""
+    global _cached_overrides, _fetch_attempted
+    _cached_overrides = None
+    _fetch_attempted = False
+
+
+def generate_overrides_json() -> str:
+    """Generate runner-overrides.json content from amdgpu_family_matrix.py."""
+    from amdgpu_family_matrix import (
+        amdgpu_family_info_matrix_nightly,
+        amdgpu_family_info_matrix_postsubmit,
+        amdgpu_family_info_matrix_presubmit,
+    )
+
+    overrides = {}
+    matrices = [
+        amdgpu_family_info_matrix_presubmit,
+        amdgpu_family_info_matrix_postsubmit,
+        amdgpu_family_info_matrix_nightly,
+    ]
+
+    for matrix in matrices:
+        for family_key, platforms in matrix.items():
+            if family_key not in overrides:
+                overrides[family_key] = {}
+            for platform, config in platforms.items():
+                overrides[family_key][platform] = dict(config)
+
+    return json.dumps({"overrides": overrides}, indent=2, sort_keys=True)
diff --git a/build_tools/github_actions/tests/gpu_runner_s3_config_test.py b/build_tools/github_actions/tests/gpu_runner_s3_config_test.py
@@ -0,0 +1,85 @@
+# Copyright Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+"""Unit tests for gpu_runner_s3_config.py."""
+
+import json
+import os
+from pathlib import Path
+import sys
+import unittest
+from unittest.mock import MagicMock, patch
+
+sys.path.insert(0, os.fspath(Path(__file__).parent.parent))
+import gpu_runner_s3_config
+
+
+class TestRunnerOverrides(unittest.TestCase):
+    """Tests for gpu_runner_s3_config module."""
+
+    def setUp(self):
+        gpu_runner_s3_config.reset_cache()
+        os.environ.pop("THEROCK_RUNNER_OVERRIDE_URL", None)
+        os.environ.pop("THEROCK_DISABLE_gpu_runner_s3_config", None)
+
+    def tearDown(self):
+        gpu_runner_s3_config.reset_cache()
+        os.environ.pop("THEROCK_RUNNER_OVERRIDE_URL", None)
+        os.environ.pop("THEROCK_DISABLE_gpu_runner_s3_config", None)
+
+    def _mock_urlopen(self, mock, data):
+        """Helper to set up urlopen mock with given data."""
+        resp = MagicMock()
+        resp.read.return_value = json.dumps(data).encode("utf-8")
+        resp.__enter__ = MagicMock(return_value=resp)
+        resp.__exit__ = MagicMock(return_value=False)
+        mock.return_value = resp
+
+    @patch("gpu_runner_s3_config.urlopen")
+    def test_fetch_errors_return_empty(self, mock_urlopen):
+        """Test that network/parse errors return empty dict."""
+        from urllib.error import URLError
+
+        mock_urlopen.side_effect = URLError("Connection refused")
+        self.assertEqual(gpu_runner_s3_config.fetch_overrides(), {})
+
+    def test_fetch_disabled(self):
+        """Test fetch skipped when disabled."""
+        os.environ["THEROCK_DISABLE_RUNNER_OVERRIDES"] = "true"
+        with patch("gpu_runner_s3_config.urlopen") as mock:
+            self.assertEqual(gpu_runner_s3_config.fetch_overrides(), {})
+            mock.assert_not_called()
+
+    @patch("gpu_runner_s3_config.fetch_overrides")
+    def test_apply_sparse_merge(self, mock_fetch):
+        """Test overrides are sparsely merged without mutating original."""
+        mock_fetch.return_value = {"gfx94x": {"linux": {"test-runs-on": "new-runner"}}}
+        original = {
+            "gfx94x": {
+                "linux": {"test-runs-on": "old-runner", "family": "gfx94X-dcgpu"}
+            }
+        }
+
+        result = gpu_runner_s3_config.apply_overrides(original)
+
+        # Override applied
+        self.assertEqual(result["gfx94x"]["linux"]["test-runs-on"], "new-runner")
+        # Other fields preserved
+        self.assertEqual(result["gfx94x"]["linux"]["family"], "gfx94X-dcgpu")
+        # Original unchanged
+        self.assertEqual(original["gfx94x"]["linux"]["test-runs-on"], "old-runner")
+
+    @patch("gpu_runner_s3_config.fetch_overrides")
+    def test_apply_ignores_unknown_families(self, mock_fetch):
+        """Test unknown families/platforms in overrides are ignored."""
+        mock_fetch.return_value = {"unknown": {"linux": {"test-runs-on": "x"}}}
+        original = {"gfx94x": {"linux": {"test-runs-on": "runner"}}}
+
+        result = gpu_runner_s3_config.apply_overrides(original)
+
+        self.assertNotIn("unknown", result)
+        self.assertEqual(result["gfx94x"]["linux"]["test-runs-on"], "runner")
+
+
+if __name__ == "__main__":
+    unittest.main()