Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/setup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ jobs:
- name: Configuring CI options
id: configure
env:
# TheRock is the source of truth for runner labels; disable S3 overrides
THEROCK_DISABLE_RUNNER_OVERRIDES: "true"
INPUT_LINUX_AMDGPU_FAMILIES: ${{ github.event.inputs.linux_amdgpu_families }}
LINUX_TEST_LABELS: ${{ github.event.inputs.linux_test_labels }}
LINUX_USE_PREBUILT_ARTIFACTS: ${{ github.event.inputs.linux_use_prebuilt_artifacts }}
Expand Down
16 changes: 14 additions & 2 deletions build_tools/github_actions/amdgpu_family_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
"""
This AMD GPU Family Matrix is the "source of truth" for GitHub workflows.

Runner labels can be dynamically overridden via S3 without requiring PRs.
See runner_overrides.py for details.

* Each entry determines which families and test runners are available to use
* Each group determines which entries run by default on workflow triggers

Expand Down Expand Up @@ -328,7 +331,14 @@ def get_all_families_for_trigger_types(trigger_types):
"""
Returns a combined family matrix for the specified trigger types.
trigger_types: list of strings, e.g. ['presubmit', 'postsubmit', 'nightly']

The returned matrix has S3-based runner overrides applied, allowing
dynamic runner configuration without requiring PRs.
"""
import copy

from gpu_runner_s3_config import apply_overrides

result = {}
matrix_map = {
"presubmit": amdgpu_family_info_matrix_presubmit,
Expand All @@ -339,6 +349,8 @@ def get_all_families_for_trigger_types(trigger_types):
for trigger_type in trigger_types:
if trigger_type in matrix_map:
for family_name, family_config in matrix_map[trigger_type].items():
result[family_name] = family_config
# Deep copy to avoid mutating original static matrices
result[family_name] = copy.deepcopy(family_config)

return result
# Apply S3 overrides for dynamic runner configuration
return apply_overrides(result)
141 changes: 141 additions & 0 deletions build_tools/github_actions/gpu_runner_s3_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Copyright Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT

"""
Fetches runner label overrides from S3.
Falls back gracefully if S3 is unreachable.

This module enables dynamic runner configuration without requiring PRs to TheRock.
Overrides are stored in a public S3 bucket and fetched at runtime during CI configuration.

Environment variables:
- THEROCK_RUNNER_OVERRIDE_URL: Custom URL for override file (for testing)
- THEROCK_DISABLE_RUNNER_OVERRIDES: Set to "1" to skip fetching (for local dev/debugging)
"""

import copy
import json
import os
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

from github_actions_utils import str2bool

# Public HTTPS URL (no auth needed for reads)
DEFAULT_OVERRIDE_URL = (
"https://therock-ci-config.s3.amazonaws.com/therock-runner-config.json"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this in version control somewhere? It might be hard to track changes, or troubleshoot if things go awry otherwise.

If it were committed and tracked in TheRock repo, couldn't it just be requested from there? As an example, currently the pre/post cleanup processes script is sparsely checked out from TheRock repo in test workflows in rocm-systems and rocm-libraries without being pinned. Is there a particular reason to use S3 buckets?

Copy link
Copy Markdown
Contributor Author

@geomin12 geomin12 Mar 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there can be version control on the bucket. the purpose of this is to:

  • TheRock being source of truth (when we onboard new machines, we test on TheRock first then land, then make update to s3)
  • If we unpinned SHA for this script in monorepos, machines may go wacky as all workflows use these machines immediately (we want TheRock to test first - similar to gfx110X windows, we found issues!)
  • Once machines are tested on TheRock, we can just use this generate_overrides_json function to copy exactly and paste into s3

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the context!

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is the right way. If you trigger a workflow, how do you trace back which therock-runner-config.json version have been pulled? This is most likely not reproducible. An alternative could be a TheRock-config repo which you checkout without pinning, so you always get the latest version but you know what you got as this was logged. You can get the same with extra logging but a git repo feels way more natural.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's fair as well, I wanted some seamless way to update runner names without much PR friction

However, an extra repo of TheRock-config would end up being the same scenario of TheRock with rocm-libraries/rocm-systems (as we want to test via TheRock first, then propogate to rocm-libraries/systems).

I'm okay with keeping the current format of amdgpu_family_matrix.py with additional PRs in other places, let me know! I can close this PR out

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also note that the data schema used for this config is far from trivial, and has no built in versioning (which you could get from a format like protobuf, flatbuffers, a version field in JSON, etc.). Using a completely unversioned file without any pinning is just asking for forwards/backwards-compatibility issues as the infrastructure code that reads from the file is changed (and it is changing already today - there's a rewrite in https://github.com/ROCm/TheRock/blob/main/build_tools/github_actions/new_amdgpu_family_matrix.py and @HereThereBeDragons has more changes coming).

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the concern is wanting to test in TheRock without propagating immediately to rocm-systems and rocm-libraries, why not create a production branch in TheRock repo, that r-s and r-l can point to for checkout. This way it's controllable in TheRock, yet you could allow whatever gpu matrix setup be the latest in TheRock main branch.

https://github.com/ROCm/rocm-libraries/blob/47d3429cea5a05d25ac54cb2eede19a983e501f2/.github/workflows/therock-test-component.yml#L54-L72

This is how I implemented the ability for rocm-systems and rocm-libraries to always take the latest build_tools from TheRock (and only this folder with sparse checkout) since no ref is specified. The next checkout is the original pinned checkout used for the remainder of the job.

A live example of logs showing the latest ref being checked out if unspecified per @marbre:
https://github.com/ROCm/rocm-libraries/actions/runs/23351169868/job/67941706099#step:2:65
During the step Fetch 'build_tools' from repository

Determining the default branch
  Retrieving the default branch name
  Default branch 'main'
Fetching the repository
  "C:\Program Files\Git\cmd\git.exe" -c protocol.version=2 fetch --no-tags --prune --no-recurse-submodules --filter=blob:none --depth=1 origin +refs/heads/main:refs/remotes/origin/main
  From https://github.com/ROCm/TheRock
   * [new branch]      main       -> origin/main
Determining the checkout info
Setting up sparse checkout
  "C:\Program Files\Git\cmd\git.exe" sparse-checkout set build_tools
Checking out the ref
  "C:\Program Files\Git\cmd\git.exe" checkout --progress --force -B main refs/remotes/origin/main
  Switched to a new branch 'main'
  branch 'main' set up to track 'origin/main'.
"C:\Program Files\Git\cmd\git.exe" log -1 --format=%H
095cfd35cc0c87d277c884757402768d76c7bb3e

Versus the step Checkout Repository

Fetching the repository
  "C:\Program Files\Git\cmd\git.exe" -c protocol.version=2 fetch --no-tags --prune --no-recurse-submodules --depth=1 origin 1408a826dc2c3446a0295c0ef0c71c4d92f0f4f9
  From https://github.com/ROCm/TheRock
   * branch            1408a826dc2c3446a0295c0ef0c71c4d92f0f4f9 -> FETCH_HEAD
Determining the checkout info
"C:\Program Files\Git\cmd\git.exe" sparse-checkout disable
"C:\Program Files\Git\cmd\git.exe" config --local --unset-all extensions.worktreeConfig
Checking out the ref
  "C:\Program Files\Git\cmd\git.exe" checkout --progress --force 1408a826dc2c3446a0295c0ef0c71c4d92f0f4f9
  Warning: you are leaving 1 commit behind, not connected to
  any of your branches:
  
    f9cf9bb [ci] fixing hipsparse gtest filter (#4053)
  
  If you want to keep it by creating a new branch, this may be a good time
  to do so with:
  
   git branch <new-branch-name> f9cf9bb
  
  HEAD is now at 1408a82 [ci] disabling mi325 test runners (#3969)
"C:\Program Files\Git\cmd\git.exe" log -1 --format=%H
1408a826dc2c3446a0295c0ef0c71c4d92f0f4f9

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's brittle too - we can't rename or move that cleanup_processes.ps1 file now without breaking CI in that repository. It's a small file, just copy it into the repository that needs it?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's definitely a limitation as there were moments where I did want to rename the script. I think at the time the trade off was for faster iteration since every repo would be running tests on runners shared by all repos. Would have to consider alternate implementations in the future.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We pin the versions of dependencies precisely so we don't get stuck being unable to update either repository safely like that. It might not break today, but any unpinned dependency like that is just waiting to break in the future.

)

# Module-level cache (one fetch per process)
_cached_overrides: dict | None = None
_fetch_attempted: bool = False


def _get_override_url() -> str:
"""Get the URL for runner overrides, allowing override via environment variable."""
return os.environ.get("THEROCK_RUNNER_OVERRIDE_URL", DEFAULT_OVERRIDE_URL)


def _is_disabled() -> bool:
"""Check if runner overrides are disabled via environment variable."""
return str2bool(os.environ.get("THEROCK_DISABLE_RUNNER_OVERRIDES", "false"))


def fetch_overrides() -> dict:
"""Fetch overrides from S3. Returns empty dict on failure.

Returns:
Dict mapping family keys to platform overrides, e.g.:
{
"gfx94x": {
"linux": {
"test-runs-on": "linux-mi325-1gpu-ossci-rocm",
...
}
}
}
"""
global _cached_overrides, _fetch_attempted

if _is_disabled():
return {}

if _fetch_attempted:
return _cached_overrides or {}

_fetch_attempted = True

override_url = _get_override_url()

try:
req = Request(override_url, headers={"User-Agent": "TheRock-CI"})
with urlopen(req, timeout=5) as resp:
data = json.loads(resp.read().decode("utf-8"))
_cached_overrides = data.get("overrides", {})
print(f"Loaded runner overrides from {override_url}")
return _cached_overrides
except (URLError, HTTPError, json.JSONDecodeError, TimeoutError, OSError) as e:
print(f"Warning: Failed to fetch runner overrides from {override_url}: {e}")
return {}


def apply_overrides(family_matrix: dict) -> dict:
"""Apply S3 overrides to a family matrix."""
overrides = fetch_overrides()

if not overrides:
return family_matrix

# Deep copy to avoid mutating the original matrix
result = copy.deepcopy(family_matrix)

for family_key, family_overrides in overrides.items():
if family_key not in result:
continue

if not isinstance(family_overrides, dict):
continue

for platform, platform_overrides in family_overrides.items():
if platform not in result[family_key]:
continue

if not isinstance(platform_overrides, dict):
continue

# Merge overrides into existing config (sparse merge)
result[family_key][platform].update(platform_overrides)

return result


def reset_cache() -> None:
"""Reset the module-level cache. Useful for testing."""
global _cached_overrides, _fetch_attempted
_cached_overrides = None
_fetch_attempted = False


def generate_overrides_json() -> str:
"""Generate runner-overrides.json content from amdgpu_family_matrix.py."""
from amdgpu_family_matrix import (
amdgpu_family_info_matrix_nightly,
amdgpu_family_info_matrix_postsubmit,
amdgpu_family_info_matrix_presubmit,
)

overrides = {}
matrices = [
amdgpu_family_info_matrix_presubmit,
amdgpu_family_info_matrix_postsubmit,
amdgpu_family_info_matrix_nightly,
]

for matrix in matrices:
for family_key, platforms in matrix.items():
if family_key not in overrides:
overrides[family_key] = {}
for platform, config in platforms.items():
overrides[family_key][platform] = dict(config)

return json.dumps({"overrides": overrides}, indent=2, sort_keys=True)
85 changes: 85 additions & 0 deletions build_tools/github_actions/tests/gpu_runner_s3_config_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Copyright Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT

"""Unit tests for gpu_runner_s3_config.py."""

import json
import os
from pathlib import Path
import sys
import unittest
from unittest.mock import MagicMock, patch

sys.path.insert(0, os.fspath(Path(__file__).parent.parent))
import gpu_runner_s3_config


class TestRunnerOverrides(unittest.TestCase):
"""Tests for gpu_runner_s3_config module."""

def setUp(self):
gpu_runner_s3_config.reset_cache()
os.environ.pop("THEROCK_RUNNER_OVERRIDE_URL", None)
os.environ.pop("THEROCK_DISABLE_gpu_runner_s3_config", None)

def tearDown(self):
gpu_runner_s3_config.reset_cache()
os.environ.pop("THEROCK_RUNNER_OVERRIDE_URL", None)
os.environ.pop("THEROCK_DISABLE_gpu_runner_s3_config", None)

def _mock_urlopen(self, mock, data):
"""Helper to set up urlopen mock with given data."""
resp = MagicMock()
resp.read.return_value = json.dumps(data).encode("utf-8")
resp.__enter__ = MagicMock(return_value=resp)
resp.__exit__ = MagicMock(return_value=False)
mock.return_value = resp

@patch("gpu_runner_s3_config.urlopen")
def test_fetch_errors_return_empty(self, mock_urlopen):
"""Test that network/parse errors return empty dict."""
from urllib.error import URLError

mock_urlopen.side_effect = URLError("Connection refused")
self.assertEqual(gpu_runner_s3_config.fetch_overrides(), {})

def test_fetch_disabled(self):
"""Test fetch skipped when disabled."""
os.environ["THEROCK_DISABLE_RUNNER_OVERRIDES"] = "true"
with patch("gpu_runner_s3_config.urlopen") as mock:
self.assertEqual(gpu_runner_s3_config.fetch_overrides(), {})
mock.assert_not_called()

@patch("gpu_runner_s3_config.fetch_overrides")
def test_apply_sparse_merge(self, mock_fetch):
"""Test overrides are sparsely merged without mutating original."""
mock_fetch.return_value = {"gfx94x": {"linux": {"test-runs-on": "new-runner"}}}
original = {
"gfx94x": {
"linux": {"test-runs-on": "old-runner", "family": "gfx94X-dcgpu"}
}
}

result = gpu_runner_s3_config.apply_overrides(original)

# Override applied
self.assertEqual(result["gfx94x"]["linux"]["test-runs-on"], "new-runner")
# Other fields preserved
self.assertEqual(result["gfx94x"]["linux"]["family"], "gfx94X-dcgpu")
# Original unchanged
self.assertEqual(original["gfx94x"]["linux"]["test-runs-on"], "old-runner")

@patch("gpu_runner_s3_config.fetch_overrides")
def test_apply_ignores_unknown_families(self, mock_fetch):
"""Test unknown families/platforms in overrides are ignored."""
mock_fetch.return_value = {"unknown": {"linux": {"test-runs-on": "x"}}}
original = {"gfx94x": {"linux": {"test-runs-on": "runner"}}}

result = gpu_runner_s3_config.apply_overrides(original)

self.assertNotIn("unknown", result)
self.assertEqual(result["gfx94x"]["linux"]["test-runs-on"], "runner")


if __name__ == "__main__":
unittest.main()
Loading