Skip to content
Closed
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
5e38611
Add validation tests for `amd-smi` CLI output
Mar 6, 2026
ae69d07
Merge branch 'main' of https://github.com/ROCm/TheRock into users/hri…
Mar 6, 2026
7edbb8d
Refactor amd-smi path resolution and update function signatures for c…
Mar 6, 2026
bac9e22
Add logging for amd-smi command execution and output
Mar 6, 2026
e7bebb2
Add parameterized test cases for `amd-smi` output modes
Mar 6, 2026
9c80073
Update logging level for amd-smi command execution in test_amdsmi_cli.py
Mar 6, 2026
30c2a1d
command enhance
Mar 6, 2026
7a9f5db
Refactor assertions for clarity and consistency in test_amdsmi_cli.py
Mar 6, 2026
1588c09
Merge branch 'main' of https://github.com/ROCm/TheRock into users/hri…
Mar 9, 2026
3f23ba4
Update fetch_test_configurations.py to use --base-only for amdsmi_cli…
Mar 11, 2026
daca4b5
Merge branch 'main' of https://github.com/ROCm/TheRock into users/hri…
Mar 11, 2026
b002cc5
Merge branch 'main' of https://github.com/ROCm/TheRock into users/hri…
Mar 11, 2026
443bca2
Add AMDSMI tests and update sanity check workflow
Mar 13, 2026
4775e83
Refactor error handling in JSON and CSV parsing to improve readability
Mar 13, 2026
58c355e
Refactor code for improved readability in conftest.py and test_rocm_s…
Mar 13, 2026
e45209f
Add missing newline in test_rocm_sanity.py and fix marker formatting …
Mar 13, 2026
8ccca35
Fix formatting of markers in conftest.py
Mar 13, 2026
31f476b
Remove timeout parameter from pytest commands in test_sanity_check.yml
Mar 13, 2026
f39a95e
Add timeout parameter to pytest commands in test_sanity_check.yml
Mar 13, 2026
c65ba70
Merge origin/main: keep local test_sanity_check.yml (preserve sanity …
Mar 13, 2026
f2eec65
Refactor amd-smi CLI tests
Mar 13, 2026
8bfba37
clean up
Mar 13, 2026
69cc284
Refactor code for improved readability and consistency in test scripts
Mar 13, 2026
053d2bb
Enhance logging in _run_amd_smi function to include return code and o…
Mar 13, 2026
d3aa5f6
Remove amdsmi_cli tests and related configurations; refactor amd-smi …
Mar 14, 2026
3a678d0
Merge branch 'main' of https://github.com/ROCm/TheRock into users/hri…
Mar 14, 2026
a794b26
Refactor test_sanity.py by removing the _run_pytest function and dire…
Mar 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 208 additions & 0 deletions tests/test_rocm_sanity.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT

import csv
import json
from pathlib import Path
from pytest_check import check
import logging
Expand All @@ -25,6 +27,117 @@
from github_actions_utils import is_asan


def _amd_smi_path() -> Path:
therock_bin_dir_env = os.getenv("THEROCK_BIN_DIR")
if not therock_bin_dir_env:
pytest.fail("THEROCK_BIN_DIR not set; failing amd-smi CLI tests")

amd_smi_bin_path = (Path(therock_bin_dir_env).expanduser().resolve()) / "amd-smi"
if not amd_smi_bin_path.exists():
pytest.fail(f"amd-smi not found at {amd_smi_bin_path}")
if not os.access(amd_smi_bin_path, os.X_OK):
pytest.fail(f"amd-smi is not executable: {amd_smi_bin_path}")
return amd_smi_bin_path


def _run_amd_smi(subcommands: list[str]) -> tuple[int, str, str]:
amd_smi_bin = _amd_smi_path()
cmd = [str(amd_smi_bin)] + list(subcommands)
logger.info("Running amd-smi: %s", cmd)
proc = subprocess.run(cmd, capture_output=True, text=True)
logger.info("amd-smi returncode=%s", proc.returncode)
if proc.returncode != 0:
if proc.stdout:
logger.error("amd-smi stdout:\n%s", proc.stdout)
if proc.stderr:
logger.error("amd-smi stderr:\n%s", proc.stderr)
else:
if proc.stdout:
logger.info("amd-smi stdout:\n%s", proc.stdout)
if proc.stderr:
logger.error("amd-smi stderr (unexpected on success):\n%s", proc.stderr)
return proc.returncode, proc.stdout, proc.stderr


def _parse_gpu_blocks(text_output: str) -> list[str]:
gpu_blocks: list[str] = []
current_block_lines: list[str] | None = None
for line in text_output.splitlines():
if re.search(r"GPU:\s+(\d+)", line) or re.search(r"GPU\s+(\d+):", line):
if current_block_lines is not None:
gpu_blocks.append("\n".join(current_block_lines))
current_block_lines = [line]
continue
if current_block_lines is not None:
current_block_lines.append(line)
if current_block_lines is not None:
gpu_blocks.append("\n".join(current_block_lines))
return gpu_blocks


def _validate_human_readable_gpu_block(human_readable_gpu_block_text: str) -> list[str]:
missing_fields: list[str] = []
if not re.search(r"\s*BDF:\s*.+", human_readable_gpu_block_text):
missing_fields.append("BDF")
if not re.search(r"\s*UUID:\s*.+", human_readable_gpu_block_text):
missing_fields.append("UUID")
if not re.search(r"\s*KFD_ID:\s*\d+", human_readable_gpu_block_text):
missing_fields.append("KFD_ID")
if not re.search(r"\s*NODE_ID:\s*\d+", human_readable_gpu_block_text):
missing_fields.append("NODE_ID")
if not re.search(r"\s*PARTITION_ID:\s*\d+", human_readable_gpu_block_text):
missing_fields.append("PARTITION_ID")
return missing_fields


def _validate_json(gpu_obj: dict) -> list[str]:
missing_fields: list[str] = []
if "gpu" not in gpu_obj or not isinstance(gpu_obj.get("gpu"), int):
missing_fields.append("gpu")
if "bdf" not in gpu_obj or not isinstance(gpu_obj.get("bdf"), str):
missing_fields.append("bdf")
if "uuid" not in gpu_obj or not isinstance(gpu_obj.get("uuid"), str):
missing_fields.append("uuid")
if "kfd_id" not in gpu_obj or not isinstance(gpu_obj.get("kfd_id"), int):
missing_fields.append("kfd_id")
if "node_id" not in gpu_obj or not isinstance(gpu_obj.get("node_id"), int):
missing_fields.append("node_id")
if "partition_id" not in gpu_obj or not isinstance(
gpu_obj.get("partition_id"), int
):
missing_fields.append("partition_id")
return missing_fields


def _validate_csv_row(csv_row: dict) -> list[str]:
missing_fields: list[str] = []
try:
if "gpu" not in csv_row or int(csv_row.get("gpu", "")) < 0:
missing_fields.append("gpu")
except Exception:
missing_fields.append("gpu")
if not csv_row.get("gpu_bdf"):
missing_fields.append("gpu_bdf")
if not csv_row.get("gpu_uuid"):
missing_fields.append("gpu_uuid")
try:
if "kfd_id" not in csv_row or int(csv_row.get("kfd_id", "")) < 0:
missing_fields.append("kfd_id")
except Exception:
missing_fields.append("kfd_id")
try:
if "node_id" not in csv_row or int(csv_row.get("node_id", "")) < 0:
missing_fields.append("node_id")
except Exception:
missing_fields.append("node_id")
try:
if "partition_id" not in csv_row or int(csv_row.get("partition_id", "")) < 0:
missing_fields.append("partition_id")
except Exception:
missing_fields.append("partition_id")
return missing_fields


def is_windows():
return "windows" == platform.system().lower()

Expand Down Expand Up @@ -229,3 +342,98 @@ def test_amdsmi_suite(self):
print(f"[amdsmitst-summary] {line}")

check.equal(process.returncode, 0)

@pytest.mark.skipif(is_windows(), reason="amd-smi CLI not supported on Windows")
@pytest.mark.skipif(
AMDGPU_FAMILIES == "gfx1151", reason="Linux gfx1151 does not support amdsmi yet"
)
@pytest.mark.parametrize(
"mod_args",
[
([], None),
(["--json"], None),
(["--csv"], None),
(["--file"], "human"),
(["--json", "--file"], "json"),
(["--csv", "--file"], "csv"),
],
ids=[
"human-stdout",
"json-stdout",
"csv-stdout",
"human-file",
"json-file",
"csv-file",
],
)
def test_amd_smi_list(self, mod_args, tmp_path: Path) -> None:
modifiers, expected_output_mode = mod_args

output_file_path: Path | None = None
invocation_args = list(modifiers)
if "--file" in invocation_args:
output_file_path = tmp_path / "amdsmi_out.txt"
invocation_args = [a for a in invocation_args if a != "--file"]
invocation_args.extend(["--file", str(output_file_path)])

return_code, stdout_text, stderr_text = _run_amd_smi(["list"] + invocation_args)
assert (
return_code == 0
), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}"

if output_file_path is not None:
assert (
stdout_text.strip() == ""
), f"Expected no stdout with --file, got: {stdout_text}"
assert output_file_path.exists(), "Expected output file to be created"
content_text = output_file_path.read_text(
encoding="utf-8", errors="replace"
)
else:
content_text = stdout_text

if expected_output_mode == "json" or (
"--json" in modifiers and expected_output_mode is None
):
try:
json_data = json.loads(content_text)
except Exception as e:
pytest.fail(
f"Failed to parse JSON output: {e}\nContent:\n{content_text}"
)
assert (
isinstance(json_data, list) and json_data
), "Expected non-empty JSON array"
for index, gpu_obj in enumerate(json_data):
missing_fields = _validate_json(gpu_obj)
assert (
not missing_fields
), f"JSON GPU entry {index} missing fields: {missing_fields}"

elif expected_output_mode == "csv" or (
"--csv" in modifiers and expected_output_mode is None
):
try:
csv_reader = csv.DictReader(content_text.splitlines())
csv_rows = list(csv_reader)
except Exception as e:
pytest.fail(
f"Failed to parse CSV output: {e}\nContent:\n{content_text}"
)
assert csv_rows, "Expected at least one CSV row"
for index, csv_row in enumerate(csv_rows):
missing_fields = _validate_csv_row(csv_row)
assert (
not missing_fields
), f"CSV row {index} missing fields: {missing_fields}"

else:
gpu_blocks = _parse_gpu_blocks(content_text)
assert gpu_blocks, "No GPU blocks found in amd-smi human output"
for index, human_readable_gpu_block in enumerate(gpu_blocks):
missing_fields = _validate_human_readable_gpu_block(
human_readable_gpu_block
)
assert (
not missing_fields
), f"Human-readable GPU block {index} missing fields: {missing_fields}\nBlock:\n{human_readable_gpu_block}"
Loading