From 5e38611c50e6ab0faf188c7a1821f234ac65ec35 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 6 Mar 2026 12:13:55 +0530 Subject: [PATCH 01/21] Add validation tests for `amd-smi` CLI output --- .../fetch_test_configurations.py | 11 + .../test_amdsmi_cli.py | 268 ++++++++++++++++++ 2 files changed, 279 insertions(+) create mode 100644 build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py diff --git a/build_tools/github_actions/fetch_test_configurations.py b/build_tools/github_actions/fetch_test_configurations.py index dab223872e6..65e49a27721 100644 --- a/build_tools/github_actions/fetch_test_configurations.py +++ b/build_tools/github_actions/fetch_test_configurations.py @@ -153,6 +153,17 @@ def _get_script_path(script_name: str) -> str: "windows": 1, }, }, + + "amdsmi_cli": { + "job_name": "amdsmi_cli", + "fetch_artifact_args": "--tests", + "timeout_minutes": 15, + "test_script": f"pytest {_get_script_path('test_amdsmi_cli.py')} -s", + "platform": ["linux"], + "total_shards_dict": { + "linux": 1, + }, + }, "hipcub": { "job_name": "hipcub", "fetch_artifact_args": "--prim --tests", diff --git a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py new file mode 100644 index 00000000000..518d25562aa --- /dev/null +++ b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +""" +validation of the `amd-smi` CLI output. + +This test expects `THEROCK_BIN_DIR` to point to the TheRock `bin/` directory +containing the `amd-smi` binary (CI sets this via the setup action). +""" + +import os +import re +import json +import csv +import subprocess +from pathlib import Path + +import pytest + + +def _amd_smi_path() -> Path: + """Return the path to the `amd-smi` binary from `THEROCK_BIN_DIR`. + + Skips the test via pytest if `THEROCK_BIN_DIR` is not set. Asserts that + the expected `amd-smi` binary exists at the resolved path. + + Args: + None + + Returns: + pathlib.Path: Path to the `amd-smi` binary. + """ + th = os.getenv("THEROCK_BIN_DIR") + if not th: + pytest.skip("THEROCK_BIN_DIR not set; skipping amdsmi tests") + p = Path(th) / "amd-smi" + assert p.exists(), f"amd-smi not found at {p}" + return p + + +def _run_amd_smi(amd_smi: Path, args: list[str]) -> tuple[int, str, str]: + """Run `amd-smi list` with the given `args` and return (rc, stdout, stderr). + + The function invokes the binary via subprocess.run and captures text + output for assertions in the tests. + + Args: + amd_smi (pathlib.Path): Path to the `amd-smi` binary. + args (list[str]): Arguments to pass after `amd-smi list`. + + Returns: + tuple[int, str, str]: Return code, stdout text, stderr text. + """ + cmd = [str(amd_smi), "list"] + args + proc = subprocess.run(cmd, capture_output=True, text=True) + return proc.returncode, proc.stdout, proc.stderr + + +def _parse_gpu_blocks(output: str) -> list[str]: + """Parse human-readable `amd-smi list` output into GPU text blocks. + + Returns a list where each element is the multiline block describing a + single GPU. The parser looks for lines that start GPU markers like + "GPU: " or "GPU :" and groups subsequent lines until the next + GPU marker. + + Args: + output (str): The human-readable stdout from `amd-smi list`. + + Returns: + list[str]: List of multiline GPU description blocks. + """ + blocks = [] + current = None + for line in output.splitlines(): + if re.search(r"GPU:\s+(\d+)", line) or re.search(r"GPU\s+(\d+):", line): + if current is not None: + blocks.append("\n".join(current)) + current = [line] + continue + if current is not None: + current.append(line) + if current is not None: + blocks.append("\n".join(current)) + return blocks + + +def _validate_human_block(block_text: str) -> list[str]: + """Validate a single human-readable GPU block. + + Returns a list of missing field names (empty if all required fields + appear). The function checks for BDF, UUID, KFD_ID, NODE_ID and + PARTITION_ID in the block_text. + + Args: + block_text (str): Multiline text block describing a single GPU. + + Returns: + list[str]: Missing field names (empty if validation passes). + """ + missing = [] + if not re.search(r"\s*BDF:\s*.+", block_text): + missing.append("BDF") + if not re.search(r"\s*UUID:\s*.+", block_text): + missing.append("UUID") + if not re.search(r"\s*KFD_ID:\s*\d+", block_text): + missing.append("KFD_ID") + if not re.search(r"\s*NODE_ID:\s*\d+", block_text): + missing.append("NODE_ID") + if not re.search(r"\s*PARTITION_ID:\s*\d+", block_text): + missing.append("PARTITION_ID") + return missing + + +def _validate_json(obj: dict) -> list[str]: + """Validate a JSON GPU entry from `amd-smi --json`. + + Returns a list of missing or incorrectly-typed fields. Expected fields + include `gpu` (int), `bdf` (str), `uuid` (str), `kfd_id` (int), + `node_id` (int) and `partition_id` (int). + + Args: + obj (dict): Parsed JSON object representing a GPU entry. + + Returns: + list[str]: Missing or invalid field names. + """ + missing = [] + # required keys mapping + if "gpu" not in obj or not isinstance(obj.get("gpu"), int): + missing.append("gpu") + if "bdf" not in obj or not isinstance(obj.get("bdf"), str): + missing.append("bdf") + if "uuid" not in obj or not isinstance(obj.get("uuid"), str): + missing.append("uuid") + if "kfd_id" not in obj or not isinstance(obj.get("kfd_id"), int): + missing.append("kfd_id") + if "node_id" not in obj or not isinstance(obj.get("node_id"), int): + missing.append("node_id") + if "partition_id" not in obj or not isinstance(obj.get("partition_id"), int): + missing.append("partition_id") + return missing + + +def _validate_csv_row(row: dict) -> list[str]: + """Validate a CSV row parsed from `amd-smi --csv` output. + + Expected header names are: `gpu,gpu_bdf,gpu_uuid,kfd_id,node_id,partition_id`. + Returns a list of missing or invalid fields. + + Args: + row (dict): Mapping of CSV headers to values as returned by + `csv.DictReader`. + + Returns: + list[str]: Missing or invalid field names. + """ + # expected header names: gpu,gpu_bdf,gpu_uuid,kfd_id,node_id,partition_id + missing = [] + try: + if "gpu" not in row or int(row.get("gpu", "")) < 0: + missing.append("gpu") + except Exception: + missing.append("gpu") + if not row.get("gpu_bdf"): + missing.append("gpu_bdf") + if not row.get("gpu_uuid"): + missing.append("gpu_uuid") + try: + if "kfd_id" not in row or int(row.get("kfd_id", "")) < 0: + missing.append("kfd_id") + except Exception: + missing.append("kfd_id") + try: + if "node_id" not in row or int(row.get("node_id", "")) < 0: + missing.append("node_id") + except Exception: + missing.append("node_id") + try: + if "partition_id" not in row or int(row.get("partition_id", "")) < 0: + missing.append("partition_id") + except Exception: + missing.append("partition_id") + return missing + + +@pytest.mark.parametrize( + "mod_args", + [ + ([], None), # human readable on stdout + (["--json"], None), + (["--csv"], None), + (["--file"], "human"), + (["--json", "--file"], "json"), + (["--csv", "--file"], "csv"), + ], +) +def test_amd_smi_list(mod_args, tmp_path): + """End-to-end test of `amd-smi list` covering output modes. + + The test runs `amd-smi list` with multiple modifier combinations (human, + JSON, CSV, and file-output variants), parses the output and validates + required fields for each GPU entry. + + Args: + mod_args (tuple[list[str], Optional[str]]): Parameterized tuple where + the first element is a list of modifier args and the second + element indicates the expected parsed mode when `--file` is + used. + tmp_path (pathlib.Path): pytest temporary directory fixture. + + Returns: + None + """ + args, expected_mode = mod_args + amd_smi = _amd_smi_path() + + file_path = None + run_args = list(args) + if "--file" in run_args: + # supply output file + file_path = tmp_path / "amdsmi_out.txt" + run_args = [a for a in run_args if a != "--file"] + run_args.extend(["--file", str(file_path)]) + + rc, out, err = _run_amd_smi(amd_smi, run_args) + assert rc == 0, f"amd-smi failed rc={rc} stderr={err} stdout={out}" + + # If file was requested, stdout should be empty + if file_path is not None: + assert out.strip() == "", f"Expected no stdout when using --file, got: {out}" + assert file_path.exists(), "Expected output file to be created" + content = file_path.read_text(encoding="utf-8", errors="replace") + else: + content = out + + # Validate based on mode + if expected_mode == "json" or ("--json" in args and expected_mode is None): + # JSON array expected + try: + data = json.loads(content) + except Exception as e: + pytest.fail(f"Failed to parse JSON output: {e}\nContent:\n{content}") + assert isinstance(data, list) and data, "Expected non-empty JSON array" + for idx, obj in enumerate(data): + missing = _validate_json(obj) + assert not missing, f"JSON GPU entry {idx} missing fields: {missing}" + + elif expected_mode == "csv" or ("--csv" in args and expected_mode is None): + # CSV expected + try: + reader = csv.DictReader(content.splitlines()) + rows = list(reader) + except Exception as e: + pytest.fail(f"Failed to parse CSV output: {e}\nContent:\n{content}") + assert rows, "Expected at least one CSV row" + for idx, row in enumerate(rows): + missing = _validate_csv_row(row) + assert not missing, f"CSV row {idx} missing fields: {missing}" + + else: + # human readable output + blocks = _parse_gpu_blocks(content) + assert blocks, "No GPU blocks found in amd-smi human output" + for idx, block_text in enumerate(blocks): + missing = _validate_human_block(block_text) + assert not missing, f"Human GPU block {idx} missing fields: {missing}\nBlock:\n{block_text}" From 7edbb8d427a5f6ccebbc09328f54f64723283bf0 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 6 Mar 2026 17:52:20 +0530 Subject: [PATCH 02/21] Refactor amd-smi path resolution and update function signatures for clarity --- .../test_amdsmi_cli.py | 223 +++++++++--------- 1 file changed, 114 insertions(+), 109 deletions(-) diff --git a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py index 518d25562aa..ce61861f00c 100644 --- a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py +++ b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py @@ -31,33 +31,38 @@ def _amd_smi_path() -> Path: Returns: pathlib.Path: Path to the `amd-smi` binary. """ - th = os.getenv("THEROCK_BIN_DIR") - if not th: + therock_bin_dir_env = os.getenv("THEROCK_BIN_DIR") + if not therock_bin_dir_env: pytest.skip("THEROCK_BIN_DIR not set; skipping amdsmi tests") - p = Path(th) / "amd-smi" - assert p.exists(), f"amd-smi not found at {p}" - return p + # Resolve the path to an absolute canonical path to avoid cwd-dependent + # failures (e.g., if a prior step changes directory). Also check that the + # binary exists and is executable. + amd_smi_bin_path = (Path(therock_bin_dir_env).expanduser().resolve()) / "amd-smi" + assert amd_smi_bin_path.exists(), f"amd-smi not found at {amd_smi_bin_path}" + assert os.access(amd_smi_bin_path, os.X_OK), f"amd-smi is not executable: {amd_smi_bin_path}" + return amd_smi_bin_path -def _run_amd_smi(amd_smi: Path, args: list[str]) -> tuple[int, str, str]: - """Run `amd-smi list` with the given `args` and return (rc, stdout, stderr). + +def _run_amd_smi(amd_smi_path: Path, modifiers: list[str]) -> tuple[int, str, str]: + """Run `amd-smi list` with the given `modifiers` and return (rc, stdout, stderr). The function invokes the binary via subprocess.run and captures text output for assertions in the tests. Args: - amd_smi (pathlib.Path): Path to the `amd-smi` binary. - args (list[str]): Arguments to pass after `amd-smi list`. + amd_smi_path (pathlib.Path): Path to the `amd-smi` binary. + modifiers (list[str]): Arguments to pass after `amd-smi list`. Returns: tuple[int, str, str]: Return code, stdout text, stderr text. """ - cmd = [str(amd_smi), "list"] + args + cmd = [str(amd_smi_path), "list"] + modifiers proc = subprocess.run(cmd, capture_output=True, text=True) return proc.returncode, proc.stdout, proc.stderr -def _parse_gpu_blocks(output: str) -> list[str]: +def _parse_gpu_blocks(text_output: str) -> list[str]: """Parse human-readable `amd-smi list` output into GPU text blocks. Returns a list where each element is the multiline block describing a @@ -71,22 +76,22 @@ def _parse_gpu_blocks(output: str) -> list[str]: Returns: list[str]: List of multiline GPU description blocks. """ - blocks = [] - current = None - for line in output.splitlines(): + gpu_blocks = [] + current_block_lines = None + for line in text_output.splitlines(): if re.search(r"GPU:\s+(\d+)", line) or re.search(r"GPU\s+(\d+):", line): - if current is not None: - blocks.append("\n".join(current)) - current = [line] + if current_block_lines is not None: + gpu_blocks.append("\n".join(current_block_lines)) + current_block_lines = [line] continue - if current is not None: - current.append(line) - if current is not None: - blocks.append("\n".join(current)) - return blocks + if current_block_lines is not None: + current_block_lines.append(line) + if current_block_lines is not None: + gpu_blocks.append("\n".join(current_block_lines)) + return gpu_blocks -def _validate_human_block(block_text: str) -> list[str]: +def _validate_human_readable_gpu_block(human_readable_gpu_block_text: str) -> list[str]: """Validate a single human-readable GPU block. Returns a list of missing field names (empty if all required fields @@ -94,26 +99,26 @@ def _validate_human_block(block_text: str) -> list[str]: PARTITION_ID in the block_text. Args: - block_text (str): Multiline text block describing a single GPU. + human_readable_gpu_block_text (str): Multiline text block describing a single GPU. Returns: list[str]: Missing field names (empty if validation passes). """ - missing = [] - if not re.search(r"\s*BDF:\s*.+", block_text): - missing.append("BDF") - if not re.search(r"\s*UUID:\s*.+", block_text): - missing.append("UUID") - if not re.search(r"\s*KFD_ID:\s*\d+", block_text): - missing.append("KFD_ID") - if not re.search(r"\s*NODE_ID:\s*\d+", block_text): - missing.append("NODE_ID") - if not re.search(r"\s*PARTITION_ID:\s*\d+", block_text): - missing.append("PARTITION_ID") - return missing - - -def _validate_json(obj: dict) -> list[str]: + missing_fields = [] + if not re.search(r"\s*BDF:\s*.+", human_readable_gpu_block_text): + missing_fields.append("BDF") + if not re.search(r"\s*UUID:\s*.+", human_readable_gpu_block_text): + missing_fields.append("UUID") + if not re.search(r"\s*KFD_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("KFD_ID") + if not re.search(r"\s*NODE_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("NODE_ID") + if not re.search(r"\s*PARTITION_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("PARTITION_ID") + return missing_fields + + +def _validate_json(gpu_obj: dict) -> list[str]: """Validate a JSON GPU entry from `amd-smi --json`. Returns a list of missing or incorrectly-typed fields. Expected fields @@ -126,24 +131,24 @@ def _validate_json(obj: dict) -> list[str]: Returns: list[str]: Missing or invalid field names. """ - missing = [] + missing_fields = [] # required keys mapping - if "gpu" not in obj or not isinstance(obj.get("gpu"), int): - missing.append("gpu") - if "bdf" not in obj or not isinstance(obj.get("bdf"), str): - missing.append("bdf") - if "uuid" not in obj or not isinstance(obj.get("uuid"), str): - missing.append("uuid") - if "kfd_id" not in obj or not isinstance(obj.get("kfd_id"), int): - missing.append("kfd_id") - if "node_id" not in obj or not isinstance(obj.get("node_id"), int): - missing.append("node_id") - if "partition_id" not in obj or not isinstance(obj.get("partition_id"), int): - missing.append("partition_id") - return missing - - -def _validate_csv_row(row: dict) -> list[str]: + if "gpu" not in gpu_obj or not isinstance(gpu_obj.get("gpu"), int): + missing_fields.append("gpu") + if "bdf" not in gpu_obj or not isinstance(gpu_obj.get("bdf"), str): + missing_fields.append("bdf") + if "uuid" not in gpu_obj or not isinstance(gpu_obj.get("uuid"), str): + missing_fields.append("uuid") + if "kfd_id" not in gpu_obj or not isinstance(gpu_obj.get("kfd_id"), int): + missing_fields.append("kfd_id") + if "node_id" not in gpu_obj or not isinstance(gpu_obj.get("node_id"), int): + missing_fields.append("node_id") + if "partition_id" not in gpu_obj or not isinstance(gpu_obj.get("partition_id"), int): + missing_fields.append("partition_id") + return missing_fields + + +def _validate_csv_row(csv_row: dict) -> list[str]: """Validate a CSV row parsed from `amd-smi --csv` output. Expected header names are: `gpu,gpu_bdf,gpu_uuid,kfd_id,node_id,partition_id`. @@ -157,32 +162,32 @@ def _validate_csv_row(row: dict) -> list[str]: list[str]: Missing or invalid field names. """ # expected header names: gpu,gpu_bdf,gpu_uuid,kfd_id,node_id,partition_id - missing = [] + missing_fields = [] try: - if "gpu" not in row or int(row.get("gpu", "")) < 0: - missing.append("gpu") + if "gpu" not in csv_row or int(csv_row.get("gpu", "")) < 0: + missing_fields.append("gpu") except Exception: - missing.append("gpu") - if not row.get("gpu_bdf"): - missing.append("gpu_bdf") - if not row.get("gpu_uuid"): - missing.append("gpu_uuid") + missing_fields.append("gpu") + if not csv_row.get("gpu_bdf"): + missing_fields.append("gpu_bdf") + if not csv_row.get("gpu_uuid"): + missing_fields.append("gpu_uuid") try: - if "kfd_id" not in row or int(row.get("kfd_id", "")) < 0: - missing.append("kfd_id") + if "kfd_id" not in csv_row or int(csv_row.get("kfd_id", "")) < 0: + missing_fields.append("kfd_id") except Exception: - missing.append("kfd_id") + missing_fields.append("kfd_id") try: - if "node_id" not in row or int(row.get("node_id", "")) < 0: - missing.append("node_id") + if "node_id" not in csv_row or int(csv_row.get("node_id", "")) < 0: + missing_fields.append("node_id") except Exception: - missing.append("node_id") + missing_fields.append("node_id") try: - if "partition_id" not in row or int(row.get("partition_id", "")) < 0: - missing.append("partition_id") + if "partition_id" not in csv_row or int(csv_row.get("partition_id", "")) < 0: + missing_fields.append("partition_id") except Exception: - missing.append("partition_id") - return missing + missing_fields.append("partition_id") + return missing_fields @pytest.mark.parametrize( @@ -213,56 +218,56 @@ def test_amd_smi_list(mod_args, tmp_path): Returns: None """ - args, expected_mode = mod_args - amd_smi = _amd_smi_path() + modifiers, expected_output_mode = mod_args + amd_smi_bin = _amd_smi_path() - file_path = None - run_args = list(args) - if "--file" in run_args: + output_file_path = None + invocation_args = list(modifiers) + if "--file" in invocation_args: # supply output file - file_path = tmp_path / "amdsmi_out.txt" - run_args = [a for a in run_args if a != "--file"] - run_args.extend(["--file", str(file_path)]) + output_file_path = tmp_path / "amdsmi_out.txt" + invocation_args = [a for a in invocation_args if a != "--file"] + invocation_args.extend(["--file", str(output_file_path)]) - rc, out, err = _run_amd_smi(amd_smi, run_args) - assert rc == 0, f"amd-smi failed rc={rc} stderr={err} stdout={out}" + return_code, stdout_text, stderr_text = _run_amd_smi(amd_smi_bin, invocation_args) + assert return_code == 0, f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" # If file was requested, stdout should be empty - if file_path is not None: - assert out.strip() == "", f"Expected no stdout when using --file, got: {out}" - assert file_path.exists(), "Expected output file to be created" - content = file_path.read_text(encoding="utf-8", errors="replace") + if output_file_path is not None: + assert stdout_text.strip() == "", f"Expected no stdout when using --file, got: {stdout_text}" + assert output_file_path.exists(), "Expected output file to be created" + content_text = output_file_path.read_text(encoding="utf-8", errors="replace") else: - content = out + content_text = stdout_text # Validate based on mode - if expected_mode == "json" or ("--json" in args and expected_mode is None): + if expected_output_mode == "json" or ("--json" in modifiers and expected_output_mode is None): # JSON array expected try: - data = json.loads(content) + json_data = json.loads(content_text) except Exception as e: - pytest.fail(f"Failed to parse JSON output: {e}\nContent:\n{content}") - assert isinstance(data, list) and data, "Expected non-empty JSON array" - for idx, obj in enumerate(data): - missing = _validate_json(obj) - assert not missing, f"JSON GPU entry {idx} missing fields: {missing}" + pytest.fail(f"Failed to parse JSON output: {e}\nContent:\n{content_text}") + assert isinstance(json_data, list) and json_data, "Expected non-empty JSON array" + for index, gpu_obj in enumerate(json_data): + missing_fields = _validate_json(gpu_obj) + assert not missing_fields, f"JSON GPU entry {index} missing fields: {missing_fields}" - elif expected_mode == "csv" or ("--csv" in args and expected_mode is None): + elif expected_output_mode == "csv" or ("--csv" in modifiers and expected_output_mode is None): # CSV expected try: - reader = csv.DictReader(content.splitlines()) - rows = list(reader) + csv_reader = csv.DictReader(content_text.splitlines()) + csv_rows = list(csv_reader) except Exception as e: - pytest.fail(f"Failed to parse CSV output: {e}\nContent:\n{content}") - assert rows, "Expected at least one CSV row" - for idx, row in enumerate(rows): - missing = _validate_csv_row(row) - assert not missing, f"CSV row {idx} missing fields: {missing}" + pytest.fail(f"Failed to parse CSV output: {e}\nContent:\n{content_text}") + assert csv_rows, "Expected at least one CSV row" + for index, csv_row in enumerate(csv_rows): + missing_fields = _validate_csv_row(csv_row) + assert not missing_fields, f"CSV row {index} missing fields: {missing_fields}" else: # human readable output - blocks = _parse_gpu_blocks(content) - assert blocks, "No GPU blocks found in amd-smi human output" - for idx, block_text in enumerate(blocks): - missing = _validate_human_block(block_text) - assert not missing, f"Human GPU block {idx} missing fields: {missing}\nBlock:\n{block_text}" + gpu_blocks = _parse_gpu_blocks(content_text) + assert gpu_blocks, "No GPU blocks found in amd-smi human output" + for index, human_readable_gpu_block in enumerate(gpu_blocks): + missing_fields = _validate_human_readable_gpu_block(human_readable_gpu_block) + assert not missing_fields, f"Human-readable GPU block {index} missing fields: {missing_fields}\nBlock:\n{human_readable_gpu_block}" From bac9e22449ec8f7f8d8e72f2fdd6286da3aa4a4d Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 6 Mar 2026 18:27:11 +0530 Subject: [PATCH 03/21] Add logging for amd-smi command execution and output --- .../test_executable_scripts/test_amdsmi_cli.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py index ce61861f00c..589d22efa00 100644 --- a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py +++ b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py @@ -15,10 +15,14 @@ import csv import subprocess from pathlib import Path +import logging import pytest +logger = logging.getLogger(__name__) + + def _amd_smi_path() -> Path: """Return the path to the `amd-smi` binary from `THEROCK_BIN_DIR`. @@ -58,7 +62,11 @@ def _run_amd_smi(amd_smi_path: Path, modifiers: list[str]) -> tuple[int, str, st tuple[int, str, str]: Return code, stdout text, stderr text. """ cmd = [str(amd_smi_path), "list"] + modifiers + logger.debug("Running amd-smi: %s", cmd) proc = subprocess.run(cmd, capture_output=True, text=True) + logger.debug("amd-smi returncode=%s", proc.returncode) + logger.debug("amd-smi stdout:\n%s", proc.stdout) + logger.debug("amd-smi stderr:\n%s", proc.stderr) return proc.returncode, proc.stdout, proc.stderr From e7bebb2056b440caeaeae82b4fc08113158b5477 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 6 Mar 2026 18:38:44 +0530 Subject: [PATCH 04/21] Add parameterized test cases for `amd-smi` output modes --- .../test_executable_scripts/test_amdsmi_cli.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py index 589d22efa00..d1a4880718d 100644 --- a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py +++ b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py @@ -208,6 +208,14 @@ def _validate_csv_row(csv_row: dict) -> list[str]: (["--json", "--file"], "json"), (["--csv", "--file"], "csv"), ], + ids=[ + "human-stdout", + "json-stdout", + "csv-stdout", + "human-file", + "json-file", + "csv-file", + ], ) def test_amd_smi_list(mod_args, tmp_path): """End-to-end test of `amd-smi list` covering output modes. From 9c80073aef24e4a108b8f4af1e5bee7a78c8b2a0 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 6 Mar 2026 22:51:42 +0530 Subject: [PATCH 05/21] Update logging level for amd-smi command execution in test_amdsmi_cli.py --- build_tools/github_actions/fetch_test_configurations.py | 1 - .../test_executable_scripts/test_amdsmi_cli.py | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/build_tools/github_actions/fetch_test_configurations.py b/build_tools/github_actions/fetch_test_configurations.py index b25ecfce421..5d6feca049d 100644 --- a/build_tools/github_actions/fetch_test_configurations.py +++ b/build_tools/github_actions/fetch_test_configurations.py @@ -156,7 +156,6 @@ def _get_script_path(script_name: str) -> str: "windows": 1, }, }, - "amdsmi_cli": { "job_name": "amdsmi_cli", "fetch_artifact_args": "--tests", diff --git a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py index d1a4880718d..f2255d7d968 100644 --- a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py +++ b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py @@ -62,11 +62,11 @@ def _run_amd_smi(amd_smi_path: Path, modifiers: list[str]) -> tuple[int, str, st tuple[int, str, str]: Return code, stdout text, stderr text. """ cmd = [str(amd_smi_path), "list"] + modifiers - logger.debug("Running amd-smi: %s", cmd) + logger.info("Running amd-smi: %s", cmd) proc = subprocess.run(cmd, capture_output=True, text=True) - logger.debug("amd-smi returncode=%s", proc.returncode) - logger.debug("amd-smi stdout:\n%s", proc.stdout) - logger.debug("amd-smi stderr:\n%s", proc.stderr) + logger.info("amd-smi returncode=%s", proc.returncode) + logger.info("amd-smi stdout:\n%s", proc.stdout) + logger.info("amd-smi stderr:\n%s", proc.stderr) return proc.returncode, proc.stdout, proc.stderr From 30c2a1d3a3af82d0d48bc3228ba81221f00a0e30 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 6 Mar 2026 23:03:41 +0530 Subject: [PATCH 06/21] command enhance --- build_tools/github_actions/fetch_test_configurations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/github_actions/fetch_test_configurations.py b/build_tools/github_actions/fetch_test_configurations.py index 5d6feca049d..e9a8fd8e7dc 100644 --- a/build_tools/github_actions/fetch_test_configurations.py +++ b/build_tools/github_actions/fetch_test_configurations.py @@ -160,7 +160,7 @@ def _get_script_path(script_name: str) -> str: "job_name": "amdsmi_cli", "fetch_artifact_args": "--tests", "timeout_minutes": 15, - "test_script": f"pytest {_get_script_path('test_amdsmi_cli.py')} -s", + "test_script": f"pytest {_get_script_path('test_amdsmi_cli.py')} -o log_cli=true --log-cli-level=INFO", "platform": ["linux"], "total_shards_dict": { "linux": 1, From 7a9f5db857a79ebfa7b4aad865ec242aa5891af0 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 6 Mar 2026 23:37:59 +0530 Subject: [PATCH 07/21] Refactor assertions for clarity and consistency in test_amdsmi_cli.py --- .../test_amdsmi_cli.py | 44 ++++++++++++++----- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py index f2255d7d968..8e81918658a 100644 --- a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py +++ b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py @@ -44,7 +44,9 @@ def _amd_smi_path() -> Path: # binary exists and is executable. amd_smi_bin_path = (Path(therock_bin_dir_env).expanduser().resolve()) / "amd-smi" assert amd_smi_bin_path.exists(), f"amd-smi not found at {amd_smi_bin_path}" - assert os.access(amd_smi_bin_path, os.X_OK), f"amd-smi is not executable: {amd_smi_bin_path}" + assert os.access( + amd_smi_bin_path, os.X_OK + ), f"amd-smi is not executable: {amd_smi_bin_path}" return amd_smi_bin_path @@ -151,7 +153,9 @@ def _validate_json(gpu_obj: dict) -> list[str]: missing_fields.append("kfd_id") if "node_id" not in gpu_obj or not isinstance(gpu_obj.get("node_id"), int): missing_fields.append("node_id") - if "partition_id" not in gpu_obj or not isinstance(gpu_obj.get("partition_id"), int): + if "partition_id" not in gpu_obj or not isinstance( + gpu_obj.get("partition_id"), int + ): missing_fields.append("partition_id") return missing_fields @@ -246,29 +250,41 @@ def test_amd_smi_list(mod_args, tmp_path): invocation_args.extend(["--file", str(output_file_path)]) return_code, stdout_text, stderr_text = _run_amd_smi(amd_smi_bin, invocation_args) - assert return_code == 0, f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" + assert ( + return_code == 0 + ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" # If file was requested, stdout should be empty if output_file_path is not None: - assert stdout_text.strip() == "", f"Expected no stdout when using --file, got: {stdout_text}" + assert ( + stdout_text.strip() == "" + ), f"Expected no stdout when using --file, got: {stdout_text}" assert output_file_path.exists(), "Expected output file to be created" content_text = output_file_path.read_text(encoding="utf-8", errors="replace") else: content_text = stdout_text # Validate based on mode - if expected_output_mode == "json" or ("--json" in modifiers and expected_output_mode is None): + if expected_output_mode == "json" or ( + "--json" in modifiers and expected_output_mode is None + ): # JSON array expected try: json_data = json.loads(content_text) except Exception as e: pytest.fail(f"Failed to parse JSON output: {e}\nContent:\n{content_text}") - assert isinstance(json_data, list) and json_data, "Expected non-empty JSON array" + assert ( + isinstance(json_data, list) and json_data + ), "Expected non-empty JSON array" for index, gpu_obj in enumerate(json_data): missing_fields = _validate_json(gpu_obj) - assert not missing_fields, f"JSON GPU entry {index} missing fields: {missing_fields}" + assert ( + not missing_fields + ), f"JSON GPU entry {index} missing fields: {missing_fields}" - elif expected_output_mode == "csv" or ("--csv" in modifiers and expected_output_mode is None): + elif expected_output_mode == "csv" or ( + "--csv" in modifiers and expected_output_mode is None + ): # CSV expected try: csv_reader = csv.DictReader(content_text.splitlines()) @@ -278,12 +294,18 @@ def test_amd_smi_list(mod_args, tmp_path): assert csv_rows, "Expected at least one CSV row" for index, csv_row in enumerate(csv_rows): missing_fields = _validate_csv_row(csv_row) - assert not missing_fields, f"CSV row {index} missing fields: {missing_fields}" + assert ( + not missing_fields + ), f"CSV row {index} missing fields: {missing_fields}" else: # human readable output gpu_blocks = _parse_gpu_blocks(content_text) assert gpu_blocks, "No GPU blocks found in amd-smi human output" for index, human_readable_gpu_block in enumerate(gpu_blocks): - missing_fields = _validate_human_readable_gpu_block(human_readable_gpu_block) - assert not missing_fields, f"Human-readable GPU block {index} missing fields: {missing_fields}\nBlock:\n{human_readable_gpu_block}" + missing_fields = _validate_human_readable_gpu_block( + human_readable_gpu_block + ) + assert ( + not missing_fields + ), f"Human-readable GPU block {index} missing fields: {missing_fields}\nBlock:\n{human_readable_gpu_block}" From 3f23ba4c75e6a50dba5e5937a434c846ddb83f60 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Wed, 11 Mar 2026 10:57:14 +0530 Subject: [PATCH 08/21] Update fetch_test_configurations.py to use --base-only for amdsmi_cli and refactor test_amdsmi_cli.py for improved clarity and functionality --- .../fetch_test_configurations.py | 2 +- .../test_amdsmi_cli.py | 39 ++++--------------- 2 files changed, 8 insertions(+), 33 deletions(-) diff --git a/build_tools/github_actions/fetch_test_configurations.py b/build_tools/github_actions/fetch_test_configurations.py index e9a8fd8e7dc..8f325f0923c 100644 --- a/build_tools/github_actions/fetch_test_configurations.py +++ b/build_tools/github_actions/fetch_test_configurations.py @@ -158,7 +158,7 @@ def _get_script_path(script_name: str) -> str: }, "amdsmi_cli": { "job_name": "amdsmi_cli", - "fetch_artifact_args": "--tests", + "fetch_artifact_args": "--base-only", "timeout_minutes": 15, "test_script": f"pytest {_get_script_path('test_amdsmi_cli.py')} -o log_cli=true --log-cli-level=INFO", "platform": ["linux"], diff --git a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py index 8e81918658a..86abdf28137 100644 --- a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py +++ b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py @@ -24,24 +24,10 @@ def _amd_smi_path() -> Path: - """Return the path to the `amd-smi` binary from `THEROCK_BIN_DIR`. - - Skips the test via pytest if `THEROCK_BIN_DIR` is not set. Asserts that - the expected `amd-smi` binary exists at the resolved path. - - Args: - None - - Returns: - pathlib.Path: Path to the `amd-smi` binary. - """ therock_bin_dir_env = os.getenv("THEROCK_BIN_DIR") if not therock_bin_dir_env: pytest.skip("THEROCK_BIN_DIR not set; skipping amdsmi tests") - # Resolve the path to an absolute canonical path to avoid cwd-dependent - # failures (e.g., if a prior step changes directory). Also check that the - # binary exists and is executable. amd_smi_bin_path = (Path(therock_bin_dir_env).expanduser().resolve()) / "amd-smi" assert amd_smi_bin_path.exists(), f"amd-smi not found at {amd_smi_bin_path}" assert os.access( @@ -50,20 +36,9 @@ def _amd_smi_path() -> Path: return amd_smi_bin_path -def _run_amd_smi(amd_smi_path: Path, modifiers: list[str]) -> tuple[int, str, str]: - """Run `amd-smi list` with the given `modifiers` and return (rc, stdout, stderr). - - The function invokes the binary via subprocess.run and captures text - output for assertions in the tests. - - Args: - amd_smi_path (pathlib.Path): Path to the `amd-smi` binary. - modifiers (list[str]): Arguments to pass after `amd-smi list`. - - Returns: - tuple[int, str, str]: Return code, stdout text, stderr text. - """ - cmd = [str(amd_smi_path), "list"] + modifiers +def _run_amd_smi(subcommands: list[str]) -> tuple[int, str, str]: + amd_smi_bin = _amd_smi_path() + cmd = [str(amd_smi_bin)] + list(subcommands) logger.info("Running amd-smi: %s", cmd) proc = subprocess.run(cmd, capture_output=True, text=True) logger.info("amd-smi returncode=%s", proc.returncode) @@ -73,7 +48,7 @@ def _run_amd_smi(amd_smi_path: Path, modifiers: list[str]) -> tuple[int, str, st def _parse_gpu_blocks(text_output: str) -> list[str]: - """Parse human-readable `amd-smi list` output into GPU text blocks. + """Parse human-readable `amd-smi` output into GPU text blocks. Returns a list where each element is the multiline block describing a single GPU. The parser looks for lines that start GPU markers like @@ -81,7 +56,7 @@ def _parse_gpu_blocks(text_output: str) -> list[str]: GPU marker. Args: - output (str): The human-readable stdout from `amd-smi list`. + output (str): The human-readable stdout from `amd-smi`. Returns: list[str]: List of multiline GPU description blocks. @@ -239,7 +214,6 @@ def test_amd_smi_list(mod_args, tmp_path): None """ modifiers, expected_output_mode = mod_args - amd_smi_bin = _amd_smi_path() output_file_path = None invocation_args = list(modifiers) @@ -249,7 +223,8 @@ def test_amd_smi_list(mod_args, tmp_path): invocation_args = [a for a in invocation_args if a != "--file"] invocation_args.extend(["--file", str(output_file_path)]) - return_code, stdout_text, stderr_text = _run_amd_smi(amd_smi_bin, invocation_args) + # subcommands: run `amd-smi list` with the invocation args + return_code, stdout_text, stderr_text = _run_amd_smi(["list"] + invocation_args) assert ( return_code == 0 ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" From 443bca2285cc50e8d2a8719cf05d77968721efa4 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 13 Mar 2026 18:14:11 +0530 Subject: [PATCH 09/21] Add AMDSMI tests and update sanity check workflow - Introduced new markers for AMDSMI tests in conftest.py. - Enhanced test_sanity_check.yml to include new input for handling AMDSMI default-unblocking tests. - Updated test execution logic in test_rocm_sanity.py to validate AMDSMI CLI output and added new test cases. - Removed obsolete test_amdsmi_cli.py file. --- .github/workflows/test_sanity_check.yml | 26 +- .../test_amdsmi_cli.py | 286 ------------------ conftest.py | 9 + tests/test_rocm_sanity.py | 215 ++++++++++++- 4 files changed, 244 insertions(+), 292 deletions(-) delete mode 100644 build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py create mode 100644 conftest.py diff --git a/.github/workflows/test_sanity_check.yml b/.github/workflows/test_sanity_check.yml index ee60887beb3..3d66655f22a 100644 --- a/.github/workflows/test_sanity_check.yml +++ b/.github/workflows/test_sanity_check.yml @@ -21,6 +21,10 @@ on: type: string platform: type: string + amdsmi_tests_default_unblocking_for_sanity_blocking: + type: boolean + description: 'If true, treat amdsmi default-unblocking tests as blockers (do not continue on error)' + default: false workflow_call: inputs: artifact_group: @@ -38,6 +42,10 @@ on: type: string platform: type: string + amdsmi_tests_default_unblocking_for_sanity_blocking: + type: boolean + description: 'If true, treat amdsmi default-unblocking tests as blockers (do not continue on error)' + default: false push: branches: - ADHOCBUILD @@ -138,15 +146,27 @@ jobs: run: | python ./build_tools/print_driver_gpu_info.py - - name: Run ROCm Sanity Tests - timeout-minutes: 5 + - name: Run ROCm Blocking Sanity Tests + timeout-minutes: 10 + env: + # Enable verbose logging, see + # https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html + AMD_LOG_LEVEL: 4 + ROCM_KPACK_DEBUG: "1" + run: | + pytest tests/ -m "not amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info --timeout=300 + + - name: Run ROCm AMDSMI Default-Unblocking-For-Sanity Tests + id: amdsmi_tests_default_unblocking_for_sanity_tests + continue-on-error: ${{ inputs.amdsmi_tests_default_unblocking_for_sanity_blocking == false }} + timeout-minutes: 10 env: # Enable verbose logging, see # https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html AMD_LOG_LEVEL: 4 ROCM_KPACK_DEBUG: "1" run: | - pytest tests/ --log-cli-level=info --timeout=300 + pytest tests/ -m "amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info --timeout=300 - name: Post-job cleanup processes on Windows if: ${{ always() && runner.os == 'Windows' }} diff --git a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py b/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py deleted file mode 100644 index 86abdf28137..00000000000 --- a/build_tools/github_actions/test_executable_scripts/test_amdsmi_cli.py +++ /dev/null @@ -1,286 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Advanced Micro Devices, Inc. -# SPDX-License-Identifier: MIT - -""" -validation of the `amd-smi` CLI output. - -This test expects `THEROCK_BIN_DIR` to point to the TheRock `bin/` directory -containing the `amd-smi` binary (CI sets this via the setup action). -""" - -import os -import re -import json -import csv -import subprocess -from pathlib import Path -import logging - -import pytest - - -logger = logging.getLogger(__name__) - - -def _amd_smi_path() -> Path: - therock_bin_dir_env = os.getenv("THEROCK_BIN_DIR") - if not therock_bin_dir_env: - pytest.skip("THEROCK_BIN_DIR not set; skipping amdsmi tests") - - amd_smi_bin_path = (Path(therock_bin_dir_env).expanduser().resolve()) / "amd-smi" - assert amd_smi_bin_path.exists(), f"amd-smi not found at {amd_smi_bin_path}" - assert os.access( - amd_smi_bin_path, os.X_OK - ), f"amd-smi is not executable: {amd_smi_bin_path}" - return amd_smi_bin_path - - -def _run_amd_smi(subcommands: list[str]) -> tuple[int, str, str]: - amd_smi_bin = _amd_smi_path() - cmd = [str(amd_smi_bin)] + list(subcommands) - logger.info("Running amd-smi: %s", cmd) - proc = subprocess.run(cmd, capture_output=True, text=True) - logger.info("amd-smi returncode=%s", proc.returncode) - logger.info("amd-smi stdout:\n%s", proc.stdout) - logger.info("amd-smi stderr:\n%s", proc.stderr) - return proc.returncode, proc.stdout, proc.stderr - - -def _parse_gpu_blocks(text_output: str) -> list[str]: - """Parse human-readable `amd-smi` output into GPU text blocks. - - Returns a list where each element is the multiline block describing a - single GPU. The parser looks for lines that start GPU markers like - "GPU: " or "GPU :" and groups subsequent lines until the next - GPU marker. - - Args: - output (str): The human-readable stdout from `amd-smi`. - - Returns: - list[str]: List of multiline GPU description blocks. - """ - gpu_blocks = [] - current_block_lines = None - for line in text_output.splitlines(): - if re.search(r"GPU:\s+(\d+)", line) or re.search(r"GPU\s+(\d+):", line): - if current_block_lines is not None: - gpu_blocks.append("\n".join(current_block_lines)) - current_block_lines = [line] - continue - if current_block_lines is not None: - current_block_lines.append(line) - if current_block_lines is not None: - gpu_blocks.append("\n".join(current_block_lines)) - return gpu_blocks - - -def _validate_human_readable_gpu_block(human_readable_gpu_block_text: str) -> list[str]: - """Validate a single human-readable GPU block. - - Returns a list of missing field names (empty if all required fields - appear). The function checks for BDF, UUID, KFD_ID, NODE_ID and - PARTITION_ID in the block_text. - - Args: - human_readable_gpu_block_text (str): Multiline text block describing a single GPU. - - Returns: - list[str]: Missing field names (empty if validation passes). - """ - missing_fields = [] - if not re.search(r"\s*BDF:\s*.+", human_readable_gpu_block_text): - missing_fields.append("BDF") - if not re.search(r"\s*UUID:\s*.+", human_readable_gpu_block_text): - missing_fields.append("UUID") - if not re.search(r"\s*KFD_ID:\s*\d+", human_readable_gpu_block_text): - missing_fields.append("KFD_ID") - if not re.search(r"\s*NODE_ID:\s*\d+", human_readable_gpu_block_text): - missing_fields.append("NODE_ID") - if not re.search(r"\s*PARTITION_ID:\s*\d+", human_readable_gpu_block_text): - missing_fields.append("PARTITION_ID") - return missing_fields - - -def _validate_json(gpu_obj: dict) -> list[str]: - """Validate a JSON GPU entry from `amd-smi --json`. - - Returns a list of missing or incorrectly-typed fields. Expected fields - include `gpu` (int), `bdf` (str), `uuid` (str), `kfd_id` (int), - `node_id` (int) and `partition_id` (int). - - Args: - obj (dict): Parsed JSON object representing a GPU entry. - - Returns: - list[str]: Missing or invalid field names. - """ - missing_fields = [] - # required keys mapping - if "gpu" not in gpu_obj or not isinstance(gpu_obj.get("gpu"), int): - missing_fields.append("gpu") - if "bdf" not in gpu_obj or not isinstance(gpu_obj.get("bdf"), str): - missing_fields.append("bdf") - if "uuid" not in gpu_obj or not isinstance(gpu_obj.get("uuid"), str): - missing_fields.append("uuid") - if "kfd_id" not in gpu_obj or not isinstance(gpu_obj.get("kfd_id"), int): - missing_fields.append("kfd_id") - if "node_id" not in gpu_obj or not isinstance(gpu_obj.get("node_id"), int): - missing_fields.append("node_id") - if "partition_id" not in gpu_obj or not isinstance( - gpu_obj.get("partition_id"), int - ): - missing_fields.append("partition_id") - return missing_fields - - -def _validate_csv_row(csv_row: dict) -> list[str]: - """Validate a CSV row parsed from `amd-smi --csv` output. - - Expected header names are: `gpu,gpu_bdf,gpu_uuid,kfd_id,node_id,partition_id`. - Returns a list of missing or invalid fields. - - Args: - row (dict): Mapping of CSV headers to values as returned by - `csv.DictReader`. - - Returns: - list[str]: Missing or invalid field names. - """ - # expected header names: gpu,gpu_bdf,gpu_uuid,kfd_id,node_id,partition_id - missing_fields = [] - try: - if "gpu" not in csv_row or int(csv_row.get("gpu", "")) < 0: - missing_fields.append("gpu") - except Exception: - missing_fields.append("gpu") - if not csv_row.get("gpu_bdf"): - missing_fields.append("gpu_bdf") - if not csv_row.get("gpu_uuid"): - missing_fields.append("gpu_uuid") - try: - if "kfd_id" not in csv_row or int(csv_row.get("kfd_id", "")) < 0: - missing_fields.append("kfd_id") - except Exception: - missing_fields.append("kfd_id") - try: - if "node_id" not in csv_row or int(csv_row.get("node_id", "")) < 0: - missing_fields.append("node_id") - except Exception: - missing_fields.append("node_id") - try: - if "partition_id" not in csv_row or int(csv_row.get("partition_id", "")) < 0: - missing_fields.append("partition_id") - except Exception: - missing_fields.append("partition_id") - return missing_fields - - -@pytest.mark.parametrize( - "mod_args", - [ - ([], None), # human readable on stdout - (["--json"], None), - (["--csv"], None), - (["--file"], "human"), - (["--json", "--file"], "json"), - (["--csv", "--file"], "csv"), - ], - ids=[ - "human-stdout", - "json-stdout", - "csv-stdout", - "human-file", - "json-file", - "csv-file", - ], -) -def test_amd_smi_list(mod_args, tmp_path): - """End-to-end test of `amd-smi list` covering output modes. - - The test runs `amd-smi list` with multiple modifier combinations (human, - JSON, CSV, and file-output variants), parses the output and validates - required fields for each GPU entry. - - Args: - mod_args (tuple[list[str], Optional[str]]): Parameterized tuple where - the first element is a list of modifier args and the second - element indicates the expected parsed mode when `--file` is - used. - tmp_path (pathlib.Path): pytest temporary directory fixture. - - Returns: - None - """ - modifiers, expected_output_mode = mod_args - - output_file_path = None - invocation_args = list(modifiers) - if "--file" in invocation_args: - # supply output file - output_file_path = tmp_path / "amdsmi_out.txt" - invocation_args = [a for a in invocation_args if a != "--file"] - invocation_args.extend(["--file", str(output_file_path)]) - - # subcommands: run `amd-smi list` with the invocation args - return_code, stdout_text, stderr_text = _run_amd_smi(["list"] + invocation_args) - assert ( - return_code == 0 - ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" - - # If file was requested, stdout should be empty - if output_file_path is not None: - assert ( - stdout_text.strip() == "" - ), f"Expected no stdout when using --file, got: {stdout_text}" - assert output_file_path.exists(), "Expected output file to be created" - content_text = output_file_path.read_text(encoding="utf-8", errors="replace") - else: - content_text = stdout_text - - # Validate based on mode - if expected_output_mode == "json" or ( - "--json" in modifiers and expected_output_mode is None - ): - # JSON array expected - try: - json_data = json.loads(content_text) - except Exception as e: - pytest.fail(f"Failed to parse JSON output: {e}\nContent:\n{content_text}") - assert ( - isinstance(json_data, list) and json_data - ), "Expected non-empty JSON array" - for index, gpu_obj in enumerate(json_data): - missing_fields = _validate_json(gpu_obj) - assert ( - not missing_fields - ), f"JSON GPU entry {index} missing fields: {missing_fields}" - - elif expected_output_mode == "csv" or ( - "--csv" in modifiers and expected_output_mode is None - ): - # CSV expected - try: - csv_reader = csv.DictReader(content_text.splitlines()) - csv_rows = list(csv_reader) - except Exception as e: - pytest.fail(f"Failed to parse CSV output: {e}\nContent:\n{content_text}") - assert csv_rows, "Expected at least one CSV row" - for index, csv_row in enumerate(csv_rows): - missing_fields = _validate_csv_row(csv_row) - assert ( - not missing_fields - ), f"CSV row {index} missing fields: {missing_fields}" - - else: - # human readable output - gpu_blocks = _parse_gpu_blocks(content_text) - assert gpu_blocks, "No GPU blocks found in amd-smi human output" - for index, human_readable_gpu_block in enumerate(gpu_blocks): - missing_fields = _validate_human_readable_gpu_block( - human_readable_gpu_block - ) - assert ( - not missing_fields - ), f"Human-readable GPU block {index} missing fields: {missing_fields}\nBlock:\n{human_readable_gpu_block}" diff --git a/conftest.py b/conftest.py new file mode 100644 index 00000000000..918b04e5e36 --- /dev/null +++ b/conftest.py @@ -0,0 +1,9 @@ +def pytest_configure(config): + config.addinivalue_line( + "markers", + "amdsmi_tests_default_unblocking_for_sanity: marks tests as default-unblocking for amdsmi sanity (amdsmi_tests_default_unblocking_for_sanity)" + ) + config.addinivalue_line( + "markers", + "amd_smi: marks tests that exercise the amd-smi CLI" + ) diff --git a/tests/test_rocm_sanity.py b/tests/test_rocm_sanity.py index 233ec8d3abd..eb788e9be14 100644 --- a/tests/test_rocm_sanity.py +++ b/tests/test_rocm_sanity.py @@ -1,6 +1,7 @@ # Copyright Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT - +import json +import csv from pathlib import Path from pytest_check import check import logging @@ -15,9 +16,7 @@ THIS_DIR = Path(__file__).resolve().parent logger = logging.getLogger(__name__) - THEROCK_BIN_DIR = Path(os.getenv("THEROCK_BIN_DIR")).resolve() - AMDGPU_FAMILIES = os.getenv("AMDGPU_FAMILIES") # Importing is_asan from github_actions_utils.py @@ -25,6 +24,108 @@ from github_actions_utils import is_asan +def _amd_smi_path() -> Path: + therock_bin_dir_env = os.getenv("THEROCK_BIN_DIR") + if not therock_bin_dir_env: + pytest.fail("THEROCK_BIN_DIR not set; failing amdsmi tests") + + amd_smi_bin_path = (Path(therock_bin_dir_env).expanduser().resolve()) / "amd-smi" + if not amd_smi_bin_path.exists(): + pytest.fail(f"amd-smi not found at {amd_smi_bin_path}") + if not os.access(amd_smi_bin_path, os.X_OK): + pytest.fail(f"amd-smi is not executable: {amd_smi_bin_path}") + return amd_smi_bin_path + + +def _run_amd_smi(subcommands: list[str]) -> tuple[int, str, str]: + amd_smi_bin = _amd_smi_path() + cmd = [str(amd_smi_bin)] + list(subcommands) + logger.info("Running amd-smi: %s", cmd) + proc = subprocess.run(cmd, capture_output=True, text=True) + logger.info("amd-smi returncode=%s", proc.returncode) + logger.info("amd-smi stdout:\n%s", proc.stdout) + logger.info("amd-smi stderr:\n%s", proc.stderr) + return proc.returncode, proc.stdout, proc.stderr + + +def _parse_gpu_blocks(text_output: str) -> list[str]: + gpu_blocks = [] + current_block_lines = None + for line in text_output.splitlines(): + if re.search(r"GPU:\s+(\d+)", line) or re.search(r"GPU\s+(\d+):", line): + if current_block_lines is not None: + gpu_blocks.append("\n".join(current_block_lines)) + current_block_lines = [line] + continue + if current_block_lines is not None: + current_block_lines.append(line) + if current_block_lines is not None: + gpu_blocks.append("\n".join(current_block_lines)) + return gpu_blocks + + +def _validate_human_readable_gpu_block(human_readable_gpu_block_text: str) -> list[str]: + missing_fields = [] + if not re.search(r"\s*BDF:\s*.+", human_readable_gpu_block_text): + missing_fields.append("BDF") + if not re.search(r"\s*UUID:\s*.+", human_readable_gpu_block_text): + missing_fields.append("UUID") + if not re.search(r"\s*KFD_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("KFD_ID") + if not re.search(r"\s*NODE_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("NODE_ID") + if not re.search(r"\s*PARTITION_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("PARTITION_ID") + return missing_fields + + +def _validate_json(gpu_obj: dict) -> list[str]: + missing_fields = [] + if "gpu" not in gpu_obj or not isinstance(gpu_obj.get("gpu"), int): + missing_fields.append("gpu") + if "bdf" not in gpu_obj or not isinstance(gpu_obj.get("bdf"), str): + missing_fields.append("bdf") + if "uuid" not in gpu_obj or not isinstance(gpu_obj.get("uuid"), str): + missing_fields.append("uuid") + if "kfd_id" not in gpu_obj or not isinstance(gpu_obj.get("kfd_id"), int): + missing_fields.append("kfd_id") + if "node_id" not in gpu_obj or not isinstance(gpu_obj.get("node_id"), int): + missing_fields.append("node_id") + if "partition_id" not in gpu_obj or not isinstance( + gpu_obj.get("partition_id"), int + ): + missing_fields.append("partition_id") + return missing_fields + + +def _validate_csv_row(csv_row: dict) -> list[str]: + missing_fields = [] + try: + if "gpu" not in csv_row or int(csv_row.get("gpu", "")) < 0: + missing_fields.append("gpu") + except Exception: + missing_fields.append("gpu") + if not csv_row.get("gpu_bdf"): + missing_fields.append("gpu_bdf") + if not csv_row.get("gpu_uuid"): + missing_fields.append("gpu_uuid") + try: + if "kfd_id" not in csv_row or int(csv_row.get("kfd_id", "")) < 0: + missing_fields.append("kfd_id") + except Exception: + missing_fields.append("kfd_id") + try: + if "node_id" not in csv_row or int(csv_row.get("node_id", "")) < 0: + missing_fields.append("node_id") + except Exception: + missing_fields.append("node_id") + try: + if "partition_id" not in csv_row or int(csv_row.get("partition_id", "")) < 0: + missing_fields.append("partition_id") + except Exception: + missing_fields.append("partition_id") + return missing_fields + def is_windows(): return "windows" == platform.system().lower() @@ -229,3 +330,111 @@ def test_amdsmi_suite(self): print(f"[amdsmitst-summary] {line}") check.equal(process.returncode, 0) + + @pytest.mark.skipif(is_windows(), reason="amd-smi CLI not supported on Windows") + @pytest.mark.skipif( + AMDGPU_FAMILIES == "gfx1151", reason="Linux gfx1151 does not support amdsmi yet" + ) + @pytest.mark.amd_smi + def test_amd_smi_blocks(self): + """Blocking check: `amd-smi list` prints GPU blocks and they are non-empty. + + This is a lightweight blocking gate: it only asserts that GPU blocks + exist and contain some text, without validating specific fields. + """ + return_code, stdout_text, stderr_text = _run_amd_smi(["list"]) + assert ( + return_code == 0 + ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" + + gpu_blocks = _parse_gpu_blocks(stdout_text) + assert gpu_blocks, "No GPU blocks found in amd-smi output" + for index, block in enumerate(gpu_blocks): + assert block.strip(), f"GPU block {index} is empty" + + @pytest.mark.amd_smi + @pytest.mark.amdsmi_tests_default_unblocking_for_sanity + @pytest.mark.parametrize( + "mod_args", + [ + ([], None), # human readable on stdout + (["--json"], None), + (["--csv"], None), + (["--file"], "human"), + (["--json", "--file"], "json"), + (["--csv", "--file"], "csv"), + ], + ids=[ + "human-stdout", + "json-stdout", + "csv-stdout", + "human-file", + "json-file", + "csv-file", + ], + ) + def test_amd_smi_list(self, mod_args, tmp_path): + modifiers, expected_output_mode = mod_args + + output_file_path = None + invocation_args = list(modifiers) + if "--file" in invocation_args: + output_file_path = tmp_path / "amdsmi_out.txt" + invocation_args = [a for a in invocation_args if a != "--file"] + invocation_args.extend(["--file", str(output_file_path)]) + + return_code, stdout_text, stderr_text = _run_amd_smi(["list"] + invocation_args) + assert ( + return_code == 0 + ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" + + if output_file_path is not None: + assert ( + stdout_text.strip() == "" + ), f"Expected no stdout when using --file, got: {stdout_text}" + assert output_file_path.exists(), "Expected output file to be created" + content_text = output_file_path.read_text(encoding="utf-8", errors="replace") + else: + content_text = stdout_text + + if expected_output_mode == "json" or ( + "--json" in modifiers and expected_output_mode is None + ): + try: + json_data = json.loads(content_text) + except Exception as e: + pytest.fail(f"Failed to parse JSON output: {e}\nContent:\n{content_text}") + assert ( + isinstance(json_data, list) and json_data + ), "Expected non-empty JSON array" + for index, gpu_obj in enumerate(json_data): + missing_fields = _validate_json(gpu_obj) + assert ( + not missing_fields + ), f"JSON GPU entry {index} missing fields: {missing_fields}" + + elif expected_output_mode == "csv" or ( + "--csv" in modifiers and expected_output_mode is None + ): + try: + csv_reader = csv.DictReader(content_text.splitlines()) + csv_rows = list(csv_reader) + except Exception as e: + pytest.fail(f"Failed to parse CSV output: {e}\nContent:\n{content_text}") + assert csv_rows, "Expected at least one CSV row" + for index, csv_row in enumerate(csv_rows): + missing_fields = _validate_csv_row(csv_row) + assert ( + not missing_fields + ), f"CSV row {index} missing fields: {missing_fields}" + + else: + gpu_blocks = _parse_gpu_blocks(content_text) + assert gpu_blocks, "No GPU blocks found in amd-smi human output" + for index, human_readable_gpu_block in enumerate(gpu_blocks): + missing_fields = _validate_human_readable_gpu_block( + human_readable_gpu_block + ) + assert ( + not missing_fields + ), f"Human-readable GPU block {index} missing fields: {missing_fields}\nBlock:\n{human_readable_gpu_block}" From 4775e8397fd9b5a736415ce3c5829396aef47376 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 13 Mar 2026 18:19:32 +0530 Subject: [PATCH 10/21] Refactor error handling in JSON and CSV parsing to improve readability --- conftest.py | 6 ++---- tests/test_rocm_sanity.py | 8 ++++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/conftest.py b/conftest.py index 918b04e5e36..ec241064ca2 100644 --- a/conftest.py +++ b/conftest.py @@ -1,9 +1,7 @@ def pytest_configure(config): config.addinivalue_line( - "markers", - "amdsmi_tests_default_unblocking_for_sanity: marks tests as default-unblocking for amdsmi sanity (amdsmi_tests_default_unblocking_for_sanity)" + "markers", "amdsmi_tests_default_unblocking_for_sanity: marks tests as default-unblocking for amdsmi sanity (amdsmi_tests_default_unblocking_for_sanity)" ) config.addinivalue_line( - "markers", - "amd_smi: marks tests that exercise the amd-smi CLI" + "markers", "amd_smi: marks tests that exercise the amd-smi CLI" ) diff --git a/tests/test_rocm_sanity.py b/tests/test_rocm_sanity.py index eb788e9be14..92300900e46 100644 --- a/tests/test_rocm_sanity.py +++ b/tests/test_rocm_sanity.py @@ -403,7 +403,9 @@ def test_amd_smi_list(self, mod_args, tmp_path): try: json_data = json.loads(content_text) except Exception as e: - pytest.fail(f"Failed to parse JSON output: {e}\nContent:\n{content_text}") + pytest.fail( + f"Failed to parse JSON output: {e}\nContent:\n{content_text}" + ) assert ( isinstance(json_data, list) and json_data ), "Expected non-empty JSON array" @@ -420,7 +422,9 @@ def test_amd_smi_list(self, mod_args, tmp_path): csv_reader = csv.DictReader(content_text.splitlines()) csv_rows = list(csv_reader) except Exception as e: - pytest.fail(f"Failed to parse CSV output: {e}\nContent:\n{content_text}") + pytest.fail( + f"Failed to parse CSV output: {e}\nContent:\n{content_text}" + ) assert csv_rows, "Expected at least one CSV row" for index, csv_row in enumerate(csv_rows): missing_fields = _validate_csv_row(csv_row) From 58c355ea1674dd8c5751396c8e702da3654c0e9d Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 13 Mar 2026 18:23:09 +0530 Subject: [PATCH 11/21] Refactor code for improved readability in conftest.py and test_rocm_sanity.py --- conftest.py | 3 ++- tests/test_rocm_sanity.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/conftest.py b/conftest.py index ec241064ca2..810541c123c 100644 --- a/conftest.py +++ b/conftest.py @@ -1,6 +1,7 @@ def pytest_configure(config): config.addinivalue_line( - "markers", "amdsmi_tests_default_unblocking_for_sanity: marks tests as default-unblocking for amdsmi sanity (amdsmi_tests_default_unblocking_for_sanity)" + "markers", + "amdsmi_tests_default_unblocking_for_sanity: marks tests as default-unblocking for amdsmi sanity (amdsmi_tests_default_unblocking_for_sanity)" ) config.addinivalue_line( "markers", "amd_smi: marks tests that exercise the amd-smi CLI" diff --git a/tests/test_rocm_sanity.py b/tests/test_rocm_sanity.py index 92300900e46..63f4a87b99a 100644 --- a/tests/test_rocm_sanity.py +++ b/tests/test_rocm_sanity.py @@ -393,7 +393,9 @@ def test_amd_smi_list(self, mod_args, tmp_path): stdout_text.strip() == "" ), f"Expected no stdout when using --file, got: {stdout_text}" assert output_file_path.exists(), "Expected output file to be created" - content_text = output_file_path.read_text(encoding="utf-8", errors="replace") + content_text = output_file_path.read_text( + encoding="utf-8", errors="replace" + ) else: content_text = stdout_text From e45209f1e8ababc975dad5f0d4ccdb048f325987 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 13 Mar 2026 18:26:18 +0530 Subject: [PATCH 12/21] Add missing newline in test_rocm_sanity.py and fix marker formatting in conftest.py --- conftest.py | 2 +- tests/test_rocm_sanity.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 810541c123c..c6cdf7a9109 100644 --- a/conftest.py +++ b/conftest.py @@ -1,7 +1,7 @@ def pytest_configure(config): config.addinivalue_line( "markers", - "amdsmi_tests_default_unblocking_for_sanity: marks tests as default-unblocking for amdsmi sanity (amdsmi_tests_default_unblocking_for_sanity)" + "amdsmi_tests_default_unblocking_for_sanity: marks tests as default-unblocking for amdsmi sanity (amdsmi_tests_default_unblocking_for_sanity)", ) config.addinivalue_line( "markers", "amd_smi: marks tests that exercise the amd-smi CLI" diff --git a/tests/test_rocm_sanity.py b/tests/test_rocm_sanity.py index 63f4a87b99a..36a4f946fc5 100644 --- a/tests/test_rocm_sanity.py +++ b/tests/test_rocm_sanity.py @@ -126,6 +126,7 @@ def _validate_csv_row(csv_row: dict) -> list[str]: missing_fields.append("partition_id") return missing_fields + def is_windows(): return "windows" == platform.system().lower() From 8ccca35b942ada777dae80f52db8c3241fa9abc7 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 13 Mar 2026 18:28:44 +0530 Subject: [PATCH 13/21] Fix formatting of markers in conftest.py --- conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index c6cdf7a9109..60d88e647a7 100644 --- a/conftest.py +++ b/conftest.py @@ -1,6 +1,6 @@ def pytest_configure(config): config.addinivalue_line( - "markers", + "markers", "amdsmi_tests_default_unblocking_for_sanity: marks tests as default-unblocking for amdsmi sanity (amdsmi_tests_default_unblocking_for_sanity)", ) config.addinivalue_line( From 31f476b47aa2d376a5c058ecc2a84ceedf346832 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 13 Mar 2026 18:31:43 +0530 Subject: [PATCH 14/21] Remove timeout parameter from pytest commands in test_sanity_check.yml --- .github/workflows/test_sanity_check.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_sanity_check.yml b/.github/workflows/test_sanity_check.yml index 3d66655f22a..57ad349dcd0 100644 --- a/.github/workflows/test_sanity_check.yml +++ b/.github/workflows/test_sanity_check.yml @@ -154,7 +154,7 @@ jobs: AMD_LOG_LEVEL: 4 ROCM_KPACK_DEBUG: "1" run: | - pytest tests/ -m "not amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info --timeout=300 + pytest tests/ -m "not amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info - name: Run ROCm AMDSMI Default-Unblocking-For-Sanity Tests id: amdsmi_tests_default_unblocking_for_sanity_tests @@ -166,7 +166,7 @@ jobs: AMD_LOG_LEVEL: 4 ROCM_KPACK_DEBUG: "1" run: | - pytest tests/ -m "amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info --timeout=300 + pytest tests/ -m "amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info - name: Post-job cleanup processes on Windows if: ${{ always() && runner.os == 'Windows' }} From f39a95e88027bdebe2f7b27f6f1b0fb1da64cd30 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Fri, 13 Mar 2026 18:36:43 +0530 Subject: [PATCH 15/21] Add timeout parameter to pytest commands in test_sanity_check.yml --- .github/workflows/test_sanity_check.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_sanity_check.yml b/.github/workflows/test_sanity_check.yml index 57ad349dcd0..708a8e23d29 100644 --- a/.github/workflows/test_sanity_check.yml +++ b/.github/workflows/test_sanity_check.yml @@ -154,7 +154,7 @@ jobs: AMD_LOG_LEVEL: 4 ROCM_KPACK_DEBUG: "1" run: | - pytest tests/ -m "not amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info + pytest tests/ -m "not amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info --timeout=600 - name: Run ROCm AMDSMI Default-Unblocking-For-Sanity Tests id: amdsmi_tests_default_unblocking_for_sanity_tests @@ -166,7 +166,7 @@ jobs: AMD_LOG_LEVEL: 4 ROCM_KPACK_DEBUG: "1" run: | - pytest tests/ -m "amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info + pytest tests/ -m "amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info --timeout=600 - name: Post-job cleanup processes on Windows if: ${{ always() && runner.os == 'Windows' }} From f2eec65f1353a47fb9603ef2c59aeeaee1e63751 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Sat, 14 Mar 2026 00:55:34 +0530 Subject: [PATCH 16/21] Refactor amd-smi CLI tests --- .github/workflows/test_sanity_check.yml | 175 -------------- .../fetch_test_configurations.py | 2 +- .../test_executable_scripts/test_sanity.py | 11 +- conftest.py | 9 +- tests/test_amdsmi_cli.py | 218 +++++++++++++++++ tests/test_rocm_sanity.py | 219 ------------------ 6 files changed, 234 insertions(+), 400 deletions(-) delete mode 100644 .github/workflows/test_sanity_check.yml create mode 100644 tests/test_amdsmi_cli.py diff --git a/.github/workflows/test_sanity_check.yml b/.github/workflows/test_sanity_check.yml deleted file mode 100644 index 708a8e23d29..00000000000 --- a/.github/workflows/test_sanity_check.yml +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright Advanced Micro Devices, Inc. -# SPDX-License-Identifier: MIT - -name: TheRock Sanity Check - -on: - workflow_dispatch: - inputs: - artifact_group: - type: string - artifact_run_id: - type: string - default: "" - amdgpu_families: - type: string - default: "" - amdgpu_targets: - type: string - default: "" - test_runs_on: - type: string - platform: - type: string - amdsmi_tests_default_unblocking_for_sanity_blocking: - type: boolean - description: 'If true, treat amdsmi default-unblocking tests as blockers (do not continue on error)' - default: false - workflow_call: - inputs: - artifact_group: - type: string - artifact_run_id: - type: string - default: "" - amdgpu_families: - type: string - default: "" - amdgpu_targets: - type: string - default: "" - test_runs_on: - type: string - platform: - type: string - amdsmi_tests_default_unblocking_for_sanity_blocking: - type: boolean - description: 'If true, treat amdsmi default-unblocking tests as blockers (do not continue on error)' - default: false - push: - branches: - - ADHOCBUILD - -permissions: - contents: read - -jobs: - test_sanity_check: - name: "Sanity ROCM Test (${{ inputs.amdgpu_families }})" - runs-on: ${{ inputs.test_runs_on }} - # Running docker with cap-add and -v /lib/modiles, by recommendation of Github: https://rocm.docs.amd.com/projects/amdsmi/en/amd-staging/how-to/setup-docker-container.html - container: - image: ${{ inputs.platform == 'linux' && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98' || null }} - # --ulimit memlock=-1:-1 - Prevents memory allocation issues with ROCm inside container - # --security-opt seccomp=unconfined - enables memory mapping, and is recommended for containers running in HPC environments - # --env-file /etc/podinfo/gha-gpu-isolation-settings - Required for GPU isolation on OSSCI MIXXX runners - # --user 0:0 - Running as root, by recommendation of GitHub: https://docs.github.com/en/actions/reference/workflows-and-actions/dockerfile-support#user - options: --ipc host - --group-add video - --device /dev/kfd - --device /dev/dri - --group-add 992 - --group-add 110 - --cap-add SYS_MODULE - -v /lib/modules:/lib/modules - --ulimit memlock=-1:-1 - --security-opt seccomp=unconfined - --env-file /etc/podinfo/gha-gpu-isolation-settings - --user 0:0 - defaults: - run: - shell: bash - env: - VENV_DIR: ${{ github.workspace }}/.venv - ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}" - OUTPUT_ARTIFACTS_DIR: ${{ github.workspace }}/build - THEROCK_BIN_DIR: ${{ github.workspace }}/build/bin - AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }} - AMDGPU_TARGETS: ${{ inputs.amdgpu_targets }} - ARTIFACT_GROUP: ${{ inputs.artifact_group }} - steps: - - name: "Fetch 'build_tools' from repository" - if: ${{ runner.os == 'Windows' }} - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - sparse-checkout: build_tools - path: prejob - - - name: Pre-job cleanup processes on Windows - if: ${{ runner.os == 'Windows' }} - timeout-minutes: 5 - shell: powershell - run: . '${{ github.workspace }}\prejob\build_tools\github_actions\cleanup_processes.ps1' - - - name: Checkout Repository - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - repository: "ROCm/TheRock" - - - name: Pre-job cleanup Docker containers on Linux - if: ${{ runner.os == 'Linux' }} - timeout-minutes: 5 - shell: bash - run: | - # Remove any stopped containers - docker container prune -f || true - # Remove dangling networks - docker network prune -f || true - - - name: Run setup test environment workflow - timeout-minutes: 15 - uses: './.github/actions/setup_test_environment' - with: - ARTIFACT_GROUP: ${{ inputs.artifact_group }} - AMDGPU_TARGETS: ${{ inputs.amdgpu_targets }} - ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }} - OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }} - VENV_DIR: ${{ env.VENV_DIR }} - FETCH_ARTIFACT_ARGS: "--base-only" - IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }} - - # The sanity checks run tools like 'offload-arch' which may search for - # DLLs on multiple search paths (PATH, CWD, system32, etc.). - # For typical "installs" of ROCm, the rocm/bin/ dir can be expected to be - # added to PATH, so we do that here. If we don't do this, DLLs on test - # runners in system32 may be picked up instead and the tests may not be - # representative, see https://github.com/ROCm/TheRock/issues/2019 and - # https://github.com/ROCm/TheRock/pull/3230#issuecomment-3844854922. - - name: Set PATH and HIP_CLANG_PATH for windows - if: ${{ runner.os == 'Windows' }} - run: | - echo "HIP_CLANG_PATH=${OUTPUT_ARTIFACTS_DIR}\lib\llvm\bin" >> $GITHUB_ENV - echo "${OUTPUT_ARTIFACTS_DIR}\bin" >> $GITHUB_PATH - - - name: Driver / GPU sanity check - timeout-minutes: 3 - run: | - python ./build_tools/print_driver_gpu_info.py - - - name: Run ROCm Blocking Sanity Tests - timeout-minutes: 10 - env: - # Enable verbose logging, see - # https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html - AMD_LOG_LEVEL: 4 - ROCM_KPACK_DEBUG: "1" - run: | - pytest tests/ -m "not amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info --timeout=600 - - - name: Run ROCm AMDSMI Default-Unblocking-For-Sanity Tests - id: amdsmi_tests_default_unblocking_for_sanity_tests - continue-on-error: ${{ inputs.amdsmi_tests_default_unblocking_for_sanity_blocking == false }} - timeout-minutes: 10 - env: - # Enable verbose logging, see - # https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html - AMD_LOG_LEVEL: 4 - ROCM_KPACK_DEBUG: "1" - run: | - pytest tests/ -m "amdsmi_tests_default_unblocking_for_sanity" --log-cli-level=info --timeout=600 - - - name: Post-job cleanup processes on Windows - if: ${{ always() && runner.os == 'Windows' }} - timeout-minutes: 5 - shell: powershell - run: . '${{ github.workspace }}\build_tools\github_actions\cleanup_processes.ps1' diff --git a/build_tools/github_actions/fetch_test_configurations.py b/build_tools/github_actions/fetch_test_configurations.py index bcb6d65be84..dd002c05f02 100644 --- a/build_tools/github_actions/fetch_test_configurations.py +++ b/build_tools/github_actions/fetch_test_configurations.py @@ -183,7 +183,7 @@ def _get_script_path(script_name: str) -> str: "job_name": "amdsmi_cli", "fetch_artifact_args": "--base-only", "timeout_minutes": 15, - "test_script": f"pytest {_get_script_path('test_amdsmi_cli.py')} -o log_cli=true --log-cli-level=INFO", + "test_script": "pytest tests/test_amdsmi_cli.py -m not_sanity -o log_cli=true --log-cli-level=INFO", "platform": ["linux"], "total_shards_dict": { "linux": 1, diff --git a/build_tools/github_actions/test_executable_scripts/test_sanity.py b/build_tools/github_actions/test_executable_scripts/test_sanity.py index e54a0e20607..eb154db1785 100644 --- a/build_tools/github_actions/test_executable_scripts/test_sanity.py +++ b/build_tools/github_actions/test_executable_scripts/test_sanity.py @@ -12,6 +12,11 @@ SCRIPT_DIR = Path(__file__).resolve().parent THEROCK_DIR = SCRIPT_DIR.parent.parent.parent + +def _run_pytest(cmd: list[str], *, cwd: Path, env: dict[str, str], check: bool) -> subprocess.CompletedProcess[str]: + logging.info("++ Exec [%s]$ %s", cwd, " ".join(cmd)) + return subprocess.run(cmd, cwd=cwd, env=env, check=check, text=True) + env = os.environ.copy() # Enable verbose ROCm logging, see # https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html @@ -39,6 +44,6 @@ "--timeout=300", ] -logging.info(f"++ Exec [{THEROCK_DIR}]$ {' '.join(cmd)}") - -subprocess.run(cmd, cwd=THEROCK_DIR, env=env, check=True) +# Default sanity behavior: run everything except tests marked as not_sanity. +phase_cmd = cmd + ["-m", "not not_sanity"] +_run_pytest(phase_cmd, cwd=THEROCK_DIR, env=env, check=True) diff --git a/conftest.py b/conftest.py index 60d88e647a7..a2aeb752037 100644 --- a/conftest.py +++ b/conftest.py @@ -1,8 +1,13 @@ def pytest_configure(config): config.addinivalue_line( "markers", - "amdsmi_tests_default_unblocking_for_sanity: marks tests as default-unblocking for amdsmi sanity (amdsmi_tests_default_unblocking_for_sanity)", + "not_sanity: marks tests that must not run in sanity gating", ) config.addinivalue_line( - "markers", "amd_smi: marks tests that exercise the amd-smi CLI" + "markers", + "amd_smi: marks tests that exercise amd-smi", + ) + config.addinivalue_line( + "markers", + "amd_smi_cli: marks amd-smi CLI tests", ) diff --git a/tests/test_amdsmi_cli.py b/tests/test_amdsmi_cli.py new file mode 100644 index 00000000000..61a78a55130 --- /dev/null +++ b/tests/test_amdsmi_cli.py @@ -0,0 +1,218 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +"""amd-smi CLI tests.""" + +import csv +import json +import logging +import os +import platform +import re +import subprocess +from pathlib import Path + +import pytest + +logger = logging.getLogger(__name__) + + +def is_windows() -> bool: + return platform.system().lower() == "windows" + + +def _amd_smi_path() -> Path: + therock_bin_dir_env = os.getenv("THEROCK_BIN_DIR") + if not therock_bin_dir_env: + pytest.fail("THEROCK_BIN_DIR not set; failing amd-smi CLI tests") + + amd_smi_bin_path = (Path(therock_bin_dir_env).expanduser().resolve()) / "amd-smi" + if not amd_smi_bin_path.exists(): + pytest.fail(f"amd-smi not found at {amd_smi_bin_path}") + if not os.access(amd_smi_bin_path, os.X_OK): + pytest.fail(f"amd-smi is not executable: {amd_smi_bin_path}") + return amd_smi_bin_path + + +def _run_amd_smi(subcommands: list[str]) -> tuple[int, str, str]: + amd_smi_bin = _amd_smi_path() + cmd = [str(amd_smi_bin)] + list(subcommands) + logger.info("Running amd-smi: %s", cmd) + proc = subprocess.run(cmd, capture_output=True, text=True) + return proc.returncode, proc.stdout, proc.stderr + + +def _parse_gpu_blocks(text_output: str) -> list[str]: + gpu_blocks: list[str] = [] + current_block_lines: list[str] | None = None + for line in text_output.splitlines(): + if re.search(r"GPU:\s+(\d+)", line) or re.search(r"GPU\s+(\d+):", line): + if current_block_lines is not None: + gpu_blocks.append("\n".join(current_block_lines)) + current_block_lines = [line] + continue + if current_block_lines is not None: + current_block_lines.append(line) + if current_block_lines is not None: + gpu_blocks.append("\n".join(current_block_lines)) + return gpu_blocks + + +def _validate_human_readable_gpu_block(human_readable_gpu_block_text: str) -> list[str]: + missing_fields: list[str] = [] + if not re.search(r"\s*BDF:\s*.+", human_readable_gpu_block_text): + missing_fields.append("BDF") + if not re.search(r"\s*UUID:\s*.+", human_readable_gpu_block_text): + missing_fields.append("UUID") + if not re.search(r"\s*KFD_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("KFD_ID") + if not re.search(r"\s*NODE_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("NODE_ID") + if not re.search(r"\s*PARTITION_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("PARTITION_ID") + return missing_fields + + +def _validate_json(gpu_obj: dict) -> list[str]: + missing_fields: list[str] = [] + if "gpu" not in gpu_obj or not isinstance(gpu_obj.get("gpu"), int): + missing_fields.append("gpu") + if "bdf" not in gpu_obj or not isinstance(gpu_obj.get("bdf"), str): + missing_fields.append("bdf") + if "uuid" not in gpu_obj or not isinstance(gpu_obj.get("uuid"), str): + missing_fields.append("uuid") + if "kfd_id" not in gpu_obj or not isinstance(gpu_obj.get("kfd_id"), int): + missing_fields.append("kfd_id") + if "node_id" not in gpu_obj or not isinstance(gpu_obj.get("node_id"), int): + missing_fields.append("node_id") + if "partition_id" not in gpu_obj or not isinstance(gpu_obj.get("partition_id"), int): + missing_fields.append("partition_id") + return missing_fields + + +def _validate_csv_row(csv_row: dict) -> list[str]: + missing_fields: list[str] = [] + try: + if "gpu" not in csv_row or int(csv_row.get("gpu", "")) < 0: + missing_fields.append("gpu") + except Exception: + missing_fields.append("gpu") + if not csv_row.get("gpu_bdf"): + missing_fields.append("gpu_bdf") + if not csv_row.get("gpu_uuid"): + missing_fields.append("gpu_uuid") + try: + if "kfd_id" not in csv_row or int(csv_row.get("kfd_id", "")) < 0: + missing_fields.append("kfd_id") + except Exception: + missing_fields.append("kfd_id") + try: + if "node_id" not in csv_row or int(csv_row.get("node_id", "")) < 0: + missing_fields.append("node_id") + except Exception: + missing_fields.append("node_id") + try: + if "partition_id" not in csv_row or int(csv_row.get("partition_id", "")) < 0: + missing_fields.append("partition_id") + except Exception: + missing_fields.append("partition_id") + return missing_fields + + +AMDGPU_FAMILIES = os.getenv("AMDGPU_FAMILIES") + + +# Module-wide: these are amd-smi CLI tests. +pytestmark = [pytest.mark.amd_smi, pytest.mark.amd_smi_cli] + + +@pytest.mark.skipif(is_windows(), reason="amd-smi CLI not supported on Windows") +@pytest.mark.skipif( + AMDGPU_FAMILIES == "gfx1151", reason="Linux gfx1151 does not support amdsmi yet" +) +def test_amd_smi_blocks() -> None: + """Sanity-gating check: amd-smi list succeeds and reports at least one GPU.""" + return_code, stdout_text, stderr_text = _run_amd_smi(["list"]) + assert ( + return_code == 0 + ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" + + gpu_blocks = _parse_gpu_blocks(stdout_text) + assert gpu_blocks, "No GPU blocks found in amd-smi output" + + +@pytest.mark.not_sanity +@pytest.mark.skipif(is_windows(), reason="amd-smi CLI not supported on Windows") +@pytest.mark.skipif( + AMDGPU_FAMILIES == "gfx1151", reason="Linux gfx1151 does not support amdsmi yet" +) +@pytest.mark.parametrize( + "mod_args", + [ + ([], None), + (["--json"], None), + (["--csv"], None), + (["--file"], "human"), + (["--json", "--file"], "json"), + (["--csv", "--file"], "csv"), + ], + ids=[ + "human-stdout", + "json-stdout", + "csv-stdout", + "human-file", + "json-file", + "csv-file", + ], +) +def test_amd_smi_list(mod_args, tmp_path: Path) -> None: + modifiers, expected_output_mode = mod_args + + output_file_path: Path | None = None + invocation_args = list(modifiers) + if "--file" in invocation_args: + output_file_path = tmp_path / "amdsmi_out.txt" + invocation_args = [a for a in invocation_args if a != "--file"] + invocation_args.extend(["--file", str(output_file_path)]) + + return_code, stdout_text, stderr_text = _run_amd_smi(["list"] + invocation_args) + assert ( + return_code == 0 + ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" + + if output_file_path is not None: + assert stdout_text.strip() == "", f"Expected no stdout with --file, got: {stdout_text}" + assert output_file_path.exists(), "Expected output file to be created" + content_text = output_file_path.read_text(encoding="utf-8", errors="replace") + else: + content_text = stdout_text + + if expected_output_mode == "json" or ("--json" in modifiers and expected_output_mode is None): + try: + json_data = json.loads(content_text) + except Exception as e: + pytest.fail(f"Failed to parse JSON output: {e}\nContent:\n{content_text}") + assert isinstance(json_data, list) and json_data, "Expected non-empty JSON array" + for index, gpu_obj in enumerate(json_data): + missing_fields = _validate_json(gpu_obj) + assert not missing_fields, f"JSON GPU entry {index} missing fields: {missing_fields}" + + elif expected_output_mode == "csv" or ("--csv" in modifiers and expected_output_mode is None): + try: + csv_reader = csv.DictReader(content_text.splitlines()) + csv_rows = list(csv_reader) + except Exception as e: + pytest.fail(f"Failed to parse CSV output: {e}\nContent:\n{content_text}") + assert csv_rows, "Expected at least one CSV row" + for index, csv_row in enumerate(csv_rows): + missing_fields = _validate_csv_row(csv_row) + assert not missing_fields, f"CSV row {index} missing fields: {missing_fields}" + + else: + gpu_blocks = _parse_gpu_blocks(content_text) + assert gpu_blocks, "No GPU blocks found in amd-smi human output" + for index, human_readable_gpu_block in enumerate(gpu_blocks): + missing_fields = _validate_human_readable_gpu_block(human_readable_gpu_block) + assert ( + not missing_fields + ), f"Human-readable GPU block {index} missing fields: {missing_fields}\nBlock:\n{human_readable_gpu_block}" diff --git a/tests/test_rocm_sanity.py b/tests/test_rocm_sanity.py index 36a4f946fc5..47c41f03e55 100644 --- a/tests/test_rocm_sanity.py +++ b/tests/test_rocm_sanity.py @@ -1,7 +1,5 @@ # Copyright Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import json -import csv from pathlib import Path from pytest_check import check import logging @@ -24,109 +22,6 @@ from github_actions_utils import is_asan -def _amd_smi_path() -> Path: - therock_bin_dir_env = os.getenv("THEROCK_BIN_DIR") - if not therock_bin_dir_env: - pytest.fail("THEROCK_BIN_DIR not set; failing amdsmi tests") - - amd_smi_bin_path = (Path(therock_bin_dir_env).expanduser().resolve()) / "amd-smi" - if not amd_smi_bin_path.exists(): - pytest.fail(f"amd-smi not found at {amd_smi_bin_path}") - if not os.access(amd_smi_bin_path, os.X_OK): - pytest.fail(f"amd-smi is not executable: {amd_smi_bin_path}") - return amd_smi_bin_path - - -def _run_amd_smi(subcommands: list[str]) -> tuple[int, str, str]: - amd_smi_bin = _amd_smi_path() - cmd = [str(amd_smi_bin)] + list(subcommands) - logger.info("Running amd-smi: %s", cmd) - proc = subprocess.run(cmd, capture_output=True, text=True) - logger.info("amd-smi returncode=%s", proc.returncode) - logger.info("amd-smi stdout:\n%s", proc.stdout) - logger.info("amd-smi stderr:\n%s", proc.stderr) - return proc.returncode, proc.stdout, proc.stderr - - -def _parse_gpu_blocks(text_output: str) -> list[str]: - gpu_blocks = [] - current_block_lines = None - for line in text_output.splitlines(): - if re.search(r"GPU:\s+(\d+)", line) or re.search(r"GPU\s+(\d+):", line): - if current_block_lines is not None: - gpu_blocks.append("\n".join(current_block_lines)) - current_block_lines = [line] - continue - if current_block_lines is not None: - current_block_lines.append(line) - if current_block_lines is not None: - gpu_blocks.append("\n".join(current_block_lines)) - return gpu_blocks - - -def _validate_human_readable_gpu_block(human_readable_gpu_block_text: str) -> list[str]: - missing_fields = [] - if not re.search(r"\s*BDF:\s*.+", human_readable_gpu_block_text): - missing_fields.append("BDF") - if not re.search(r"\s*UUID:\s*.+", human_readable_gpu_block_text): - missing_fields.append("UUID") - if not re.search(r"\s*KFD_ID:\s*\d+", human_readable_gpu_block_text): - missing_fields.append("KFD_ID") - if not re.search(r"\s*NODE_ID:\s*\d+", human_readable_gpu_block_text): - missing_fields.append("NODE_ID") - if not re.search(r"\s*PARTITION_ID:\s*\d+", human_readable_gpu_block_text): - missing_fields.append("PARTITION_ID") - return missing_fields - - -def _validate_json(gpu_obj: dict) -> list[str]: - missing_fields = [] - if "gpu" not in gpu_obj or not isinstance(gpu_obj.get("gpu"), int): - missing_fields.append("gpu") - if "bdf" not in gpu_obj or not isinstance(gpu_obj.get("bdf"), str): - missing_fields.append("bdf") - if "uuid" not in gpu_obj or not isinstance(gpu_obj.get("uuid"), str): - missing_fields.append("uuid") - if "kfd_id" not in gpu_obj or not isinstance(gpu_obj.get("kfd_id"), int): - missing_fields.append("kfd_id") - if "node_id" not in gpu_obj or not isinstance(gpu_obj.get("node_id"), int): - missing_fields.append("node_id") - if "partition_id" not in gpu_obj or not isinstance( - gpu_obj.get("partition_id"), int - ): - missing_fields.append("partition_id") - return missing_fields - - -def _validate_csv_row(csv_row: dict) -> list[str]: - missing_fields = [] - try: - if "gpu" not in csv_row or int(csv_row.get("gpu", "")) < 0: - missing_fields.append("gpu") - except Exception: - missing_fields.append("gpu") - if not csv_row.get("gpu_bdf"): - missing_fields.append("gpu_bdf") - if not csv_row.get("gpu_uuid"): - missing_fields.append("gpu_uuid") - try: - if "kfd_id" not in csv_row or int(csv_row.get("kfd_id", "")) < 0: - missing_fields.append("kfd_id") - except Exception: - missing_fields.append("kfd_id") - try: - if "node_id" not in csv_row or int(csv_row.get("node_id", "")) < 0: - missing_fields.append("node_id") - except Exception: - missing_fields.append("node_id") - try: - if "partition_id" not in csv_row or int(csv_row.get("partition_id", "")) < 0: - missing_fields.append("partition_id") - except Exception: - missing_fields.append("partition_id") - return missing_fields - - def is_windows(): return "windows" == platform.system().lower() @@ -331,117 +226,3 @@ def test_amdsmi_suite(self): print(f"[amdsmitst-summary] {line}") check.equal(process.returncode, 0) - - @pytest.mark.skipif(is_windows(), reason="amd-smi CLI not supported on Windows") - @pytest.mark.skipif( - AMDGPU_FAMILIES == "gfx1151", reason="Linux gfx1151 does not support amdsmi yet" - ) - @pytest.mark.amd_smi - def test_amd_smi_blocks(self): - """Blocking check: `amd-smi list` prints GPU blocks and they are non-empty. - - This is a lightweight blocking gate: it only asserts that GPU blocks - exist and contain some text, without validating specific fields. - """ - return_code, stdout_text, stderr_text = _run_amd_smi(["list"]) - assert ( - return_code == 0 - ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" - - gpu_blocks = _parse_gpu_blocks(stdout_text) - assert gpu_blocks, "No GPU blocks found in amd-smi output" - for index, block in enumerate(gpu_blocks): - assert block.strip(), f"GPU block {index} is empty" - - @pytest.mark.amd_smi - @pytest.mark.amdsmi_tests_default_unblocking_for_sanity - @pytest.mark.parametrize( - "mod_args", - [ - ([], None), # human readable on stdout - (["--json"], None), - (["--csv"], None), - (["--file"], "human"), - (["--json", "--file"], "json"), - (["--csv", "--file"], "csv"), - ], - ids=[ - "human-stdout", - "json-stdout", - "csv-stdout", - "human-file", - "json-file", - "csv-file", - ], - ) - def test_amd_smi_list(self, mod_args, tmp_path): - modifiers, expected_output_mode = mod_args - - output_file_path = None - invocation_args = list(modifiers) - if "--file" in invocation_args: - output_file_path = tmp_path / "amdsmi_out.txt" - invocation_args = [a for a in invocation_args if a != "--file"] - invocation_args.extend(["--file", str(output_file_path)]) - - return_code, stdout_text, stderr_text = _run_amd_smi(["list"] + invocation_args) - assert ( - return_code == 0 - ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" - - if output_file_path is not None: - assert ( - stdout_text.strip() == "" - ), f"Expected no stdout when using --file, got: {stdout_text}" - assert output_file_path.exists(), "Expected output file to be created" - content_text = output_file_path.read_text( - encoding="utf-8", errors="replace" - ) - else: - content_text = stdout_text - - if expected_output_mode == "json" or ( - "--json" in modifiers and expected_output_mode is None - ): - try: - json_data = json.loads(content_text) - except Exception as e: - pytest.fail( - f"Failed to parse JSON output: {e}\nContent:\n{content_text}" - ) - assert ( - isinstance(json_data, list) and json_data - ), "Expected non-empty JSON array" - for index, gpu_obj in enumerate(json_data): - missing_fields = _validate_json(gpu_obj) - assert ( - not missing_fields - ), f"JSON GPU entry {index} missing fields: {missing_fields}" - - elif expected_output_mode == "csv" or ( - "--csv" in modifiers and expected_output_mode is None - ): - try: - csv_reader = csv.DictReader(content_text.splitlines()) - csv_rows = list(csv_reader) - except Exception as e: - pytest.fail( - f"Failed to parse CSV output: {e}\nContent:\n{content_text}" - ) - assert csv_rows, "Expected at least one CSV row" - for index, csv_row in enumerate(csv_rows): - missing_fields = _validate_csv_row(csv_row) - assert ( - not missing_fields - ), f"CSV row {index} missing fields: {missing_fields}" - - else: - gpu_blocks = _parse_gpu_blocks(content_text) - assert gpu_blocks, "No GPU blocks found in amd-smi human output" - for index, human_readable_gpu_block in enumerate(gpu_blocks): - missing_fields = _validate_human_readable_gpu_block( - human_readable_gpu_block - ) - assert ( - not missing_fields - ), f"Human-readable GPU block {index} missing fields: {missing_fields}\nBlock:\n{human_readable_gpu_block}" From 8bfba375f46553133796fde40e7fa73fb18eab69 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Sat, 14 Mar 2026 01:01:36 +0530 Subject: [PATCH 17/21] clean up --- .../github_actions/fetch_test_configurations.py | 11 ----------- tests/test_rocm_sanity.py | 3 +++ 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/build_tools/github_actions/fetch_test_configurations.py b/build_tools/github_actions/fetch_test_configurations.py index dd002c05f02..4f17329db38 100644 --- a/build_tools/github_actions/fetch_test_configurations.py +++ b/build_tools/github_actions/fetch_test_configurations.py @@ -168,17 +168,6 @@ def _get_script_path(script_name: str) -> str: "windows": 2, }, }, - "rocprofiler_systems": { - "job_name": "rocprofiler_systems", - "fetch_artifact_args": "--rocprofiler-systems --rocprofiler-sdk --tests", - "timeout_minutes": 15, - "test_script": f"python {_get_script_path('test_rocprofiler_systems.py')}", - "platform": ["linux"], - "total_shards_dict": { - "linux": 1, - "windows": 1, - }, - }, "amdsmi_cli": { "job_name": "amdsmi_cli", "fetch_artifact_args": "--base-only", diff --git a/tests/test_rocm_sanity.py b/tests/test_rocm_sanity.py index 47c41f03e55..233ec8d3abd 100644 --- a/tests/test_rocm_sanity.py +++ b/tests/test_rocm_sanity.py @@ -1,5 +1,6 @@ # Copyright Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT + from pathlib import Path from pytest_check import check import logging @@ -14,7 +15,9 @@ THIS_DIR = Path(__file__).resolve().parent logger = logging.getLogger(__name__) + THEROCK_BIN_DIR = Path(os.getenv("THEROCK_BIN_DIR")).resolve() + AMDGPU_FAMILIES = os.getenv("AMDGPU_FAMILIES") # Importing is_asan from github_actions_utils.py From 69cc2848173be5da0d5ddc1583823b9f8af6d049 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Sat, 14 Mar 2026 01:06:09 +0530 Subject: [PATCH 18/21] Refactor code for improved readability and consistency in test scripts --- .../test_executable_scripts/test_sanity.py | 5 ++- tests/test_amdsmi_cli.py | 32 ++++++++++++++----- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/build_tools/github_actions/test_executable_scripts/test_sanity.py b/build_tools/github_actions/test_executable_scripts/test_sanity.py index eb154db1785..823b29c7ec7 100644 --- a/build_tools/github_actions/test_executable_scripts/test_sanity.py +++ b/build_tools/github_actions/test_executable_scripts/test_sanity.py @@ -13,10 +13,13 @@ THEROCK_DIR = SCRIPT_DIR.parent.parent.parent -def _run_pytest(cmd: list[str], *, cwd: Path, env: dict[str, str], check: bool) -> subprocess.CompletedProcess[str]: +def _run_pytest( + cmd: list[str], *, cwd: Path, env: dict[str, str], check: bool +) -> subprocess.CompletedProcess[str]: logging.info("++ Exec [%s]$ %s", cwd, " ".join(cmd)) return subprocess.run(cmd, cwd=cwd, env=env, check=check, text=True) + env = os.environ.copy() # Enable verbose ROCm logging, see # https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html diff --git a/tests/test_amdsmi_cli.py b/tests/test_amdsmi_cli.py index 61a78a55130..e176073f41e 100644 --- a/tests/test_amdsmi_cli.py +++ b/tests/test_amdsmi_cli.py @@ -85,7 +85,9 @@ def _validate_json(gpu_obj: dict) -> list[str]: missing_fields.append("kfd_id") if "node_id" not in gpu_obj or not isinstance(gpu_obj.get("node_id"), int): missing_fields.append("node_id") - if "partition_id" not in gpu_obj or not isinstance(gpu_obj.get("partition_id"), int): + if "partition_id" not in gpu_obj or not isinstance( + gpu_obj.get("partition_id"), int + ): missing_fields.append("partition_id") return missing_fields @@ -181,23 +183,33 @@ def test_amd_smi_list(mod_args, tmp_path: Path) -> None: ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" if output_file_path is not None: - assert stdout_text.strip() == "", f"Expected no stdout with --file, got: {stdout_text}" + assert ( + stdout_text.strip() == "" + ), f"Expected no stdout with --file, got: {stdout_text}" assert output_file_path.exists(), "Expected output file to be created" content_text = output_file_path.read_text(encoding="utf-8", errors="replace") else: content_text = stdout_text - if expected_output_mode == "json" or ("--json" in modifiers and expected_output_mode is None): + if expected_output_mode == "json" or ( + "--json" in modifiers and expected_output_mode is None + ): try: json_data = json.loads(content_text) except Exception as e: pytest.fail(f"Failed to parse JSON output: {e}\nContent:\n{content_text}") - assert isinstance(json_data, list) and json_data, "Expected non-empty JSON array" + assert ( + isinstance(json_data, list) and json_data + ), "Expected non-empty JSON array" for index, gpu_obj in enumerate(json_data): missing_fields = _validate_json(gpu_obj) - assert not missing_fields, f"JSON GPU entry {index} missing fields: {missing_fields}" + assert ( + not missing_fields + ), f"JSON GPU entry {index} missing fields: {missing_fields}" - elif expected_output_mode == "csv" or ("--csv" in modifiers and expected_output_mode is None): + elif expected_output_mode == "csv" or ( + "--csv" in modifiers and expected_output_mode is None + ): try: csv_reader = csv.DictReader(content_text.splitlines()) csv_rows = list(csv_reader) @@ -206,13 +218,17 @@ def test_amd_smi_list(mod_args, tmp_path: Path) -> None: assert csv_rows, "Expected at least one CSV row" for index, csv_row in enumerate(csv_rows): missing_fields = _validate_csv_row(csv_row) - assert not missing_fields, f"CSV row {index} missing fields: {missing_fields}" + assert ( + not missing_fields + ), f"CSV row {index} missing fields: {missing_fields}" else: gpu_blocks = _parse_gpu_blocks(content_text) assert gpu_blocks, "No GPU blocks found in amd-smi human output" for index, human_readable_gpu_block in enumerate(gpu_blocks): - missing_fields = _validate_human_readable_gpu_block(human_readable_gpu_block) + missing_fields = _validate_human_readable_gpu_block( + human_readable_gpu_block + ) assert ( not missing_fields ), f"Human-readable GPU block {index} missing fields: {missing_fields}\nBlock:\n{human_readable_gpu_block}" From 053d2bb9b3d0541016c2aefb9d5d27caae97cc6c Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Sat, 14 Mar 2026 01:33:41 +0530 Subject: [PATCH 19/21] Enhance logging in _run_amd_smi function to include return code and output --- tests/test_amdsmi_cli.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_amdsmi_cli.py b/tests/test_amdsmi_cli.py index e176073f41e..31eadd22e7c 100644 --- a/tests/test_amdsmi_cli.py +++ b/tests/test_amdsmi_cli.py @@ -39,6 +39,11 @@ def _run_amd_smi(subcommands: list[str]) -> tuple[int, str, str]: cmd = [str(amd_smi_bin)] + list(subcommands) logger.info("Running amd-smi: %s", cmd) proc = subprocess.run(cmd, capture_output=True, text=True) + logger.info("amd-smi returncode=%s", proc.returncode) + if proc.stdout: + logger.info("amd-smi stdout:\n%s", proc.stdout) + if proc.stderr: + logger.info("amd-smi stderr:\n%s", proc.stderr) return proc.returncode, proc.stdout, proc.stderr From d3aa5f61be9820d98f5f2e8bb36aa9cce53dddcf Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Sat, 14 Mar 2026 11:19:37 +0530 Subject: [PATCH 20/21] Remove amdsmi_cli tests and related configurations; refactor amd-smi handling in ROCm sanity tests --- .../fetch_test_configurations.py | 10 - conftest.py | 13 - tests/test_amdsmi_cli.py | 239 ------------------ tests/test_rocm_sanity.py | 208 +++++++++++++++ 4 files changed, 208 insertions(+), 262 deletions(-) delete mode 100644 conftest.py delete mode 100644 tests/test_amdsmi_cli.py diff --git a/build_tools/github_actions/fetch_test_configurations.py b/build_tools/github_actions/fetch_test_configurations.py index 4f17329db38..b82a1a219b9 100644 --- a/build_tools/github_actions/fetch_test_configurations.py +++ b/build_tools/github_actions/fetch_test_configurations.py @@ -168,16 +168,6 @@ def _get_script_path(script_name: str) -> str: "windows": 2, }, }, - "amdsmi_cli": { - "job_name": "amdsmi_cli", - "fetch_artifact_args": "--base-only", - "timeout_minutes": 15, - "test_script": "pytest tests/test_amdsmi_cli.py -m not_sanity -o log_cli=true --log-cli-level=INFO", - "platform": ["linux"], - "total_shards_dict": { - "linux": 1, - }, - }, "hipcub": { "job_name": "hipcub", "fetch_artifact_args": "--prim --tests", diff --git a/conftest.py b/conftest.py deleted file mode 100644 index a2aeb752037..00000000000 --- a/conftest.py +++ /dev/null @@ -1,13 +0,0 @@ -def pytest_configure(config): - config.addinivalue_line( - "markers", - "not_sanity: marks tests that must not run in sanity gating", - ) - config.addinivalue_line( - "markers", - "amd_smi: marks tests that exercise amd-smi", - ) - config.addinivalue_line( - "markers", - "amd_smi_cli: marks amd-smi CLI tests", - ) diff --git a/tests/test_amdsmi_cli.py b/tests/test_amdsmi_cli.py deleted file mode 100644 index 31eadd22e7c..00000000000 --- a/tests/test_amdsmi_cli.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright Advanced Micro Devices, Inc. -# SPDX-License-Identifier: MIT - -"""amd-smi CLI tests.""" - -import csv -import json -import logging -import os -import platform -import re -import subprocess -from pathlib import Path - -import pytest - -logger = logging.getLogger(__name__) - - -def is_windows() -> bool: - return platform.system().lower() == "windows" - - -def _amd_smi_path() -> Path: - therock_bin_dir_env = os.getenv("THEROCK_BIN_DIR") - if not therock_bin_dir_env: - pytest.fail("THEROCK_BIN_DIR not set; failing amd-smi CLI tests") - - amd_smi_bin_path = (Path(therock_bin_dir_env).expanduser().resolve()) / "amd-smi" - if not amd_smi_bin_path.exists(): - pytest.fail(f"amd-smi not found at {amd_smi_bin_path}") - if not os.access(amd_smi_bin_path, os.X_OK): - pytest.fail(f"amd-smi is not executable: {amd_smi_bin_path}") - return amd_smi_bin_path - - -def _run_amd_smi(subcommands: list[str]) -> tuple[int, str, str]: - amd_smi_bin = _amd_smi_path() - cmd = [str(amd_smi_bin)] + list(subcommands) - logger.info("Running amd-smi: %s", cmd) - proc = subprocess.run(cmd, capture_output=True, text=True) - logger.info("amd-smi returncode=%s", proc.returncode) - if proc.stdout: - logger.info("amd-smi stdout:\n%s", proc.stdout) - if proc.stderr: - logger.info("amd-smi stderr:\n%s", proc.stderr) - return proc.returncode, proc.stdout, proc.stderr - - -def _parse_gpu_blocks(text_output: str) -> list[str]: - gpu_blocks: list[str] = [] - current_block_lines: list[str] | None = None - for line in text_output.splitlines(): - if re.search(r"GPU:\s+(\d+)", line) or re.search(r"GPU\s+(\d+):", line): - if current_block_lines is not None: - gpu_blocks.append("\n".join(current_block_lines)) - current_block_lines = [line] - continue - if current_block_lines is not None: - current_block_lines.append(line) - if current_block_lines is not None: - gpu_blocks.append("\n".join(current_block_lines)) - return gpu_blocks - - -def _validate_human_readable_gpu_block(human_readable_gpu_block_text: str) -> list[str]: - missing_fields: list[str] = [] - if not re.search(r"\s*BDF:\s*.+", human_readable_gpu_block_text): - missing_fields.append("BDF") - if not re.search(r"\s*UUID:\s*.+", human_readable_gpu_block_text): - missing_fields.append("UUID") - if not re.search(r"\s*KFD_ID:\s*\d+", human_readable_gpu_block_text): - missing_fields.append("KFD_ID") - if not re.search(r"\s*NODE_ID:\s*\d+", human_readable_gpu_block_text): - missing_fields.append("NODE_ID") - if not re.search(r"\s*PARTITION_ID:\s*\d+", human_readable_gpu_block_text): - missing_fields.append("PARTITION_ID") - return missing_fields - - -def _validate_json(gpu_obj: dict) -> list[str]: - missing_fields: list[str] = [] - if "gpu" not in gpu_obj or not isinstance(gpu_obj.get("gpu"), int): - missing_fields.append("gpu") - if "bdf" not in gpu_obj or not isinstance(gpu_obj.get("bdf"), str): - missing_fields.append("bdf") - if "uuid" not in gpu_obj or not isinstance(gpu_obj.get("uuid"), str): - missing_fields.append("uuid") - if "kfd_id" not in gpu_obj or not isinstance(gpu_obj.get("kfd_id"), int): - missing_fields.append("kfd_id") - if "node_id" not in gpu_obj or not isinstance(gpu_obj.get("node_id"), int): - missing_fields.append("node_id") - if "partition_id" not in gpu_obj or not isinstance( - gpu_obj.get("partition_id"), int - ): - missing_fields.append("partition_id") - return missing_fields - - -def _validate_csv_row(csv_row: dict) -> list[str]: - missing_fields: list[str] = [] - try: - if "gpu" not in csv_row or int(csv_row.get("gpu", "")) < 0: - missing_fields.append("gpu") - except Exception: - missing_fields.append("gpu") - if not csv_row.get("gpu_bdf"): - missing_fields.append("gpu_bdf") - if not csv_row.get("gpu_uuid"): - missing_fields.append("gpu_uuid") - try: - if "kfd_id" not in csv_row or int(csv_row.get("kfd_id", "")) < 0: - missing_fields.append("kfd_id") - except Exception: - missing_fields.append("kfd_id") - try: - if "node_id" not in csv_row or int(csv_row.get("node_id", "")) < 0: - missing_fields.append("node_id") - except Exception: - missing_fields.append("node_id") - try: - if "partition_id" not in csv_row or int(csv_row.get("partition_id", "")) < 0: - missing_fields.append("partition_id") - except Exception: - missing_fields.append("partition_id") - return missing_fields - - -AMDGPU_FAMILIES = os.getenv("AMDGPU_FAMILIES") - - -# Module-wide: these are amd-smi CLI tests. -pytestmark = [pytest.mark.amd_smi, pytest.mark.amd_smi_cli] - - -@pytest.mark.skipif(is_windows(), reason="amd-smi CLI not supported on Windows") -@pytest.mark.skipif( - AMDGPU_FAMILIES == "gfx1151", reason="Linux gfx1151 does not support amdsmi yet" -) -def test_amd_smi_blocks() -> None: - """Sanity-gating check: amd-smi list succeeds and reports at least one GPU.""" - return_code, stdout_text, stderr_text = _run_amd_smi(["list"]) - assert ( - return_code == 0 - ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" - - gpu_blocks = _parse_gpu_blocks(stdout_text) - assert gpu_blocks, "No GPU blocks found in amd-smi output" - - -@pytest.mark.not_sanity -@pytest.mark.skipif(is_windows(), reason="amd-smi CLI not supported on Windows") -@pytest.mark.skipif( - AMDGPU_FAMILIES == "gfx1151", reason="Linux gfx1151 does not support amdsmi yet" -) -@pytest.mark.parametrize( - "mod_args", - [ - ([], None), - (["--json"], None), - (["--csv"], None), - (["--file"], "human"), - (["--json", "--file"], "json"), - (["--csv", "--file"], "csv"), - ], - ids=[ - "human-stdout", - "json-stdout", - "csv-stdout", - "human-file", - "json-file", - "csv-file", - ], -) -def test_amd_smi_list(mod_args, tmp_path: Path) -> None: - modifiers, expected_output_mode = mod_args - - output_file_path: Path | None = None - invocation_args = list(modifiers) - if "--file" in invocation_args: - output_file_path = tmp_path / "amdsmi_out.txt" - invocation_args = [a for a in invocation_args if a != "--file"] - invocation_args.extend(["--file", str(output_file_path)]) - - return_code, stdout_text, stderr_text = _run_amd_smi(["list"] + invocation_args) - assert ( - return_code == 0 - ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" - - if output_file_path is not None: - assert ( - stdout_text.strip() == "" - ), f"Expected no stdout with --file, got: {stdout_text}" - assert output_file_path.exists(), "Expected output file to be created" - content_text = output_file_path.read_text(encoding="utf-8", errors="replace") - else: - content_text = stdout_text - - if expected_output_mode == "json" or ( - "--json" in modifiers and expected_output_mode is None - ): - try: - json_data = json.loads(content_text) - except Exception as e: - pytest.fail(f"Failed to parse JSON output: {e}\nContent:\n{content_text}") - assert ( - isinstance(json_data, list) and json_data - ), "Expected non-empty JSON array" - for index, gpu_obj in enumerate(json_data): - missing_fields = _validate_json(gpu_obj) - assert ( - not missing_fields - ), f"JSON GPU entry {index} missing fields: {missing_fields}" - - elif expected_output_mode == "csv" or ( - "--csv" in modifiers and expected_output_mode is None - ): - try: - csv_reader = csv.DictReader(content_text.splitlines()) - csv_rows = list(csv_reader) - except Exception as e: - pytest.fail(f"Failed to parse CSV output: {e}\nContent:\n{content_text}") - assert csv_rows, "Expected at least one CSV row" - for index, csv_row in enumerate(csv_rows): - missing_fields = _validate_csv_row(csv_row) - assert ( - not missing_fields - ), f"CSV row {index} missing fields: {missing_fields}" - - else: - gpu_blocks = _parse_gpu_blocks(content_text) - assert gpu_blocks, "No GPU blocks found in amd-smi human output" - for index, human_readable_gpu_block in enumerate(gpu_blocks): - missing_fields = _validate_human_readable_gpu_block( - human_readable_gpu_block - ) - assert ( - not missing_fields - ), f"Human-readable GPU block {index} missing fields: {missing_fields}\nBlock:\n{human_readable_gpu_block}" diff --git a/tests/test_rocm_sanity.py b/tests/test_rocm_sanity.py index 233ec8d3abd..7a1bdf57d52 100644 --- a/tests/test_rocm_sanity.py +++ b/tests/test_rocm_sanity.py @@ -1,6 +1,8 @@ # Copyright Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT +import csv +import json from pathlib import Path from pytest_check import check import logging @@ -25,6 +27,117 @@ from github_actions_utils import is_asan +def _amd_smi_path() -> Path: + therock_bin_dir_env = os.getenv("THEROCK_BIN_DIR") + if not therock_bin_dir_env: + pytest.fail("THEROCK_BIN_DIR not set; failing amd-smi CLI tests") + + amd_smi_bin_path = (Path(therock_bin_dir_env).expanduser().resolve()) / "amd-smi" + if not amd_smi_bin_path.exists(): + pytest.fail(f"amd-smi not found at {amd_smi_bin_path}") + if not os.access(amd_smi_bin_path, os.X_OK): + pytest.fail(f"amd-smi is not executable: {amd_smi_bin_path}") + return amd_smi_bin_path + + +def _run_amd_smi(subcommands: list[str]) -> tuple[int, str, str]: + amd_smi_bin = _amd_smi_path() + cmd = [str(amd_smi_bin)] + list(subcommands) + logger.info("Running amd-smi: %s", cmd) + proc = subprocess.run(cmd, capture_output=True, text=True) + logger.info("amd-smi returncode=%s", proc.returncode) + if proc.returncode != 0: + if proc.stdout: + logger.error("amd-smi stdout:\n%s", proc.stdout) + if proc.stderr: + logger.error("amd-smi stderr:\n%s", proc.stderr) + else: + if proc.stdout: + logger.info("amd-smi stdout:\n%s", proc.stdout) + if proc.stderr: + logger.error("amd-smi stderr (unexpected on success):\n%s", proc.stderr) + return proc.returncode, proc.stdout, proc.stderr + + +def _parse_gpu_blocks(text_output: str) -> list[str]: + gpu_blocks: list[str] = [] + current_block_lines: list[str] | None = None + for line in text_output.splitlines(): + if re.search(r"GPU:\s+(\d+)", line) or re.search(r"GPU\s+(\d+):", line): + if current_block_lines is not None: + gpu_blocks.append("\n".join(current_block_lines)) + current_block_lines = [line] + continue + if current_block_lines is not None: + current_block_lines.append(line) + if current_block_lines is not None: + gpu_blocks.append("\n".join(current_block_lines)) + return gpu_blocks + + +def _validate_human_readable_gpu_block(human_readable_gpu_block_text: str) -> list[str]: + missing_fields: list[str] = [] + if not re.search(r"\s*BDF:\s*.+", human_readable_gpu_block_text): + missing_fields.append("BDF") + if not re.search(r"\s*UUID:\s*.+", human_readable_gpu_block_text): + missing_fields.append("UUID") + if not re.search(r"\s*KFD_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("KFD_ID") + if not re.search(r"\s*NODE_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("NODE_ID") + if not re.search(r"\s*PARTITION_ID:\s*\d+", human_readable_gpu_block_text): + missing_fields.append("PARTITION_ID") + return missing_fields + + +def _validate_json(gpu_obj: dict) -> list[str]: + missing_fields: list[str] = [] + if "gpu" not in gpu_obj or not isinstance(gpu_obj.get("gpu"), int): + missing_fields.append("gpu") + if "bdf" not in gpu_obj or not isinstance(gpu_obj.get("bdf"), str): + missing_fields.append("bdf") + if "uuid" not in gpu_obj or not isinstance(gpu_obj.get("uuid"), str): + missing_fields.append("uuid") + if "kfd_id" not in gpu_obj or not isinstance(gpu_obj.get("kfd_id"), int): + missing_fields.append("kfd_id") + if "node_id" not in gpu_obj or not isinstance(gpu_obj.get("node_id"), int): + missing_fields.append("node_id") + if "partition_id" not in gpu_obj or not isinstance( + gpu_obj.get("partition_id"), int + ): + missing_fields.append("partition_id") + return missing_fields + + +def _validate_csv_row(csv_row: dict) -> list[str]: + missing_fields: list[str] = [] + try: + if "gpu" not in csv_row or int(csv_row.get("gpu", "")) < 0: + missing_fields.append("gpu") + except Exception: + missing_fields.append("gpu") + if not csv_row.get("gpu_bdf"): + missing_fields.append("gpu_bdf") + if not csv_row.get("gpu_uuid"): + missing_fields.append("gpu_uuid") + try: + if "kfd_id" not in csv_row or int(csv_row.get("kfd_id", "")) < 0: + missing_fields.append("kfd_id") + except Exception: + missing_fields.append("kfd_id") + try: + if "node_id" not in csv_row or int(csv_row.get("node_id", "")) < 0: + missing_fields.append("node_id") + except Exception: + missing_fields.append("node_id") + try: + if "partition_id" not in csv_row or int(csv_row.get("partition_id", "")) < 0: + missing_fields.append("partition_id") + except Exception: + missing_fields.append("partition_id") + return missing_fields + + def is_windows(): return "windows" == platform.system().lower() @@ -229,3 +342,98 @@ def test_amdsmi_suite(self): print(f"[amdsmitst-summary] {line}") check.equal(process.returncode, 0) + + @pytest.mark.skipif(is_windows(), reason="amd-smi CLI not supported on Windows") + @pytest.mark.skipif( + AMDGPU_FAMILIES == "gfx1151", reason="Linux gfx1151 does not support amdsmi yet" + ) + @pytest.mark.parametrize( + "mod_args", + [ + ([], None), + (["--json"], None), + (["--csv"], None), + (["--file"], "human"), + (["--json", "--file"], "json"), + (["--csv", "--file"], "csv"), + ], + ids=[ + "human-stdout", + "json-stdout", + "csv-stdout", + "human-file", + "json-file", + "csv-file", + ], + ) + def test_amd_smi_list(self, mod_args, tmp_path: Path) -> None: + modifiers, expected_output_mode = mod_args + + output_file_path: Path | None = None + invocation_args = list(modifiers) + if "--file" in invocation_args: + output_file_path = tmp_path / "amdsmi_out.txt" + invocation_args = [a for a in invocation_args if a != "--file"] + invocation_args.extend(["--file", str(output_file_path)]) + + return_code, stdout_text, stderr_text = _run_amd_smi(["list"] + invocation_args) + assert ( + return_code == 0 + ), f"amd-smi failed rc={return_code} stderr={stderr_text} stdout={stdout_text}" + + if output_file_path is not None: + assert ( + stdout_text.strip() == "" + ), f"Expected no stdout with --file, got: {stdout_text}" + assert output_file_path.exists(), "Expected output file to be created" + content_text = output_file_path.read_text( + encoding="utf-8", errors="replace" + ) + else: + content_text = stdout_text + + if expected_output_mode == "json" or ( + "--json" in modifiers and expected_output_mode is None + ): + try: + json_data = json.loads(content_text) + except Exception as e: + pytest.fail( + f"Failed to parse JSON output: {e}\nContent:\n{content_text}" + ) + assert ( + isinstance(json_data, list) and json_data + ), "Expected non-empty JSON array" + for index, gpu_obj in enumerate(json_data): + missing_fields = _validate_json(gpu_obj) + assert ( + not missing_fields + ), f"JSON GPU entry {index} missing fields: {missing_fields}" + + elif expected_output_mode == "csv" or ( + "--csv" in modifiers and expected_output_mode is None + ): + try: + csv_reader = csv.DictReader(content_text.splitlines()) + csv_rows = list(csv_reader) + except Exception as e: + pytest.fail( + f"Failed to parse CSV output: {e}\nContent:\n{content_text}" + ) + assert csv_rows, "Expected at least one CSV row" + for index, csv_row in enumerate(csv_rows): + missing_fields = _validate_csv_row(csv_row) + assert ( + not missing_fields + ), f"CSV row {index} missing fields: {missing_fields}" + + else: + gpu_blocks = _parse_gpu_blocks(content_text) + assert gpu_blocks, "No GPU blocks found in amd-smi human output" + for index, human_readable_gpu_block in enumerate(gpu_blocks): + missing_fields = _validate_human_readable_gpu_block( + human_readable_gpu_block + ) + assert ( + not missing_fields + ), f"Human-readable GPU block {index} missing fields: {missing_fields}\nBlock:\n{human_readable_gpu_block}" From a794b26c62d0265c60be89eccbe4aebe7475d0a1 Mon Sep 17 00:00:00 2001 From: HRISHIKESH THULA Date: Sat, 14 Mar 2026 11:33:01 +0530 Subject: [PATCH 21/21] Refactor test_sanity.py by removing the _run_pytest function and directly executing pytest command --- .../test_executable_scripts/test_sanity.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/build_tools/github_actions/test_executable_scripts/test_sanity.py b/build_tools/github_actions/test_executable_scripts/test_sanity.py index 823b29c7ec7..e54a0e20607 100644 --- a/build_tools/github_actions/test_executable_scripts/test_sanity.py +++ b/build_tools/github_actions/test_executable_scripts/test_sanity.py @@ -12,14 +12,6 @@ SCRIPT_DIR = Path(__file__).resolve().parent THEROCK_DIR = SCRIPT_DIR.parent.parent.parent - -def _run_pytest( - cmd: list[str], *, cwd: Path, env: dict[str, str], check: bool -) -> subprocess.CompletedProcess[str]: - logging.info("++ Exec [%s]$ %s", cwd, " ".join(cmd)) - return subprocess.run(cmd, cwd=cwd, env=env, check=check, text=True) - - env = os.environ.copy() # Enable verbose ROCm logging, see # https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html @@ -47,6 +39,6 @@ def _run_pytest( "--timeout=300", ] -# Default sanity behavior: run everything except tests marked as not_sanity. -phase_cmd = cmd + ["-m", "not not_sanity"] -_run_pytest(phase_cmd, cwd=THEROCK_DIR, env=env, check=True) +logging.info(f"++ Exec [{THEROCK_DIR}]$ {' '.join(cmd)}") + +subprocess.run(cmd, cwd=THEROCK_DIR, env=env, check=True)