From fab87654cbe795e7079eb1b669aa6e4271db5219 Mon Sep 17 00:00:00 2001 From: Leorize Date: Tue, 10 Jun 2025 14:12:35 -0500 Subject: [PATCH 1/4] Only enumerate ROCm-capable AMD GPUs Discover AMD graphics devices using AMDKFD topology instead of enumerating the PCIe bus. This interface exposes a lot more information about potential devices, allowing RamaLama to filter out unsupported devices. Currently, devices older than GFX9 are filtered, as they are no longer supported by ROCm. Signed-off-by: Leorize --- ramalama/common.py | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/ramalama/common.py b/ramalama/common.py index 55cf47bb1..e21bb1c6c 100644 --- a/ramalama/common.py +++ b/ramalama/common.py @@ -447,12 +447,39 @@ def check_ascend(): def check_rocm_amd(): gpu_num = 0 gpu_bytes = 0 - for i, fp in enumerate(sorted(glob.glob('/sys/bus/pci/devices/*/mem_info_vram_total'))): - with open(fp, 'r') as file: - content = int(file.read()) - if content > 1073741824 and content > gpu_bytes: - gpu_bytes = content - gpu_num = i + + def parse_props(path): + with open(path) as file: + return {key: int(value) for key, _, value in (line.partition(' ') for line in file)} + + def kfd_gpus(): + for np in sorted(glob.glob('/sys/devices/virtual/kfd/kfd/topology/nodes/*')): + props = parse_props(np + '/properties') + + # Skip CPUs + if props['gfx_target_version'] == 0: + continue + + yield np, props + + for i, (np, props) in enumerate(kfd_gpus()): + # Radeon GPUs older than gfx900 are not supported by ROCm (e.g. Polaris) + if props['gfx_target_version'] < 90000: + continue + + mem_banks_count = int(props['mem_banks_count']) + mem_bytes = 0 + for bank in range(mem_banks_count): + bank_props = parse_props(np + f'/mem_banks/{bank}/properties') + # See /usr/include/linux/kfd_sysfs.h for possible heap types + # + # Count public and private framebuffer memory as VRAM + if bank_props['heap_type'] in [1, 2]: + mem_bytes += int(bank_props['size_in_bytes']) + + if mem_bytes > 1073741824 and mem_bytes > gpu_bytes: + gpu_bytes = mem_bytes + gpu_num = i if gpu_bytes: os.environ["HIP_VISIBLE_DEVICES"] = str(gpu_num) From ecb9fb086f1c67608692ecea9d692a57d5fc7fec Mon Sep 17 00:00:00 2001 From: Leorize Date: Tue, 10 Jun 2025 15:13:37 -0500 Subject: [PATCH 2/4] Extract amdkfd utilities to its own module Signed-off-by: Leorize --- ramalama/amdkfd.py | 19 +++++++++++++++++++ ramalama/common.py | 20 +++----------------- 2 files changed, 22 insertions(+), 17 deletions(-) create mode 100644 ramalama/amdkfd.py diff --git a/ramalama/amdkfd.py b/ramalama/amdkfd.py new file mode 100644 index 000000000..33066f591 --- /dev/null +++ b/ramalama/amdkfd.py @@ -0,0 +1,19 @@ +"""utilities for working with AMDKFD driver""" + +import glob + +def parse_props(path): + """Returns a dict corresponding to a KFD properties file""" + with open(path) as file: + return {key: int(value) for key, _, value in (line.partition(' ') for line in file)} + +def gpus(): + """Yields GPU nodes within KFD topology and their properties""" + for np in sorted(glob.glob('/sys/devices/virtual/kfd/kfd/topology/nodes/*')): + props = parse_props(np + '/properties') + + # Skip CPUs + if props['gfx_target_version'] == 0: + continue + + yield np, props diff --git a/ramalama/common.py b/ramalama/common.py index e21bb1c6c..a171731e1 100644 --- a/ramalama/common.py +++ b/ramalama/common.py @@ -16,6 +16,7 @@ import urllib.error from typing import List +import ramalama.amdkfd as amdkfd import ramalama.console as console from ramalama.http_client import HttpClient from ramalama.logger import logger @@ -447,22 +448,7 @@ def check_ascend(): def check_rocm_amd(): gpu_num = 0 gpu_bytes = 0 - - def parse_props(path): - with open(path) as file: - return {key: int(value) for key, _, value in (line.partition(' ') for line in file)} - - def kfd_gpus(): - for np in sorted(glob.glob('/sys/devices/virtual/kfd/kfd/topology/nodes/*')): - props = parse_props(np + '/properties') - - # Skip CPUs - if props['gfx_target_version'] == 0: - continue - - yield np, props - - for i, (np, props) in enumerate(kfd_gpus()): + for i, (np, props) in enumerate(amdkfd.gpus()): # Radeon GPUs older than gfx900 are not supported by ROCm (e.g. Polaris) if props['gfx_target_version'] < 90000: continue @@ -470,7 +456,7 @@ def kfd_gpus(): mem_banks_count = int(props['mem_banks_count']) mem_bytes = 0 for bank in range(mem_banks_count): - bank_props = parse_props(np + f'/mem_banks/{bank}/properties') + bank_props = amdkfd.parse_props(np + f'/mem_banks/{bank}/properties') # See /usr/include/linux/kfd_sysfs.h for possible heap types # # Count public and private framebuffer memory as VRAM From 93e36ac24e248359c825bbaaf56d142b9875f0e4 Mon Sep 17 00:00:00 2001 From: Leorize Date: Tue, 10 Jun 2025 15:16:40 -0500 Subject: [PATCH 3/4] Extract VRAM minimum into a constant Signed-off-by: Leorize --- ramalama/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ramalama/common.py b/ramalama/common.py index a171731e1..a1d80e695 100644 --- a/ramalama/common.py +++ b/ramalama/common.py @@ -36,6 +36,8 @@ DEFAULT_IMAGE = "quay.io/ramalama/ramalama" +MIN_VRAM_BYTES = 1073741824 # 1GiB + _engine = -1 # -1 means cached variable not set yet _nvidia = -1 # -1 means cached variable not set yet @@ -463,7 +465,7 @@ def check_rocm_amd(): if bank_props['heap_type'] in [1, 2]: mem_bytes += int(bank_props['size_in_bytes']) - if mem_bytes > 1073741824 and mem_bytes > gpu_bytes: + if mem_bytes > MIN_VRAM_BYTES and mem_bytes > gpu_bytes: gpu_bytes = mem_bytes gpu_num = i From db4a7d24af46cf8c222732497290f224f90279af Mon Sep 17 00:00:00 2001 From: Leorize Date: Tue, 10 Jun 2025 15:19:53 -0500 Subject: [PATCH 4/4] Apply formatting fixes Signed-off-by: Leorize --- ramalama/amdkfd.py | 2 ++ ramalama/common.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ramalama/amdkfd.py b/ramalama/amdkfd.py index 33066f591..c69919256 100644 --- a/ramalama/amdkfd.py +++ b/ramalama/amdkfd.py @@ -2,11 +2,13 @@ import glob + def parse_props(path): """Returns a dict corresponding to a KFD properties file""" with open(path) as file: return {key: int(value) for key, _, value in (line.partition(' ') for line in file)} + def gpus(): """Yields GPU nodes within KFD topology and their properties""" for np in sorted(glob.glob('/sys/devices/virtual/kfd/kfd/topology/nodes/*')): diff --git a/ramalama/common.py b/ramalama/common.py index a1d80e695..c5a2eb205 100644 --- a/ramalama/common.py +++ b/ramalama/common.py @@ -36,7 +36,7 @@ DEFAULT_IMAGE = "quay.io/ramalama/ramalama" -MIN_VRAM_BYTES = 1073741824 # 1GiB +MIN_VRAM_BYTES = 1073741824 # 1GiB _engine = -1 # -1 means cached variable not set yet