diff --git a/ramalama/amdkfd.py b/ramalama/amdkfd.py new file mode 100644 index 000000000..c69919256 --- /dev/null +++ b/ramalama/amdkfd.py @@ -0,0 +1,21 @@ +"""utilities for working with AMDKFD driver""" + +import glob + + +def parse_props(path): + """Returns a dict corresponding to a KFD properties file""" + with open(path) as file: + return {key: int(value) for key, _, value in (line.partition(' ') for line in file)} + + +def gpus(): + """Yields GPU nodes within KFD topology and their properties""" + for np in sorted(glob.glob('/sys/devices/virtual/kfd/kfd/topology/nodes/*')): + props = parse_props(np + '/properties') + + # Skip CPUs + if props['gfx_target_version'] == 0: + continue + + yield np, props diff --git a/ramalama/common.py b/ramalama/common.py index 55cf47bb1..c5a2eb205 100644 --- a/ramalama/common.py +++ b/ramalama/common.py @@ -16,6 +16,7 @@ import urllib.error from typing import List +import ramalama.amdkfd as amdkfd import ramalama.console as console from ramalama.http_client import HttpClient from ramalama.logger import logger @@ -35,6 +36,8 @@ DEFAULT_IMAGE = "quay.io/ramalama/ramalama" +MIN_VRAM_BYTES = 1073741824 # 1GiB + _engine = -1 # -1 means cached variable not set yet _nvidia = -1 # -1 means cached variable not set yet @@ -447,12 +450,24 @@ def check_ascend(): def check_rocm_amd(): gpu_num = 0 gpu_bytes = 0 - for i, fp in enumerate(sorted(glob.glob('/sys/bus/pci/devices/*/mem_info_vram_total'))): - with open(fp, 'r') as file: - content = int(file.read()) - if content > 1073741824 and content > gpu_bytes: - gpu_bytes = content - gpu_num = i + for i, (np, props) in enumerate(amdkfd.gpus()): + # Radeon GPUs older than gfx900 are not supported by ROCm (e.g. Polaris) + if props['gfx_target_version'] < 90000: + continue + + mem_banks_count = int(props['mem_banks_count']) + mem_bytes = 0 + for bank in range(mem_banks_count): + bank_props = amdkfd.parse_props(np + f'/mem_banks/{bank}/properties') + # See /usr/include/linux/kfd_sysfs.h for possible heap types + # + # Count public and private framebuffer memory as VRAM + if bank_props['heap_type'] in [1, 2]: + mem_bytes += int(bank_props['size_in_bytes']) + + if mem_bytes > MIN_VRAM_BYTES and mem_bytes > gpu_bytes: + gpu_bytes = mem_bytes + gpu_num = i if gpu_bytes: os.environ["HIP_VISIBLE_DEVICES"] = str(gpu_num)