Skip to content

Commit

Permalink
arch: suport rocm-smi for get_gpu_info
Browse files Browse the repository at this point in the history
  • Loading branch information
mloubout committed Nov 10, 2023
1 parent 586ee8f commit 39cb6c6
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 2 deletions.
72 changes: 72 additions & 0 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import re
import os
import sys
import json

from devito.logger import warning
from devito.tools import as_tuple, all_equal, memoized_func
Expand Down Expand Up @@ -249,6 +250,77 @@ def cbk(deviceid=0):
except OSError:
pass

# *** Second try: `rocm-smi`, clearly only works with AMD cards
try:
gpu_infos = {}

# Base gpu info
info_cmd = ['rocm-smi', '--showproductname']
proc = Popen(info_cmd, stdout=PIPE, stderr=DEVNULL)
raw_info = str(proc.stdout.read())

lines = raw_info.replace('\\n', '\n').replace('b\'', '').replace('\\t', '')
lines = lines.splitlines()

for line in lines:
if 'GPU' in line:
# Product
pattern = r'GPU\[(\d+)\].*?Card series:\s*(.*?)\s*$'
match1 = re.match(pattern, line)

if match1:
gid = match1.group(1)
gpu_infos.setdefault(gid, dict())
gpu_infos[gid]['physicalid'] = gid
gpu_infos[gid]['product'] = match1.group(2)

# Model
pattern = r'GPU\[(\d+)\].*?Card model:\s*(.*?)\s*$'
match2 = re.match(pattern, line)

if match2:
gid = match2.group(1)
gpu_infos.setdefault(gid, dict())
gpu_infos[gid]['physicalid'] = match2.group(1)
gpu_infos[gid]['model'] = match2.group(2)

gpu_info = homogenise_gpus(list(gpu_infos.values()))

# Also attach callbacks to retrieve instantaneous memory info
info_cmd = ['rocm-smi', '--showmeminfo', 'vram', '--json']
proc = Popen(info_cmd, stdout=PIPE, stderr=DEVNULL)
raw_info = str(proc.stdout.read())
lines = raw_info.replace('\\n', '').replace('b\'', '').replace('\'', '')
info = json.loads(lines)

for i in ['total', 'free', 'used']:
def make_cbk(i):
def cbk(deviceid=0):
try:
# Should only contain Used and total
assert len(info['card%s' % deviceid]) == 2
used = [int(v) for k, v in info['card%s' % deviceid].items()
if 'Used' in k][0]
total = [int(v) for k, v in info['card%s' % deviceid].items()
if 'Used' not in k][0]
free = total - used
return {'total': total, 'free': free, 'used': used}[i]
except:
# We shouldn't really end up here, unless nvidia-smi changes
# the output format (though we still have tests in place that
# will catch this)
return None

return cbk

gpu_info['mem.%s' % i] = make_cbk(i)

gpu_infos['architecture'] = 'AMD'
return gpu_info

except OSError:
pass

# *** Second try: `lshw`
try:
info_cmd = ['lshw', '-C', 'video']
Expand Down
4 changes: 2 additions & 2 deletions tests/test_gpu_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ class TestGPUInfo(object):

def test_get_gpu_info(self):
info = get_gpu_info()
known = ['nvidia', 'tesla', 'geforce', 'quadro', 'unspecified']
known = ['nvidia', 'tesla', 'geforce', 'quadro', 'amd', 'unspecified']
try:
assert info['architecture'].lower() in known
except KeyError:
# There might be than one GPUs, but for now we don't care
# as we're not really exploiting this info yet...
pass
pytest.xfail("Unsupported platform for get_gpu_info")


class TestCodeGeneration(object):
Expand Down

0 comments on commit 39cb6c6

Please sign in to comment.