Skip to content

Commit

Permalink
arch: suport rocm-smi for get_gpu_info
Browse files Browse the repository at this point in the history
  • Loading branch information
mloubout committed Nov 10, 2023
1 parent 9a89f7c commit 94e7c92
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 4 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pytest-core-nompi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
- name: pytest-ubuntu-py39-gcc9-omp
python-version: '3.9'
os: ubuntu-20.04
arch: "gcc-9"
arch: "custom"
language: "openmp"
sympy: "1.9"

Expand Down Expand Up @@ -140,7 +140,7 @@ jobs:
id: set-run

- name: Install ${{ matrix.arch }} compiler
if: "runner.os == 'linux' && !contains(matrix.name, 'docker')"
if: "runner.os == 'linux' && !contains(matrix.name, 'docker') && matrix.arch !='custom' "
run : |
sudo apt-get install -y ${{ matrix.arch }}
Expand Down
72 changes: 72 additions & 0 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import re
import os
import sys
import json

from devito.logger import warning
from devito.tools import as_tuple, all_equal, memoized_func
Expand Down Expand Up @@ -249,6 +250,77 @@ def cbk(deviceid=0):
except OSError:
pass

# *** Second try: `rocm-smi`, clearly only works with AMD cards
try:
gpu_infos = {}

# Base gpu info
info_cmd = ['rocm-smi', '--showproductname']
proc = Popen(info_cmd, stdout=PIPE, stderr=DEVNULL)
raw_info = str(proc.stdout.read())

lines = raw_info.replace('\\n', '\n').replace('b\'', '').replace('\\t', '')
lines = lines.splitlines()

for line in lines:
if 'GPU' in line:
# Product
pattern = r'GPU\[(\d+)\].*?Card series:\s*(.*?)\s*$'
match1 = re.match(pattern, line)

if match1:
gid = match1.group(1)
gpu_infos.setdefault(gid, dict())
gpu_infos[gid]['physicalid'] = gid
gpu_infos[gid]['product'] = match1.group(2)

# Model
pattern = r'GPU\[(\d+)\].*?Card model:\s*(.*?)\s*$'
match2 = re.match(pattern, line)

if match2:
gid = match2.group(1)
gpu_infos.setdefault(gid, dict())
gpu_infos[gid]['physicalid'] = match2.group(1)
gpu_infos[gid]['model'] = match2.group(2)

gpu_info = homogenise_gpus(list(gpu_infos.values()))

# Also attach callbacks to retrieve instantaneous memory info
info_cmd = ['rocm-smi', '--showmeminfo', 'vram', '--json']
proc = Popen(info_cmd, stdout=PIPE, stderr=DEVNULL)
raw_info = str(proc.stdout.read())
lines = raw_info.replace('\\n', '').replace('b\'', '').replace('\'', '')
info = json.loads(lines)

for i in ['total', 'free', 'used']:
def make_cbk(i):
def cbk(deviceid=0):
try:
# Should only contain Used and total
assert len(info['card%s' % deviceid]) == 2
used = [int(v) for k, v in info['card%s' % deviceid].items()
if 'Used' in k][0]
total = [int(v) for k, v in info['card%s' % deviceid].items()
if 'Used' not in k][0]
free = total - used
return {'total': total, 'free': free, 'used': used}[i]
except:
# We shouldn't really end up here, unless nvidia-smi changes
# the output format (though we still have tests in place that
# will catch this)
return None

return cbk

gpu_info['mem.%s' % i] = make_cbk(i)

gpu_infos['architecture'] = 'AMD'
return gpu_info

except OSError:
pass

# *** Second try: `lshw`
try:
info_cmd = ['lshw', '-C', 'video']
Expand Down
2 changes: 2 additions & 0 deletions devito/mpi/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from devito.types.utils import DimensionTuple


__all__ = ['CustomTopology']

# Do not prematurely initialize MPI
# This allows launching a Devito program from within another Python program
# that has *already* initialized MPI
Expand Down
17 changes: 15 additions & 2 deletions tests/test_gpu_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,26 @@ class TestGPUInfo(object):

def test_get_gpu_info(self):
info = get_gpu_info()
known = ['nvidia', 'tesla', 'geforce', 'quadro', 'unspecified']
known = ['nvidia', 'tesla', 'geforce', 'quadro', 'amd', 'unspecified']
try:
assert info['architecture'].lower() in known
except KeyError:
# There might be than one GPUs, but for now we don't care
# as we're not really exploiting this info yet...
pass
pytest.xfail("Unsupported platform for get_gpu_info")

def custom_compiler(self):
grid = Grid(shape=(4, 4))

u = TimeFunction(name='u', grid=grid)

eqn = Eq(u.forward, u + 1)

with switchconfig(compiler='custom'):
op = Operator(eqn)()
# Check jit-compilation and correct execution
op.apply(time_M=10)
assert np.all(u.data[1] == 11)


class TestCodeGeneration(object):
Expand Down

0 comments on commit 94e7c92

Please sign in to comment.