From d0002b2eeadb00d044f4601472d38cc6ee45a912 Mon Sep 17 00:00:00 2001 From: mloubout Date: Tue, 7 Nov 2023 11:17:35 -0500 Subject: [PATCH] arch: suport rocm-smi for get_gpu_info --- .github/workflows/pytest-core-nompi.yml | 6 +-- devito/arch/archinfo.py | 72 +++++++++++++++++++++++++ devito/arch/compiler.py | 7 ++- devito/mpi/distributed.py | 2 + tests/test_gpu_common.py | 17 +++++- 5 files changed, 96 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pytest-core-nompi.yml b/.github/workflows/pytest-core-nompi.yml index 1cbf3d510d..af0127b053 100644 --- a/.github/workflows/pytest-core-nompi.yml +++ b/.github/workflows/pytest-core-nompi.yml @@ -80,7 +80,7 @@ jobs: - name: pytest-ubuntu-py39-gcc9-omp python-version: '3.9' os: ubuntu-20.04 - arch: "gcc-9" + arch: "custom" language: "openmp" sympy: "1.9" @@ -140,7 +140,7 @@ jobs: id: set-run - name: Install ${{ matrix.arch }} compiler - if: "runner.os == 'linux' && !contains(matrix.name, 'docker')" + if: "runner.os == 'linux' && !contains(matrix.name, 'docker') && matrix.arch !='custom' " run : | sudo apt-get install -y ${{ matrix.arch }} @@ -166,8 +166,6 @@ jobs: - name: Test with pytest run: | - ${{ env.RUN_CMD }} ${{ matrix.arch }} --version - ${{ env.RUN_CMD }} python3 --version ${{ env.RUN_CMD }} pytest -k "${{ matrix.test-set }}" -m "not parallel" --cov --cov-config=.coveragerc --cov-report=xml ${{ env.TESTS }} - name: Upload coverage to Codecov diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py index 0a5ac0588d..2b7fdb4b67 100644 --- a/devito/arch/archinfo.py +++ b/devito/arch/archinfo.py @@ -10,6 +10,7 @@ import re import os import sys +import json from devito.logger import warning from devito.tools import as_tuple, all_equal, memoized_func @@ -249,6 +250,77 @@ def cbk(deviceid=0): except OSError: pass + # *** Second try: `rocm-smi`, clearly only works with AMD cards + try: + gpu_infos = {} + + # Base gpu info + info_cmd = ['rocm-smi', '--showproductname'] + proc = Popen(info_cmd, stdout=PIPE, stderr=DEVNULL) + raw_info = str(proc.stdout.read()) + + lines = raw_info.replace('\\n', '\n').replace('b\'', '').replace('\\t', '') + lines = lines.splitlines() + + for line in lines: + if 'GPU' in line: + # Product + pattern = r'GPU\[(\d+)\].*?Card series:\s*(.*?)\s*$' + match1 = re.match(pattern, line) + + if match1: + gid = match1.group(1) + gpu_infos.setdefault(gid, dict()) + gpu_infos[gid]['physicalid'] = gid + gpu_infos[gid]['product'] = match1.group(2) + + # Model + pattern = r'GPU\[(\d+)\].*?Card model:\s*(.*?)\s*$' + match2 = re.match(pattern, line) + + if match2: + gid = match2.group(1) + gpu_infos.setdefault(gid, dict()) + gpu_infos[gid]['physicalid'] = match2.group(1) + gpu_infos[gid]['model'] = match2.group(2) + + gpu_info = homogenise_gpus(list(gpu_infos.values())) + + # Also attach callbacks to retrieve instantaneous memory info + info_cmd = ['rocm-smi', '--showmeminfo', 'vram', '--json'] + proc = Popen(info_cmd, stdout=PIPE, stderr=DEVNULL) + raw_info = str(proc.stdout.read()) + lines = raw_info.replace('\\n', '').replace('b\'', '').replace('\'', '') + info = json.loads(lines) + + for i in ['total', 'free', 'used']: + def make_cbk(i): + def cbk(deviceid=0): + try: + # Should only contain Used and total + assert len(info['card%s' % deviceid]) == 2 + used = [int(v) for k, v in info['card%s' % deviceid].items() + if 'Used' in k][0] + total = [int(v) for k, v in info['card%s' % deviceid].items() + if 'Used' not in k][0] + free = total - used + return {'total': total, 'free': free, 'used': used}[i] + except: + # We shouldn't really end up here, unless nvidia-smi changes + # the output format (though we still have tests in place that + # will catch this) + return None + + return cbk + + gpu_info['mem.%s' % i] = make_cbk(i) + + gpu_infos['architecture'] = 'AMD' + return gpu_info + + except OSError: + pass + # *** Second try: `lshw` try: info_cmd = ['lshw', '-C', 'video'] diff --git a/devito/arch/compiler.py b/devito/arch/compiler.py index 7ec1d3da5b..08f509c5fe 100644 --- a/devito/arch/compiler.py +++ b/devito/arch/compiler.py @@ -862,8 +862,8 @@ def __new__(cls, *args, **kwargs): obj = super().__new__(cls) # Keep base to initialize accordingly - obj._base = _base - obj._cpp = _base._cpp + obj._base = kwargs.pop('base', _base) + obj._cpp = obj._base._cpp return obj @@ -891,6 +891,9 @@ def __lookup_cmds__(self): self.MPICC = environ.get('MPICC', self.MPICC) self.MPICXX = environ.get('MPICXX', self.MPICXX) + def __new_with__(self, **kwargs): + return super().__new_with__(base=self._base, **kwargs) + compiler_registry = { 'custom': CustomCompiler, diff --git a/devito/mpi/distributed.py b/devito/mpi/distributed.py index 464a62a15a..9c8bcd0c8e 100644 --- a/devito/mpi/distributed.py +++ b/devito/mpi/distributed.py @@ -17,6 +17,8 @@ from devito.types.utils import DimensionTuple +__all__ = ['CustomTopology'] + # Do not prematurely initialize MPI # This allows launching a Devito program from within another Python program # that has *already* initialized MPI diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index d3164722f2..c6594305fc 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -25,13 +25,26 @@ class TestGPUInfo(object): def test_get_gpu_info(self): info = get_gpu_info() - known = ['nvidia', 'tesla', 'geforce', 'quadro', 'unspecified'] + known = ['nvidia', 'tesla', 'geforce', 'quadro', 'amd', 'unspecified'] try: assert info['architecture'].lower() in known except KeyError: # There might be than one GPUs, but for now we don't care # as we're not really exploiting this info yet... - pass + pytest.xfail("Unsupported platform for get_gpu_info") + + def custom_compiler(self): + grid = Grid(shape=(4, 4)) + + u = TimeFunction(name='u', grid=grid) + + eqn = Eq(u.forward, u + 1) + + with switchconfig(compiler='custom'): + op = Operator(eqn)() + # Check jit-compilation and correct execution + op.apply(time_M=10) + assert np.all(u.data[1] == 11) class TestCodeGeneration(object):