Merge pull request #1828 from devitocodes/admit-cuda-2

compiler: Augment code generation capabilities for CUDA/HIP/SYCL support
devitocodes · Feb 14, 2022 · 41ee245 · 41ee245
2 parents e2321f4 + 220fe2a
commit 41ee245
Show file tree

Hide file tree

Showing 53 changed files with 1,427 additions and 733 deletions.
diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
@@ -13,6 +13,7 @@
 from devito.tools import as_tuple, all_equal, memoized_func
 
 __all__ = ['platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_nvidia_cc',
+           'check_cuda_runtime',
            'Platform', 'Cpu64', 'Intel64', 'Amd', 'Arm', 'Power', 'Device',
            'NvidiaDevice', 'AmdDevice',
            'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'SKX', 'KNL', 'KNL7210',  # Intel
@@ -354,6 +355,33 @@ def get_nvidia_cc():
         return 10*cc_major.value + cc_minor.value
 
 
+@memoized_func
+def check_cuda_runtime():
+    libnames = ('libcudart.so', 'libcudart.dylib', 'cudart.dll')
+    for libname in libnames:
+        try:
+            cuda = ctypes.CDLL(libname)
+        except OSError:
+            continue
+        else:
+            break
+    else:
+        warning("Unable to check compatibility of NVidia driver and runtime")
+
+    driver_version = ctypes.c_int()
+    runtime_version = ctypes.c_int()
+
+    if cuda.cudaDriverGetVersion(ctypes.byref(driver_version)) == 0 and \
+       cuda.cudaRuntimeGetVersion(ctypes.byref(runtime_version)) == 0:
+        driver_version = driver_version.value
+        runtime_version = runtime_version.value
+        if driver_version < runtime_version:
+            warning("The NVidia driver (v%d) on this system may not be compatible "
+                    "with the CUDA runtime (v%d)" % (driver_version, runtime_version))
+    else:
+        warning("Unable to check compatibility of NVidia driver and runtime")
+
+
 @memoized_func
 def lscpu():
     try:

diff --git a/devito/arch/compiler.py b/devito/arch/compiler.py
@@ -12,7 +12,8 @@
 from codepy.jit import compile_from_string
 from codepy.toolchain import GCCToolchain
 
-from devito.arch import AMDGPUX, NVIDIAX, M1, SKX, POWER8, POWER9, get_nvidia_cc
+from devito.arch import (AMDGPUX, NVIDIAX, M1, SKX, POWER8, POWER9, get_nvidia_cc,
+                         check_cuda_runtime)
 from devito.exceptions import CompilationError
 from devito.logger import debug, warning, error
 from devito.parameters import configuration
@@ -495,10 +496,16 @@ def __init__(self, *args, **kwargs):
         self.cflags.remove('-std=c99')
         self.cflags.remove('-Wall')
         self.cflags.remove('-fPIC')
-        self.cflags += ['-std=c++11', '-Xcompiler', '-fPIC']
+        self.cflags += ['-std=c++14', '-Xcompiler', '-fPIC']
 
         self.src_ext = 'cu'
 
+        # NOTE: not sure where we should place this. It definitely needs
+        # to be executed once to warn the user in case there's a CUDA/driver
+        # mismatch that would cause the program to run, but likely producing
+        # garbage, since the CUDA kernel behaviour would be undefined
+        check_cuda_runtime()
+
     def __lookup_cmds__(self):
         self.CC = 'nvcc'
         self.CXX = 'nvcc'

diff --git a/devito/core/cpu.py b/devito/core/cpu.py
@@ -1,6 +1,6 @@
 from functools import partial
 
-from devito.core.operator import CoreOperator, CustomOperator
+from devito.core.operator import CoreOperator, CustomOperator, ParTile
 from devito.exceptions import InvalidOperator
 from devito.passes.equations import collect_derivatives
 from devito.passes.clusters import (Lift, blocking, buffering, cire, cse,
@@ -23,6 +23,17 @@ class Cpu64OperatorMixin(object):
     3 => "blocks", "sub-blocks", and "sub-sub-blocks", ...
     """
 
+    BLOCK_EAGER = True
+    """
+    Apply loop blocking as early as possible, and in particular prior to CIRE.
+    """
+
+    BLOCK_RELAX = False
+    """
+    If set to True, bypass the compiler heuristics that prevent loop blocking in
+    situations where the performance impact might be detrimental.
+    """
+
     CIRE_MINGAIN = 10
     """
     Minimum operation count reduction for a redundant expression to be optimized
@@ -84,7 +95,11 @@ def _normalize_kwargs(cls, **kwargs):
         # Blocking
         o['blockinner'] = oo.pop('blockinner', False)
         o['blocklevels'] = oo.pop('blocklevels', cls.BLOCK_LEVELS)
+        o['blockeager'] = oo.pop('blockeager', cls.BLOCK_EAGER)
+        o['blocklazy'] = oo.pop('blocklazy', not o['blockeager'])
+        o['blockrelax'] = oo.pop('blockrelax', cls.BLOCK_RELAX)
         o['skewing'] = oo.pop('skewing', False)
+        o['par-tile'] = ParTile(oo.pop('par-tile', False), default=16)
 
         # CIRE
         o['min-storage'] = oo.pop('min-storage', False)
@@ -172,7 +187,8 @@ def _specialize_clusters(cls, clusters, **kwargs):
         clusters = Lift().process(clusters)
 
         # Blocking to improve data locality
-        clusters = blocking(clusters, options)
+        if options['blockeager']:
+            clusters = blocking(clusters, sregistry, options)
 
         # Reduce flops
         clusters = extract_increments(clusters, sregistry)
@@ -186,6 +202,10 @@ def _specialize_clusters(cls, clusters, **kwargs):
         # Reduce flops
         clusters = cse(clusters, sregistry)
 
+        # Blocking to improve data locality
+        if options['blocklazy']:
+            clusters = blocking(clusters, sregistry, options)
+
         return clusters
 
     @classmethod
@@ -228,6 +248,8 @@ class Cpu64FsgOperator(Cpu64AdvOperator):
     Operator with performance optimizations tailored "For small grids" ("Fsg").
     """
 
+    BLOCK_EAGER = False
+
     @classmethod
     def _normalize_kwargs(cls, **kwargs):
         kwargs = super()._normalize_kwargs(**kwargs)
@@ -238,40 +260,6 @@ def _normalize_kwargs(cls, **kwargs):
 
         return kwargs
 
-    @classmethod
-    @timed_pass(name='specializing.Clusters')
-    def _specialize_clusters(cls, clusters, **kwargs):
-        options = kwargs['options']
-        platform = kwargs['platform']
-        sregistry = kwargs['sregistry']
-
-        # Optimize MultiSubDomains
-        clusters = optimize_msds(clusters)
-
-        # Toposort+Fusion (the former to expose more fusion opportunities)
-        clusters = fuse(clusters, toposort=True)
-
-        # Hoist and optimize Dimension-invariant sub-expressions
-        clusters = cire(clusters, 'invariants', sregistry, options, platform)
-        clusters = Lift().process(clusters)
-
-        # Reduce flops (potential arithmetic alterations)
-        clusters = extract_increments(clusters, sregistry)
-        clusters = cire(clusters, 'sops', sregistry, options, platform)
-        clusters = factorize(clusters)
-        clusters = optimize_pows(clusters)
-
-        # The previous passes may have created fusion opportunities
-        clusters = fuse(clusters)
-
-        # Reduce flops (no arithmetic alterations)
-        clusters = cse(clusters, sregistry)
-
-        # Blocking to improve data locality
-        clusters = blocking(clusters, options)
-
-        return clusters
-
 
 class Cpu64CustomOperator(Cpu64OperatorMixin, CustomOperator):
 
@@ -299,7 +287,7 @@ def callback(f):
 
         return {
             'buffering': lambda i: buffering(i, callback, sregistry, options),
-            'blocking': lambda i: blocking(i, options),
+            'blocking': lambda i: blocking(i, sregistry, options),
             'factorize': factorize,
             'fission': fission,
             'fuse': lambda i: fuse(i, options=options),

diff --git a/devito/core/gpu.py b/devito/core/gpu.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from devito.core.operator import CoreOperator, CustomOperator
+from devito.core.operator import CoreOperator, CustomOperator, ParTile
 from devito.exceptions import InvalidOperator
 from devito.passes.equations import collect_derivatives
 from devito.passes.clusters import (Lift, Streaming, Tasker, blocking, buffering,
@@ -26,6 +26,17 @@ class DeviceOperatorMixin(object):
     3 => "blocks", "sub-blocks", and "sub-sub-blocks", ...
     """
 
+    BLOCK_EAGER = True
+    """
+    Apply loop blocking as early as possible, and in particular prior to CIRE.
+    """
+
+    BLOCK_RELAX = False
+    """
+    If set to True, bypass the compiler heuristics that prevent loop blocking in
+    situations where the performance impact might be detrimental.
+    """
+
     CIRE_MINGAIN = 10
     """
     Minimum operation count reduction for a redundant expression to be optimized
@@ -67,6 +78,9 @@ def _normalize_kwargs(cls, **kwargs):
         # Blocking
         o['blockinner'] = oo.pop('blockinner', True)
         o['blocklevels'] = oo.pop('blocklevels', cls.BLOCK_LEVELS)
+        o['blockeager'] = oo.pop('blockeager', cls.BLOCK_EAGER)
+        o['blocklazy'] = oo.pop('blocklazy', not o['blockeager'])
+        o['blockrelax'] = oo.pop('blockrelax', cls.BLOCK_RELAX)
         o['skewing'] = oo.pop('skewing', False)
 
         # CIRE
@@ -78,7 +92,7 @@ def _normalize_kwargs(cls, **kwargs):
         o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE)
 
         # GPU parallelism
-        o['par-tile'] = oo.pop('par-tile', False)  # Control tile parallelism
+        o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4))
         o['par-collapse-ncores'] = 1  # Always collapse (meaningful if `par-tile=False`)
         o['par-collapse-work'] = 1  # Always collapse (meaningful if `par-tile=False`)
         o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE)
@@ -161,8 +175,9 @@ def _specialize_clusters(cls, clusters, **kwargs):
         clusters = cire(clusters, 'invariants', sregistry, options, platform)
         clusters = Lift().process(clusters)
 
-        # Loop tiling
-        clusters = blocking(clusters, options)
+        # Blocking to define thread blocks
+        if options['blockeager']:
+            clusters = blocking(clusters, sregistry, options)
 
         # Reduce flops
         clusters = extract_increments(clusters, sregistry)
@@ -176,6 +191,10 @@ def _specialize_clusters(cls, clusters, **kwargs):
         # Reduce flops
         clusters = cse(clusters, sregistry)
 
+        # Blocking to define thread blocks
+        if options['blocklazy']:
+            clusters = blocking(clusters, sregistry, options)
+
         return clusters
 
     @classmethod
@@ -245,7 +264,7 @@ def callback(f):
 
         return {
             'buffering': lambda i: buffering(i, callback, sregistry, options),
-            'blocking': lambda i: blocking(i, options),
+            'blocking': lambda i: blocking(i, sregistry, options),
             'tasking': Tasker(runs_on_host).process,
             'streaming': Streaming(reads_if_on_host).process,
             'factorize': factorize,

diff --git a/devito/core/operator.py b/devito/core/operator.py
@@ -1,12 +1,16 @@
+from collections.abc import Iterable
+
 from devito.core.autotuning import autotune
 from devito.exceptions import InvalidOperator
 from devito.logger import warning
 from devito.parameters import configuration
 from devito.operator import Operator
-from devito.tools import as_tuple, timed_pass
+from devito.tools import as_tuple, is_integer, timed_pass
 from devito.types import NThreads
 
-__all__ = ['CoreOperator', 'CustomOperator']
+__all__ = ['CoreOperator', 'CustomOperator',
+           # Optimization options
+           'ParTile']
 
 
 class BasicOperator(Operator):
@@ -208,3 +212,68 @@ def _specialize_iet(cls, graph, **kwargs):
             passes_mapper['linearize'](graph)
 
         return graph
+
+
+# Wrappers for optimization options
+
+
+class OptOption(object):
+    pass
+
+
+class ParTileArg(tuple):
+
+    def __new__(cls, items, shm=0, tag=None):
+        obj = super().__new__(cls, items)
+        obj.shm = shm
+        obj.tag = tag
+        return obj
+
+
+class ParTile(tuple, OptOption):
+
+    def __new__(cls, items, default=None):
+        if not items:
+            return None
+        elif isinstance(items, bool):
+            if not default:
+                raise ValueError("Expected `default` value, got None")
+            items = (ParTileArg(as_tuple(default)),)
+        elif isinstance(items, tuple):
+            if not items:
+                raise ValueError("Expected at least one value")
+
+            # Normalize to tuple of ParTileArgs
+
+            x = items[0]
+            if is_integer(x):
+                # E.g., (32, 4, 8)
+                items = (ParTileArg(items),)
+
+            elif isinstance(x, Iterable):
+                if not x:
+                    raise ValueError("Expected at least one value")
+
+                try:
+                    y = items[1]
+                    if is_integer(y):
+                        # E.g., ((32, 4, 8), 1)
+                        # E.g., ((32, 4, 8), 1, 'tag')
+                        items = (ParTileArg(*items),)
+                    else:
+                        try:
+                            # E.g., (((32, 4, 8), 1), ((32, 4, 4), 2))
+                            # E.g., (((32, 4, 8), 1, 'tag0'), ((32, 4, 4), 2, 'tag1'))
+                            items = tuple(ParTileArg(*i) for i in items)
+                        except TypeError:
+                            # E.g., ((32, 4, 8), (32, 4, 4))
+                            items = tuple(ParTileArg(i) for i in items)
+                except IndexError:
+                    # E.g., ((32, 4, 8),)
+                    items = (ParTileArg(x),)
+            else:
+                raise ValueError("Expected int or tuple, got %s instead" % type(x))
+        else:
+            raise ValueError("Expected bool or tuple, got %s instead" % type(items))
+
+        return super().__new__(cls, items)