Skip to content

Commit

Permalink
Merge pull request #1828 from devitocodes/admit-cuda-2
Browse files Browse the repository at this point in the history
compiler: Augment code generation capabilities for CUDA/HIP/SYCL support
  • Loading branch information
FabioLuporini authored Feb 14, 2022
2 parents e2321f4 + 220fe2a commit 41ee245
Show file tree
Hide file tree
Showing 53 changed files with 1,427 additions and 733 deletions.
28 changes: 28 additions & 0 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from devito.tools import as_tuple, all_equal, memoized_func

__all__ = ['platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_nvidia_cc',
'check_cuda_runtime',
'Platform', 'Cpu64', 'Intel64', 'Amd', 'Arm', 'Power', 'Device',
'NvidiaDevice', 'AmdDevice',
'INTEL64', 'SNB', 'IVB', 'HSW', 'BDW', 'SKX', 'KNL', 'KNL7210', # Intel
Expand Down Expand Up @@ -354,6 +355,33 @@ def get_nvidia_cc():
return 10*cc_major.value + cc_minor.value


@memoized_func
def check_cuda_runtime():
libnames = ('libcudart.so', 'libcudart.dylib', 'cudart.dll')
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)
except OSError:
continue
else:
break
else:
warning("Unable to check compatibility of NVidia driver and runtime")

driver_version = ctypes.c_int()
runtime_version = ctypes.c_int()

if cuda.cudaDriverGetVersion(ctypes.byref(driver_version)) == 0 and \
cuda.cudaRuntimeGetVersion(ctypes.byref(runtime_version)) == 0:
driver_version = driver_version.value
runtime_version = runtime_version.value
if driver_version < runtime_version:
warning("The NVidia driver (v%d) on this system may not be compatible "
"with the CUDA runtime (v%d)" % (driver_version, runtime_version))
else:
warning("Unable to check compatibility of NVidia driver and runtime")


@memoized_func
def lscpu():
try:
Expand Down
11 changes: 9 additions & 2 deletions devito/arch/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from codepy.jit import compile_from_string
from codepy.toolchain import GCCToolchain

from devito.arch import AMDGPUX, NVIDIAX, M1, SKX, POWER8, POWER9, get_nvidia_cc
from devito.arch import (AMDGPUX, NVIDIAX, M1, SKX, POWER8, POWER9, get_nvidia_cc,
check_cuda_runtime)
from devito.exceptions import CompilationError
from devito.logger import debug, warning, error
from devito.parameters import configuration
Expand Down Expand Up @@ -495,10 +496,16 @@ def __init__(self, *args, **kwargs):
self.cflags.remove('-std=c99')
self.cflags.remove('-Wall')
self.cflags.remove('-fPIC')
self.cflags += ['-std=c++11', '-Xcompiler', '-fPIC']
self.cflags += ['-std=c++14', '-Xcompiler', '-fPIC']

self.src_ext = 'cu'

# NOTE: not sure where we should place this. It definitely needs
# to be executed once to warn the user in case there's a CUDA/driver
# mismatch that would cause the program to run, but likely producing
# garbage, since the CUDA kernel behaviour would be undefined
check_cuda_runtime()

def __lookup_cmds__(self):
self.CC = 'nvcc'
self.CXX = 'nvcc'
Expand Down
62 changes: 25 additions & 37 deletions devito/core/cpu.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from functools import partial

from devito.core.operator import CoreOperator, CustomOperator
from devito.core.operator import CoreOperator, CustomOperator, ParTile
from devito.exceptions import InvalidOperator
from devito.passes.equations import collect_derivatives
from devito.passes.clusters import (Lift, blocking, buffering, cire, cse,
Expand All @@ -23,6 +23,17 @@ class Cpu64OperatorMixin(object):
3 => "blocks", "sub-blocks", and "sub-sub-blocks", ...
"""

BLOCK_EAGER = True
"""
Apply loop blocking as early as possible, and in particular prior to CIRE.
"""

BLOCK_RELAX = False
"""
If set to True, bypass the compiler heuristics that prevent loop blocking in
situations where the performance impact might be detrimental.
"""

CIRE_MINGAIN = 10
"""
Minimum operation count reduction for a redundant expression to be optimized
Expand Down Expand Up @@ -84,7 +95,11 @@ def _normalize_kwargs(cls, **kwargs):
# Blocking
o['blockinner'] = oo.pop('blockinner', False)
o['blocklevels'] = oo.pop('blocklevels', cls.BLOCK_LEVELS)
o['blockeager'] = oo.pop('blockeager', cls.BLOCK_EAGER)
o['blocklazy'] = oo.pop('blocklazy', not o['blockeager'])
o['blockrelax'] = oo.pop('blockrelax', cls.BLOCK_RELAX)
o['skewing'] = oo.pop('skewing', False)
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=16)

# CIRE
o['min-storage'] = oo.pop('min-storage', False)
Expand Down Expand Up @@ -172,7 +187,8 @@ def _specialize_clusters(cls, clusters, **kwargs):
clusters = Lift().process(clusters)

# Blocking to improve data locality
clusters = blocking(clusters, options)
if options['blockeager']:
clusters = blocking(clusters, sregistry, options)

# Reduce flops
clusters = extract_increments(clusters, sregistry)
Expand All @@ -186,6 +202,10 @@ def _specialize_clusters(cls, clusters, **kwargs):
# Reduce flops
clusters = cse(clusters, sregistry)

# Blocking to improve data locality
if options['blocklazy']:
clusters = blocking(clusters, sregistry, options)

return clusters

@classmethod
Expand Down Expand Up @@ -228,6 +248,8 @@ class Cpu64FsgOperator(Cpu64AdvOperator):
Operator with performance optimizations tailored "For small grids" ("Fsg").
"""

BLOCK_EAGER = False

@classmethod
def _normalize_kwargs(cls, **kwargs):
kwargs = super()._normalize_kwargs(**kwargs)
Expand All @@ -238,40 +260,6 @@ def _normalize_kwargs(cls, **kwargs):

return kwargs

@classmethod
@timed_pass(name='specializing.Clusters')
def _specialize_clusters(cls, clusters, **kwargs):
options = kwargs['options']
platform = kwargs['platform']
sregistry = kwargs['sregistry']

# Optimize MultiSubDomains
clusters = optimize_msds(clusters)

# Toposort+Fusion (the former to expose more fusion opportunities)
clusters = fuse(clusters, toposort=True)

# Hoist and optimize Dimension-invariant sub-expressions
clusters = cire(clusters, 'invariants', sregistry, options, platform)
clusters = Lift().process(clusters)

# Reduce flops (potential arithmetic alterations)
clusters = extract_increments(clusters, sregistry)
clusters = cire(clusters, 'sops', sregistry, options, platform)
clusters = factorize(clusters)
clusters = optimize_pows(clusters)

# The previous passes may have created fusion opportunities
clusters = fuse(clusters)

# Reduce flops (no arithmetic alterations)
clusters = cse(clusters, sregistry)

# Blocking to improve data locality
clusters = blocking(clusters, options)

return clusters


class Cpu64CustomOperator(Cpu64OperatorMixin, CustomOperator):

Expand Down Expand Up @@ -299,7 +287,7 @@ def callback(f):

return {
'buffering': lambda i: buffering(i, callback, sregistry, options),
'blocking': lambda i: blocking(i, options),
'blocking': lambda i: blocking(i, sregistry, options),
'factorize': factorize,
'fission': fission,
'fuse': lambda i: fuse(i, options=options),
Expand Down
29 changes: 24 additions & 5 deletions devito/core/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy as np

from devito.core.operator import CoreOperator, CustomOperator
from devito.core.operator import CoreOperator, CustomOperator, ParTile
from devito.exceptions import InvalidOperator
from devito.passes.equations import collect_derivatives
from devito.passes.clusters import (Lift, Streaming, Tasker, blocking, buffering,
Expand All @@ -26,6 +26,17 @@ class DeviceOperatorMixin(object):
3 => "blocks", "sub-blocks", and "sub-sub-blocks", ...
"""

BLOCK_EAGER = True
"""
Apply loop blocking as early as possible, and in particular prior to CIRE.
"""

BLOCK_RELAX = False
"""
If set to True, bypass the compiler heuristics that prevent loop blocking in
situations where the performance impact might be detrimental.
"""

CIRE_MINGAIN = 10
"""
Minimum operation count reduction for a redundant expression to be optimized
Expand Down Expand Up @@ -67,6 +78,9 @@ def _normalize_kwargs(cls, **kwargs):
# Blocking
o['blockinner'] = oo.pop('blockinner', True)
o['blocklevels'] = oo.pop('blocklevels', cls.BLOCK_LEVELS)
o['blockeager'] = oo.pop('blockeager', cls.BLOCK_EAGER)
o['blocklazy'] = oo.pop('blocklazy', not o['blockeager'])
o['blockrelax'] = oo.pop('blockrelax', cls.BLOCK_RELAX)
o['skewing'] = oo.pop('skewing', False)

# CIRE
Expand All @@ -78,7 +92,7 @@ def _normalize_kwargs(cls, **kwargs):
o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE)

# GPU parallelism
o['par-tile'] = oo.pop('par-tile', False) # Control tile parallelism
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4))
o['par-collapse-ncores'] = 1 # Always collapse (meaningful if `par-tile=False`)
o['par-collapse-work'] = 1 # Always collapse (meaningful if `par-tile=False`)
o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE)
Expand Down Expand Up @@ -161,8 +175,9 @@ def _specialize_clusters(cls, clusters, **kwargs):
clusters = cire(clusters, 'invariants', sregistry, options, platform)
clusters = Lift().process(clusters)

# Loop tiling
clusters = blocking(clusters, options)
# Blocking to define thread blocks
if options['blockeager']:
clusters = blocking(clusters, sregistry, options)

# Reduce flops
clusters = extract_increments(clusters, sregistry)
Expand All @@ -176,6 +191,10 @@ def _specialize_clusters(cls, clusters, **kwargs):
# Reduce flops
clusters = cse(clusters, sregistry)

# Blocking to define thread blocks
if options['blocklazy']:
clusters = blocking(clusters, sregistry, options)

return clusters

@classmethod
Expand Down Expand Up @@ -245,7 +264,7 @@ def callback(f):

return {
'buffering': lambda i: buffering(i, callback, sregistry, options),
'blocking': lambda i: blocking(i, options),
'blocking': lambda i: blocking(i, sregistry, options),
'tasking': Tasker(runs_on_host).process,
'streaming': Streaming(reads_if_on_host).process,
'factorize': factorize,
Expand Down
73 changes: 71 additions & 2 deletions devito/core/operator.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from collections.abc import Iterable

from devito.core.autotuning import autotune
from devito.exceptions import InvalidOperator
from devito.logger import warning
from devito.parameters import configuration
from devito.operator import Operator
from devito.tools import as_tuple, timed_pass
from devito.tools import as_tuple, is_integer, timed_pass
from devito.types import NThreads

__all__ = ['CoreOperator', 'CustomOperator']
__all__ = ['CoreOperator', 'CustomOperator',
# Optimization options
'ParTile']


class BasicOperator(Operator):
Expand Down Expand Up @@ -208,3 +212,68 @@ def _specialize_iet(cls, graph, **kwargs):
passes_mapper['linearize'](graph)

return graph


# Wrappers for optimization options


class OptOption(object):
pass


class ParTileArg(tuple):

def __new__(cls, items, shm=0, tag=None):
obj = super().__new__(cls, items)
obj.shm = shm
obj.tag = tag
return obj


class ParTile(tuple, OptOption):

def __new__(cls, items, default=None):
if not items:
return None
elif isinstance(items, bool):
if not default:
raise ValueError("Expected `default` value, got None")
items = (ParTileArg(as_tuple(default)),)
elif isinstance(items, tuple):
if not items:
raise ValueError("Expected at least one value")

# Normalize to tuple of ParTileArgs

x = items[0]
if is_integer(x):
# E.g., (32, 4, 8)
items = (ParTileArg(items),)

elif isinstance(x, Iterable):
if not x:
raise ValueError("Expected at least one value")

try:
y = items[1]
if is_integer(y):
# E.g., ((32, 4, 8), 1)
# E.g., ((32, 4, 8), 1, 'tag')
items = (ParTileArg(*items),)
else:
try:
# E.g., (((32, 4, 8), 1), ((32, 4, 4), 2))
# E.g., (((32, 4, 8), 1, 'tag0'), ((32, 4, 4), 2, 'tag1'))
items = tuple(ParTileArg(*i) for i in items)
except TypeError:
# E.g., ((32, 4, 8), (32, 4, 4))
items = tuple(ParTileArg(i) for i in items)
except IndexError:
# E.g., ((32, 4, 8),)
items = (ParTileArg(x),)
else:
raise ValueError("Expected int or tuple, got %s instead" % type(x))
else:
raise ValueError("Expected bool or tuple, got %s instead" % type(items))

return super().__new__(cls, items)
Loading

0 comments on commit 41ee245

Please sign in to comment.