Skip to content

Commit

Permalink
Merge pull request #2138 from devitocodes/fix-udx2-MPI
Browse files Browse the repository at this point in the history
compiler: Misc compiler fixes and improvements -- part II
  • Loading branch information
FabioLuporini authored Jun 7, 2023
2 parents 0678627 + 9636366 commit aedc3b9
Show file tree
Hide file tree
Showing 24 changed files with 1,002 additions and 765 deletions.
12 changes: 7 additions & 5 deletions devito/arch/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

platform = kwargs.pop('platform', configuration['platform'])

# Graviton flag
if platform is GRAVITON:
self.cflags += ['-mcpu=neoverse-n1']
Expand Down Expand Up @@ -493,13 +494,13 @@ class AOMPCompiler(Compiler):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

language = kwargs.pop('language', configuration['language'])
platform = kwargs.pop('platform', configuration['platform'])

self.cflags += ['-Wno-unused-result', '-Wno-unused-variable']
if not configuration['safe-math']:
self.cflags.append('-ffast-math')

language = kwargs.pop('language', configuration['language'])
platform = kwargs.pop('platform', configuration['platform'])

if platform is NVIDIAX:
self.cflags.remove('-std=c99')
elif platform is AMDGPUX:
Expand Down Expand Up @@ -685,6 +686,7 @@ def __init__(self, *args, **kwargs):

platform = kwargs.pop('platform', configuration['platform'])
language = kwargs.pop('language', configuration['language'])

self.cflags.append("-xHost")

if configuration['safe-math']:
Expand Down Expand Up @@ -730,10 +732,10 @@ class IntelKNLCompiler(IntelCompiler):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.cflags.append('-xMIC-AVX512')

language = kwargs.pop('language', configuration['language'])

self.cflags.append('-xMIC-AVX512')

if language != 'openmp':
warning("Running on Intel KNL without OpenMP is highly discouraged")

Expand Down
5 changes: 5 additions & 0 deletions devito/ir/clusters/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,11 @@ def callback(self, clusters, prefix, seen=None):
# be rescheduled after `c` upon topological sorting
points.update(a.access for a in c.scope.accesses if a.is_write)

# Sort for determinism
# NOTE: not sorting might impact code generation. The order of
# the args is important because that's what search functions honor!
points = sorted(points, key=str)

rhs = HaloTouch(*points, halo_scheme=halo_scheme)

# Insert only if not redundant, to avoid useless pollution
Expand Down
2 changes: 1 addition & 1 deletion devito/ir/iet/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def iet_build(stree):
body = HaloSpot(queues.pop(i), i.halo_scheme)

elif i.is_Sync:
body = SyncSpot(i.sync_ops, body=queues.pop(i))
body = SyncSpot(i.sync_ops, body=queues.pop(i, None))

queues.setdefault(i.parent, []).append(body)

Expand Down
23 changes: 17 additions & 6 deletions devito/ir/iet/visitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@


__all__ = ['FindApplications', 'FindNodes', 'FindSections', 'FindSymbols',
'MapExprStmts', 'MapNodes', 'IsPerfectIteration', 'printAST', 'CGen',
'CInterface', 'Transformer', 'Uxreplace']
'MapExprStmts', 'MapHaloSpots', 'MapNodes', 'IsPerfectIteration',
'printAST', 'CGen', 'CInterface', 'Transformer', 'Uxreplace']


class Visitor(GenericVisitor):
Expand Down Expand Up @@ -737,14 +737,17 @@ def visit_Conditional(self, o, ret=None, queue=None):
return ret


class MapExprStmts(FindSections):
class MapKind(FindSections):

"""
Construct a mapper from ExprStmts, i.e. expression statements such as Calls
and Expressions, to their enclosing block (e.g., Iteration, Block).
Base class to construct mappers from Nodes of given type to their enclosing
scope of Nodes.
"""

def visit_ExprStmt(self, o, ret=None, queue=None):
# NOTE: Ideally, we would use a metaclass that dynamically constructs mappers
# for the kind supplied by the caller, but it'd be overkill at the moment

def visit_dummy(self, o, ret=None, queue=None):
if ret is None:
ret = self.default_retval()
ret[o] = as_tuple(queue)
Expand All @@ -754,6 +757,14 @@ def visit_ExprStmt(self, o, ret=None, queue=None):
visit_Block = FindSections.visit_Iteration


class MapExprStmts(MapKind):
visit_ExprStmt = MapKind.visit_dummy


class MapHaloSpots(MapKind):
visit_HaloSpot = MapKind.visit_dummy


class MapNodes(Visitor):

@classmethod
Expand Down
30 changes: 25 additions & 5 deletions devito/ir/stree/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from devito.ir.stree.tree import (ScheduleTree, NodeIteration, NodeConditional,
NodeSync, NodeExprs, NodeSection, NodeHalo)
from devito.ir.support import (SEQUENTIAL, Any, Interval, IterationInterval,
IterationSpace, normalize_properties, normalize_syncs)
IterationSpace, WaitLock, normalize_properties,
normalize_syncs)
from devito.mpi.halo_scheme import HaloScheme
from devito.tools import Bunch, DefaultOrderedDict

Expand Down Expand Up @@ -176,15 +177,34 @@ def reuse_partial_subtree(c0, c1, d=None):


def reuse_whole_subtree(c0, c1, d=None):
return (c0.guards.get(d) == c1.guards.get(d) and
c0.syncs.get(d) == c1.syncs.get(d))
if not reuse_partial_subtree(c0, c1, d):
return False

syncs0 = c0.syncs.get(d, [])
syncs1 = c1.syncs.get(d, [])

if syncs0 == syncs1:
return True
elif not syncs0 and all(isinstance(s, WaitLock) for s in syncs1):
return True

return False


def augment_partial_subtree(cluster, tip, mapper, it=None):
d = it.dim

if d in cluster.syncs:
tip = NodeSync(cluster.syncs[d], tip)
try:
syncs = cluster.syncs[d]
if all(isinstance(s, WaitLock) for s in syncs):
# Unlike all other SyncOps, a WaitLock "floats" in the stree, in that
# it doesn't need to wrap any subtree. Thus, a WaitLock acts like
# a barrier to what follows inside `d`
NodeSync(syncs, tip)
else:
tip = NodeSync(syncs, tip)
except KeyError:
pass

mapper[it].bottom = tip

Expand Down
49 changes: 49 additions & 0 deletions devito/mpi/halo_scheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from devito import configuration
from devito.data import CORE, OWNED, LEFT, CENTER, RIGHT
from devito.ir.support import Forward, Scope
from devito.symbolics.manipulation import _uxreplace_registry
from devito.tools import (Reconstructable, Tag, as_tuple, filter_ordered, flatten,
frozendict, is_integer)
from devito.types import Grid
Expand Down Expand Up @@ -583,3 +584,51 @@ def __eq__(self, other):
return isinstance(other, HaloTouch) and self.halo_scheme == other.halo_scheme

func = Reconstructable._rebuild


def _uxreplace_dispatch_haloscheme(hs0, rule):
changed = False
hs = hs0
for f, hse0 in hs0.fmapper.items():
# Is it an attempt to replace `f`?
for i, v in rule.items():
if i is f:
# Yes!
g = v
hse = hse0

elif i.is_Indexed and i.function is f and v.is_Indexed:
# Yes, but through an Indexed, hence the `loc_indices` may now
# differ; let's infer them from the context
g = v.function

loc_indices = {}
loc_dirs = {}
for d0, loc_index in hse0.loc_indices.items():
if i.indices[d0] == loc_index:
# They indeed do change
d1 = g.indices[d0]
loc_indices[d1] = v.indices[d0]
loc_dirs[d1] = hse0.loc_dirs[d0]

if len(loc_indices) != len(hse0.loc_indices):
# Nope, let's try with the next Indexed, if any
continue

hse = HaloSchemeEntry(frozendict(loc_indices),
frozendict(loc_dirs),
hse0.halos, hse0.dims)

else:
continue

hs = hs.drop(f).add(g, hse)
changed |= True

break

return hs, changed


_uxreplace_registry.register(HaloTouch,
{HaloScheme: _uxreplace_dispatch_haloscheme})
68 changes: 43 additions & 25 deletions devito/mpi/routines.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from devito.symbolics import (Byref, CondNe, FieldFromPointer, FieldFromComposite,
IndexedPointer, Macro, cast_mapper, subs_op_args)
from devito.tools import (as_mapper, dtype_to_mpitype, dtype_len, dtype_to_ctype,
flatten, generator, split)
flatten, generator, is_integer, split)
from devito.types import (Array, Bundle, Dimension, Eq, Symbol, LocalObject,
CompositeObject, CustomDimension)

Expand Down Expand Up @@ -338,35 +338,37 @@ def _make_all(self, f, hse, msg):

def _make_copy(self, f, hse, key, swap=False):
dims = [d.root for d in f.dimensions if d not in hse.loc_indices]
ofs = [Symbol(name='o%s' % d.root, is_const=True) for d in f.dimensions]

f_offsets = []
f_indices = []
for d, h in zip(f.dimensions, f._size_nodomain.left):
offset = Symbol(name='o%s' % d.root, is_const=True)
f_offsets.append(offset)
offset_nohalo = offset - h
f_indices.append(offset_nohalo + (d.root if d not in hse.loc_indices else 0))
bshape = [Symbol(name='b%s' % d.symbolic_size) for d in dims]
bdims = [CustomDimension(name=d.name, parent=d, symbolic_size=s)
for d, s in zip(dims, bshape)]

eqns = []
eqns.extend([Eq(d.symbolic_min, 0) for d in dims])
eqns.extend([Eq(d.symbolic_max, d.symbolic_size - 1) for d in dims])
eqns.extend([Eq(d.symbolic_min, 0) for d in bdims])
eqns.extend([Eq(d.symbolic_max, d.symbolic_size - 1) for d in bdims])

vd = CustomDimension(name='vd', symbolic_size=f.ncomp)
buf = Array(name='buf', dimensions=[vd] + bdims, dtype=f.c0.dtype,
padding=0)

mapper = dict(zip(dims, bdims))
findices = [o - h + mapper.get(d.root, 0)
for d, o, h in zip(f.dimensions, ofs, f._size_nodomain.left)]

bdims = [CustomDimension(name='vd', symbolic_size=f.ncomp)] + dims
buf = Array(name='buf', dimensions=bdims, dtype=f.c0.dtype, padding=0)
if swap is False:
swap = lambda i, j: (i, j)
name = 'gather%s' % key
else:
swap = lambda i, j: (j, i)
name = 'scatter%s' % key
for i, c in enumerate(f.components):
eqns.append(Eq(*swap(buf[[i] + dims], c[f_indices])))
eqns.append(Eq(*swap(buf[[i] + bdims], c[findices])))

# Compile `eqns` into an IET via recursive compilation
irs, _ = self.rcompile(eqns)

shape = [d.symbolic_size for d in dims]
parameters = [buf] + shape + list(f.components) + f_offsets
parameters = [buf] + bshape + list(f.components) + ofs

return CopyBuffer(name, irs.uiet, parameters)

Expand Down Expand Up @@ -1156,7 +1158,19 @@ def halos(self):
def npeers(self):
return len(self._halos)

def _arg_defaults(self, allocator, alias):
def _as_number(self, v, args):
"""
Turn a sympy.Symbol into a number. In doing so, perform a number of
sanity checks to ensure we get a Symbol iff the Msg is for an Array.
"""
if is_integer(v):
return int(v)
else:
assert self.target.c0.is_Array
assert args is not None
return int(v.subs(args))

def _arg_defaults(self, allocator, alias, args=None):
# Lazy initialization if `allocator` is necessary as the `allocator`
# type isn't really known until an Operator is constructed
self._allocator = allocator
Expand All @@ -1165,14 +1179,14 @@ def _arg_defaults(self, allocator, alias):
for i, halo in enumerate(self.halos):
entry = self.value[i]

# Buffer size for this peer
# Buffer shape for this peer
shape = []
for dim, side in zip(*halo):
try:
shape.append(getattr(f._size_owned[dim], side.name))
except AttributeError:
assert side is CENTER
shape.append(f._size_domain[dim])
shape.append(self._as_number(f._size_domain[dim], args))
entry.sizes = (c_int*len(shape))(*shape)

# Allocate the send/recv buffers
Expand All @@ -1181,8 +1195,8 @@ def _arg_defaults(self, allocator, alias):
entry.bufg, bufg_memfree_args = allocator._alloc_C_libcall(size, ctype)
entry.bufs, bufs_memfree_args = allocator._alloc_C_libcall(size, ctype)

# The `memfree_args` will be used to deallocate the buffer upon returning
# from C-land
# The `memfree_args` will be used to deallocate the buffer upon
# returning from C-land
self._memfree_args.extend([bufg_memfree_args, bufs_memfree_args])

return {self.name: self.value}
Expand All @@ -1198,7 +1212,7 @@ def _arg_values(self, args=None, **kwargs):
else:
alias = f

return self._arg_defaults(args.allocator, alias=alias)
return self._arg_defaults(args.allocator, alias=alias, args=args)

def _arg_apply(self, *args, **kwargs):
self._C_memfree()
Expand All @@ -1218,30 +1232,34 @@ class MPIMsgEnriched(MPIMsg):
(_C_field_to, c_int)
]

def _arg_defaults(self, allocator, alias=None):
super()._arg_defaults(allocator, alias)
def _arg_defaults(self, allocator, alias=None, args=None):
super()._arg_defaults(allocator, alias, args=args)

f = alias or self.target.c0
neighborhood = f.grid.distributor.neighborhood

for i, halo in enumerate(self.halos):
entry = self.value[i]

# `torank` peer + gather offsets
entry.torank = neighborhood[halo.side]
ofsg = []
for dim, side in zip(*halo):
try:
ofsg.append(getattr(f._offset_owned[dim], side.name))
v = getattr(f._offset_owned[dim], side.name)
ofsg.append(self._as_number(v, args))
except AttributeError:
assert side is CENTER
ofsg.append(f._offset_owned[dim].left)
entry.ofsg = (c_int*len(ofsg))(*ofsg)

# `fromrank` peer + scatter offsets
entry.fromrank = neighborhood[tuple(i.flip() for i in halo.side)]
ofss = []
for dim, side in zip(*halo):
try:
ofss.append(getattr(f._offset_halo[dim], side.flip().name))
v = getattr(f._offset_halo[dim], side.flip().name)
ofss.append(self._as_number(v, args))
except AttributeError:
assert side is CENTER
# Note `_offset_owned`, and not `_offset_halo`, is *not* a bug
Expand Down
Loading

0 comments on commit aedc3b9

Please sign in to comment.