Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compiler: Fix OpenMP reductions in tandem with linearize=True #2117

Merged
merged 5 commits into from
May 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion devito/core/autotuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,12 @@ def finalize_time_bounds(stepper, at_args, args, mode):
def calculate_nblocks(tree, blockable):
block_indices = [n for n, i in enumerate(tree) if i.dim in blockable]
index = block_indices[0]
collapsed = tree[index:index + (tree[index].ncollapsed or index+1)]
try:
ncollapsed = tree[index].ncollapsed
except AttributeError:
# Not using OpenMP
ncollapsed = 0
collapsed = tree[index:index + (ncollapsed or index+1)]
blocked = [i.dim for i in collapsed if i.dim in blockable]
remainders = [(d.root.symbolic_max-d.root.symbolic_min+1) % d.step for d in blocked]
niters = [d.root.symbolic_max - i for d, i in zip(blocked, remainders)]
Expand Down
64 changes: 3 additions & 61 deletions devito/ir/iet/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from devito.data import FULL
from devito.ir.equations import DummyEq, OpInc, OpMin, OpMax
from devito.ir.support import (INBOUND, SEQUENTIAL, PARALLEL, PARALLEL_IF_ATOMIC,
PARALLEL_IF_PVT, VECTORIZED, AFFINE, COLLAPSED,
Property, Forward, detect_io)
PARALLEL_IF_PVT, VECTORIZED, AFFINE, Property,
Forward, detect_io)
from devito.symbolics import ListInitializer, CallFromPointer, ccode
from devito.tools import (Signer, as_tuple, filter_ordered, filter_sorted, flatten,
ctypes_to_cstr)
Expand Down Expand Up @@ -557,13 +557,6 @@ def is_Vectorized(self):
def is_Inbound(self):
return INBOUND in self.properties

@property
def ncollapsed(self):
for i in self.properties:
if i.name == 'collapsed':
return i.val
return 0

@property
def symbolic_bounds(self):
"""A 2-tuple representing the symbolic bounds [min, max] of the Iteration."""
Expand Down Expand Up @@ -1165,58 +1158,7 @@ class ParallelIteration(Iteration):
Implement a parallel for-loop.
"""

def __init__(self, *args, **kwargs):
pragmas, kwargs, properties = self._make_header(**kwargs)
super().__init__(*args, pragmas=pragmas, properties=properties, **kwargs)

@classmethod
def _make_header(cls, **kwargs):
construct = cls._make_construct(**kwargs)
clauses = cls._make_clauses(**kwargs)
header = c.Pragma(' '.join([construct] + clauses))

# Extract the Iteration Properties
properties = cls._process_properties(**kwargs)

# Drop the unrecognised or unused kwargs
kwargs = cls._process_kwargs(**kwargs)

return (header,), kwargs, properties

@classmethod
def _make_construct(cls, **kwargs):
# To be overridden by subclasses
raise NotImplementedError

@classmethod
def _make_clauses(cls, **kwargs):
return []

@classmethod
def _process_properties(cls, **kwargs):
properties = as_tuple(kwargs.get('properties'))
properties += (COLLAPSED(kwargs.get('ncollapse', 1)),)

return properties

@classmethod
def _process_kwargs(cls, **kwargs):
kwargs.pop('pragmas', None)
kwargs.pop('properties', None)

# Recognised clauses
kwargs.pop('ncollapse', None)
kwargs.pop('reduction', None)

return kwargs

@cached_property
def collapsed(self):
ret = [self]
for i in range(self.ncollapsed - 1):
ret.append(ret[i].nodes[0])
assert all(i.is_Iteration for i in ret)
return tuple(ret)
pass


class ParallelTree(List):
Expand Down
2 changes: 1 addition & 1 deletion devito/ir/support/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from .utils import * # noqa
from .vector import * # noqa
from .basic import * # noqa
from .space import * # noqa
from .guards import * # noqa
from .syncs import * # noqa
from .utils import * # noqa
from .properties import * # noqa
from .symregistry import * # noqa
3 changes: 0 additions & 3 deletions devito/ir/support/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ def __init__(self, name, val=None):
equals to 0 (i.e., the distance vector is '=').
"""

COLLAPSED = lambda i: Property('collapsed', i)
"""Collapsed Dimensions."""

VECTORIZED = Property('vector-dim')
"""A SIMD-vectorized Dimension."""

Expand Down
6 changes: 1 addition & 5 deletions devito/ir/support/syncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,12 @@

from devito.data import FULL
from devito.tools import Pickable, filter_ordered
from devito.types import DimensionTuple
from .utils import IMask

__all__ = ['WaitLock', 'ReleaseLock', 'WithLock', 'FetchUpdate', 'PrefetchUpdate',
'normalize_syncs']


class IMask(DimensionTuple):
pass


class SyncOp(Pickable):

__rargs__ = ('handle', 'target')
Expand Down
16 changes: 13 additions & 3 deletions devito/ir/support/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from devito.symbolics import (CallFromPointer, retrieve_indexed, retrieve_terminals,
uxreplace)
from devito.tools import DefaultOrderedDict, as_tuple, flatten, filter_sorted, split
from devito.types import Dimension, Indirection, ModuloDimension, StencilDimension
from devito.types import (Dimension, DimensionTuple, Indirection, ModuloDimension,
StencilDimension)

__all__ = ['AccessMode', 'Stencil', 'detect_accesses', 'detect_io', 'pull_dims',
'shift_back', 'sdims_min', 'sdims_max']
__all__ = ['AccessMode', 'Stencil', 'IMask', 'detect_accesses', 'detect_io',
'pull_dims', 'shift_back', 'sdims_min', 'sdims_max']


class AccessMode(object):
Expand Down Expand Up @@ -103,6 +104,15 @@ def union(cls, *dicts):
return output


class IMask(DimensionTuple):

"""
A mapper from Dimensions to data points or ranges.
"""

pass


def detect_accesses(exprs):
"""
Return a mapper `M : F -> S`, where F are Functions appearing in `exprs`
Expand Down
35 changes: 10 additions & 25 deletions devito/passes/iet/languages/openacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,39 @@

from devito.arch import AMDGPUX, NVIDIAX
from devito.ir import (Call, DeviceCall, DummyExpr, EntryFunction, List, Block,
ParallelIteration, ParallelTree, Pragma, Return,
FindSymbols, make_callable)
ParallelTree, Pragma, Return, FindSymbols, make_callable)
from devito.passes import is_on_device
from devito.passes.iet.definitions import DeviceAwareDataManager
from devito.passes.iet.engine import iet_pass
from devito.passes.iet.orchestration import Orchestrator
from devito.passes.iet.parpragma import (PragmaDeviceAwareTransformer, PragmaLangBB,
PragmaTransfer)
PragmaIteration, PragmaTransfer)
from devito.passes.iet.languages.C import CBB
from devito.passes.iet.languages.openmp import OmpRegion, OmpIteration
from devito.passes.iet.languages.utils import make_clause_reduction
from devito.symbolics import FieldFromPointer, Macro, cast_mapper
from devito.tools import filter_ordered
from devito.types import DeviceMap, Symbol

__all__ = ['DeviceAccizer', 'DeviceAccDataManager', 'AccOrchestrator']


class DeviceAccIteration(ParallelIteration):
class DeviceAccIteration(PragmaIteration):

@classmethod
def _make_construct(cls, **kwargs):
return 'acc parallel loop'

@classmethod
def _make_clauses(cls, ncollapse=None, reduction=None, tile=None, **kwargs):
def _make_clauses(cls, ncollapsed=None, reduction=None, tile=None, **kwargs):
clauses = []

if ncollapse:
clauses.append('collapse(%d)' % (ncollapse or 1))
elif tile:
if tile:
clauses.append('tile(%s)' % ','.join(str(i) for i in tile))
elif ncollapsed:
clauses.append('collapse(%d)' % (ncollapsed or 1))

if reduction:
clauses.append(make_clause_reduction(reduction))
clauses.append(cls._make_clause_reduction_from_imask(reduction))

indexeds = FindSymbols('indexeds').visit(kwargs['nodes'])
deviceptrs = filter_ordered(i.name for i in indexeds if i.function._mem_local)
Expand All @@ -55,20 +53,6 @@ def _make_clauses(cls, ncollapse=None, reduction=None, tile=None, **kwargs):

return clauses

@classmethod
def _process_kwargs(cls, **kwargs):
kwargs = super()._process_kwargs(**kwargs)

kwargs.pop('gpu_fit', None)

kwargs.pop('schedule', None)
kwargs.pop('parallel', None)
kwargs.pop('chunk_size', None)
kwargs.pop('nthreads', None)
kwargs.pop('tile', None)

return kwargs


class AccBB(PragmaLangBB):

Expand Down Expand Up @@ -191,7 +175,8 @@ def _make_partree(self, candidates, nthreads=None):
else:
tile = tile[:ncollapsable + 1]

body = self.DeviceIteration(gpu_fit=self.gpu_fit, tile=tile, **root.args)
body = self.DeviceIteration(gpu_fit=self.gpu_fit, tile=tile,
ncollapsed=ncollapsable, **root.args)
partree = ParallelTree([], body, nthreads=nthreads)

return root, partree
Expand Down
25 changes: 6 additions & 19 deletions devito/passes/iet/languages/openmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,14 @@
from devito.arch import AMDGPUX, NVIDIAX, INTELGPUX
from devito.arch.compiler import GNUCompiler
from devito.ir import (Call, Conditional, DeviceCall, List, Prodder,
ParallelIteration, ParallelBlock, PointerCast, While,
FindSymbols)
ParallelBlock, PointerCast, While, FindSymbols)
from devito.passes.iet.definitions import DataManager, DeviceAwareDataManager
from devito.passes.iet.langbase import LangBB
from devito.passes.iet.orchestration import Orchestrator
from devito.passes.iet.parpragma import (PragmaSimdTransformer, PragmaShmTransformer,
PragmaDeviceAwareTransformer, PragmaLangBB,
PragmaTransfer)
PragmaIteration, PragmaTransfer)
from devito.passes.iet.languages.C import CBB
from devito.passes.iet.languages.utils import make_clause_reduction
from devito.symbolics import CondEq, DefFunction
from devito.tools import filter_ordered

Expand All @@ -32,7 +30,7 @@ def _make_header(cls, nthreads, private=None):
return c.Pragma('omp parallel num_threads(%s) %s' % (nthreads.name, private))


class OmpIteration(ParallelIteration):
class OmpIteration(PragmaIteration):

@classmethod
def _make_construct(cls, parallel=False, **kwargs):
Expand All @@ -42,11 +40,11 @@ def _make_construct(cls, parallel=False, **kwargs):
return 'omp for'

@classmethod
def _make_clauses(cls, ncollapse=None, chunk_size=None, nthreads=None,
def _make_clauses(cls, ncollapsed=None, chunk_size=None, nthreads=None,
reduction=None, schedule=None, **kwargs):
clauses = []

clauses.append('collapse(%d)' % (ncollapse or 1))
clauses.append('collapse(%d)' % (ncollapsed or 1))

if chunk_size is not False:
clauses.append('schedule(%s,%s)' % (schedule or 'dynamic',
Expand All @@ -56,21 +54,10 @@ def _make_clauses(cls, ncollapse=None, chunk_size=None, nthreads=None,
clauses.append('num_threads(%s)' % nthreads)

if reduction:
clauses.append(make_clause_reduction(reduction))
clauses.append(cls._make_clause_reduction_from_imask(reduction))

return clauses

@classmethod
def _process_kwargs(cls, **kwargs):
kwargs = super()._process_kwargs(**kwargs)

kwargs.pop('schedule', None)
kwargs.pop('parallel', False)
kwargs.pop('chunk_size', None)
kwargs.pop('nthreads', None)

return kwargs


class DeviceOmpIteration(OmpIteration):

Expand Down
27 changes: 0 additions & 27 deletions devito/passes/iet/languages/utils.py

This file was deleted.

37 changes: 36 additions & 1 deletion devito/passes/iet/linearization.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@

from devito.data import FULL
from devito.ir import (BlankLine, Call, DummyExpr, Dereference, List, PointerCast,
Transfer, FindNodes, FindSymbols, Transformer, Uxreplace)
Transfer, FindNodes, FindSymbols, Transformer, Uxreplace,
IMask)
from devito.passes.iet.engine import iet_pass
from devito.passes.iet.parpragma import PragmaIteration
from devito.symbolics import DefFunction, MacroArgument, ccode
from devito.tools import Bunch, filter_ordered, prod
from devito.types import Array, Bundle, Symbol, FIndexed, Indexed, Wildcard
Expand Down Expand Up @@ -62,6 +64,7 @@ def linearization(iet, lmode=None, tracker=None, **kwargs):
iet = linearize_accesses(iet, key, tracker, **kwargs)
iet = linearize_pointers(iet, key)
iet = linearize_transfers(iet, **kwargs)
iet = linearize_clauses(iet, **kwargs)

# Postprocess headers
headers = [(ccode(define), ccode(expr)) for define, expr in tracker.headers.values()]
Expand Down Expand Up @@ -339,3 +342,35 @@ def linearize_transfers(iet, sregistry=None, **kwargs):
iet = Transformer(mapper).visit(iet)

return iet


def linearize_clauses(iet, **kwargs):
iters = FindNodes(PragmaIteration).visit(iet)
mapper = {}
for i in iters:
# Linearize reduction clauses, e.g.:
# `reduction(+:f[0:f_vec->size[1][0:f_vec->size[2]]])`
# ->
# `reduction(+:f[0:f_vec->size[1]*f_vec->size[2]])
if not i.reduction:
continue
reductions = []
for output, imask, op in i.reduction:
f = output.function

# Support partial reductions
try:
idx = imask.index(FULL)
except ValueError:
idx = len(imask)

m = np.prod(imask[:idx] or [0])
size = prod([f._C_get_field(FULL, d).size for d in f.dimensions[idx:]])

reductions.append((output, IMask((m*size, size)), op))

mapper[i] = i._rebuild(reduction=reductions)

iet = Transformer(mapper).visit(iet)

return iet
Loading