diff --git a/devito/core/autotuning.py b/devito/core/autotuning.py index 8360b3d0ed..00d5f5069f 100644 --- a/devito/core/autotuning.py +++ b/devito/core/autotuning.py @@ -258,7 +258,12 @@ def finalize_time_bounds(stepper, at_args, args, mode): def calculate_nblocks(tree, blockable): block_indices = [n for n, i in enumerate(tree) if i.dim in blockable] index = block_indices[0] - collapsed = tree[index:index + (tree[index].ncollapsed or index+1)] + try: + ncollapsed = tree[index].ncollapsed + except AttributeError: + # Not using OpenMP + ncollapsed = 0 + collapsed = tree[index:index + (ncollapsed or index+1)] blocked = [i.dim for i in collapsed if i.dim in blockable] remainders = [(d.root.symbolic_max-d.root.symbolic_min+1) % d.step for d in blocked] niters = [d.root.symbolic_max - i for d, i in zip(blocked, remainders)] diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index 5d5c81f579..aba0d0d98c 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -12,8 +12,8 @@ from devito.data import FULL from devito.ir.equations import DummyEq, OpInc, OpMin, OpMax from devito.ir.support import (INBOUND, SEQUENTIAL, PARALLEL, PARALLEL_IF_ATOMIC, - PARALLEL_IF_PVT, VECTORIZED, AFFINE, COLLAPSED, - Property, Forward, detect_io) + PARALLEL_IF_PVT, VECTORIZED, AFFINE, Property, + Forward, detect_io) from devito.symbolics import ListInitializer, CallFromPointer, ccode from devito.tools import (Signer, as_tuple, filter_ordered, filter_sorted, flatten, ctypes_to_cstr) @@ -557,13 +557,6 @@ def is_Vectorized(self): def is_Inbound(self): return INBOUND in self.properties - @property - def ncollapsed(self): - for i in self.properties: - if i.name == 'collapsed': - return i.val - return 0 - @property def symbolic_bounds(self): """A 2-tuple representing the symbolic bounds [min, max] of the Iteration.""" @@ -1165,58 +1158,7 @@ class ParallelIteration(Iteration): Implement a parallel for-loop. """ - def __init__(self, *args, **kwargs): - pragmas, kwargs, properties = self._make_header(**kwargs) - super().__init__(*args, pragmas=pragmas, properties=properties, **kwargs) - - @classmethod - def _make_header(cls, **kwargs): - construct = cls._make_construct(**kwargs) - clauses = cls._make_clauses(**kwargs) - header = c.Pragma(' '.join([construct] + clauses)) - - # Extract the Iteration Properties - properties = cls._process_properties(**kwargs) - - # Drop the unrecognised or unused kwargs - kwargs = cls._process_kwargs(**kwargs) - - return (header,), kwargs, properties - - @classmethod - def _make_construct(cls, **kwargs): - # To be overridden by subclasses - raise NotImplementedError - - @classmethod - def _make_clauses(cls, **kwargs): - return [] - - @classmethod - def _process_properties(cls, **kwargs): - properties = as_tuple(kwargs.get('properties')) - properties += (COLLAPSED(kwargs.get('ncollapse', 1)),) - - return properties - - @classmethod - def _process_kwargs(cls, **kwargs): - kwargs.pop('pragmas', None) - kwargs.pop('properties', None) - - # Recognised clauses - kwargs.pop('ncollapse', None) - kwargs.pop('reduction', None) - - return kwargs - - @cached_property - def collapsed(self): - ret = [self] - for i in range(self.ncollapsed - 1): - ret.append(ret[i].nodes[0]) - assert all(i.is_Iteration for i in ret) - return tuple(ret) + pass class ParallelTree(List): diff --git a/devito/ir/support/__init__.py b/devito/ir/support/__init__.py index 9ac941e817..2b20e3ab20 100644 --- a/devito/ir/support/__init__.py +++ b/devito/ir/support/__init__.py @@ -1,8 +1,8 @@ +from .utils import * # noqa from .vector import * # noqa from .basic import * # noqa from .space import * # noqa from .guards import * # noqa from .syncs import * # noqa -from .utils import * # noqa from .properties import * # noqa from .symregistry import * # noqa diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py index ae9ad44fe9..d0fefc37df 100644 --- a/devito/ir/support/properties.py +++ b/devito/ir/support/properties.py @@ -33,9 +33,6 @@ def __init__(self, name, val=None): equals to 0 (i.e., the distance vector is '='). """ -COLLAPSED = lambda i: Property('collapsed', i) -"""Collapsed Dimensions.""" - VECTORIZED = Property('vector-dim') """A SIMD-vectorized Dimension.""" diff --git a/devito/ir/support/syncs.py b/devito/ir/support/syncs.py index 1b9707c51f..96ab27c637 100644 --- a/devito/ir/support/syncs.py +++ b/devito/ir/support/syncs.py @@ -6,16 +6,12 @@ from devito.data import FULL from devito.tools import Pickable, filter_ordered -from devito.types import DimensionTuple +from .utils import IMask __all__ = ['WaitLock', 'ReleaseLock', 'WithLock', 'FetchUpdate', 'PrefetchUpdate', 'normalize_syncs'] -class IMask(DimensionTuple): - pass - - class SyncOp(Pickable): __rargs__ = ('handle', 'target') diff --git a/devito/ir/support/utils.py b/devito/ir/support/utils.py index e0e0d043fa..e0bcd8d956 100644 --- a/devito/ir/support/utils.py +++ b/devito/ir/support/utils.py @@ -3,10 +3,11 @@ from devito.symbolics import (CallFromPointer, retrieve_indexed, retrieve_terminals, uxreplace) from devito.tools import DefaultOrderedDict, as_tuple, flatten, filter_sorted, split -from devito.types import Dimension, Indirection, ModuloDimension, StencilDimension +from devito.types import (Dimension, DimensionTuple, Indirection, ModuloDimension, + StencilDimension) -__all__ = ['AccessMode', 'Stencil', 'detect_accesses', 'detect_io', 'pull_dims', - 'shift_back', 'sdims_min', 'sdims_max'] +__all__ = ['AccessMode', 'Stencil', 'IMask', 'detect_accesses', 'detect_io', + 'pull_dims', 'shift_back', 'sdims_min', 'sdims_max'] class AccessMode(object): @@ -103,6 +104,15 @@ def union(cls, *dicts): return output +class IMask(DimensionTuple): + + """ + A mapper from Dimensions to data points or ranges. + """ + + pass + + def detect_accesses(exprs): """ Return a mapper `M : F -> S`, where F are Functions appearing in `exprs` diff --git a/devito/passes/iet/languages/openacc.py b/devito/passes/iet/languages/openacc.py index e896aef433..aed8f5c1ac 100644 --- a/devito/passes/iet/languages/openacc.py +++ b/devito/passes/iet/languages/openacc.py @@ -3,17 +3,15 @@ from devito.arch import AMDGPUX, NVIDIAX from devito.ir import (Call, DeviceCall, DummyExpr, EntryFunction, List, Block, - ParallelIteration, ParallelTree, Pragma, Return, - FindSymbols, make_callable) + ParallelTree, Pragma, Return, FindSymbols, make_callable) from devito.passes import is_on_device from devito.passes.iet.definitions import DeviceAwareDataManager from devito.passes.iet.engine import iet_pass from devito.passes.iet.orchestration import Orchestrator from devito.passes.iet.parpragma import (PragmaDeviceAwareTransformer, PragmaLangBB, - PragmaTransfer) + PragmaIteration, PragmaTransfer) from devito.passes.iet.languages.C import CBB from devito.passes.iet.languages.openmp import OmpRegion, OmpIteration -from devito.passes.iet.languages.utils import make_clause_reduction from devito.symbolics import FieldFromPointer, Macro, cast_mapper from devito.tools import filter_ordered from devito.types import DeviceMap, Symbol @@ -21,23 +19,23 @@ __all__ = ['DeviceAccizer', 'DeviceAccDataManager', 'AccOrchestrator'] -class DeviceAccIteration(ParallelIteration): +class DeviceAccIteration(PragmaIteration): @classmethod def _make_construct(cls, **kwargs): return 'acc parallel loop' @classmethod - def _make_clauses(cls, ncollapse=None, reduction=None, tile=None, **kwargs): + def _make_clauses(cls, ncollapsed=None, reduction=None, tile=None, **kwargs): clauses = [] - if ncollapse: - clauses.append('collapse(%d)' % (ncollapse or 1)) - elif tile: + if tile: clauses.append('tile(%s)' % ','.join(str(i) for i in tile)) + elif ncollapsed: + clauses.append('collapse(%d)' % (ncollapsed or 1)) if reduction: - clauses.append(make_clause_reduction(reduction)) + clauses.append(cls._make_clause_reduction_from_imask(reduction)) indexeds = FindSymbols('indexeds').visit(kwargs['nodes']) deviceptrs = filter_ordered(i.name for i in indexeds if i.function._mem_local) @@ -55,20 +53,6 @@ def _make_clauses(cls, ncollapse=None, reduction=None, tile=None, **kwargs): return clauses - @classmethod - def _process_kwargs(cls, **kwargs): - kwargs = super()._process_kwargs(**kwargs) - - kwargs.pop('gpu_fit', None) - - kwargs.pop('schedule', None) - kwargs.pop('parallel', None) - kwargs.pop('chunk_size', None) - kwargs.pop('nthreads', None) - kwargs.pop('tile', None) - - return kwargs - class AccBB(PragmaLangBB): @@ -191,7 +175,8 @@ def _make_partree(self, candidates, nthreads=None): else: tile = tile[:ncollapsable + 1] - body = self.DeviceIteration(gpu_fit=self.gpu_fit, tile=tile, **root.args) + body = self.DeviceIteration(gpu_fit=self.gpu_fit, tile=tile, + ncollapsed=ncollapsable, **root.args) partree = ParallelTree([], body, nthreads=nthreads) return root, partree diff --git a/devito/passes/iet/languages/openmp.py b/devito/passes/iet/languages/openmp.py index 9d0edd65eb..985c41915c 100644 --- a/devito/passes/iet/languages/openmp.py +++ b/devito/passes/iet/languages/openmp.py @@ -6,16 +6,14 @@ from devito.arch import AMDGPUX, NVIDIAX, INTELGPUX from devito.arch.compiler import GNUCompiler from devito.ir import (Call, Conditional, DeviceCall, List, Prodder, - ParallelIteration, ParallelBlock, PointerCast, While, - FindSymbols) + ParallelBlock, PointerCast, While, FindSymbols) from devito.passes.iet.definitions import DataManager, DeviceAwareDataManager from devito.passes.iet.langbase import LangBB from devito.passes.iet.orchestration import Orchestrator from devito.passes.iet.parpragma import (PragmaSimdTransformer, PragmaShmTransformer, PragmaDeviceAwareTransformer, PragmaLangBB, - PragmaTransfer) + PragmaIteration, PragmaTransfer) from devito.passes.iet.languages.C import CBB -from devito.passes.iet.languages.utils import make_clause_reduction from devito.symbolics import CondEq, DefFunction from devito.tools import filter_ordered @@ -32,7 +30,7 @@ def _make_header(cls, nthreads, private=None): return c.Pragma('omp parallel num_threads(%s) %s' % (nthreads.name, private)) -class OmpIteration(ParallelIteration): +class OmpIteration(PragmaIteration): @classmethod def _make_construct(cls, parallel=False, **kwargs): @@ -42,11 +40,11 @@ def _make_construct(cls, parallel=False, **kwargs): return 'omp for' @classmethod - def _make_clauses(cls, ncollapse=None, chunk_size=None, nthreads=None, + def _make_clauses(cls, ncollapsed=None, chunk_size=None, nthreads=None, reduction=None, schedule=None, **kwargs): clauses = [] - clauses.append('collapse(%d)' % (ncollapse or 1)) + clauses.append('collapse(%d)' % (ncollapsed or 1)) if chunk_size is not False: clauses.append('schedule(%s,%s)' % (schedule or 'dynamic', @@ -56,21 +54,10 @@ def _make_clauses(cls, ncollapse=None, chunk_size=None, nthreads=None, clauses.append('num_threads(%s)' % nthreads) if reduction: - clauses.append(make_clause_reduction(reduction)) + clauses.append(cls._make_clause_reduction_from_imask(reduction)) return clauses - @classmethod - def _process_kwargs(cls, **kwargs): - kwargs = super()._process_kwargs(**kwargs) - - kwargs.pop('schedule', None) - kwargs.pop('parallel', False) - kwargs.pop('chunk_size', None) - kwargs.pop('nthreads', None) - - return kwargs - class DeviceOmpIteration(OmpIteration): diff --git a/devito/passes/iet/languages/utils.py b/devito/passes/iet/languages/utils.py deleted file mode 100644 index 2bf5c5ca53..0000000000 --- a/devito/passes/iet/languages/utils.py +++ /dev/null @@ -1,27 +0,0 @@ -from devito.data import FULL - -__all__ = ['make_clause_reduction'] - - -def make_clause_reduction(reductions): - """ - Build a string representing a reduction clause given a list of - 2-tuples `(symbol, ir.Operation)`. - """ - args = [] - for i, r in reductions: - if i.is_Indexed: - f = i.function - bounds = [] - for k, d in zip(i.indices, f.dimensions): - if k.is_Number: - bounds.append('[%s]' % k) - else: - # Languages such as OpenMP and OpenACC expect a range - # as input to a reduction clause, such as - # `reduction(+:f[0:f_vec->size[1]])` - bounds.append('[0:%s]' % f._C_get_field(FULL, d).size) - args.append('%s%s' % (i.name, ''.join(bounds))) - else: - args.append(str(i)) - return 'reduction(%s:%s)' % (r.name, ','.join(args)) diff --git a/devito/passes/iet/linearization.py b/devito/passes/iet/linearization.py index c5e1790bac..d861408bff 100644 --- a/devito/passes/iet/linearization.py +++ b/devito/passes/iet/linearization.py @@ -7,8 +7,10 @@ from devito.data import FULL from devito.ir import (BlankLine, Call, DummyExpr, Dereference, List, PointerCast, - Transfer, FindNodes, FindSymbols, Transformer, Uxreplace) + Transfer, FindNodes, FindSymbols, Transformer, Uxreplace, + IMask) from devito.passes.iet.engine import iet_pass +from devito.passes.iet.parpragma import PragmaIteration from devito.symbolics import DefFunction, MacroArgument, ccode from devito.tools import Bunch, filter_ordered, prod from devito.types import Array, Bundle, Symbol, FIndexed, Indexed, Wildcard @@ -62,6 +64,7 @@ def linearization(iet, lmode=None, tracker=None, **kwargs): iet = linearize_accesses(iet, key, tracker, **kwargs) iet = linearize_pointers(iet, key) iet = linearize_transfers(iet, **kwargs) + iet = linearize_clauses(iet, **kwargs) # Postprocess headers headers = [(ccode(define), ccode(expr)) for define, expr in tracker.headers.values()] @@ -339,3 +342,35 @@ def linearize_transfers(iet, sregistry=None, **kwargs): iet = Transformer(mapper).visit(iet) return iet + + +def linearize_clauses(iet, **kwargs): + iters = FindNodes(PragmaIteration).visit(iet) + mapper = {} + for i in iters: + # Linearize reduction clauses, e.g.: + # `reduction(+:f[0:f_vec->size[1][0:f_vec->size[2]]])` + # -> + # `reduction(+:f[0:f_vec->size[1]*f_vec->size[2]]) + if not i.reduction: + continue + reductions = [] + for output, imask, op in i.reduction: + f = output.function + + # Support partial reductions + try: + idx = imask.index(FULL) + except ValueError: + idx = len(imask) + + m = np.prod(imask[:idx] or [0]) + size = prod([f._C_get_field(FULL, d).size for d in f.dimensions[idx:]]) + + reductions.append((output, IMask((m*size, size)), op)) + + mapper[i] = i._rebuild(reduction=reductions) + + iet = Transformer(mapper).visit(iet) + + return iet diff --git a/devito/passes/iet/parpragma.py b/devito/passes/iet/parpragma.py index b2a334440f..dd78437381 100644 --- a/devito/passes/iet/parpragma.py +++ b/devito/passes/iet/parpragma.py @@ -4,15 +4,16 @@ from sympy import And, Max, true from devito.data import FULL -from devito.ir import (Conditional, DummyEq, Dereference, Expression, ExpressionBundle, - FindSymbols, FindNodes, ParallelTree, Pragma, Prodder, Transfer, - List, Transformer, IsPerfectIteration, OpInc, filter_iterations, - retrieve_iteration_tree, VECTORIZED) +from devito.ir import (Conditional, DummyEq, Dereference, Expression, + ExpressionBundle, FindSymbols, FindNodes, ParallelIteration, + ParallelTree, Pragma, Prodder, Transfer, List, Transformer, + IsPerfectIteration, OpInc, filter_iterations, + retrieve_iteration_tree, IMask, VECTORIZED) from devito.passes.iet.engine import iet_pass from devito.passes.iet.langbase import (LangBB, LangTransformer, DeviceAwareMixin, make_sections_from_imask) from devito.symbolics import INT, ccode -from devito.tools import as_tuple, flatten, prod +from devito.tools import as_tuple, flatten, is_integer, prod from devito.types import Symbol __all__ = ['PragmaSimdTransformer', 'PragmaShmTransformer', @@ -118,6 +119,76 @@ def make_simd(self, iet): return iet, {} +class PragmaIteration(ParallelIteration): + + def __init__(self, *args, parallel=None, schedule=None, chunk_size=None, + nthreads=None, ncollapsed=None, reduction=None, tile=None, + gpu_fit=None, **kwargs): + + construct = self._make_construct(parallel=parallel) + clauses = self._make_clauses( + ncollapsed=ncollapsed, chunk_size=chunk_size, nthreads=nthreads, + reduction=reduction, schedule=schedule, tile=tile, gpu_fit=gpu_fit, + **kwargs + ) + pragma = c.Pragma(' '.join([construct] + clauses)) + kwargs['pragmas'] = pragma + + super().__init__(*args, **kwargs) + + self.parallel = parallel + self.schedule = schedule + self.chunk_size = chunk_size + self.nthreads = nthreads + self.ncollapsed = ncollapsed + self.reduction = reduction + self.tile = tile + self.gpu_fit = gpu_fit + + @classmethod + def _make_construct(cls, **kwargs): + # To be overridden by subclasses + raise NotImplementedError + + @classmethod + def _make_clauses(cls, **kwargs): + return [] + + @classmethod + def _make_clause_reduction_from_imask(cls, reductions): + """ + Build a string representing of a reduction clause given a list of + 2-tuples `(symbol, ir.Operation)`. + """ + args = [] + for i, imask, r in reductions: + if i.is_Indexed: + f = i.function + bounds = [] + for k, d in zip(imask, f.dimensions): + if is_integer(k): + bounds.append('[%s]' % k) + elif k is FULL: + # Lower FULL Dimensions into a range spanning the entire + # Dimension space, e.g. `reduction(+:f[0:f_vec->size[1]])` + bounds.append('[0:%s]' % f._C_get_field(FULL, d).size) + else: + assert isinstance(k, tuple) and len(k) == 2 + bounds.append('[%s:%s]' % k) + args.append('%s%s' % (i.name, ''.join(bounds))) + else: + args.append(str(i)) + return 'reduction(%s:%s)' % (r.name, ','.join(args)) + + @cached_property + def collapsed(self): + ret = [self] + for i in range(self.ncollapsed - 1): + ret.append(ret[i].nodes[0]) + assert all(i.is_Iteration for i in ret) + return tuple(ret) + + class PragmaShmTransformer(PragmaSimdTransformer): """ @@ -246,16 +317,22 @@ def _make_reductions(self, partree): return partree exprs = [i for i in FindNodes(Expression).visit(partree) if i.is_reduction] - reductions = [(i.output, i.operation) for i in exprs] - test0 = all(not i.is_Indexed for i, _ in reductions) + reductions = [] + for e in exprs: + f = e.write + items = [i if i.is_Number else FULL for i in e.output.indices] + imask = IMask(*items, getters=f.dimensions) + reductions.append((e.output, imask, e.operation)) + + test0 = all(not i.is_Indexed for i, _, _ in reductions) test1 = (self._support_array_reduction(self.compiler) and all(i.is_Affine for i in partree.collapsed)) if test0 or test1: # Implement reduction mapper = {partree.root: partree.root._rebuild(reduction=reductions)} - elif all(i is OpInc for _, i in reductions): + elif all(i is OpInc for _, _, i in reductions): # Use atomic increments mapper = {i: i._rebuild(pragmas=self.lang['atomic']) for i in exprs} else: @@ -275,7 +352,7 @@ def _make_partree(self, candidates, nthreads=None): # Get the collapsable Iterations root, collapsable = self._select_candidates(candidates) - ncollapse = 1 + len(collapsable) + ncollapsed = 1 + len(collapsable) # Prepare to build a ParallelTree if all(i.is_Affine for i in candidates): @@ -288,12 +365,12 @@ def _make_partree(self, candidates, nthreads=None): if nthreads is None: # pragma ... for ... schedule(..., 1) nthreads = self.nthreads - body = self.HostIteration(schedule=schedule, ncollapse=ncollapse, + body = self.HostIteration(schedule=schedule, ncollapsed=ncollapsed, **root.args) else: # pragma ... parallel for ... schedule(..., 1) body = self.HostIteration(schedule=schedule, parallel=True, - ncollapse=ncollapse, nthreads=nthreads, + ncollapsed=ncollapsed, nthreads=nthreads, **root.args) prefix = [] else: @@ -301,7 +378,7 @@ def _make_partree(self, candidates, nthreads=None): assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') - body = self.HostIteration(ncollapse=ncollapse, chunk_size=chunk_size, + body = self.HostIteration(ncollapsed=ncollapsed, chunk_size=chunk_size, **root.args) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) @@ -510,7 +587,7 @@ def _make_partree(self, candidates, nthreads=None): if self._is_offloadable(root): body = self.DeviceIteration(gpu_fit=self.gpu_fit, - ncollapse=len(collapsable) + 1, + ncollapsed=len(collapsable) + 1, **root.args) partree = ParallelTree([], body, nthreads=nthreads) diff --git a/tests/test_linearize.py b/tests/test_linearize.py index 6cf0f6ce3b..8919df3b44 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -3,10 +3,10 @@ import scipy.sparse from devito import (Grid, Function, TimeFunction, SparseTimeFunction, Operator, Eq, - MatrixSparseTimeFunction, sin) + Inc, MatrixSparseTimeFunction, sin) from devito.ir import Call, Callable, DummyExpr, Expression, FindNodes, SymbolRegistry from devito.passes import Graph, linearize -from devito.types import Array, Bundle +from devito.types import Array, Bundle, DefaultDimension def test_basic(): @@ -553,3 +553,41 @@ def test_bundle(): y_stride0 = foo.body.body[2].write assert y_stride0.name == 'y_stride0' assert y_stride0 in bar.parameters + + +def test_inc_w_default_dims(): + grid = Grid(shape=(5, 6)) + k = DefaultDimension(name="k", default_value=7) + x, y = grid.dimensions + + f = Function(name="f", grid=grid, dimensions=(x, y, k), + shape=grid.shape + (k._default_value,)) + g = Function(name="g", grid=grid) + + f.data[:] = 1 + + eq = Inc(g, f) + + op = Operator(eq, opt=('advanced', {'linearize': True})) + + # NOTE: Eventually we compare the numerical output, but truly the most + # import check is implicit to op1.apply, and it's the fact that op1 + # actually jit-compiles successfully, with the openmp reduction clause + # getting "linearized" just like everything else in the Operator + op.apply() + + assert np.all(g.data == 7) + + # Similar, but now reducing into a specific item along one Dimension + g.data[:] = 0. + + eq = Inc(g[3, y], f) + + op = Operator(eq, opt=('advanced', {'linearize': True})) + + op.apply() + + assert np.all(g.data[:3] == 0) + assert f.shape[0]*k._default_value == 35 + assert np.all(g.data[3] == f.shape[0]*k._default_value) + assert np.all(g.data[4:] == 0)