From a73e49723ff4e1fa02c159c7a13f4142a8b376ed Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 31 Oct 2023 09:40:50 +0000 Subject: [PATCH 01/18] compiler: Enhance rcompile --- devito/core/gpu.py | 12 +++++++----- devito/operator/operator.py | 6 +++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/devito/core/gpu.py b/devito/core/gpu.py index 7aa24e492f..eb0d0d0dce 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -102,16 +102,18 @@ def _normalize_gpu_fit(cls, **kwargs): return cls.GPU_FIT @classmethod - def _rcompile_wrapper(cls, **kwargs): - options = kwargs['options'] + def _rcompile_wrapper(cls, **kwargs0): + options = kwargs0['options'] - def wrapper(expressions, kwargs=kwargs, mode='default'): + def wrapper(expressions, mode='default', **kwargs1): if mode == 'host': - kwargs = { + kwargs = {**{ 'platform': 'cpu64', 'language': 'C' if options['par-disabled'] else 'openmp', 'compiler': 'custom', - } + }, **kwargs1} + else: + kwargs = {**kwargs0, **kwargs1} return rcompile(expressions, kwargs) return wrapper diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 0fadc5dc47..4476d07bd3 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -266,9 +266,9 @@ def _lower(cls, expressions, **kwargs): return IRs(expressions, clusters, stree, uiet, iet), byproduct @classmethod - def _rcompile_wrapper(cls, **kwargs): - def wrapper(expressions, kwargs=kwargs): - return rcompile(expressions, kwargs) + def _rcompile_wrapper(cls, **kwargs0): + def wrapper(expressions, **kwargs1): + return rcompile(expressions, {**kwargs0, **kwargs1}) return wrapper @classmethod From 50239832035327699bc3380b4a78d8ba1df02a28 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 31 Oct 2023 13:17:28 +0000 Subject: [PATCH 02/18] compiler: Add IET type KernelLaunch --- devito/ir/iet/efunc.py | 32 ++++++++++++++++++++++++++++++-- devito/ir/iet/visitors.py | 21 +++++++++++++++++++++ tests/test_iet.py | 16 +++++++++++++++- 3 files changed, 66 insertions(+), 3 deletions(-) diff --git a/devito/ir/iet/efunc.py b/devito/ir/iet/efunc.py index e55cce67ee..ae835126da 100644 --- a/devito/ir/iet/efunc.py +++ b/devito/ir/iet/efunc.py @@ -7,7 +7,7 @@ __all__ = ['ElementalFunction', 'ElementalCall', 'make_efunc', 'make_callable', 'EntryFunction', 'AsyncCallable', 'AsyncCall', 'ThreadCallable', - 'DeviceFunction', 'DeviceCall'] + 'DeviceFunction', 'DeviceCall', 'KernelLaunch'] # ElementalFunction machinery @@ -157,7 +157,7 @@ def __init__(self, name, body, retval='void', parameters=None, prefix='__global_ class DeviceCall(Call): """ - A call to an external function executed asynchronously on a device. + A call to a function executed asynchronously on a device. """ def __init__(self, name, arguments=None, **kwargs): @@ -175,3 +175,31 @@ def __init__(self, name, arguments=None, **kwargs): processed.append(a) super().__init__(name, arguments=processed, **kwargs) + + +class KernelLaunch(DeviceCall): + + """ + A call to an asynchronous device kernel. + """ + + def __init__(self, name, grid, block, shm=0, stream=None, + arguments=None, writes=None): + super().__init__(name, arguments=arguments, writes=writes) + + # Kernel launch arguments + self.grid = grid + self.block = block + self.shm = shm + self.stream = stream + + def __repr__(self): + return 'Launch[%s]<<<(%s)>>>' % (self.name, + ','.join(str(i.name) for i in self.writes)) + + @cached_property + def functions(self): + launch_args = (self.grid, self.block,) + if self.stream is not None: + launch_args += (self.stream.function,) + return super().functions + launch_args diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index 07400e7c7f..2657b58a24 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -576,6 +576,19 @@ def visit_HaloSpot(self, o): body = flatten(self._visit(i) for i in o.children) return c.Collection(body) + def visit_KernelLaunch(self, o): + arguments = self._args_call(o.arguments) + arguments = ','.join(arguments) + + launch_args = [o.grid, o.block] + if o.shm is not None: + launch_args.append(o.shm) + if o.stream is not None: + launch_args.append(o.stream) + launch_config = ','.join(str(i) for i in launch_args) + + return c.Statement('%s<<<%s>>>(%s)' % (o.name, launch_config, arguments)) + # Operator-handle machinery def _operator_includes(self, o): @@ -1202,6 +1215,14 @@ def visit_HaloSpot(self, o): visit_ThreadedProdder = visit_Call + def visit_KernelLaunch(self, o): + arguments = [uxreplace(i, self.mapper) for i in o.arguments] + grid = self.mapper.get(o.grid, o.grid) + block = self.mapper.get(o.block, o.block) + stream = self.mapper.get(o.stream, o.stream) + return o._rebuild(grid=grid, block=block, stream=stream, + arguments=arguments) + # Utils diff --git a/tests/test_iet.py b/tests/test_iet.py index eb6ddcc7d7..d148dc1539 100644 --- a/tests/test_iet.py +++ b/tests/test_iet.py @@ -7,7 +7,7 @@ from devito import (Eq, Grid, Function, TimeFunction, Operator, Dimension, # noqa switchconfig) from devito.ir.iet import (Call, Callable, Conditional, DummyExpr, Iteration, List, - Lambda, ElementalFunction, CGen, FindSymbols, + KernelLaunch, Lambda, ElementalFunction, CGen, FindSymbols, filter_iterations, make_efunc, retrieve_iteration_tree) from devito.ir import SymbolRegistry from devito.passes.iet.engine import Graph @@ -330,6 +330,20 @@ def test_templates(): }""" +def test_kernel_launch(): + grid = Grid(shape=(10, 10)) + + u = Function(name='u', grid=grid) + + class Dim3(LocalObject): + dtype = type('dim3', (c_void_p,), {}) + + kl = KernelLaunch('mykernel', Dim3('mygrid'), Dim3('myblock'), + arguments=(u.indexed,)) + + assert str(kl) == 'mykernel<<>>(d_u);' + + def test_codegen_quality0(): grid = Grid(shape=(4, 4, 4)) _, y, z = grid.dimensions From 864cddcf9468e9988426a8062902ff19bf0db48f Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Tue, 31 Oct 2023 14:18:09 +0000 Subject: [PATCH 03/18] compiler: Increase granularity of abstract_object --- devito/passes/iet/engine.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index 09ea56c3d9..a94b92def5 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -256,9 +256,10 @@ def abstract_objects(objects, sregistry=None): # Precedence rules make it possible to reconstruct objects that depend on # higher priority objects priority = { - DiscreteFunction: 1, - AbstractIncrDimension: 2, - BlockDimension: 3, + Array: 1, + DiscreteFunction: 2, + AbstractIncrDimension: 3, + BlockDimension: 4, } def key(i): From 98250060f307d20460e926c1f32f7e3997fdc5ab Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 1 Nov 2023 10:47:26 +0000 Subject: [PATCH 04/18] compiler: Improve Cast to perform automatic simplification --- devito/symbolics/extended_sympy.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/devito/symbolics/extended_sympy.py b/devito/symbolics/extended_sympy.py index 6ad84a4bf5..301d2892dd 100644 --- a/devito/symbolics/extended_sympy.py +++ b/devito/symbolics/extended_sympy.py @@ -375,6 +375,17 @@ class Cast(UnaryOp): __rkwargs__ = ('stars',) def __new__(cls, base, stars=None, **kwargs): + # Attempt simplifcation + # E.g., `FLOAT(32) -> 32.0` of type `sympy.Float` + try: + return sympify(eval(cls._base_typ)(base)) + except (NameError, SyntaxError): + # E.g., `_base_typ` is "char" or "unsigned long" + pass + except TypeError: + # `base` ain't a number + pass + obj = super().__new__(cls, base) obj._stars = stars return obj From 18f3098b4b291b13dae15204ef8584af6ba6cb39 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 1 Nov 2023 10:54:07 +0000 Subject: [PATCH 05/18] tests: Unpickle won't unpick factorization --- tests/test_pickle.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_pickle.py b/tests/test_pickle.py index 26b6ea7f63..03188cd17a 100644 --- a/tests/test_pickle.py +++ b/tests/test_pickle.py @@ -571,6 +571,17 @@ def test_operator_timefunction_w_preallocation(self, pickle): new_op.apply(time_m=1, time_M=1, f=f) assert np.all(f.data[2] == 2) + def test_collected_coeffs(self, pickle): + grid = Grid(shape=(8, 8, 8)) + f = TimeFunction(name='f', grid=grid, space_order=4) + + op = Operator(Eq(f.forward, f.dx2 + 1)) + + pkl_op = pickle.dumps(op) + new_op = pickle.loads(pkl_op) + + assert str(op) == str(new_op) + def test_elemental(self, pickle): """ Tests that elemental functions don't get reconstructed differently. From ecf07c495dccde87b00b575d18f0a0aa1d328a93 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Thu, 2 Nov 2023 09:19:02 +0000 Subject: [PATCH 06/18] compiler: Simplify asyncs lowering --- devito/ir/iet/efunc.py | 12 ++++++++-- devito/passes/iet/asynchrony.py | 42 +++++++++++++++------------------ devito/passes/iet/engine.py | 9 +++++-- devito/types/parallel.py | 4 ++++ 4 files changed, 40 insertions(+), 27 deletions(-) diff --git a/devito/ir/iet/efunc.py b/devito/ir/iet/efunc.py index ae835126da..1911f24227 100644 --- a/devito/ir/iet/efunc.py +++ b/devito/ir/iet/efunc.py @@ -137,8 +137,16 @@ class ThreadCallable(Callable): A Callable executed asynchronously by a thread. """ - def __init__(self, name, body, parameters=None, prefix='static'): - super().__init__(name, body, 'void*', parameters=parameters, prefix=prefix) + def __init__(self, name, body, parameters): + super().__init__(name, body, 'void*', parameters=parameters, prefix='static') + + # Sanity checks + # By construction, the first unpack statement of a ThreadCallable must + # be the PointerCast that makes `sdata` available in the local scope + assert len(body.unpacks) > 0 + v = body.unpacks[0] + assert v.is_PointerCast + self.sdata = v.function # DeviceFunction machinery diff --git a/devito/passes/iet/asynchrony.py b/devito/passes/iet/asynchrony.py index 1350091aca..d1d502fe9e 100644 --- a/devito/passes/iet/asynchrony.py +++ b/devito/passes/iet/asynchrony.py @@ -11,7 +11,7 @@ from devito.passes.iet.engine import iet_pass from devito.symbolics import (CondEq, CondNe, FieldFromComposite, FieldFromPointer, Null) -from devito.tools import DefaultOrderedDict, Bunch, split +from devito.tools import Bunch, split from devito.types import (Lock, Pointer, PThreadArray, QueueID, SharedData, Symbol, VolatileInt) @@ -19,31 +19,26 @@ def pthreadify(graph, **kwargs): - track = DefaultOrderedDict(lambda: Bunch(threads=None, sdata=None)) + lower_async_callables(graph, root=graph.root, **kwargs) - lower_async_callables(graph, track=track, root=graph.root, **kwargs) + track = {i.name: i.sdata for i in graph.efuncs.values() + if isinstance(i, ThreadCallable)} lower_async_calls(graph, track=track, **kwargs) + DataManager(**kwargs).place_definitions(graph) @iet_pass -def lower_async_callables(iet, track=None, root=None, sregistry=None): +def lower_async_callables(iet, root=None, sregistry=None): if not isinstance(iet, AsyncCallable): return iet, {} - n = len(track) - # Determine the max number of threads that can run this `iet` in parallel locks = [i for i in iet.parameters if isinstance(i, Lock)] npthreads = min([i.size for i in locks], default=1) if npthreads > 1: npthreads = sregistry.make_npthreads(npthreads) - # PthreadArray -- the symbol representing an array of pthreads, which will - # execute the AsyncCallable asynchronously - threads = track[iet.name].threads = PThreadArray(name='threads', - npthreads=npthreads) - # The `cfields` are the constant fields, that is the fields whose value # definitely never changes across different executions of `ìet`; the # `ncfields` are instead the non-constant fields, that is the fields whose @@ -58,11 +53,13 @@ def lower_async_callables(iet, track=None, root=None, sregistry=None): # SharedData -- that is the data structure that will be used by the # main thread to pass information down to the child thread(s) - sdata = track[iet.name].sdata = SharedData(name='sdata', - npthreads=threads.size, - cfields=cfields, - ncfields=ncfields, - pname='tsdata%d' % n) + sdata = SharedData( + name='sdata', + npthreads=npthreads, + cfields=cfields, + ncfields=ncfields, + pname=sregistry.make_name(prefix='tsdata') + ) sbase = sdata.symbolic_base # Prepend the SharedData fields available upon thread activation @@ -114,9 +111,7 @@ def lower_async_calls(iet, track=None, sregistry=None): continue assert n.name in track - b = track[n.name] - - sdata = b.sdata + sdata = track[n.name] sbase = sdata.symbolic_base name = sregistry.make_name(prefix='init_%s' % sdata.name) body = [DummyExpr(FieldFromPointer(i._C_name, sbase), i._C_symbol) @@ -131,12 +126,13 @@ def lower_async_calls(iet, track=None, sregistry=None): finalization = [] mapper = {} for n in FindNodes(AsyncCall).visit(iet): - # Create `sdata` and `threads` objects for `n` - b = track[n.name] + # Bind the abstract `sdata` to `n` name = sregistry.make_name(prefix='sdata') - sdata = b.sdata._rebuild(name=name) + sdata = track[n.name]._rebuild(name=name) + + # The pthreads that will execute the AsyncCallable asynchronously name = sregistry.make_name(prefix='threads') - threads = b.threads._rebuild(name=name) + threads = PThreadArray(name=name, npthreads=sdata.npthreads) # Call to `sdata` initialization Callable sbase = sdata.symbolic_base diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index a94b92def5..98362369bf 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -84,8 +84,8 @@ def apply(self, func, **kwargs): continue # Minimize code size by abstracting semantically identical efuncs - efunc, efuncs = reuse_efuncs(efunc, metadata.get('efuncs', []), - self.sregistry) + efuncs = metadata.get('efuncs', []) + efunc, efuncs = reuse_efuncs(efunc, efuncs, self.sregistry) self.efuncs[i] = efunc self.efuncs.update(OrderedDict([(i.name, i) for i in efuncs])) @@ -94,6 +94,11 @@ def apply(self, func, **kwargs): # introduced or removed objects self.efuncs = update_args(efunc, self.efuncs, dag) + # There could be two semantically different efuncs working with + # semantically identical yet syntactically different compiler-generated + # types + #TODO + # Uniqueness self.includes = filter_ordered(self.includes) self.headers = filter_ordered(self.headers, key=str) diff --git a/devito/types/parallel.py b/devito/types/parallel.py index 472ee9e781..d96482e0cf 100644 --- a/devito/types/parallel.py +++ b/devito/types/parallel.py @@ -125,6 +125,10 @@ def dim(self): assert len(self.dimensions) == 1 return self.dimensions[0] + @property + def npthreads(self): + return self.dim.symbolic_size + @property def index(self): if self.size == 1: From 4af722f2227587d0acb5c80361f4860501d90ff2 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 6 Nov 2023 10:53:41 +0000 Subject: [PATCH 07/18] compiler: Add UCHAR and UCHARP for unsigned char casts --- devito/symbolics/extended_sympy.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/devito/symbolics/extended_sympy.py b/devito/symbolics/extended_sympy.py index 301d2892dd..43a72d5753 100644 --- a/devito/symbolics/extended_sympy.py +++ b/devito/symbolics/extended_sympy.py @@ -638,6 +638,10 @@ class USHORT(Cast): _base_typ = 'unsigned short' +class UCHAR(Cast): + _base_typ = 'unsigned char' + + class LONG(Cast): _base_typ = 'long' @@ -654,9 +658,13 @@ class CHARP(CastStar): base = CHAR +class UCHARP(CastStar): + base = UCHAR + + cast_mapper = { np.int8: CHAR, - np.uint8: CHAR, + np.uint8: UCHAR, np.int16: SHORT, # noqa np.uint16: USHORT, # noqa int: INT, # noqa @@ -668,7 +676,7 @@ class CHARP(CastStar): np.float64: DOUBLE, # noqa (np.int8, '*'): CHARP, - (np.uint8, '*'): CHARP, + (np.uint8, '*'): UCHARP, (int, '*'): INTP, # noqa (np.uint16, '*'): INTP, # noqa (np.int16, '*'): INTP, # noqa From 71a3a236ba389acedeed90af573b36e7acb85624 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 6 Nov 2023 14:34:03 +0000 Subject: [PATCH 08/18] compiler: Add CallFromComposite --- devito/symbolics/extended_sympy.py | 24 +++++++++++++++++++----- devito/symbolics/printer.py | 4 ++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/devito/symbolics/extended_sympy.py b/devito/symbolics/extended_sympy.py index 43a72d5753..8d9cbde35d 100644 --- a/devito/symbolics/extended_sympy.py +++ b/devito/symbolics/extended_sympy.py @@ -12,11 +12,12 @@ from devito.finite_differences.elementary import Min, Max from devito.types import Symbol -__all__ = ['CondEq', 'CondNe', 'IntDiv', 'CallFromPointer', 'FieldFromPointer', # noqa - 'FieldFromComposite', 'ListInitializer', 'Byref', 'IndexedPointer', 'Cast', - 'DefFunction', 'InlineIf', 'Keyword', 'String', 'Macro', 'MacroArgument', - 'CustomType', 'Deref', 'INT', 'FLOAT', 'DOUBLE', 'VOID', - 'Null', 'SizeOf', 'rfunc', 'cast_mapper', 'BasicWrapperMixin'] +__all__ = ['CondEq', 'CondNe', 'IntDiv', 'CallFromPointer', # noqa + 'CallFromComposite', 'FieldFromPointer', 'FieldFromComposite', + 'ListInitializer', 'Byref', 'IndexedPointer', 'Cast', 'DefFunction', + 'InlineIf', 'Keyword', 'String', 'Macro', 'MacroArgument', + 'CustomType', 'Deref', 'INT', 'FLOAT', 'DOUBLE', 'VOID', 'Null', + 'SizeOf', 'rfunc', 'cast_mapper', 'BasicWrapperMixin'] class CondEq(sympy.Eq): @@ -221,6 +222,19 @@ def free_symbols(self): __reduce_ex__ = Pickable.__reduce_ex__ +class CallFromComposite(CallFromPointer, Pickable): + + """ + Symbolic representation of the C notation ``composite.call(params)``. + """ + + def __str__(self): + return '%s.%s(%s)' % (self.pointer, self.call, + ", ".join(str(i) for i in as_tuple(self.params))) + + __repr__ = __str__ + + class FieldFromPointer(CallFromPointer, Pickable): """ diff --git a/devito/symbolics/printer.py b/devito/symbolics/printer.py index cf3ba528bc..17c04d8a99 100644 --- a/devito/symbolics/printer.py +++ b/devito/symbolics/printer.py @@ -187,6 +187,10 @@ def _print_CallFromPointer(self, expr): indices = [self._print(i) for i in expr.params] return "%s->%s(%s)" % (expr.pointer, expr.call, ', '.join(indices)) + def _print_CallFromComposite(self, expr): + indices = [self._print(i) for i in expr.params] + return "%s.%s(%s)" % (expr.pointer, expr.call, ', '.join(indices)) + def _print_FieldFromPointer(self, expr): return "%s->%s" % (expr.pointer, expr.field) From 2c6c80ccf5aeb4df5d28e217518f6072ae025485 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 3 Nov 2023 16:15:14 +0000 Subject: [PATCH 09/18] compiler: Enhance automated routine abstraction --- devito/builtins/utils.py | 2 +- devito/ir/iet/efunc.py | 10 +++++ devito/ir/iet/nodes.py | 2 +- devito/operator/operator.py | 3 +- devito/passes/__init__.py | 20 ++++++++++ devito/passes/iet/asynchrony.py | 2 +- devito/passes/iet/definitions.py | 52 +++++--------------------- devito/passes/iet/engine.py | 40 +++++++++++++------- devito/passes/iet/languages/openacc.py | 17 +++++---- devito/types/array.py | 23 +----------- devito/types/basic.py | 25 +++++++++++-- tests/test_gpu_common.py | 36 +++++++++--------- 12 files changed, 122 insertions(+), 110 deletions(-) diff --git a/devito/builtins/utils.py b/devito/builtins/utils.py index 70f590d5de..84c336b15c 100644 --- a/devito/builtins/utils.py +++ b/devito/builtins/utils.py @@ -36,7 +36,7 @@ def __init__(self, *functions, op=dv.mpi.MPI.SUM, dtype=None): def __enter__(self): i = dv.Dimension(name='mri',) self.n = dv.Function(name='n', shape=(1,), dimensions=(i,), - grid=self.grid, dtype=self.dtype) + grid=self.grid, dtype=self.dtype, space='host') self.n.data[0] = 0 return self diff --git a/devito/ir/iet/efunc.py b/devito/ir/iet/efunc.py index 1911f24227..48571701cd 100644 --- a/devito/ir/iet/efunc.py +++ b/devito/ir/iet/efunc.py @@ -106,6 +106,16 @@ def make_callable(name, iet, retval='void', prefix='static'): Utility function to create a Callable from an IET. """ parameters = derive_parameters(iet) + + # TODO: this should be done by `derive_parameters`, and perhaps better, e.g. + # ordering such that TimeFunctions go first, then Functions, etc. However, + # doing it would require updating a *massive* number of tests and notebooks, + # hence for now we limit it here + # NOTE: doing it not just for code aesthetics, but also so that semantically + # identical callables can be abstracted homogeneously irrespective of the + # object names, which dictate the ordering in the callable signature + parameters = sorted(parameters, key=lambda p: str(type(p))) + return Callable(name, iet, retval, parameters=parameters, prefix=prefix) diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index 793cd6b14d..a3caef47ee 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -21,7 +21,7 @@ Symbol) from devito.types.object import AbstractObject, LocalObject -__all__ = ['Node', 'Block', 'Expression', 'Callable', 'Call', +__all__ = ['Node', 'Block', 'Expression', 'Callable', 'Call', 'ExprStmt', 'Conditional', 'Iteration', 'List', 'Section', 'TimedList', 'Prodder', 'MetaCall', 'PointerCast', 'HaloSpot', 'Definition', 'ExpressionBundle', 'AugmentedExpression', 'Increment', 'Return', 'While', diff --git a/devito/operator/operator.py b/devito/operator/operator.py index 4476d07bd3..3ed9709a1d 100644 --- a/devito/operator/operator.py +++ b/devito/operator/operator.py @@ -447,7 +447,6 @@ def _lower_iet(cls, uiet, profiler=None, **kwargs): * Finalize (e.g., symbol definitions, array casts) """ name = kwargs.get("name", "Kernel") - sregistry = kwargs['sregistry'] # Wrap the IET with an EntryFunction (a special Callable representing # the entry point of the generated library) @@ -455,7 +454,7 @@ def _lower_iet(cls, uiet, profiler=None, **kwargs): iet = EntryFunction(name, uiet, 'int', parameters, ()) # Lower IET to a target-specific IET - graph = Graph(iet, sregistry=sregistry) + graph = Graph(iet, **kwargs) graph = cls._specialize_iet(graph, **kwargs) # Instrument the IET for C-level profiling diff --git a/devito/passes/__init__.py b/devito/passes/__init__.py index 4ea9529f36..66f3edbc98 100644 --- a/devito/passes/__init__.py +++ b/devito/passes/__init__.py @@ -23,6 +23,9 @@ def is_on_device(obj, gpu_fit): except AttributeError: functions = as_tuple(obj) + if any(f._mem_host for f in functions): + return False + fsave = [f for f in functions if isinstance(f, TimeFunction) and is_integer(f.save)] @@ -33,6 +36,23 @@ def is_on_device(obj, gpu_fit): return all(f in gpu_fit for f in fsave) +def needs_transfer(f, gpu_fit): + """ + True if the given object triggers a transfer from/to device memory, + False otherwise. + + Parameters + ---------- + f : Function + The target object. + gpu_fit : list of Function + The Function's which are known to definitely fit in the device memory. This + information is given directly by the user through the compiler option + `gpu-fit` and is propagated down here through the various stages of lowering. + """ + return f._mem_mapped and not f.alias and is_on_device(f, gpu_fit) + + def is_gpu_create(obj, gpu_create): """ True if the given objects are created and not copied in the device memory, diff --git a/devito/passes/iet/asynchrony.py b/devito/passes/iet/asynchrony.py index d1d502fe9e..7fb2abb6dc 100644 --- a/devito/passes/iet/asynchrony.py +++ b/devito/passes/iet/asynchrony.py @@ -11,7 +11,7 @@ from devito.passes.iet.engine import iet_pass from devito.symbolics import (CondEq, CondNe, FieldFromComposite, FieldFromPointer, Null) -from devito.tools import Bunch, split +from devito.tools import split from devito.types import (Lock, Pointer, PThreadArray, QueueID, SharedData, Symbol, VolatileInt) diff --git a/devito/passes/iet/definitions.py b/devito/passes/iet/definitions.py index 8f827da18e..903cb5eb35 100644 --- a/devito/passes/iet/definitions.py +++ b/devito/passes/iet/definitions.py @@ -9,11 +9,10 @@ import numpy as np -from devito.ir import (Block, Call, Definition, DeviceCall, DeviceFunction, - DummyExpr, Return, EntryFunction, FindSymbols, MapExprStmts, - Transformer, make_callable) -from devito.passes import is_on_device, is_gpu_create -from devito.passes.iet.engine import iet_pass, iet_visit +from devito.ir import (Block, Call, Definition, DummyExpr, Return, EntryFunction, + FindSymbols, MapExprStmts, Transformer, make_callable) +from devito.passes import is_gpu_create +from devito.passes.iet.engine import iet_pass from devito.passes.iet.langbase import LangBB from devito.symbolics import (Byref, DefFunction, FieldFromPointer, IndexedPointer, SizeOf, VOID, Keyword, pow_to_mul) @@ -486,33 +485,8 @@ def _map_function_on_high_bw_mem(self, site, obj, storage, devicerm, read_only=F storage.update(obj, site, maps=mmap, unmaps=unmap, efuncs=efunc) - @iet_visit - def derive_transfers(self, iet): - """ - Collect all symbols that cause host-device data transfer, distinguishing - between reads and writes. - """ - - def needs_transfer(f): - return f._mem_mapped and not f.alias and is_on_device(f, self.gpu_fit) - - writes = set() - reads = set() - for i, v in MapExprStmts().visit(iet).items(): - if not any(isinstance(j, self.lang.DeviceIteration) for j in v) and \ - not isinstance(i, DeviceCall) and \ - not isinstance(iet, DeviceFunction): - # Not an offloaded Iteration tree - continue - - writes.update({w for w in i.writes if needs_transfer(w)}) - reads.update({f for f in i.functions - if needs_transfer(f) and f not in writes}) - - return (reads, writes) - @iet_pass - def place_transfers(self, iet, **kwargs): + def place_transfers(self, iet, data_movs=None, **kwargs): """ Create a new IET with host-device data transfers. This requires mapping symbols to the suitable memory spaces. @@ -521,17 +495,12 @@ def place_transfers(self, iet, **kwargs): return iet, {} @singledispatch - def _place_transfers(iet, mapper): + def _place_transfers(iet, data_movs): return iet, {} @_place_transfers.register(EntryFunction) - def _(iet, mapper): - try: - reads, writes = list(zip(*mapper.values())) - except ValueError: - return iet, {} - reads = set(flatten(reads)) - writes = set(flatten(writes)) + def _(iet, data_movs): + reads, writes = data_movs # Special symbol which gives user code control over data deallocations devicerm = DeviceRM() @@ -552,7 +521,7 @@ def _(iet, mapper): return iet, {'efuncs': efuncs} - return _place_transfers(iet, mapper=kwargs['mapper']) + return _place_transfers(iet, data_movs=data_movs) @iet_pass def place_devptr(self, iet, **kwargs): @@ -580,8 +549,7 @@ def process(self, graph): """ Apply the `place_transfers`, `place_definitions` and `place_casts` passes. """ - mapper = self.derive_transfers(graph) - self.place_transfers(graph, mapper=mapper) + self.place_transfers(graph, data_movs=graph.data_movs) self.place_definitions(graph, globs=set()) self.place_devptr(graph) self.place_bundling(graph, writes_input=graph.writes_input) diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index 98362369bf..4c2ea1a2b7 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -1,11 +1,12 @@ from collections import OrderedDict from functools import partial, singledispatch, wraps -from devito.ir.iet import (Call, FindNodes, FindSymbols, MetaCall, Transformer, - EntryFunction, ThreadCallable, Uxreplace, - derive_parameters) +from devito.ir.iet import (Call, ExprStmt, FindNodes, FindSymbols, MetaCall, + Transformer, EntryFunction, ThreadCallable, + Uxreplace, derive_parameters) from devito.ir.support import SymbolRegistry from devito.mpi.distributed import MPINeighborhood +from devito.passes import needs_transfer from devito.tools import DAG, as_tuple, filter_ordered, timed_pass from devito.types import (Array, Bundle, CompositeObject, Lock, IncrDimension, Indirection, Temp) @@ -36,7 +37,7 @@ class Graph(object): The `visit` method collects info about the nodes in the Graph. """ - def __init__(self, iet, sregistry=None): + def __init__(self, iet, options=None, sregistry=None, **kwargs): self.efuncs = OrderedDict([(iet.name, iet)]) self.sregistry = sregistry @@ -45,10 +46,23 @@ def __init__(self, iet, sregistry=None): self.headers = [] self.globals = [] - # Stash immutable information useful for some compiler passes + # Stash immutable information useful for one or more compiler passes + + # All written user-level objects writes = FindSymbols('writes').visit(iet) self.writes_input = frozenset(f for f in writes if f.is_Input) + # All symbols requiring host-device data transfers when running + # on device + self.data_movs = rmovs, wmovs = set(), set() + gpu_fit = (options or {}).get('gpu-fit', ()) + for i in FindNodes(ExprStmt).visit(iet): + wmovs.update({w for w in i.writes + if needs_transfer(w, gpu_fit)}) + for i in FindNodes(ExprStmt).visit(iet): + rmovs.update({f for f in i.functions + if needs_transfer(f, gpu_fit) and f not in wmovs}) + @property def root(self): return self.efuncs[list(self.efuncs).pop(0)] @@ -84,7 +98,7 @@ def apply(self, func, **kwargs): continue # Minimize code size by abstracting semantically identical efuncs - efuncs = metadata.get('efuncs', []) + efuncs = metadata.get('efuncs', []) efunc, efuncs = reuse_efuncs(efunc, efuncs, self.sregistry) self.efuncs[i] = efunc @@ -94,11 +108,6 @@ def apply(self, func, **kwargs): # introduced or removed objects self.efuncs = update_args(efunc, self.efuncs, dag) - # There could be two semantically different efuncs working with - # semantically identical yet syntactically different compiler-generated - # types - #TODO - # Uniqueness self.includes = filter_ordered(self.includes) self.headers = filter_ordered(self.headers, key=str) @@ -270,8 +279,11 @@ def abstract_objects(objects, sregistry=None): def key(i): for cls in sorted(priority, key=priority.get, reverse=True): if isinstance(i, cls): - return priority[cls] - return 0 + v = priority[cls] + break + else: + v = 0 + return (v, str(type(i))) objects = sorted(objects, key=key, reverse=True) @@ -317,7 +329,7 @@ def _(i, mapper, sregistry): else: name = sregistry.make_name(prefix='a') - v = i._rebuild(name=name) + v = i._rebuild(name=name, alias=True) mapper.update({ i: v, diff --git a/devito/passes/iet/languages/openacc.py b/devito/passes/iet/languages/openacc.py index 89cb60252a..186a106211 100644 --- a/devito/passes/iet/languages/openacc.py +++ b/devito/passes/iet/languages/openacc.py @@ -4,7 +4,7 @@ from devito.arch import AMDGPUX, NVIDIAX from devito.ir import (Call, DeviceCall, DummyExpr, EntryFunction, List, Block, ParallelTree, Pragma, Return, FindSymbols, make_callable) -from devito.passes import is_on_device +from devito.passes import needs_transfer, is_on_device from devito.passes.iet.definitions import DeviceAwareDataManager from devito.passes.iet.engine import iet_pass from devito.passes.iet.orchestration import Orchestrator @@ -14,7 +14,7 @@ from devito.passes.iet.languages.openmp import OmpRegion, OmpIteration from devito.symbolics import FieldFromPointer, Macro, cast_mapper from devito.tools import filter_ordered, UnboundTuple -from devito.types import DeviceMap, Symbol +from devito.types import Symbol __all__ = ['DeviceAccizer', 'DeviceAccDataManager', 'AccOrchestrator'] @@ -214,13 +214,13 @@ def place_devptr(self, iet, **kwargs): if not isinstance(iet, EntryFunction): return iet, {} - dmaps = [i for i in FindSymbols('basics').visit(iet) - if isinstance(i, DeviceMap)] + symbols = FindSymbols('basics').visit(iet) + functions = [f for f in symbols if needs_transfer(f, self.gpu_fit)] efuncs = [] calls = [] - for dmap in filter_ordered(dmaps): - f = dmap.function + for f in functions: + dmap = f.dmap hp = f.indexed tdp = Symbol(name="dptr", dtype=np.uint64) @@ -242,7 +242,10 @@ def place_devptr(self, iet, **kwargs): name = self.sregistry.make_name(prefix='map_device_ptr') efuncs.append(make_callable(name, body, retval=hp)) - calls.append(Call(name, f, retobj=dmap)) + if dmap in symbols: + calls.append(Call(name, f, retobj=dmap)) + else: + calls.append(Call(name, f)) body = iet.body._rebuild(maps=iet.body.maps + tuple(calls)) iet = iet._rebuild(body=body) diff --git a/devito/types/array.py b/devito/types/array.py index 539d4337dd..e790a39c52 100644 --- a/devito/types/array.py +++ b/devito/types/array.py @@ -104,11 +104,11 @@ class Array(ArrayBasic): is_Array = True __rkwargs__ = (AbstractFunction.__rkwargs__ + - ('dimensions', 'liveness', 'space', 'scope', 'initvalue')) + ('dimensions', 'liveness', 'scope', 'initvalue')) def __new__(cls, *args, **kwargs): kwargs.update({'options': {'evaluate': False}}) - space = kwargs.get('space', 'local') + space = kwargs.setdefault('space', 'local') if cls is Array and space == 'mapped': return AbstractFunction.__new__(ArrayMapped, *args, **kwargs) @@ -121,9 +121,6 @@ def __init_finalize__(self, *args, **kwargs): self._liveness = kwargs.get('liveness', 'lazy') assert self._liveness in ['eager', 'lazy'] - self._space = kwargs.get('space', 'local') - assert self._space in ['local', 'mapped', 'host'] - self._scope = kwargs.get('scope', 'heap') assert self._scope in ['heap', 'stack', 'static', 'constant', 'shared'] @@ -171,10 +168,6 @@ def __dtype_setup__(cls, **kwargs): def liveness(self): return self._liveness - @property - def space(self): - return self._space - @property def scope(self): return self._scope @@ -191,18 +184,6 @@ def _mem_internal_eager(self): def _mem_internal_lazy(self): return self._liveness == 'lazy' - @property - def _mem_local(self): - return self._space == 'local' - - @property - def _mem_mapped(self): - return self._space == 'mapped' - - @property - def _mem_host(self): - return self._space == 'host' - @property def _mem_stack(self): return self._scope in ('stack', 'shared') diff --git a/devito/types/basic.py b/devito/types/basic.py index 102bfe3a75..c5c0022b00 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -809,7 +809,8 @@ class AbstractFunction(sympy.Function, Basic, Pickable, Evaluable): True if data is allocated as a single, contiguous chunk of memory. """ - __rkwargs__ = ('name', 'dtype', 'grid', 'halo', 'padding', 'alias', 'function') + __rkwargs__ = ('name', 'dtype', 'grid', 'halo', 'padding', 'alias', + 'space', 'function') def __new__(cls, *args, **kwargs): # Preprocess arguments @@ -911,12 +912,18 @@ def __init_finalize__(self, *args, **kwargs): self._distributor = self.__distributor_setup__(**kwargs) # Symbol properties - # "Aliasing" another DiscreteFunction means that `self` logically + + # "Aliasing" another AbstractFunction means that `self` logically # represents another object. For example, `self` might be used as the # formal parameter of a routine generated by the compiler, where the # routines is applied to several actual DiscreteFunctions self._alias = kwargs.get('alias', False) + # The memory space of the AbstractFunction + # See `_mem_{local,mapped,host}.__doc__` for more info + self._space = kwargs.get('space', 'mapped') + assert self._space in ['local', 'mapped', 'host'] + @classmethod def __args_setup__(cls, *args, **kwargs): """ @@ -1138,6 +1145,10 @@ def is_const(self): def alias(self): return self._alias + @property + def space(self): + return self._space + @property def _C_name(self): return "%s_vec" % self.name @@ -1146,9 +1157,17 @@ def _C_name(self): def _C_symbol(self): return BoundSymbol(name=self._C_name, dtype=self.dtype, function=self.function) + @property + def _mem_local(self): + return self._space == 'local' + @property def _mem_mapped(self): - return not self._mem_local + return self._space == 'mapped' + + @property + def _mem_host(self): + return self._space == 'host' def _make_pointer(self): """Generate a symbolic pointer to self.""" diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index c6594305fc..ad4f3f3b84 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -257,7 +257,7 @@ def test_tasking_unfused_two_locks(self): # Check generated code assert len(retrieve_iteration_tree(op)) == 3 - assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 2 + assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 3 sections = FindNodes(Section).visit(op) assert len(sections) == 4 assert (str(sections[1].body[0].body[0].body[0].body[0]) == @@ -440,7 +440,7 @@ def test_streaming_basic(self, opt, ntmps): op = Operator(eqn, opt=opt) # Check generated code - assert len(op._func_table) == 5 + assert len(op._func_table) == 6 assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps op.apply(time_M=nt-2) @@ -449,8 +449,8 @@ def test_streaming_basic(self, opt, ntmps): assert np.all(u.data[1] == 36) @pytest.mark.parametrize('opt,ntmps,nfuncs', [ - (('buffering', 'streaming', 'orchestrate'), 8, 5), - (('buffering', 'streaming', 'fuse', 'orchestrate', {'fuse-tasks': True}), 6, 5), + (('buffering', 'streaming', 'orchestrate'), 10, 6), + (('buffering', 'streaming', 'fuse', 'orchestrate', {'fuse-tasks': True}), 7, 6), ]) def test_streaming_two_buffers(self, opt, ntmps, nfuncs): nt = 10 @@ -605,7 +605,7 @@ def test_streaming_multi_input(self, opt, ntmps): op1 = Operator(eqn, opt=opt) # Check generated code - assert len(op1._func_table) == 5 + assert len(op1._func_table) == 6 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == ntmps op0.apply(time_M=nt-2, dt=0.1) @@ -692,7 +692,7 @@ def test_streaming_postponed_deletion(self, opt, ntmps): op1 = Operator(eqns, opt=opt) # Check generated code - assert len(op1._func_table) == 5 + assert len(op1._func_table) == 6 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == ntmps op0.apply(time_M=nt-1) @@ -761,7 +761,7 @@ def test_composite_buffering_tasking_multi_output(self): assert len(retrieve_iteration_tree(op1)) == 8 assert len(retrieve_iteration_tree(op2)) == 5 symbols = FindSymbols().visit(op1) - assert len([i for i in symbols if isinstance(i, Lock)]) == 2 + assert len([i for i in symbols if isinstance(i, Lock)]) == 3 threads = [i for i in symbols if isinstance(i, PThreadArray)] assert len(threads) == 2 assert threads[0].size.size == async_degree @@ -775,7 +775,7 @@ def test_composite_buffering_tasking_multi_output(self): # It is true that the usave and vsave eqns are separated in two different # loop nests, but they eventually get mapped to the same pair of efuncs, # since devito attempts to maximize code reuse - assert len(op1._func_table) == 4 + assert len(op1._func_table) == 5 # Check output op0.apply(time_M=nt-1) @@ -944,7 +944,7 @@ def test_save_multi_output(self): # The `usave` and `vsave` eqns are in separate tasks, but the tasks # are identical, so they get mapped to the same efuncs (init + copy) # There also are two extra functions to allocate and free arrays - assert len(op._func_table) == 4 + assert len(op._func_table) == 5 op.apply(time_M=nt-1) @@ -1000,7 +1000,7 @@ def test_save_w_nonaffine_time(self): # We just check the generated code here assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 - assert len(op._func_table) == 4 + assert len(op._func_table) == 5 def test_save_w_subdims(self): nt = 10 @@ -1057,7 +1057,7 @@ def test_streaming_w_shifting(self, opt, ntmps): op = Operator(eqns, opt=opt) # Check generated code - assert len(op._func_table) == 5 + assert len(op._func_table) == 6 assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps # From time_m=15 to time_M=35 with a factor=5 -- it means that, thanks @@ -1112,12 +1112,12 @@ def test_streaming_complete(self): {'fuse-tasks': True})) # Check generated code - assert len(op1._func_table) == 8 - assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 6 - assert len(op2._func_table) == 8 - assert len([i for i in FindSymbols().visit(op2) if i.is_Array]) == 6 - assert len(op3._func_table) == 6 - assert len([i for i in FindSymbols().visit(op3) if i.is_Array]) == 6 + assert len(op1._func_table) == 9 + assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 7 + assert len(op2._func_table) == 9 + assert len([i for i in FindSymbols().visit(op2) if i.is_Array]) == 7 + assert len(op3._func_table) == 7 + assert len([i for i in FindSymbols().visit(op3) if i.is_Array]) == 7 op0.apply(time_m=15, time_M=35, save_shift=0) op1.apply(time_m=15, time_M=35, save_shift=0, u=u1) @@ -1280,7 +1280,7 @@ def test_gpu_create_backward(self): assert 'create(u' in str(op) elif language == 'openmp': assert 'map(alloc: u' in str(op) - assert 'init0(u_vec' in str(op) + assert 'init0' in op._func_table op.apply(time_M=nt - 2) From 65bc3effdaaeb0502818a188fbc8c482f0cc5c29 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 6 Nov 2023 15:38:48 +0000 Subject: [PATCH 10/18] compiler: Avoid useless alignments --- devito/ir/iet/visitors.py | 14 ++++++-------- devito/types/parallel.py | 6 ++++++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index 2657b58a24..efe139e954 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -375,7 +375,7 @@ def visit_PointerCast(self, o): else: rshape = '*' lvalue = c.Value(i._C_typedata, '*%s' % v) - if o.alignment: + if o.alignment and f._data_alignment: lvalue = c.AlignedAttribute(f._data_alignment, lvalue) # rvalue @@ -406,15 +406,13 @@ def visit_Dereference(self, o): shape = ''.join("[%s]" % ccode(i) for i in a0.symbolic_shape[1:]) rvalue = '(%s (*)%s) %s[%s]' % (i._C_typedata, shape, a1.name, a1.dim.name) - lvalue = c.AlignedAttribute( - a0._data_alignment, - c.Value(i._C_typedata, '(*restrict %s)%s' % (a0.name, shape)) - ) + lvalue = c.Value(i._C_typedata, + '(*restrict %s)%s' % (a0.name, shape)) else: rvalue = '(%s *) %s[%s]' % (i._C_typedata, a1.name, a1.dim.name) - lvalue = c.AlignedAttribute( - a0._data_alignment, c.Value(i._C_typedata, '*restrict %s' % a0.name) - ) + lvalue = c.Value(i._C_typedata, '*restrict %s' % a0.name) + if a0._data_alignment: + lvalue = c.AlignedAttribute(a0._data_alignment, lvalue) else: rvalue = '%s->%s' % (a1.name, a0._C_name) lvalue = self._gen_value(a0, 0) diff --git a/devito/types/parallel.py b/devito/types/parallel.py index d96482e0cf..7a55125f57 100644 --- a/devito/types/parallel.py +++ b/devito/types/parallel.py @@ -111,6 +111,9 @@ def nthreads(self): class ThreadArray(ArrayObject): + # Not a performance-sensitive object + _data_alignment = False + @classmethod def __indices_setup__(cls, **kwargs): try: @@ -208,6 +211,9 @@ class Lock(Array): is_volatile = True + # Not a performance-sensitive object + _data_alignment = False + def __init_finalize__(self, *args, **kwargs): kwargs.setdefault('scope', 'stack') From 56f485ecfe525785ed5d147b3a27028263664db4 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 6 Nov 2023 15:53:18 +0000 Subject: [PATCH 11/18] compiler: Add blank lines where appropriate --- devito/ir/iet/visitors.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index efe139e954..33fe49d166 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -324,7 +324,9 @@ def _blankline_logic(self, children): prev is ExpressionBundle and all(i.dim.is_Stencil for i in g)): rebuilt.extend(g) - elif prev in candidates and k in candidates: + elif (prev in candidates and k in candidates) or \ + (prev is not None and k is Section) or \ + prev is Section: rebuilt.append(BlankLine) rebuilt.extend(g) else: From cbe9448e305133771b888b3830652d0b2ffe73e3 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 6 Nov 2023 17:11:28 +0000 Subject: [PATCH 12/18] compiler: Trim Section code gen --- FAQ.md | 11 +- devito/ir/iet/nodes.py | 14 +- devito/ir/iet/visitors.py | 10 +- examples/cfd/01_convection.ipynb | 22 +-- examples/cfd/01_convection_revisited.ipynb | 22 +-- examples/compiler/01_data_regions.ipynb | 30 ++-- examples/compiler/03_iet-A.ipynb | 18 +-- examples/mpi/overview.ipynb | 20 +-- examples/performance/00_overview.ipynb | 144 +++++++----------- examples/performance/01_gpu.ipynb | 10 +- .../sa_01_iso_implementation1.ipynb | 8 - examples/userapi/01_dsl.ipynb | 10 +- examples/userapi/02_apply.ipynb | 4 +- .../userapi/05_conditional_dimension.ipynb | 36 ++--- tests/test_linearize.py | 4 +- 15 files changed, 131 insertions(+), 232 deletions(-) diff --git a/FAQ.md b/FAQ.md index d5ca595505..4b919483d5 100644 --- a/FAQ.md +++ b/FAQ.md @@ -84,19 +84,18 @@ int Kernel(struct dataobj *restrict f_vec, struct dataobj *restrict g_vec, const { float (*restrict f) __attribute__ ((aligned (64))) = (float (*)) f_vec->data; float (*restrict g) __attribute__ ((aligned (64))) = (float (*)) g_vec->data; + /* Flush denormal numbers to zero in hardware */ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); - struct timeval start_section0, end_section0; - gettimeofday(&start_section0, NULL); - /* Begin section0 */ + + START(section0) for (int x = x_m; x <= x_M; x += 1) { g[x + 8] = (3.57142857e-3F*(f[x + 4] - f[x + 12]) + 3.80952381e-2F*(-f[x + 5] + f[x + 11]) + 2.0e-1F*(f[x + 6] - f[x + 10]) + 8.0e-1F*(-f[x + 7] + f[x + 9]))/h_x; } - /* End section0 */ - gettimeofday(&end_section0, NULL); - timers->section0 += (double)(end_section0.tv_sec-start_section0.tv_sec)+(double)(end_section0.tv_usec-start_section0.tv_usec)/1000000; + STOP(section0,timers) + return 0; } ``` diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index a3caef47ee..b9e5cfb0cb 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -850,20 +850,20 @@ def __init__(self, timer, lname, body): self._name = lname self._timer = timer - super().__init__(header=c.Line('START_TIMER(%s)' % lname), + super().__init__(header=c.Line('START(%s)' % lname), body=body, - footer=c.Line('STOP_TIMER(%s,%s)' % (lname, timer.name))) + footer=c.Line('STOP(%s,%s)' % (lname, timer.name))) @classmethod def _start_timer_header(cls): - return ('START_TIMER(S)', ('struct timeval start_ ## S , end_ ## S ; ' - 'gettimeofday(&start_ ## S , NULL);')) + return ('START(S)', ('struct timeval start_ ## S , end_ ## S ; ' + 'gettimeofday(&start_ ## S , NULL);')) @classmethod def _stop_timer_header(cls): - return ('STOP_TIMER(S,T)', ('gettimeofday(&end_ ## S, NULL); T->S += (double)' - '(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)' - '(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;')) + return ('STOP(S,T)', ('gettimeofday(&end_ ## S, NULL); T->S += (double)' + '(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)' + '(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;')) @property def name(self): diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index 33fe49d166..e9749ad769 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -326,7 +326,7 @@ def _blankline_logic(self, children): rebuilt.extend(g) elif (prev in candidates and k in candidates) or \ (prev is not None and k is Section) or \ - prev is Section: + (prev is Section): rebuilt.append(BlankLine) rebuilt.extend(g) else: @@ -430,13 +430,7 @@ def visit_List(self, o): def visit_Section(self, o): body = flatten(self._visit(i) for i in o.children) - if o.is_subsection: - header = [] - footer = [] - else: - header = [c.Comment("Begin %s" % o.name)] - footer = [c.Comment("End %s" % o.name)] - return c.Module(header + body + footer) + return c.Module(body) def visit_Return(self, o): v = 'return' diff --git a/examples/cfd/01_convection.ipynb b/examples/cfd/01_convection.ipynb index 6a877eb125..3d91298007 100644 --- a/examples/cfd/01_convection.ipynb +++ b/examples/cfd/01_convection.ipynb @@ -420,8 +420,8 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", @@ -453,8 +453,7 @@ "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " for (int y = y_m; y <= y_M; y += 1)\n", @@ -462,30 +461,25 @@ " u[t1][x + 1][y + 1] = dt*(-(-u[t0][x][y + 1]/h_x + u[t0][x + 1][y + 1]/h_x) - (-u[t0][x + 1][y]/h_y + u[t0][x + 1][y + 1]/h_y) + u[t0][x + 1][y + 1]/dt);\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "\n", - " /* Begin section1 */\n", - " START_TIMER(section1)\n", + " START(section1)\n", " for (int y = y_m; y <= y_M; y += 1)\n", " {\n", " u[t1][1][y + 1] = 1.00000000000000F;\n", "\n", " u[t1][81][y + 1] = 1.00000000000000F;\n", " }\n", - " STOP_TIMER(section1,timers)\n", - " /* End section1 */\n", + " STOP(section1,timers)\n", "\n", - " /* Begin section2 */\n", - " START_TIMER(section2)\n", + " START(section2)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " u[t1][x + 1][81] = 1.00000000000000F;\n", "\n", " u[t1][x + 1][1] = 1.00000000000000F;\n", " }\n", - " STOP_TIMER(section2,timers)\n", - " /* End section2 */\n", + " STOP(section2,timers)\n", " }\n", "\n", " return 0;\n", diff --git a/examples/cfd/01_convection_revisited.ipynb b/examples/cfd/01_convection_revisited.ipynb index ad59bc561a..c03e992300 100644 --- a/examples/cfd/01_convection_revisited.ipynb +++ b/examples/cfd/01_convection_revisited.ipynb @@ -395,8 +395,8 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", @@ -428,8 +428,7 @@ "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " for (int i0x = i0x_ltkn + x_m; i0x <= -i0x_rtkn + x_M; i0x += 1)\n", " {\n", " for (int i0y = i0y_ltkn + y_m; i0y <= -i0y_rtkn + y_M; i0y += 1)\n", @@ -437,30 +436,25 @@ " u[t1][i0x + 1][i0y + 1] = dt*(-(-u[t0][i0x][i0y + 1]/h_x + u[t0][i0x + 1][i0y + 1]/h_x) - (-u[t0][i0x + 1][i0y]/h_y + u[t0][i0x + 1][i0y + 1]/h_y) + u[t0][i0x + 1][i0y + 1]/dt);\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "\n", - " /* Begin section1 */\n", - " START_TIMER(section1)\n", + " START(section1)\n", " for (int y = y_m; y <= y_M; y += 1)\n", " {\n", " u[t1][1][y + 1] = 1.00000000000000F;\n", "\n", " u[t1][81][y + 1] = 1.00000000000000F;\n", " }\n", - " STOP_TIMER(section1,timers)\n", - " /* End section1 */\n", + " STOP(section1,timers)\n", "\n", - " /* Begin section2 */\n", - " START_TIMER(section2)\n", + " START(section2)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " u[t1][x + 1][81] = 1.00000000000000F;\n", "\n", " u[t1][x + 1][1] = 1.00000000000000F;\n", " }\n", - " STOP_TIMER(section2,timers)\n", - " /* End section2 */\n", + " STOP(section2,timers)\n", " }\n", "\n", " return 0;\n", diff --git a/examples/compiler/01_data_regions.ipynb b/examples/compiler/01_data_regions.ipynb index 9d3465f0de..a2b9016dbd 100644 --- a/examples/compiler/01_data_regions.ipynb +++ b/examples/compiler/01_data_regions.ipynb @@ -94,8 +94,8 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", @@ -125,8 +125,7 @@ "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " for (int y = y_m; y <= y_M; y += 1)\n", @@ -134,8 +133,7 @@ " u[t1][x + 1][y + 1] = u[t0][x + 1][y + 1] + 2;\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", " }\n", "\n", " return 0;\n", @@ -325,8 +323,8 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", @@ -356,8 +354,7 @@ "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " for (int y = y_m; y <= y_M; y += 1)\n", @@ -365,8 +362,7 @@ " u_new[t1][x + 3][y + 3] = u_new[t0][x + 3][y + 3] + 2;\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", " }\n", "\n", " return 0;\n", @@ -445,8 +441,8 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", @@ -476,8 +472,7 @@ "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " for (int y = y_m; y <= y_M; y += 1)\n", @@ -485,8 +480,7 @@ " u_pad[t1][x + 2][y + 2] = u_pad[t0][x + 2][y + 2] + 2;\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", " }\n", "\n", " return 0;\n", diff --git a/examples/compiler/03_iet-A.ipynb b/examples/compiler/03_iet-A.ipynb index 5ba4133972..8f8e26be34 100644 --- a/examples/compiler/03_iet-A.ipynb +++ b/examples/compiler/03_iet-A.ipynb @@ -125,8 +125,8 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", @@ -156,8 +156,7 @@ "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " for (int y = y_m; y <= y_M; y += 1)\n", @@ -165,8 +164,7 @@ " u[t1][x + 1][y + 1] = u[t0][x + 1][y + 1] + 1;\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", " }\n", "\n", " return 0;\n", @@ -240,7 +238,7 @@ { "data": { "text/plain": [ - "OrderedSet([('_POSIX_C_SOURCE', '200809L'), ('START_TIMER(S)', 'struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);'), ('STOP_TIMER(S,T)', 'gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;')])" + "OrderedSet([('_POSIX_C_SOURCE', '200809L'), ('START(S)', 'struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);'), ('STOP(S,T)', 'gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;')])" ] }, "execution_count": 8, @@ -327,8 +325,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " for (int y = y_m; y <= y_M; y += 1)\n", @@ -336,8 +333,7 @@ " u[t1][x + 1][y + 1] = u[t0][x + 1][y + 1] + 1;\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } diff --git a/examples/mpi/overview.ipynb b/examples/mpi/overview.ipynb index 3f535748f8..7b4647814e 100644 --- a/examples/mpi/overview.ipynb +++ b/examples/mpi/overview.ipynb @@ -323,8 +323,8 @@ "data": { "text/plain": [ "[stdout:0] #define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", @@ -361,8 +361,7 @@ "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " #pragma omp simd aligned(u:64)\n", @@ -371,8 +370,7 @@ " u[t1][x + 2][y + 2] = u[t0][x + 2][y + 2] + 1;\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", " }\n", "\n", " return 0;\n", @@ -417,8 +415,8 @@ "data": { "text/plain": [ "[stdout:0] #define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", @@ -468,8 +466,7 @@ "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " haloupdate0(u_vec,comm,nb,t0);\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", @@ -479,8 +476,7 @@ " u[t1][x + 2][y + 2] = r0*(-u[t0][x + 2][y + 2]) + r0*u[t0][x + 3][y + 2] + 1;\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", " }\n", "\n", " return 0;\n", diff --git a/examples/performance/00_overview.ipynb b/examples/performance/00_overview.ipynb index 8d8a75e922..0037489709 100644 --- a/examples/performance/00_overview.ipynb +++ b/examples/performance/00_overview.ipynb @@ -198,8 +198,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(3) schedule(dynamic,1)\n", @@ -214,8 +213,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } @@ -275,8 +273,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(3) schedule(static,1)\n", @@ -291,8 +288,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } @@ -331,8 +327,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(2) schedule(dynamic,1)\n", @@ -353,8 +348,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } @@ -415,8 +409,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(3) schedule(dynamic,1)\n", @@ -449,8 +442,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } @@ -501,8 +493,7 @@ "text": [ "float r1 = 1.0F/h_y;\n", "\n", - "/* Begin section0 */\n", - "START_TIMER(section0)\n", + "START(section0)\n", "#pragma omp parallel num_threads(nthreads)\n", "{\n", " #pragma omp for collapse(3) schedule(static,1)\n", @@ -517,13 +508,11 @@ " }\n", " }\n", "}\n", - "STOP_TIMER(section0,timers)\n", - "/* End section0 */\n", + "STOP(section0,timers)\n", "\n", "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section1 */\n", - " START_TIMER(section1)\n", + " START(section1)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(3) schedule(dynamic,1)\n", @@ -538,8 +527,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section1,timers)\n", - " /* End section1 */\n", + " STOP(section1,timers)\n", "}\n" ] } @@ -575,7 +563,7 @@ "text": [ "--- \n", "+++ \n", - "@@ -43,7 +43,8 @@\n", + "@@ -42,7 +42,8 @@\n", " {\n", " for (int z = z_m; z <= z_M; z += 1)\n", " {\n", @@ -612,7 +600,7 @@ "text": [ "--- \n", "+++ \n", - "@@ -44,7 +44,7 @@\n", + "@@ -43,7 +43,7 @@\n", " for (int z = z_m; z <= z_M; z += 1)\n", " {\n", " float r0 = 1.0F/h_y;\n", @@ -648,7 +636,7 @@ "text": [ "--- \n", "+++ \n", - "@@ -44,7 +44,7 @@\n", + "@@ -43,7 +43,7 @@\n", " for (int z = z_m; z <= z_M; z += 1)\n", " {\n", " float r0 = 1.0F/h_y;\n", @@ -709,8 +697,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " const int tid = omp_get_thread_num();\n", @@ -737,8 +724,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } @@ -771,8 +757,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(3) schedule(dynamic,1)\n", @@ -803,8 +788,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } @@ -833,8 +817,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(2) schedule(dynamic,1)\n", @@ -876,8 +859,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } @@ -926,8 +908,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(3) schedule(dynamic,1)\n", @@ -942,8 +923,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } @@ -984,8 +964,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " const int tid = omp_get_thread_num();\n", @@ -1012,8 +991,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } @@ -1061,8 +1039,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(3) schedule(dynamic,1)\n", @@ -1077,8 +1054,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } @@ -1113,8 +1089,7 @@ "text": [ "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(3) schedule(dynamic,1)\n", @@ -1143,8 +1118,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "}\n" ] } @@ -1184,8 +1158,8 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "#define MIN(a,b) (((a) < (b)) ? (a) : (b))\n", "\n", "#include \"stdlib.h\"\n", @@ -1237,8 +1211,7 @@ "\n", " float r1 = 1.0F/h_y;\n", "\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(2) schedule(static,1)\n", @@ -1254,13 +1227,11 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section1 */\n", - " START_TIMER(section1)\n", + " START(section1)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " const int tid = omp_get_thread_num();\n", @@ -1293,8 +1264,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section1,timers)\n", - " /* End section1 */\n", + " STOP(section1,timers)\n", " }\n", "\n", " #pragma omp parallel num_threads(nthreads)\n", @@ -1355,8 +1325,7 @@ "text": [ "float r1 = 1.0F/h_y;\n", "\n", - "/* Begin section0 */\n", - "START_TIMER(section0)\n", + "START(section0)\n", "#pragma omp parallel num_threads(nthreads)\n", "{\n", " #pragma omp for collapse(2) schedule(static,1)\n", @@ -1372,13 +1341,11 @@ " }\n", " }\n", "}\n", - "STOP_TIMER(section0,timers)\n", - "/* End section0 */\n", + "STOP(section0,timers)\n", "\n", "for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", "{\n", - " /* Begin section1 */\n", - " START_TIMER(section1)\n", + " START(section1)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " const int tid = omp_get_thread_num();\n", @@ -1411,8 +1378,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section1,timers)\n", - " /* End section1 */\n", + " STOP(section1,timers)\n", "}\n" ] } @@ -1476,8 +1442,8 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", @@ -1528,8 +1494,7 @@ "\n", " float r1 = 1.0F/h_y;\n", "\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(2) schedule(static,1)\n", @@ -1545,13 +1510,11 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section1 */\n", - " START_TIMER(section1)\n", + " START(section1)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " const int tid = omp_get_thread_num();\n", @@ -1578,8 +1541,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section1,timers)\n", - " /* End section1 */\n", + " STOP(section1,timers)\n", " }\n", "\n", " #pragma omp parallel num_threads(nthreads)\n", @@ -1621,8 +1583,8 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "#define MIN(a,b) (((a) < (b)) ? (a) : (b))\n", "\n", "#include \"stdlib.h\"\n", @@ -1673,8 +1635,7 @@ " float r1 = 1.0F/h_x;\n", " float r2 = 1.0F/h_y;\n", "\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(2) schedule(static,1)\n", @@ -1690,13 +1651,11 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section1 */\n", - " START_TIMER(section1)\n", + " START(section1)\n", " #pragma omp parallel num_threads(nthreads)\n", " {\n", " #pragma omp for collapse(2) schedule(dynamic,1)\n", @@ -1740,8 +1699,7 @@ " }\n", " }\n", " }\n", - " STOP_TIMER(section1,timers)\n", - " /* End section1 */\n", + " STOP(section1,timers)\n", " }\n", "\n", " free(r0_vec);\n", diff --git a/examples/performance/01_gpu.ipynb b/examples/performance/01_gpu.ipynb index 1b4e29ddb9..6103a6659a 100644 --- a/examples/performance/01_gpu.ipynb +++ b/examples/performance/01_gpu.ipynb @@ -254,8 +254,8 @@ "text": [ "#define _POSIX_C_SOURCE 200809L\n", "#define uL0(time,x,y) u[(time)*x_stride0 + (x)*y_stride0 + (y)]\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", @@ -305,8 +305,7 @@ "\n", " for (int time = time_m; time <= time_M; time += 1)\n", " {\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " #pragma omp target teams distribute parallel for collapse(2)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", @@ -316,8 +315,7 @@ " uL0(time + 1, x + 2, y + 2) = dt*(c*(r1*r3 + r1*uL0(time, x + 1, y + 2) + r1*uL0(time, x + 3, y + 2) + r2*r3 + r2*uL0(time, x + 2, y + 1) + r2*uL0(time, x + 2, y + 3)) + r0*uL0(time, x + 2, y + 2));\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", " }\n", "\n", " #pragma omp target update from(u[0:u_vec->size[0]*u_vec->size[1]*u_vec->size[2]])\n", diff --git a/examples/seismic/self_adjoint/sa_01_iso_implementation1.ipynb b/examples/seismic/self_adjoint/sa_01_iso_implementation1.ipynb index fb97791a9c..8a5f775500 100644 --- a/examples/seismic/self_adjoint/sa_01_iso_implementation1.ipynb +++ b/examples/seismic/self_adjoint/sa_01_iso_implementation1.ipynb @@ -349,14 +349,12 @@ " _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);\n", " struct timeval start_section0, end_section0;\n", " gettimeofday(&start_section0, NULL);\n", - " /* Begin section0 */\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " double r0 = 1.0/h_x;\n", " f2[x + 8] = r0*(6.97544643e-4*(f1[x + 5] - f1[x + 12]) + 9.5703125e-3*(-f1[x + 6] + f1[x + 11]) + 7.97526042e-2*(f1[x + 7] - f1[x + 10]) + 1.19628906*(-f1[x + 8] + f1[x + 9]));\n", " g2[x + 8] = r0*(6.97544643e-4*(g1[x + 4] - g1[x + 11]) + 9.5703125e-3*(-g1[x + 5] + g1[x + 10]) + 7.97526042e-2*(g1[x + 6] - g1[x + 9]) + 1.19628906*(-g1[x + 7] + g1[x + 8]));\n", " }\n", - " /* End section0 */\n", " gettimeofday(&end_section0, NULL);\n", " timers->section0 += (double)(end_section0.tv_sec-start_section0.tv_sec)+(double)(end_section0.tv_usec-start_section0.tv_usec)/1000000;\n", " return 0;\n", @@ -1230,7 +1228,6 @@ " {\n", " struct timeval start_section0, end_section0;\n", " gettimeofday(&start_section0, NULL);\n", - " /* Begin section0 */\n", " for (int x = x_m - 4; x <= x_M + 3; x += 1)\n", " {\n", " #pragma omp simd aligned(u:32)\n", @@ -1250,12 +1247,10 @@ " u[t1][x + 8][z + 8] = r7*u[t2][x + 8][z + 8] + r8*u[t0][x + 8][z + 8] + (4.41*(m[x + 8][z + 8]*m[x + 8][z + 8])*(6.97544642889625e-5*(b[x + 4][z + 8]*r26[x][z + 4] + b[x + 8][z + 4]*r25[x + 4][z] - b[x + 8][z + 11]*r25[x + 4][z + 7] - b[x + 11][z + 8]*r26[x + 7][z + 4]) + 9.5703125007276e-4*(-b[x + 5][z + 8]*r26[x + 1][z + 4] - b[x + 8][z + 5]*r25[x + 4][z + 1] + b[x + 8][z + 10]*r25[x + 4][z + 6] + b[x + 10][z + 8]*r26[x + 6][z + 4]) + 7.97526041715173e-3*(b[x + 6][z + 8]*r26[x + 2][z + 4] + b[x + 8][z + 6]*r25[x + 4][z + 2] - b[x + 8][z + 9]*r25[x + 4][z + 5] - b[x + 9][z + 8]*r26[x + 5][z + 4]) + 1.1962890625e-1*(-b[x + 7][z + 8]*r26[x + 3][z + 4] - b[x + 8][z + 7]*r25[x + 4][z + 3] + b[x + 8][z + 8]*r25[x + 4][z + 4] + b[x + 8][z + 8]*r26[x + 4][z + 4])))/b[x + 8][z + 8];\n", " }\n", " }\n", - " /* End section0 */\n", " gettimeofday(&end_section0, NULL);\n", " timers->section0 += (double)(end_section0.tv_sec-start_section0.tv_sec)+(double)(end_section0.tv_usec-start_section0.tv_usec)/1000000;\n", " struct timeval start_section1, end_section1;\n", " gettimeofday(&start_section1, NULL);\n", - " /* Begin section1 */\n", " for (int p_src = p_src_m; p_src <= p_src_M; p_src += 1)\n", " {\n", " int ii_src_0 = (int)(floor(-1.0e-1*o_x + 1.0e-1*src_coords[p_src][0]));\n", @@ -1285,12 +1280,10 @@ " u[t1][ii_src_3 + 8][ii_src_2 + 8] += r3;\n", " }\n", " }\n", - " /* End section1 */\n", " gettimeofday(&end_section1, NULL);\n", " timers->section1 += (double)(end_section1.tv_sec-start_section1.tv_sec)+(double)(end_section1.tv_usec-start_section1.tv_usec)/1000000;\n", " struct timeval start_section2, end_section2;\n", " gettimeofday(&start_section2, NULL);\n", - " /* Begin section2 */\n", " for (int p_rec = p_rec_m; p_rec <= p_rec_M; p_rec += 1)\n", " {\n", " int ii_rec_0 = (int)(floor(-1.0e-1*o_x + 1.0e-1*rec_coords[p_rec][0]));\n", @@ -1318,7 +1311,6 @@ " }\n", " rec[time][p_rec] = sum;\n", " }\n", - " /* End section2 */\n", " gettimeofday(&end_section2, NULL);\n", " timers->section2 += (double)(end_section2.tv_sec-start_section2.tv_sec)+(double)(end_section2.tv_usec-start_section2.tv_usec)/1000000;\n", " }\n", diff --git a/examples/userapi/01_dsl.ipynb b/examples/userapi/01_dsl.ipynb index af7727af38..65182c0b74 100644 --- a/examples/userapi/01_dsl.ipynb +++ b/examples/userapi/01_dsl.ipynb @@ -657,8 +657,8 @@ "output_type": "stream", "text": [ "#define _POSIX_C_SOURCE 200809L\n", - "#define START_TIMER(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", - "#define STOP_TIMER(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", + "#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);\n", + "#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;\n", "\n", "#include \"stdlib.h\"\n", "#include \"math.h\"\n", @@ -688,8 +688,7 @@ "\n", " for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))\n", " {\n", - " /* Begin section0 */\n", - " START_TIMER(section0)\n", + " START(section0)\n", " for (int x = x_m; x <= x_M; x += 1)\n", " {\n", " for (int y = y_m; y <= y_M; y += 1)\n", @@ -697,8 +696,7 @@ " u[t1][x + 1][y + 1] = dt*(-(-u[t0][x][y + 1]/h_x + u[t0][x + 1][y + 1]/h_x) - (-u[t0][x + 1][y]/h_y + u[t0][x + 1][y + 1]/h_y) + u[t0][x + 1][y + 1]/dt);\n", " }\n", " }\n", - " STOP_TIMER(section0,timers)\n", - " /* End section0 */\n", + " STOP(section0,timers)\n", " }\n", "\n", " return 0;\n", diff --git a/examples/userapi/02_apply.ipynb b/examples/userapi/02_apply.ipynb index aa3a04d8d9..693c4eed08 100644 --- a/examples/userapi/02_apply.ipynb +++ b/examples/userapi/02_apply.ipynb @@ -479,9 +479,7 @@ "source": [ "A `PerformanceSummary` will contain as many entries as \"sections\" in the generated code. Currently, there is no way to automatically tie a compiler-generated section to the user-provided `Eq`s (in general, there can be more than one `Eq` in a section). The only option is to look at the generated code and search for bodies of code wrapped within C comments such as\n", "```\n", - "/* Begin section0 */\n", "\n", - "/* End section0 \\*/\"\n", "```\n", "For example" ] @@ -492,7 +490,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Uncomment me and search for /* Begin section0 */ ... /* End section0 */\n", + "# Uncomment me and search for START(section0) ... STOP(section0) */\n", "# print(op)" ] }, diff --git a/examples/userapi/05_conditional_dimension.ipynb b/examples/userapi/05_conditional_dimension.ipynb index dcc46fee7d..e95e5c1305 100644 --- a/examples/userapi/05_conditional_dimension.ipynb +++ b/examples/userapi/05_conditional_dimension.ipynb @@ -320,8 +320,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/* Begin section0 */\n", - "START_TIMER(section0)\n", + "START(section0)\n", "for (int x = x_m; x <= x_M; x += 1)\n", "{\n", " #pragma omp simd aligned(f:32)\n", @@ -333,8 +332,7 @@ " }\n", " }\n", "}\n", - "STOP_TIMER(section0,timers)\n", - "/* End section0 */\n" + "STOP(section0,timers)\n", ] }, { @@ -398,8 +396,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/* Begin section0 */\n", - "START_TIMER(section0)\n", + "START(section0)\n", "for (int x = x_m; x <= x_M; x += 1)\n", "{\n", " #pragma omp simd aligned(f,g:32)\n", @@ -411,8 +408,7 @@ " }\n", " }\n", "}\n", - "STOP_TIMER(section0,timers)\n", - "/* End section0 */\n" + "STOP(section0,timers)\n", ] }, { @@ -492,8 +488,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/* Begin section0 */\n", - "START_TIMER(section0)\n", + "START(section0)\n", "for (int x = x_m; x <= x_M; x += 1)\n", "{\n", " for (int y = y_m; y <= y_M; y += 1)\n", @@ -504,8 +499,7 @@ " }\n", " }\n", "}\n", - "STOP_TIMER(section0,timers)\n", - "/* End section0 */\n" + "STOP(section0,timers)\n", ] }, { @@ -568,8 +562,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/* Begin section0 */\n", - "START_TIMER(section0)\n", + "START(section0)\n", "for (int i = i_m; i <= i_M; i += 1)\n", "{\n", " if ((i)%(4) == 0)\n", @@ -577,8 +570,7 @@ " f[i / 4] = g[i];\n", " }\n", "}\n", - "STOP_TIMER(section0,timers)\n", - "/* End section0 */\n", + "STOP(section0,timers)\n", "\n", " Data in g \n", " [ 0. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 14. 15.]\n", @@ -685,8 +677,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/* Begin section0 */\n", - "START_TIMER(section0)\n", + "START(section0)\n", "for (int x = x_m; x <= x_M; x += 1)\n", "{\n", " for (int y = y_m; y <= y_M; y += 1)\n", @@ -698,8 +689,7 @@ " }\n", " }\n", "}\n", - "STOP_TIMER(section0,timers)\n", - "/* End section0 */\n" + "STOP(section0,timers)\n", ] }, { @@ -761,8 +751,7 @@ "text": [ "int k = -1;\n", "\n", - "/* Begin section0 */\n", - "START_TIMER(section0)\n", + "START(section0)\n", "for (int x = x_m; x <= x_M; x += 1)\n", "{\n", " for (int y = y_m; y <= y_M; y += 1)\n", @@ -775,8 +764,7 @@ " }\n", " }\n", "}\n", - "STOP_TIMER(section0,timers)\n", - "/* End section0 */\n" + "STOP(section0,timers)\n", ] } ], diff --git a/tests/test_linearize.py b/tests/test_linearize.py index efed9f79db..045cb1955a 100644 --- a/tests/test_linearize.py +++ b/tests/test_linearize.py @@ -170,7 +170,7 @@ def test_codegen_quality0(): # Only four access macros necessary, namely `uL0`, `bufL0`, `bufL1` # for the efunc args - # (the other three obviously are _POSIX_C_SOURCE, START_TIMER, STOP_TIMER) + # (the other three obviously are _POSIX_C_SOURCE, START, STOP) assert len(op._headers) == 6 @@ -192,7 +192,7 @@ def test_codegen_quality1(): assert all('const long' not in str(i) for i in exprs[-3:]) # Only two access macros necessary, namely `uL0` and `r1L0` (the other five - # obviously are _POSIX_C_SOURCE, MIN, MAX, START_TIMER, STOP_TIMER) + # obviously are _POSIX_C_SOURCE, MIN, MAX, START, STOP) assert len(op._headers) == 6 From a463e4e56500f58ff549a78ee340ba9ac0f2d605 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 8 Nov 2023 09:07:43 +0000 Subject: [PATCH 13/18] compiler: Extend Uxreplace --- devito/ir/iet/visitors.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index e9749ad769..f37e53f09b 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -1184,13 +1184,19 @@ def visit_Conditional(self, o): condition = uxreplace(o.condition, self.mapper) then_body = self._visit(o.then_body) else_body = self._visit(o.else_body) - return o._rebuild(condition=condition, then_body=then_body, else_body=else_body) + return o._rebuild(condition=condition, then_body=then_body, + else_body=else_body) def visit_PointerCast(self, o): function = self.mapper.get(o.function, o.function) obj = self.mapper.get(o.obj, o.obj) return o._rebuild(function=function, obj=obj) + def visit_Dereference(self, o): + pointee = self.mapper.get(o.pointee, o.pointee) + pointer = self.mapper.get(o.pointer, o.pointer) + return o._rebuild(pointee=pointee, pointer=pointer) + def visit_Pragma(self, o): arguments = [uxreplace(i, self.mapper) for i in o.arguments] return o._rebuild(arguments=arguments) @@ -1207,6 +1213,11 @@ def visit_HaloSpot(self, o): body = self._visit(o.body) return o._rebuild(halo_scheme=halo_scheme, body=body) + def visit_While(self, o, **kwargs): + condition = uxreplace(o.condition, self.mapper) + body = self._visit(o.body) + return o._rebuild(condition=condition, body=body) + visit_ThreadedProdder = visit_Call def visit_KernelLaunch(self, o): From 1cc6a0a53410d1e4f2b928e4ff7499d92706038c Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Wed, 8 Nov 2023 09:08:27 +0000 Subject: [PATCH 14/18] compiler: Abstract semantically identical compounds --- devito/ir/iet/efunc.py | 20 ++-- devito/ir/iet/nodes.py | 2 +- devito/ir/iet/utils.py | 13 ++- devito/ir/iet/visitors.py | 29 ++++-- devito/mpi/routines.py | 4 +- devito/passes/iet/asynchrony.py | 84 ++++++++++------ devito/passes/iet/engine.py | 150 +++++++++++++++++++++++------ devito/passes/iet/orchestration.py | 6 +- devito/symbolics/extended_sympy.py | 9 +- devito/tools/utils.py | 33 ++++++- devito/types/basic.py | 12 ++- devito/types/object.py | 4 - devito/types/parallel.py | 12 +-- tests/test_gpu_common.py | 84 ++++++++-------- tests/test_pickle.py | 6 +- 15 files changed, 311 insertions(+), 157 deletions(-) diff --git a/devito/ir/iet/efunc.py b/devito/ir/iet/efunc.py index 48571701cd..7c8974b804 100644 --- a/devito/ir/iet/efunc.py +++ b/devito/ir/iet/efunc.py @@ -7,7 +7,7 @@ __all__ = ['ElementalFunction', 'ElementalCall', 'make_efunc', 'make_callable', 'EntryFunction', 'AsyncCallable', 'AsyncCall', 'ThreadCallable', - 'DeviceFunction', 'DeviceCall', 'KernelLaunch'] + 'DeviceFunction', 'DeviceCall', 'KernelLaunch', 'CommCallable'] # ElementalFunction machinery @@ -105,17 +105,7 @@ def make_callable(name, iet, retval='void', prefix='static'): """ Utility function to create a Callable from an IET. """ - parameters = derive_parameters(iet) - - # TODO: this should be done by `derive_parameters`, and perhaps better, e.g. - # ordering such that TimeFunctions go first, then Functions, etc. However, - # doing it would require updating a *massive* number of tests and notebooks, - # hence for now we limit it here - # NOTE: doing it not just for code aesthetics, but also so that semantically - # identical callables can be abstracted homogeneously irrespective of the - # object names, which dictate the ordering in the callable signature - parameters = sorted(parameters, key=lambda p: str(type(p))) - + parameters = derive_parameters(iet, ordering='canonical') return Callable(name, iet, retval, parameters=parameters, prefix=prefix) @@ -221,3 +211,9 @@ def functions(self): if self.stream is not None: launch_args += (self.stream.function,) return super().functions + launch_args + + +# Other relevant Callable subclasses + +class CommCallable(Callable): + pass diff --git a/devito/ir/iet/nodes.py b/devito/ir/iet/nodes.py index b9e5cfb0cb..7b242c6a97 100644 --- a/devito/ir/iet/nodes.py +++ b/devito/ir/iet/nodes.py @@ -722,7 +722,7 @@ def all_parameters(self): @property def functions(self): return tuple(i.function for i in self.all_parameters - if isinstance(i.function, AbstractFunction)) + if isinstance(i.function, (AbstractFunction, AbstractObject))) @property def defines(self): diff --git a/devito/ir/iet/utils.py b/devito/ir/iet/utils.py index 99662ce1da..a473668814 100644 --- a/devito/ir/iet/utils.py +++ b/devito/ir/iet/utils.py @@ -92,11 +92,13 @@ def filter_iterations(tree, key=lambda i: i): return filtered -def derive_parameters(iet, drop_locals=False): +def derive_parameters(iet, drop_locals=False, ordering='default'): """ Derive all input parameters (function call arguments) from an IET by collecting all symbols not defined in the tree itself. """ + assert ordering in ('default', 'canonical') + # Extract all candidate parameters candidates = FindSymbols().visit(iet) @@ -122,6 +124,15 @@ def derive_parameters(iet, drop_locals=False): if drop_locals: parameters = [p for p in parameters if not (p.is_ArrayBasic or p.is_LocalObject)] + # NOTE: This is requested by the caller when the parameters are used to + # construct Callables whose signature only depends on the object types, + # rather than on their name + # TODO: It should maybe be done systematically... but it's gonna change a huge + # amount of tests and examples; plus, it might break compatibility those + # using devito as a library-generator to be embedded within legacy codes + if ordering == 'canonical': + parameters = sorted(parameters, key=lambda p: str(type(p))) + return parameters diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index f37e53f09b..d98ea6100c 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -19,7 +19,8 @@ from devito.ir.support.space import Backward from devito.symbolics import ListInitializer, ccode, uxreplace from devito.tools import (GenericVisitor, as_tuple, ctypes_to_cstr, filter_ordered, - filter_sorted, flatten, is_external_ctype, c_restrict_void_p) + filter_sorted, flatten, is_external_ctype, + c_restrict_void_p, sorted_priority) from devito.types.basic import AbstractFunction, Basic from devito.types import (ArrayObject, CompositeObject, Dimension, Pointer, IndexedData, DeviceMap) @@ -224,7 +225,7 @@ def _gen_struct_decl(self, obj, masked=()): def _gen_value(self, obj, level=2, masked=()): qualifiers = [v for k, v in self._qualifiers_mapper.items() - if getattr(obj, k, False) and v not in masked] + if getattr(obj.function, k, False) and v not in masked] if (obj._mem_stack or obj._mem_constant) and level == 2: strtype = obj._C_typedata @@ -233,7 +234,8 @@ def _gen_value(self, obj, level=2, masked=()): strtype = ctypes_to_cstr(obj._C_ctype) strshape = '' if isinstance(obj, (AbstractFunction, IndexedData)) and level >= 1: - strtype = '%s%s' % (strtype, self._restrict_keyword) + if not obj._mem_stack: + strtype = '%s%s' % (strtype, self._restrict_keyword) strtype = ' '.join(qualifiers + [strtype]) strname = obj._C_name @@ -632,10 +634,10 @@ def visit_Operator(self, o, mode='all'): # Elemental functions esigns = [] efuncs = [blankline] - for i in o._func_table.values(): - if i.local: - esigns.append(self._gen_signature(i.root)) - efuncs.extend([self._visit(i.root), blankline]) + items = [i.root for i in o._func_table.values() if i.local] + for i in sorted_efuncs(items): + esigns.append(self._gen_signature(i)) + efuncs.extend([self._visit(i), blankline]) # Definitions headers = [c.Define(*i) for i in o._headers] + [blankline] @@ -1279,3 +1281,16 @@ def generate(self): if self.cast: tip = '(%s)%s' % (self.cast, tip) yield tip + + +def sorted_efuncs(efuncs): + from devito.ir.iet.efunc import (CommCallable, DeviceFunction, + ThreadCallable, ElementalFunction) + + priority = { + DeviceFunction: 3, + ThreadCallable: 2, + ElementalFunction: 1, + CommCallable: 1 + } + return sorted_priority(efuncs, priority) diff --git a/devito/mpi/routines.py b/devito/mpi/routines.py index 859de68ffb..dacf64a0f5 100644 --- a/devito/mpi/routines.py +++ b/devito/mpi/routines.py @@ -12,7 +12,7 @@ from devito.ir.iet import (Call, Callable, Conditional, ElementalFunction, Expression, ExpressionBundle, AugmentedExpression, Iteration, List, Prodder, Return, make_efunc, FindNodes, - Transformer, ElementalCall) + Transformer, ElementalCall, CommCallable) from devito.mpi import MPI from devito.symbolics import (Byref, CondNe, FieldFromPointer, FieldFromComposite, IndexedPointer, Macro, cast_mapper, subs_op_args) @@ -1015,7 +1015,7 @@ def _call_poke(self, poke): # Callable sub-hierarchy -class MPICallable(Callable): +class MPICallable(CommCallable): def __init__(self, name, body, parameters): super(MPICallable, self).__init__(name, body, 'void', parameters, ('static',)) diff --git a/devito/passes/iet/asynchrony.py b/devito/passes/iet/asynchrony.py index 7fb2abb6dc..01635945cc 100644 --- a/devito/passes/iet/asynchrony.py +++ b/devito/passes/iet/asynchrony.py @@ -4,9 +4,9 @@ import cgen as c from devito.ir import (AsyncCall, AsyncCallable, BlankLine, Call, Callable, - Conditional, Dereference, DummyExpr, FindNodes, FindSymbols, + Conditional, DummyExpr, FindNodes, FindSymbols, Iteration, List, PointerCast, Return, ThreadCallable, - Transformer, While, maybe_alias) + Transformer, While, make_callable, maybe_alias) from devito.passes.iet.definitions import DataManager from devito.passes.iet.engine import iet_pass from devito.symbolics import (CondEq, CondNe, FieldFromComposite, FieldFromPointer, @@ -60,26 +60,26 @@ def lower_async_callables(iet, root=None, sregistry=None): ncfields=ncfields, pname=sregistry.make_name(prefix='tsdata') ) - sbase = sdata.symbolic_base + sbase = sdata.indexed # Prepend the SharedData fields available upon thread activation - preactions = [DummyExpr(i, FieldFromPointer(i.name, sbase)) for i in ncfields] + preactions = [DummyExpr(i, FieldFromPointer(i.base, sbase)) for i in ncfields] preactions.append(BlankLine) # Append the flag reset postactions = [List(body=[ BlankLine, - DummyExpr(FieldFromPointer(sdata._field_flag, sbase), 1) + DummyExpr(FieldFromPointer(sdata.symbolic_flag, sbase), 1) ])] wrap = List(body=preactions + list(iet.body.body) + postactions) # The thread has work to do when it receives the signal that all locks have # been set to 0 by the main thread - wrap = Conditional(CondEq(FieldFromPointer(sdata._field_flag, sbase), 2), wrap) + wrap = Conditional(CondEq(FieldFromPointer(sdata.symbolic_flag, sbase), 2), wrap) # The thread keeps spinning until the alive flag is set to 0 by the main thread - wrap = While(CondNe(FieldFromPointer(sdata._field_flag, sbase), 0), wrap) + wrap = While(CondNe(FieldFromPointer(sdata.symbolic_flag, sbase), 0), wrap) # pthread functions expect exactly one argument of type void* tparameter = Pointer(name='_%s' % sdata.name) @@ -88,9 +88,11 @@ def lower_async_callables(iet, root=None, sregistry=None): unpacks = [PointerCast(sdata, tparameter), BlankLine] for i in cfields: if i.is_AbstractFunction: - unpacks.append(Dereference(i, sdata)) + unpacks.append( + DummyExpr(i._C_symbol, FieldFromPointer(i._C_symbol, sbase)) + ) else: - unpacks.append(DummyExpr(i, FieldFromPointer(i.name, sbase))) + unpacks.append(DummyExpr(i, FieldFromPointer(i.base, sbase))) body = iet.body._rebuild(body=[wrap, Return(Null)], unpacks=unpacks) iet = ThreadCallable(iet.name, body, tparameter) @@ -112,11 +114,20 @@ def lower_async_calls(iet, track=None, sregistry=None): assert n.name in track sdata = track[n.name] - sbase = sdata.symbolic_base + sbase = sdata.indexed name = sregistry.make_name(prefix='init_%s' % sdata.name) - body = [DummyExpr(FieldFromPointer(i._C_name, sbase), i._C_symbol) - for i in sdata.cfields] - body.extend([BlankLine, DummyExpr(FieldFromPointer(sdata._field_flag, sbase), 1)]) + body = [] + for i in sdata.cfields: + if i.is_AbstractFunction: + body.append( + DummyExpr(FieldFromPointer(i._C_symbol, sbase), i._C_symbol) + ) + else: + body.append(DummyExpr(FieldFromPointer(i.base, sbase), i.base)) + body.extend([ + BlankLine, + DummyExpr(FieldFromPointer(sdata.symbolic_flag, sbase), 1) + ]) parameters = sdata.cfields + (sdata,) efuncs[n.name] = Callable(name, body, 'void', parameters, 'static') @@ -135,7 +146,7 @@ def lower_async_calls(iet, track=None, sregistry=None): threads = PThreadArray(name=name, npthreads=sdata.npthreads) # Call to `sdata` initialization Callable - sbase = sdata.symbolic_base + sbase = sdata.indexed d = threads.index arguments = [] for a in n.arguments: @@ -152,7 +163,7 @@ def lower_async_calls(iet, track=None, sregistry=None): call0 = Call(efuncs[n.name].name, arguments) # Create pthreads - tbase = threads.symbolic_base + tbase = threads.indexed call1 = Call('pthread_create', ( tbase + d, Null, Call(n.name, [], is_indirect=True), sbase + d )) @@ -164,33 +175,34 @@ def lower_async_calls(iet, track=None, sregistry=None): else: callback = lambda body: Iteration(body, d, threads.size - 1) initialization.append(List( - header=c.Comment("Fire up and initialize `%s`" % threads.name), body=callback([call0, call1]) )) # Finalization - finalization.append(List( - header=c.Comment("Wait for completion of `%s`" % threads.name), - body=callback([ - While(CondEq(FieldFromComposite(sdata._field_flag, sdata[d]), 2)), - DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 0), - Call('pthread_join', (threads[d], Null)) - ]) - )) + name = sregistry.make_name(prefix='shutdown') + body = List(body=callback([ + While(CondEq(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 2)), + DummyExpr(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 0), + Call('pthread_join', (threads[d], Null)) + ])) + efunc = efuncs[name] = make_callable(name, body) + finalization.append(Call(name, efunc.parameters)) # Activation if threads.size == 1: d = threads.index - condition = CondNe(FieldFromComposite(sdata._field_flag, sdata[d]), 1) + condition = CondNe(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 1) activation = [While(condition)] else: d = Symbol(name=sregistry.make_name(prefix=threads.index.name)) - condition = CondNe(FieldFromComposite(sdata._field_flag, sdata[d]), 1) + condition = CondNe(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 1) activation = [DummyExpr(d, 0), While(condition, DummyExpr(d, (d + 1) % threads.size))] - activation.extend([DummyExpr(FieldFromComposite(i.name, sdata[d]), i) + activation.extend([DummyExpr(FieldFromComposite(i.base, sdata[d]), i) for i in sdata.ncfields]) - activation.append(DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 2)) + activation.append( + DummyExpr(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 2) + ) activation = List( header=[c.Line(), c.Comment("Activate `%s`" % threads.name)], body=activation, @@ -203,9 +215,19 @@ def lower_async_calls(iet, track=None, sregistry=None): iet = Transformer(mapper).visit(iet) # Inject initialization and finalization - initialization.append(BlankLine) - finalization.insert(0, BlankLine) - body = iet.body._rebuild(body=initialization + list(iet.body.body) + finalization) + initialization = List( + header=c.Comment("Fire up and initialize pthreads"), + body=initialization + [BlankLine] + ) + + finalization = List( + header=c.Comment("Wait for completion of pthreads"), + body=finalization + ) + + body = iet.body._rebuild( + body=[initialization] + list(iet.body.body) + [BlankLine, finalization] + ) iet = iet._rebuild(body=body) else: assert not initialization diff --git a/devito/passes/iet/engine.py b/devito/passes/iet/engine.py index 4c2ea1a2b7..f74dfec110 100644 --- a/devito/passes/iet/engine.py +++ b/devito/passes/iet/engine.py @@ -2,14 +2,16 @@ from functools import partial, singledispatch, wraps from devito.ir.iet import (Call, ExprStmt, FindNodes, FindSymbols, MetaCall, - Transformer, EntryFunction, ThreadCallable, - Uxreplace, derive_parameters) + Transformer, EntryFunction, ThreadCallable, Uxreplace, + derive_parameters) from devito.ir.support import SymbolRegistry from devito.mpi.distributed import MPINeighborhood from devito.passes import needs_transfer -from devito.tools import DAG, as_tuple, filter_ordered, timed_pass +from devito.symbolics import FieldFromComposite, FieldFromPointer +from devito.tools import (DAG, as_mapper, as_tuple, filter_ordered, + sorted_priority, timed_pass) from devito.types import (Array, Bundle, CompositeObject, Lock, IncrDimension, - Indirection, Temp) + Indirection, SharedData, ThreadArray, Temp) from devito.types.args import ArgProvider from devito.types.dense import DiscreteFunction from devito.types.dimension import AbstractIncrDimension, BlockDimension @@ -78,8 +80,9 @@ def apply(self, func, **kwargs): dag = create_call_graph(self.root.name, self.efuncs) # Apply `func` + efuncs = dict(self.efuncs) for i in dag.topological_sort(): - efunc, metadata = func(self.efuncs[i], **kwargs) + efunc, metadata = func(efuncs[i], **kwargs) self.includes.extend(as_tuple(metadata.get('includes'))) self.headers.extend(as_tuple(metadata.get('headers'))) @@ -94,19 +97,24 @@ def apply(self, func, **kwargs): except KeyError: pass - if efunc is self.efuncs[i]: + if efunc is efuncs[i]: continue - # Minimize code size by abstracting semantically identical efuncs - efuncs = metadata.get('efuncs', []) - efunc, efuncs = reuse_efuncs(efunc, efuncs, self.sregistry) + new_efuncs = metadata.get('efuncs', []) - self.efuncs[i] = efunc - self.efuncs.update(OrderedDict([(i.name, i) for i in efuncs])) + efuncs[i] = efunc + efuncs.update(OrderedDict([(i.name, i) for i in new_efuncs])) # Update the parameters / arguments lists since `func` may have # introduced or removed objects - self.efuncs = update_args(efunc, self.efuncs, dag) + efuncs = update_args(efunc, efuncs, dag) + + # Minimize code size + if len(efuncs) > len(self.efuncs): + efuncs = reuse_compounds(efuncs, self.sregistry) + efuncs = reuse_efuncs(self.root, efuncs, self.sregistry) + + self.efuncs = efuncs # Uniqueness self.includes = filter_ordered(self.includes) @@ -185,6 +193,82 @@ def create_call_graph(root, efuncs): return dag +def reuse_compounds(efuncs, sregistry=None): + """ + Generalise `efuncs` so that groups of semantically identical compound types + are replaced with a unique compound type, thus maximizing code reuse. + + For example, given two C structs originating from e.g. a CompositeObject + + struct foo {int a, char v} + struct bar {int g, char e} + + Reduce them to: + + struct foo {int a, char v} + + Which requires replacing all references to `bar` with the new `foo`. Note that + in this case the transformed `foo` is also syntactically identical to the + input `foo`, but this isn't necessarily the case. + """ + mapper = {} + for efunc in efuncs.values(): + local_sregistry = SymbolRegistry() + for i in FindSymbols().visit(efunc): + abstract_compound(i, mapper, local_sregistry) + + key = lambda i: mapper[i]._C_ctype + subs = {} + for v in as_mapper(mapper, key).values(): + if len(v) == 1: + continue + + # Recreate now using a globally unique type name + abstract_compound(v[0], subs, sregistry) + base = subs[v[0]] + + subs.update({i: base._rebuild(name=mapper[i].name) for i in v}) + + # Replace all occurrences in the form of FieldFrom{Composite,Pointer} + mapper = {} + for i0, i1 in subs.items(): + b0, b1 = i0.indexed, i1.indexed + + mapper.update({i0: i1, b0: b1}) + + for f0, f1 in zip(i0.fields, i1.fields): + for cls in (FieldFromComposite, FieldFromPointer): + if f0.is_AbstractFunction: + mapper[cls(f0._C_symbol, b0)] = cls(f1._C_symbol, b1) + else: + mapper[cls(f0.base, b0)] = cls(f1.base, b1) + + if mapper: + efuncs = {i: Uxreplace(mapper).visit(efunc) for i, efunc in efuncs.items()} + + return efuncs + + +@singledispatch +def abstract_compound(i, mapper, sregistry): + """ + Singledispatch-based implementation of type abstraction. + """ + return + + +@abstract_compound.register(SharedData) +def _(i, mapper, sregistry): + pname = sregistry.make_name(prefix="tsd") + + m = abstract_objects(i.fields) + cfields = [m.get(i, i) for i in i.cfields] + ncfields = [m.get(i, i) for i in i.ncfields] + + mapper[i] = i._rebuild(cfields=cfields, ncfields=ncfields, pname=pname, + function=None) + + def reuse_efuncs(root, efuncs, sregistry=None): """ Generalise `efuncs` so that syntactically identical Callables may be dropped, @@ -206,8 +290,6 @@ def reuse_efuncs(root, efuncs, sregistry=None): # assuming that `bar0` and `bar1` are compatible, we first process the # `bar`'s to obtain `[foo0(u(x)): bar0(u), foo1(u(x)): bar0(u)]`, # and finally `foo0(u(x)): bar0(u)` - efuncs = {i.name: i for i in efuncs} - efuncs[root.name] = root dag = create_call_graph(root.name, efuncs) mapper = {} @@ -236,11 +318,12 @@ def reuse_efuncs(root, efuncs, sregistry=None): afunc = afunc._rebuild(name=efunc.name) mapper[key] = (afunc, [efunc]) - root = efuncs.pop(root.name) - processed = [afunc if len(efuncs) > 1 else efuncs.pop() - for afunc, efuncs in mapper.values()] + processed = [afunc if len(v) > 1 else v.pop() for afunc, v in mapper.values()] + + retval = {root.name: efuncs[root.name]} + retval.update({i.name: i for i in processed}) - return root, processed + return retval def abstract_efunc(efunc): @@ -275,17 +358,7 @@ def abstract_objects(objects, sregistry=None): AbstractIncrDimension: 3, BlockDimension: 4, } - - def key(i): - for cls in sorted(priority, key=priority.get, reverse=True): - if isinstance(i, cls): - v = priority[cls] - break - else: - v = 0 - return (v, str(type(i))) - - objects = sorted(objects, key=key, reverse=True) + objects = sorted_priority(objects, priority) # Build abstraction mappings mapper = {} @@ -300,9 +373,6 @@ def key(i): def abstract_object(i, mapper, sregistry): """ Singledispatch-based implementation of object abstraction. - - Singledispatch allows foreign modules to specify their own rules for - object abstraction. """ return @@ -349,6 +419,22 @@ def _(i, mapper, sregistry): mapper[i] = v +@abstract_object.register(ThreadArray) +def _(i, mapper, sregistry): + if isinstance(i, SharedData): + name = sregistry.make_name(prefix='sd') + else: + name = sregistry.make_name(prefix='pta') + + v = i._rebuild(name=name) + + mapper.update({ + i: v, + i.indexed: v.indexed, + i._C_symbol: v._C_symbol, + }) + + @abstract_object.register(MPINeighborhood) def _(i, mapper, sregistry): mapper[i] = i._rebuild() diff --git a/devito/passes/iet/orchestration.py b/devito/passes/iet/orchestration.py index 54f66ec744..024417f2e6 100644 --- a/devito/passes/iet/orchestration.py +++ b/devito/passes/iet/orchestration.py @@ -64,7 +64,7 @@ def _make_withlock(self, iet, sync_ops, layer): # that we're happy for this Callable to be executed asynchronously name = self.sregistry.make_name(prefix=prefix) body = List(body=body) - parameters = derive_parameters(body) + parameters = derive_parameters(body, ordering='canonical') efunc = AsyncCallable(name, body, parameters=parameters) # The corresponding AsyncCall @@ -78,7 +78,7 @@ def _make_fetchupdate(self, iet, sync_ops, layer): # Turn init IET into a Callable name = self.sregistry.make_name(prefix=prefix) body = List(body=body) - parameters = derive_parameters(body) + parameters = derive_parameters(body, ordering='canonical') efunc = Callable(name, body, 'void', parameters, 'static') # Perform initial fetch by the main thread @@ -96,7 +96,7 @@ def _make_prefetchupdate(self, iet, sync_ops, layer): # that we're happy for this Callable to be executed asynchronously name = self.sregistry.make_name(prefix=prefix) body = List(body=body) - parameters = derive_parameters(body) + parameters = derive_parameters(body, ordering='canonical') efunc = AsyncCallable(name, body, parameters=parameters) # The corresponding AsyncCall diff --git a/devito/symbolics/extended_sympy.py b/devito/symbolics/extended_sympy.py index 8d9cbde35d..8616fd76d2 100644 --- a/devito/symbolics/extended_sympy.py +++ b/devito/symbolics/extended_sympy.py @@ -11,6 +11,7 @@ double2, double3, double4, int2, int3, int4) from devito.finite_differences.elementary import Min, Max from devito.types import Symbol +from devito.types.basic import Basic __all__ = ['CondEq', 'CondNe', 'IntDiv', 'CallFromPointer', # noqa 'CallFromComposite', 'FieldFromPointer', 'FieldFromComposite', @@ -165,11 +166,9 @@ def __new__(cls, call, pointer, params=None, **kwargs): pointer = Symbol(pointer) if isinstance(call, str): call = Symbol(call) - elif not isinstance(call, (CallFromPointer, DefFunction, sympy.Symbol)): - # NOTE: we need `sympy.Symbol`, rather than just (devito) `Symbol` - # because otherwise it breaks upon certain reconstructions on SymPy-1.8, - # due to the way `bound_symbols` and `canonical_variables` interact - raise ValueError("`call` must be CallFromPointer, DefFunction, or Symbol") + elif not isinstance(call, Basic): + raise ValueError("`call` must be a `devito.Basic` or a type " + "with compatible interface") _params = [] for p in as_tuple(params): if isinstance(p, str): diff --git a/devito/tools/utils.py b/devito/tools/utils.py index ff8b5608f2..e89a1aaa4e 100644 --- a/devito/tools/utils.py +++ b/devito/tools/utils.py @@ -13,7 +13,7 @@ 'roundm', 'powerset', 'invert', 'flatten', 'single_or', 'filter_ordered', 'as_mapper', 'filter_sorted', 'pprint', 'sweep', 'all_equal', 'as_list', 'indices_to_slices', 'indices_to_sections', 'transitive_closure', - 'humanbytes', 'contains_val'] + 'humanbytes', 'contains_val', 'sorted_priority'] # Some utils run faster with Python>=3.7 @@ -332,3 +332,34 @@ def humanbytes(B): return '%.1f GB' % round(B / GB, 1) elif TB <= B: return '%.2f TB' % round(B / TB, 1) + + +def sorted_priority(items, priority): + """ + Sort items based on their type priority. + + Rules: + + * Each type has an integer priority. + * Types with higher priority precede types with lower priority. + * Types with same priority are sorted based on the type name. + * Types with unknown priority are given 0-priority. + + Parameters + ---------- + items : iterable + The objects to be sorted. + priority : dict + A dictionary from types to integer values. + """ + + def key(i): + for cls in sorted(priority, key=priority.get, reverse=True): + if isinstance(i, cls): + v = priority[cls] + break + else: + v = 0 + return (v, str(type(i))) + + return sorted(items, key=key, reverse=True) diff --git a/devito/types/basic.py b/devito/types/basic.py index c5c0022b00..e1ce7fea30 100644 --- a/devito/types/basic.py +++ b/devito/types/basic.py @@ -270,6 +270,10 @@ class Basic(CodeSymbol): # Some other properties is_PerfKnob = False # Does it impact the Operator performance? + @property + def base(self): + return self + @property def bound_symbols(self): """ @@ -393,10 +397,6 @@ def ndim(self): def symbolic_shape(self): return () - @property - def base(self): - return self - @property def function(self): return self @@ -1006,6 +1006,10 @@ def dimensions(self): """Tuple of Dimensions representing the object indices.""" return self._dimensions + @property + def base(self): + return self.indexed + @property def _eval_deriv(self): return self diff --git a/devito/types/object.py b/devito/types/object.py index 1db973cce9..17c63c1b82 100644 --- a/devito/types/object.py +++ b/devito/types/object.py @@ -74,10 +74,6 @@ def _C_name(self): def _C_ctype(self): return self.dtype - @property - def base(self): - return self - @property def function(self): return self diff --git a/devito/types/parallel.py b/devito/types/parallel.py index 7a55125f57..0b7c3f9271 100644 --- a/devito/types/parallel.py +++ b/devito/types/parallel.py @@ -7,7 +7,7 @@ """ import os -from ctypes import c_void_p +from ctypes import POINTER, c_void_p from cached_property import cached_property import numpy as np @@ -21,8 +21,8 @@ from devito.types.misc import Fence, VolatileInt __all__ = ['NThreads', 'NThreadsNested', 'NThreadsNonaffine', 'NThreadsBase', - 'DeviceID', 'ThreadID', 'Lock', 'PThreadArray', 'SharedData', - 'NPThreads', 'DeviceRM', 'QueueID', 'Barrier', 'TBArray'] + 'DeviceID', 'ThreadID', 'Lock', 'ThreadArray', 'PThreadArray', + 'SharedData', 'NPThreads', 'DeviceRM', 'QueueID', 'Barrier', 'TBArray'] class NThreadsBase(Scalar): @@ -139,14 +139,10 @@ def index(self): else: return self.dim - @cached_property - def symbolic_base(self): - return Symbol(name=self.name, dtype=None) - class PThreadArray(ThreadArray): - dtype = type('pthread_t', (c_void_p,), {}) + dtype = POINTER(type('pthread_t', (c_void_p,), {})) @classmethod def __dtype_setup__(cls, **kwargs): diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index ad4f3f3b84..7297d87a2a 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -220,9 +220,9 @@ def test_tasking_fused(self): assert len(retrieve_iteration_tree(op)) == 3 locks = [i for i in FindSymbols().visit(op) if isinstance(i, Lock)] assert len(locks) == 1 # Only 1 because it's only `tmp` that needs protection - assert len(op._func_table) == 2 + assert len(op._func_table) == 3 exprs = FindNodes(Expression).visit(op._func_table['copy_to_host0'].root) - b = 13 if configuration['language'] == 'openacc' else 12 # No `qid` w/ OMP + b = 17 if configuration['language'] == 'openacc' else 16 # No `qid` w/ OMP assert str(exprs[b]) == 'const int deviceid = sdata->deviceid;' assert str(exprs[b+1]) == 'volatile int time = sdata->time;' assert str(exprs[b+2]) == 'lock0[0] = 1;' @@ -276,9 +276,9 @@ def test_tasking_unfused_two_locks(self): assert str(body.body[0].condition) == 'Ne(sdata1[0].flag, 1)' # Wait-thread assert str(body.body[1]) == 'sdata1[0].time = time;' assert str(body.body[2]) == 'sdata1[0].flag = 2;' - assert len(op._func_table) == 2 + assert len(op._func_table) == 3 exprs = FindNodes(Expression).visit(op._func_table['copy_to_host0'].root) - b = 15 if configuration['language'] == 'openacc' else 14 # No `qid` w/ OMP + b = 18 if configuration['language'] == 'openacc' else 17 # No `qid` w/ OMP assert str(exprs[b]) == 'lock0[0] = 1;' op.apply(time_M=nt-2) @@ -321,9 +321,9 @@ def test_tasking_forcefuse(self): assert str(body.body[0].condition) == 'Ne(sdata0[0].flag, 1)' # Wait-thread assert str(body.body[1]) == 'sdata0[0].time = time;' assert str(body.body[2]) == 'sdata0[0].flag = 2;' - assert len(op._func_table) == 2 + assert len(op._func_table) == 3 exprs = FindNodes(Expression).visit(op._func_table['copy_to_host0'].root) - b = 15 if configuration['language'] == 'openacc' else 14 # No `qid` w/ OMP + b = 21 if configuration['language'] == 'openacc' else 20 # No `qid` w/ OMP assert str(exprs[b]) == 'lock0[0] = 1;' assert str(exprs[b+1]) == 'lock1[0] = 1;' assert exprs[b+2].write is u @@ -376,7 +376,7 @@ def test_tasking_multi_output(self): op1 = Operator(eqns, opt=('tasking', 'orchestrate', {'linearize': False})) # Check generated code - assert len(retrieve_iteration_tree(op1)) == 4 + assert len(retrieve_iteration_tree(op1)) == 3 assert len([i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op1) assert len(sections) == 2 @@ -386,9 +386,9 @@ def test_tasking_multi_output(self): assert 'lock0[t' in str(sections[1].body[0].body[0].body[1 + i]) # Set-lock assert str(sections[1].body[0].body[0].body[4].body[-1]) ==\ 'sdata0[wi0].flag = 2;' - assert len(op1._func_table) == 2 + assert len(op1._func_table) == 3 exprs = FindNodes(Expression).visit(op1._func_table['copy_to_host0'].root) - b = 18 if configuration['language'] == 'openacc' else 17 # No `qid` w/ OMP + b = 21 if configuration['language'] == 'openacc' else 20 # No `qid` w/ OMP for i in range(3): assert 'lock0[t' in str(exprs[b + i]) assert exprs[b+3].write is usave @@ -413,7 +413,7 @@ def test_tasking_lock_placement(self): op = Operator(eqns, opt=('tasking', 'orchestrate')) # Check generated code -- the wait-lock is expected in section1 - assert len(retrieve_iteration_tree(op)) == 5 + assert len(retrieve_iteration_tree(op)) == 4 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) assert len(sections) == 3 @@ -440,7 +440,7 @@ def test_streaming_basic(self, opt, ntmps): op = Operator(eqn, opt=opt) # Check generated code - assert len(op._func_table) == 6 + assert len(op._func_table) == 7 assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps op.apply(time_M=nt-2) @@ -448,11 +448,11 @@ def test_streaming_basic(self, opt, ntmps): assert np.all(u.data[0] == 28) assert np.all(u.data[1] == 36) - @pytest.mark.parametrize('opt,ntmps,nfuncs', [ - (('buffering', 'streaming', 'orchestrate'), 10, 6), - (('buffering', 'streaming', 'fuse', 'orchestrate', {'fuse-tasks': True}), 7, 6), + @pytest.mark.parametrize('opt,ntmps', [ + (('buffering', 'streaming', 'orchestrate'), 10), + (('buffering', 'streaming', 'fuse', 'orchestrate', {'fuse-tasks': True}), 7), ]) - def test_streaming_two_buffers(self, opt, ntmps, nfuncs): + def test_streaming_two_buffers(self, opt, ntmps): nt = 10 grid = Grid(shape=(4, 4)) @@ -469,7 +469,7 @@ def test_streaming_two_buffers(self, opt, ntmps, nfuncs): op = Operator(eqn, opt=opt) # Check generated code - assert len(op._func_table) == nfuncs + assert len(op._func_table) == 7 assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps op.apply(time_M=nt-2) @@ -605,7 +605,7 @@ def test_streaming_multi_input(self, opt, ntmps): op1 = Operator(eqn, opt=opt) # Check generated code - assert len(op1._func_table) == 6 + assert len(op1._func_table) == 7 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == ntmps op0.apply(time_M=nt-2, dt=0.1) @@ -692,7 +692,7 @@ def test_streaming_postponed_deletion(self, opt, ntmps): op1 = Operator(eqns, opt=opt) # Check generated code - assert len(op1._func_table) == 6 + assert len(op1._func_table) == 7 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == ntmps op0.apply(time_M=nt-1) @@ -758,12 +758,12 @@ def test_composite_buffering_tasking_multi_output(self): # Check generated code -- thanks to buffering only expect 1 lock! assert len(retrieve_iteration_tree(op0)) == 2 - assert len(retrieve_iteration_tree(op1)) == 8 - assert len(retrieve_iteration_tree(op2)) == 5 + assert len(retrieve_iteration_tree(op1)) == 6 + assert len(retrieve_iteration_tree(op2)) == 4 symbols = FindSymbols().visit(op1) assert len([i for i in symbols if isinstance(i, Lock)]) == 3 threads = [i for i in symbols if isinstance(i, PThreadArray)] - assert len(threads) == 2 + assert len(threads) == 3 assert threads[0].size.size == async_degree assert threads[1].size.size == async_degree symbols = FindSymbols().visit(op2) @@ -775,7 +775,7 @@ def test_composite_buffering_tasking_multi_output(self): # It is true that the usave and vsave eqns are separated in two different # loop nests, but they eventually get mapped to the same pair of efuncs, # since devito attempts to maximize code reuse - assert len(op1._func_table) == 5 + assert len(op1._func_table) == 6 # Check output op0.apply(time_M=nt-1) @@ -815,11 +815,10 @@ def test_composite_full_0(self): assert len(retrieve_iteration_tree(op0)) == 1 assert len(retrieve_iteration_tree(op1)) == 3 symbols = FindSymbols().visit(op1) - assert len([i for i in symbols if isinstance(i, Lock)]) == 2 + assert len([i for i in symbols if isinstance(i, Lock)]) == 3 threads = [i for i in symbols if isinstance(i, PThreadArray)] - assert len(threads) == 2 - assert threads[0].size == 1 - assert threads[1].size == 1 + assert len(threads) == 3 + assert all(i.size == 1 for i in threads) op0.apply(time_M=nt-1) op1.apply(time_M=nt-1, u=u1, usave=usave1) @@ -851,7 +850,7 @@ def test_composite_full_1(self, opt): op1 = Operator(eqns, opt=opt) # Check generated code - assert len(retrieve_iteration_tree(op1)) == 7 + assert len(retrieve_iteration_tree(op1)) == 5 assert len([i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 2 op0.apply(time_M=nt-2) @@ -879,7 +878,7 @@ def test_tasking_over_compiler_generated(self): # Check generated code for op in [op1, op2]: - assert len(retrieve_iteration_tree(op)) == 5 + assert len(retrieve_iteration_tree(op)) == 4 assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 sections = FindNodes(Section).visit(op) assert len(sections) == 4 @@ -944,7 +943,7 @@ def test_save_multi_output(self): # The `usave` and `vsave` eqns are in separate tasks, but the tasks # are identical, so they get mapped to the same efuncs (init + copy) # There also are two extra functions to allocate and free arrays - assert len(op._func_table) == 5 + assert len(op._func_table) == 6 op.apply(time_M=nt-1) @@ -1000,7 +999,7 @@ def test_save_w_nonaffine_time(self): # We just check the generated code here assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 - assert len(op._func_table) == 5 + assert len(op._func_table) == 6 def test_save_w_subdims(self): nt = 10 @@ -1057,7 +1056,7 @@ def test_streaming_w_shifting(self, opt, ntmps): op = Operator(eqns, opt=opt) # Check generated code - assert len(op._func_table) == 6 + assert len(op._func_table) == 7 assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps # From time_m=15 to time_M=35 with a factor=5 -- it means that, thanks @@ -1112,11 +1111,11 @@ def test_streaming_complete(self): {'fuse-tasks': True})) # Check generated code - assert len(op1._func_table) == 9 + assert len(op1._func_table) == 11 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 7 - assert len(op2._func_table) == 9 + assert len(op2._func_table) == 11 assert len([i for i in FindSymbols().visit(op2) if i.is_Array]) == 7 - assert len(op3._func_table) == 7 + assert len(op3._func_table) == 8 assert len([i for i in FindSymbols().visit(op3) if i.is_Array]) == 7 op0.apply(time_m=15, time_M=35, save_shift=0) @@ -1244,17 +1243,14 @@ def test_gpu_create_forward(self): op = Operator(eqn, opt=('buffering', 'streaming', 'orchestrate', {'gpu-create': u})) - # print(op) - # assert False - - # language = configuration['language'] - # if language == 'openacc': - # assert 'create(u' in str(op) - # elif language == 'openmp': - # assert 'map(alloc: u' in str(op) - # assert 'init0(u_vec' in str(op) + language = configuration['language'] + if language == 'openacc': + assert 'create(u' in str(op) + elif language == 'openmp': + assert 'map(alloc: u' in str(op) + assert 'init0' in str(op) - op.apply(time_M=nt - 2) + op.apply(time_M=nt-2) assert np.all(u.data[0] == 28) assert np.all(u.data[1] == 36) diff --git a/tests/test_pickle.py b/tests/test_pickle.py index 03188cd17a..7ee1adef8e 100644 --- a/tests/test_pickle.py +++ b/tests/test_pickle.py @@ -314,12 +314,14 @@ def test_shared_data(self, pickle): assert sdata.cfields == new_sdata.cfields assert sdata.ncfields == new_sdata.ncfields - ffp = FieldFromPointer(sdata._field_flag, sdata.symbolic_base) + ffp = FieldFromPointer(sdata.symbolic_flag, sdata.indexed) pkl_ffp = pickle.dumps(ffp) new_ffp = pickle.loads(pkl_ffp) - assert ffp == new_ffp + assert ffp.field == new_ffp.field + assert ffp.base.name == new_ffp.base.name + assert ffp.function.fields == new_ffp.function.fields indexed = sdata[0] From 720b50e8d1a89e8c3167a4e188fa9d7ee40cfb0b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 10 Nov 2023 10:09:38 +0000 Subject: [PATCH 15/18] compiler: Abstract orchestration routines --- devito/passes/iet/asynchrony.py | 13 +++--- devito/passes/iet/orchestration.py | 14 +++--- tests/test_gpu_common.py | 71 +++++++++++++----------------- 3 files changed, 43 insertions(+), 55 deletions(-) diff --git a/devito/passes/iet/asynchrony.py b/devito/passes/iet/asynchrony.py index 01635945cc..29b9cce8a4 100644 --- a/devito/passes/iet/asynchrony.py +++ b/devito/passes/iet/asynchrony.py @@ -12,7 +12,7 @@ from devito.symbolics import (CondEq, CondNe, FieldFromComposite, FieldFromPointer, Null) from devito.tools import split -from devito.types import (Lock, Pointer, PThreadArray, QueueID, SharedData, Symbol, +from devito.types import (Lock, Pointer, PThreadArray, QueueID, SharedData, Temp, VolatileInt) __all__ = ['pthreadify'] @@ -194,7 +194,7 @@ def lower_async_calls(iet, track=None, sregistry=None): condition = CondNe(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 1) activation = [While(condition)] else: - d = Symbol(name=sregistry.make_name(prefix=threads.index.name)) + d = Temp(name=sregistry.make_name(prefix=threads.index.name)) condition = CondNe(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 1) activation = [DummyExpr(d, 0), While(condition, DummyExpr(d, (d + 1) % threads.size))] @@ -203,12 +203,9 @@ def lower_async_calls(iet, track=None, sregistry=None): activation.append( DummyExpr(FieldFromComposite(sdata.symbolic_flag, sdata[d]), 2) ) - activation = List( - header=[c.Line(), c.Comment("Activate `%s`" % threads.name)], - body=activation, - footer=c.Line() - ) - mapper[n] = activation + name = sregistry.make_name(prefix='activate') + efunc = efuncs[name] = make_callable(name, activation) + mapper[n] = Call(name, efunc.parameters) if mapper: # Inject activation diff --git a/devito/passes/iet/orchestration.py b/devito/passes/iet/orchestration.py index 024417f2e6..39fd286f1a 100644 --- a/devito/passes/iet/orchestration.py +++ b/devito/passes/iet/orchestration.py @@ -35,10 +35,9 @@ def __init__(self, sregistry): def _make_waitlock(self, iet, sync_ops, *args): waitloop = List( - header=c.Comment("Wait for `%s` to be copied to the host" % + header=c.Comment("Wait for `%s` to be transferred" % ",".join(s.target.name for s in sync_ops)), body=BusyWait(Or(*[CondEq(s.handle, 0) for s in sync_ops])), - footer=c.Line() ) iet = List(body=(waitloop,) + iet.body) @@ -50,12 +49,13 @@ def _make_releaselock(self, iet, sync_ops, *args): pre.append(BusyWait(Or(*[CondNe(s.handle, 2) for s in sync_ops]))) pre.extend(DummyExpr(s.handle, 0) for s in sync_ops) - iet = List( - header=c.Comment("Release lock(s) as soon as possible"), - body=pre + [iet] - ) + name = self.sregistry.make_name(prefix="release_lock") + parameters = derive_parameters(pre, ordering='canonical') + efunc = Callable(name, pre, 'void', parameters, 'static') - return iet, [] + iet = List(body=[Call(name, efunc.parameters)] + [iet]) + + return iet, [efunc] def _make_withlock(self, iet, sync_ops, layer): body, prefix = withlock(layer, iet, sync_ops, self.lang, self.sregistry) diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 7297d87a2a..0389a31405 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -188,9 +188,10 @@ def test_tasking_in_isolation(self, opt): assert len(sections) == 3 assert str(sections[0].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0);' body = sections[2].body[0].body[0] + body = op._func_table['release_lock0'].root.body assert str(body.body[0].condition) == 'Ne(lock0[0], 2)' assert str(body.body[1]) == 'lock0[0] = 0;' - body = body.body[2] + body = op._func_table['activate0'].root.body assert str(body.body[0].condition) == 'Ne(sdata0[0].flag, 1)' assert str(body.body[1]) == 'sdata0[0].time = time;' assert str(body.body[2]) == 'sdata0[0].flag = 2;' @@ -220,7 +221,7 @@ def test_tasking_fused(self): assert len(retrieve_iteration_tree(op)) == 3 locks = [i for i in FindSymbols().visit(op) if isinstance(i, Lock)] assert len(locks) == 1 # Only 1 because it's only `tmp` that needs protection - assert len(op._func_table) == 3 + assert len(op._func_table) == 5 exprs = FindNodes(Expression).visit(op._func_table['copy_to_host0'].root) b = 17 if configuration['language'] == 'openacc' else 16 # No `qid` w/ OMP assert str(exprs[b]) == 'const int deviceid = sdata->deviceid;' @@ -257,26 +258,15 @@ def test_tasking_unfused_two_locks(self): # Check generated code assert len(retrieve_iteration_tree(op)) == 3 - assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 3 + assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 4 sections = FindNodes(Section).visit(op) assert len(sections) == 4 assert (str(sections[1].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0 || lock1[0] == 0);') # Wait-lock body = sections[2].body[0].body[0] - assert str(body.body[0].condition) == 'Ne(lock0[0], 2)' - assert str(body.body[1]) == 'lock0[0] = 0;' # Set-lock - body = body.body[2] - assert str(body.body[0].condition) == 'Ne(sdata0[0].flag, 1)' # Wait-thread - assert str(body.body[1]) == 'sdata0[0].time = time;' - assert str(body.body[2]) == 'sdata0[0].flag = 2;' - body = sections[3].body[0].body[0] - assert str(body.body[0].condition) == 'Ne(lock1[0], 2)' - assert str(body.body[1]) == 'lock1[0] = 0;' # Set-lock - body = body.body[2] - assert str(body.body[0].condition) == 'Ne(sdata1[0].flag, 1)' # Wait-thread - assert str(body.body[1]) == 'sdata1[0].time = time;' - assert str(body.body[2]) == 'sdata1[0].flag = 2;' - assert len(op._func_table) == 3 + assert str(body.body[0]) == 'release_lock0(lock0);' + assert str(body.body[1]) == 'activate0(time,sdata0);' + assert len(op._func_table) == 5 exprs = FindNodes(Expression).visit(op._func_table['copy_to_host0'].root) b = 18 if configuration['language'] == 'openacc' else 17 # No `qid` w/ OMP assert str(exprs[b]) == 'lock0[0] = 1;' @@ -313,15 +303,15 @@ def test_tasking_forcefuse(self): assert len(sections) == 3 assert (str(sections[1].body[0].body[0].body[0].body[0]) == 'while(lock0[0] == 0 || lock1[0] == 0);') # Wait-lock - body = sections[2].body[0].body[0] + body = op._func_table['release_lock0'].root.body assert str(body.body[0].condition) == 'Ne(lock0[0], 2) | Ne(lock1[0], 2)' assert str(body.body[1]) == 'lock0[0] = 0;' # Set-lock assert str(body.body[2]) == 'lock1[0] = 0;' # Set-lock - body = body.body[3] + body = op._func_table['activate0'].root.body assert str(body.body[0].condition) == 'Ne(sdata0[0].flag, 1)' # Wait-thread assert str(body.body[1]) == 'sdata0[0].time = time;' assert str(body.body[2]) == 'sdata0[0].flag = 2;' - assert len(op._func_table) == 3 + assert len(op._func_table) == 5 exprs = FindNodes(Expression).visit(op._func_table['copy_to_host0'].root) b = 21 if configuration['language'] == 'openacc' else 20 # No `qid` w/ OMP assert str(exprs[b]) == 'lock0[0] = 1;' @@ -382,11 +372,12 @@ def test_tasking_multi_output(self): assert len(sections) == 2 assert str(sections[0].body[0].body[0].body[0].body[0]) ==\ 'while(lock0[t2] == 0);' + body = op1._func_table['release_lock0'].root.body for i in range(3): - assert 'lock0[t' in str(sections[1].body[0].body[0].body[1 + i]) # Set-lock - assert str(sections[1].body[0].body[0].body[4].body[-1]) ==\ - 'sdata0[wi0].flag = 2;' - assert len(op1._func_table) == 3 + assert 'lock0[t' in str(body.body[1 + i]) # Set-lock + body = op1._func_table['activate0'].root.body + assert str(body.body[-1]) == 'sdata0[wi0].flag = 2;' + assert len(op1._func_table) == 5 exprs = FindNodes(Expression).visit(op1._func_table['copy_to_host0'].root) b = 21 if configuration['language'] == 'openacc' else 20 # No `qid` w/ OMP for i in range(3): @@ -440,7 +431,7 @@ def test_streaming_basic(self, opt, ntmps): op = Operator(eqn, opt=opt) # Check generated code - assert len(op._func_table) == 7 + assert len(op._func_table) == 9 assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps op.apply(time_M=nt-2) @@ -449,7 +440,7 @@ def test_streaming_basic(self, opt, ntmps): assert np.all(u.data[1] == 36) @pytest.mark.parametrize('opt,ntmps', [ - (('buffering', 'streaming', 'orchestrate'), 10), + (('buffering', 'streaming', 'orchestrate'), 11), (('buffering', 'streaming', 'fuse', 'orchestrate', {'fuse-tasks': True}), 7), ]) def test_streaming_two_buffers(self, opt, ntmps): @@ -469,7 +460,7 @@ def test_streaming_two_buffers(self, opt, ntmps): op = Operator(eqn, opt=opt) # Check generated code - assert len(op._func_table) == 7 + assert len(op._func_table) == 9 assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps op.apply(time_M=nt-2) @@ -605,7 +596,7 @@ def test_streaming_multi_input(self, opt, ntmps): op1 = Operator(eqn, opt=opt) # Check generated code - assert len(op1._func_table) == 7 + assert len(op1._func_table) == 9 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == ntmps op0.apply(time_M=nt-2, dt=0.1) @@ -692,7 +683,7 @@ def test_streaming_postponed_deletion(self, opt, ntmps): op1 = Operator(eqns, opt=opt) # Check generated code - assert len(op1._func_table) == 7 + assert len(op1._func_table) == 9 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == ntmps op0.apply(time_M=nt-1) @@ -761,7 +752,7 @@ def test_composite_buffering_tasking_multi_output(self): assert len(retrieve_iteration_tree(op1)) == 6 assert len(retrieve_iteration_tree(op2)) == 4 symbols = FindSymbols().visit(op1) - assert len([i for i in symbols if isinstance(i, Lock)]) == 3 + assert len([i for i in symbols if isinstance(i, Lock)]) == 4 threads = [i for i in symbols if isinstance(i, PThreadArray)] assert len(threads) == 3 assert threads[0].size.size == async_degree @@ -775,7 +766,7 @@ def test_composite_buffering_tasking_multi_output(self): # It is true that the usave and vsave eqns are separated in two different # loop nests, but they eventually get mapped to the same pair of efuncs, # since devito attempts to maximize code reuse - assert len(op1._func_table) == 6 + assert len(op1._func_table) == 8 # Check output op0.apply(time_M=nt-1) @@ -815,7 +806,7 @@ def test_composite_full_0(self): assert len(retrieve_iteration_tree(op0)) == 1 assert len(retrieve_iteration_tree(op1)) == 3 symbols = FindSymbols().visit(op1) - assert len([i for i in symbols if isinstance(i, Lock)]) == 3 + assert len([i for i in symbols if isinstance(i, Lock)]) == 4 threads = [i for i in symbols if isinstance(i, PThreadArray)] assert len(threads) == 3 assert all(i.size == 1 for i in threads) @@ -943,7 +934,7 @@ def test_save_multi_output(self): # The `usave` and `vsave` eqns are in separate tasks, but the tasks # are identical, so they get mapped to the same efuncs (init + copy) # There also are two extra functions to allocate and free arrays - assert len(op._func_table) == 6 + assert len(op._func_table) == 8 op.apply(time_M=nt-1) @@ -999,7 +990,7 @@ def test_save_w_nonaffine_time(self): # We just check the generated code here assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 - assert len(op._func_table) == 6 + assert len(op._func_table) == 8 def test_save_w_subdims(self): nt = 10 @@ -1056,7 +1047,7 @@ def test_streaming_w_shifting(self, opt, ntmps): op = Operator(eqns, opt=opt) # Check generated code - assert len(op._func_table) == 7 + assert len(op._func_table) == 9 assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps # From time_m=15 to time_M=35 with a factor=5 -- it means that, thanks @@ -1111,11 +1102,11 @@ def test_streaming_complete(self): {'fuse-tasks': True})) # Check generated code - assert len(op1._func_table) == 11 - assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 7 - assert len(op2._func_table) == 11 - assert len([i for i in FindSymbols().visit(op2) if i.is_Array]) == 7 - assert len(op3._func_table) == 8 + assert len(op1._func_table) == 14 + assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 8 + assert len(op2._func_table) == 14 + assert len([i for i in FindSymbols().visit(op2) if i.is_Array]) == 8 + assert len(op3._func_table) == 10 assert len([i for i in FindSymbols().visit(op3) if i.is_Array]) == 7 op0.apply(time_m=15, time_M=35, save_shift=0) From 4590a421850ca1f9e1433891da9f7f6e9bcdd00f Mon Sep 17 00:00:00 2001 From: FabioLuporini Date: Fri, 10 Nov 2023 12:42:57 +0100 Subject: [PATCH 16/18] examples: Patch notebook --- examples/userapi/05_conditional_dimension.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/userapi/05_conditional_dimension.ipynb b/examples/userapi/05_conditional_dimension.ipynb index e95e5c1305..aee1302afe 100644 --- a/examples/userapi/05_conditional_dimension.ipynb +++ b/examples/userapi/05_conditional_dimension.ipynb @@ -332,7 +332,7 @@ " }\n", " }\n", "}\n", - "STOP(section0,timers)\n", + "STOP(section0,timers)\n" ] }, { @@ -408,7 +408,7 @@ " }\n", " }\n", "}\n", - "STOP(section0,timers)\n", + "STOP(section0,timers)\n" ] }, { @@ -499,7 +499,7 @@ " }\n", " }\n", "}\n", - "STOP(section0,timers)\n", + "STOP(section0,timers)\n" ] }, { @@ -689,7 +689,7 @@ " }\n", " }\n", "}\n", - "STOP(section0,timers)\n", + "STOP(section0,timers)\n" ] }, { @@ -764,7 +764,7 @@ " }\n", " }\n", "}\n", - "STOP(section0,timers)\n", + "STOP(section0,timers)\n" ] } ], From 18fe96f3aaa0d371e73eeda382d5f6459424ec4b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Fri, 10 Nov 2023 14:55:20 +0000 Subject: [PATCH 17/18] tests: Update expected output for omp offloading --- tests/test_gpu_common.py | 57 +++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py index 0389a31405..0a794effca 100644 --- a/tests/test_gpu_common.py +++ b/tests/test_gpu_common.py @@ -431,7 +431,10 @@ def test_streaming_basic(self, opt, ntmps): op = Operator(eqn, opt=opt) # Check generated code - assert len(op._func_table) == 9 + if configuration['language'] == 'openacc': + assert len(op._func_table) == 9 + else: + assert len(op._func_table) == 8 assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps op.apply(time_M=nt-2) @@ -460,8 +463,13 @@ def test_streaming_two_buffers(self, opt, ntmps): op = Operator(eqn, opt=opt) # Check generated code - assert len(op._func_table) == 9 - assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps + arrays = [i for i in FindSymbols().visit(op) if i.is_Array] + if configuration['language'] == 'openacc': + assert len(op._func_table) == 9 + assert len(arrays) == ntmps + else: + assert len(op._func_table) == 8 + assert len(arrays) == ntmps - 1 op.apply(time_M=nt-2) @@ -596,7 +604,10 @@ def test_streaming_multi_input(self, opt, ntmps): op1 = Operator(eqn, opt=opt) # Check generated code - assert len(op1._func_table) == 9 + if configuration['language'] == 'openacc': + assert len(op1._func_table) == 9 + else: + assert len(op1._func_table) == 8 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == ntmps op0.apply(time_M=nt-2, dt=0.1) @@ -683,7 +694,10 @@ def test_streaming_postponed_deletion(self, opt, ntmps): op1 = Operator(eqns, opt=opt) # Check generated code - assert len(op1._func_table) == 9 + if configuration['language'] == 'openacc': + assert len(op1._func_table) == 9 + else: + assert len(op1._func_table) == 8 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == ntmps op0.apply(time_M=nt-1) @@ -766,7 +780,10 @@ def test_composite_buffering_tasking_multi_output(self): # It is true that the usave and vsave eqns are separated in two different # loop nests, but they eventually get mapped to the same pair of efuncs, # since devito attempts to maximize code reuse - assert len(op1._func_table) == 8 + if configuration['language'] == 'openacc': + assert len(op1._func_table) == 8 + else: + assert len(op1._func_table) == 7 # Check output op0.apply(time_M=nt-1) @@ -934,7 +951,10 @@ def test_save_multi_output(self): # The `usave` and `vsave` eqns are in separate tasks, but the tasks # are identical, so they get mapped to the same efuncs (init + copy) # There also are two extra functions to allocate and free arrays - assert len(op._func_table) == 8 + if configuration['language'] == 'openacc': + assert len(op._func_table) == 8 + else: + assert len(op._func_table) == 7 op.apply(time_M=nt-1) @@ -990,7 +1010,10 @@ def test_save_w_nonaffine_time(self): # We just check the generated code here assert len([i for i in FindSymbols().visit(op) if isinstance(i, Lock)]) == 1 - assert len(op._func_table) == 8 + if configuration['language'] == 'openacc': + assert len(op._func_table) == 8 + else: + assert len(op._func_table) == 7 def test_save_w_subdims(self): nt = 10 @@ -1047,7 +1070,10 @@ def test_streaming_w_shifting(self, opt, ntmps): op = Operator(eqns, opt=opt) # Check generated code - assert len(op._func_table) == 9 + if configuration['language'] == 'openacc': + assert len(op._func_table) == 9 + else: + assert len(op._func_table) == 8 assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps # From time_m=15 to time_M=35 with a factor=5 -- it means that, thanks @@ -1102,12 +1128,13 @@ def test_streaming_complete(self): {'fuse-tasks': True})) # Check generated code - assert len(op1._func_table) == 14 - assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 8 - assert len(op2._func_table) == 14 - assert len([i for i in FindSymbols().visit(op2) if i.is_Array]) == 8 - assert len(op3._func_table) == 10 - assert len([i for i in FindSymbols().visit(op3) if i.is_Array]) == 7 + diff = int(configuration['language'] == 'openmp') + assert len(op1._func_table) == 14 - diff + assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 8 - diff + assert len(op2._func_table) == 14 - diff + assert len([i for i in FindSymbols().visit(op2) if i.is_Array]) == 8 - diff + assert len(op3._func_table) == 10 - diff + assert len([i for i in FindSymbols().visit(op3) if i.is_Array]) == 7 - diff op0.apply(time_m=15, time_M=35, save_shift=0) op1.apply(time_m=15, time_M=35, save_shift=0, u=u1) From 4dfc2281d554a926c6f959c0b1daa915ffb37898 Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 13 Nov 2023 15:00:36 +0000 Subject: [PATCH 18/18] tests: Remove obsolete test --- tests/test_gpu_openmp.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index 29866508d8..ebda431a37 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -222,30 +222,6 @@ def test_array_rw(self): assert len(op.body.unmaps) == 3 assert all('r0' not in str(i) for i in op.body.unmaps) - def test_function_wo(self): - grid = Grid(shape=(3, 3, 3)) - i = Dimension(name='i') - - f = Function(name='f', shape=(1,), dimensions=(i,), grid=grid) - u = TimeFunction(name='u', grid=grid) - - eqns = [Eq(u.forward, u + 1), - Eq(f[0], u[0, 0, 0, 0])] - - op = Operator(eqns, opt='noop', language='openmp') - - assert len(op.body.maps) == 1 - assert op.body.maps[0].pragmas[0].value ==\ - ('omp target enter data map(to: u[0:u_vec->size[0]]' - '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])') - assert len(op.body.unmaps) == 2 - assert op.body.unmaps[0].pragmas[0].value ==\ - ('omp target update from(u[0:u_vec->size[0]]' - '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])') - assert op.body.unmaps[1].pragmas[0].value ==\ - ('omp target exit data map(release: u[0:u_vec->size[0]]' - '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]]) if(devicerm)') - def test_timeparallel_reduction(self): grid = Grid(shape=(3, 3, 3)) i = Dimension(name='i')