From 6df00ae94f88258772698a8260c3f8417dc36c4b Mon Sep 17 00:00:00 2001 From: Fabio Luporini Date: Mon, 12 Feb 2024 09:55:50 +0000 Subject: [PATCH] compiler: Block reductions irrespective of par-tile --- devito/core/cpu.py | 4 ++- devito/core/gpu.py | 4 ++- devito/core/operator.py | 4 ++- devito/passes/clusters/blocking.py | 44 +++++++++++++++++++++++++----- 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/devito/core/cpu.py b/devito/core/cpu.py index 956258a91e..561e00e5dd 100644 --- a/devito/core/cpu.py +++ b/devito/core/cpu.py @@ -43,7 +43,9 @@ def _normalize_kwargs(cls, **kwargs): o['blocklazy'] = oo.pop('blocklazy', not o['blockeager']) o['blockrelax'] = oo.pop('blockrelax', cls.BLOCK_RELAX) o['skewing'] = oo.pop('skewing', False) - o['par-tile'] = ParTile(oo.pop('par-tile', False), default=16) + o['par-tile'] = ParTile(oo.pop('par-tile', False), default=16, + sparse=oo.pop('par-tile-sparse', None), + reduce=oo.pop('par-tile-reduce', None)) # CIRE o['min-storage'] = oo.pop('min-storage', False) diff --git a/devito/core/gpu.py b/devito/core/gpu.py index aaa498ae64..0d3b1969fa 100644 --- a/devito/core/gpu.py +++ b/devito/core/gpu.py @@ -65,7 +65,9 @@ def _normalize_kwargs(cls, **kwargs): o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE) # GPU parallelism - o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4, 4)) + o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4, 4), + sparse=oo.pop('par-tile-sparse', None), + reduce=oo.pop('par-tile-reduce', None)) o['par-collapse-ncores'] = 1 # Always collapse (meaningful if `par-tile=False`) o['par-collapse-work'] = 1 # Always collapse (meaningful if `par-tile=False`) o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE) diff --git a/devito/core/operator.py b/devito/core/operator.py index 5d0fb2efd8..5d7f886a74 100644 --- a/devito/core/operator.py +++ b/devito/core/operator.py @@ -341,7 +341,7 @@ def __new__(cls, items, rule=None, tag=None): class ParTile(UnboundedMultiTuple, OptOption): - def __new__(cls, items, default=None): + def __new__(cls, items, default=None, sparse=None, reduce=None): if not items: return UnboundedMultiTuple() elif isinstance(items, bool): @@ -397,5 +397,7 @@ def __new__(cls, items, default=None): obj = super().__new__(cls, *items) obj.default = as_tuple(default) + obj.sparse = as_tuple(sparse) + obj.reduce = as_tuple(reduce) return obj diff --git a/devito/passes/clusters/blocking.py b/devito/passes/clusters/blocking.py index 6dda3dfc0b..5ac3539c43 100644 --- a/devito/passes/clusters/blocking.py +++ b/devito/passes/clusters/blocking.py @@ -7,8 +7,8 @@ IntervalGroup, IterationSpace, Scope) from devito.passes import is_on_device from devito.symbolics import search, uxreplace, xreplace_indices -from devito.tools import (UnboundedMultiTuple, as_tuple, filter_ordered, - flatten, is_integer, prod) +from devito.tools import (UnboundedMultiTuple, UnboundTuple, as_tuple, + filter_ordered, flatten, is_integer, prod) from devito.types import BlockDimension __all__ = ['blocking'] @@ -433,12 +433,14 @@ class BlockSizeGenerator(object): """ def __init__(self, par_tile): - self.umt = par_tile self.tip = -1 - # This is for Clusters that need a small par-tile to avoid under-utilizing - # computational resources (e.g., kernels running over iteration spaces that - # are relatively small for the underlying architecture) + # The default par-tile, as an UnboundedMultiTuple. It will be used + # for most cases + self.umt = par_tile + + # Special case 1: a smaller par-tile to avoid under-utilizing + # computational resources when the iteration spaces are too small if (len(par_tile) == 1 and (len(par_tile[0]) < len(par_tile.default) or prod(par_tile[0]) < prod(par_tile.default))): @@ -447,6 +449,22 @@ def __init__(self, par_tile): else: self.umt_small = UnboundedMultiTuple(par_tile.default) + # Special case 2: par-tiles for iteration spaces that must be fully + # blocked for correctness + if par_tile.sparse: + self.umt_sparse = UnboundTuple(*par_tile.sparse, 1) + elif len(par_tile) == 1: + self.umt_sparse = UnboundTuple(*par_tile[0], 1) + else: + self.umt_sparse = UnboundTuple(*par_tile.default, 1) + + if par_tile.reduce: + self.umt_reduce = UnboundTuple(*par_tile.reduce, 1) + elif len(par_tile) == 1: + self.umt_reduce = UnboundTuple(*par_tile[0], 1) + else: + self.umt_reduce = UnboundTuple(*par_tile.default, 1) + def next(self, prefix, d, clusters): # If a whole new set of Dimensions, move the tip -- this means `clusters` # at `d` represents a new loop nest or kernel @@ -454,7 +472,19 @@ def next(self, prefix, d, clusters): if not x: self.tip += 1 - # TODO: This is for now exceptionally rudimentary + # Correctness -- enforce blocking where necessary. + # See also issue #276:PRO + if any(c.properties.is_parallel_atomic(d) for c in clusters): + if any(c.is_sparse for c in clusters): + if not x: + self.umt_sparse.iter() + return self.umt_sparse.next() + else: + if not x: + self.umt_reduce.iter() + return self.umt_reduce.next() + + # Performance heuristics -- use a smaller par-tile if all(c.properties.is_blockable_small(d) for c in clusters): if not x: self.umt_small.iter()