Skip to content

Commit

Permalink
Merge pull request #2309 from devitocodes/patch-compr-partile
Browse files Browse the repository at this point in the history
compiler: Block reductions irrespective of par-tile
  • Loading branch information
FabioLuporini authored Feb 12, 2024
2 parents 8c86bc2 + 6df00ae commit a3ea3c1
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 10 deletions.
4 changes: 3 additions & 1 deletion devito/core/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def _normalize_kwargs(cls, **kwargs):
o['blocklazy'] = oo.pop('blocklazy', not o['blockeager'])
o['blockrelax'] = oo.pop('blockrelax', cls.BLOCK_RELAX)
o['skewing'] = oo.pop('skewing', False)
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=16)
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=16,
sparse=oo.pop('par-tile-sparse', None),
reduce=oo.pop('par-tile-reduce', None))

# CIRE
o['min-storage'] = oo.pop('min-storage', False)
Expand Down
4 changes: 3 additions & 1 deletion devito/core/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def _normalize_kwargs(cls, **kwargs):
o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE)

# GPU parallelism
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4, 4))
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4, 4),
sparse=oo.pop('par-tile-sparse', None),
reduce=oo.pop('par-tile-reduce', None))
o['par-collapse-ncores'] = 1 # Always collapse (meaningful if `par-tile=False`)
o['par-collapse-work'] = 1 # Always collapse (meaningful if `par-tile=False`)
o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE)
Expand Down
4 changes: 3 additions & 1 deletion devito/core/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def __new__(cls, items, rule=None, tag=None):

class ParTile(UnboundedMultiTuple, OptOption):

def __new__(cls, items, default=None):
def __new__(cls, items, default=None, sparse=None, reduce=None):
if not items:
return UnboundedMultiTuple()
elif isinstance(items, bool):
Expand Down Expand Up @@ -397,5 +397,7 @@ def __new__(cls, items, default=None):

obj = super().__new__(cls, *items)
obj.default = as_tuple(default)
obj.sparse = as_tuple(sparse)
obj.reduce = as_tuple(reduce)

return obj
44 changes: 37 additions & 7 deletions devito/passes/clusters/blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
IntervalGroup, IterationSpace, Scope)
from devito.passes import is_on_device
from devito.symbolics import search, uxreplace, xreplace_indices
from devito.tools import (UnboundedMultiTuple, as_tuple, filter_ordered,
flatten, is_integer, prod)
from devito.tools import (UnboundedMultiTuple, UnboundTuple, as_tuple,
filter_ordered, flatten, is_integer, prod)
from devito.types import BlockDimension

__all__ = ['blocking']
Expand Down Expand Up @@ -433,12 +433,14 @@ class BlockSizeGenerator(object):
"""

def __init__(self, par_tile):
self.umt = par_tile
self.tip = -1

# This is for Clusters that need a small par-tile to avoid under-utilizing
# computational resources (e.g., kernels running over iteration spaces that
# are relatively small for the underlying architecture)
# The default par-tile, as an UnboundedMultiTuple. It will be used
# for most cases
self.umt = par_tile

# Special case 1: a smaller par-tile to avoid under-utilizing
# computational resources when the iteration spaces are too small
if (len(par_tile) == 1 and
(len(par_tile[0]) < len(par_tile.default) or
prod(par_tile[0]) < prod(par_tile.default))):
Expand All @@ -447,14 +449,42 @@ def __init__(self, par_tile):
else:
self.umt_small = UnboundedMultiTuple(par_tile.default)

# Special case 2: par-tiles for iteration spaces that must be fully
# blocked for correctness
if par_tile.sparse:
self.umt_sparse = UnboundTuple(*par_tile.sparse, 1)
elif len(par_tile) == 1:
self.umt_sparse = UnboundTuple(*par_tile[0], 1)
else:
self.umt_sparse = UnboundTuple(*par_tile.default, 1)

if par_tile.reduce:
self.umt_reduce = UnboundTuple(*par_tile.reduce, 1)
elif len(par_tile) == 1:
self.umt_reduce = UnboundTuple(*par_tile[0], 1)
else:
self.umt_reduce = UnboundTuple(*par_tile.default, 1)

def next(self, prefix, d, clusters):
# If a whole new set of Dimensions, move the tip -- this means `clusters`
# at `d` represents a new loop nest or kernel
x = any(i.dim.is_Block for i in flatten(c.ispace for c in clusters))
if not x:
self.tip += 1

# TODO: This is for now exceptionally rudimentary
# Correctness -- enforce blocking where necessary.
# See also issue #276:PRO
if any(c.properties.is_parallel_atomic(d) for c in clusters):
if any(c.is_sparse for c in clusters):
if not x:
self.umt_sparse.iter()
return self.umt_sparse.next()
else:
if not x:
self.umt_reduce.iter()
return self.umt_reduce.next()

# Performance heuristics -- use a smaller par-tile
if all(c.properties.is_blockable_small(d) for c in clusters):
if not x:
self.umt_small.iter()
Expand Down

0 comments on commit a3ea3c1

Please sign in to comment.