Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compiler: Block reductions irrespective of par-tile #2309

Merged
merged 1 commit into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion devito/core/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def _normalize_kwargs(cls, **kwargs):
o['blocklazy'] = oo.pop('blocklazy', not o['blockeager'])
o['blockrelax'] = oo.pop('blockrelax', cls.BLOCK_RELAX)
o['skewing'] = oo.pop('skewing', False)
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=16)
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=16,
sparse=oo.pop('par-tile-sparse', None),
reduce=oo.pop('par-tile-reduce', None))

# CIRE
o['min-storage'] = oo.pop('min-storage', False)
Expand Down
4 changes: 3 additions & 1 deletion devito/core/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def _normalize_kwargs(cls, **kwargs):
o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE)

# GPU parallelism
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4, 4))
o['par-tile'] = ParTile(oo.pop('par-tile', False), default=(32, 4, 4),
sparse=oo.pop('par-tile-sparse', None),
reduce=oo.pop('par-tile-reduce', None))
o['par-collapse-ncores'] = 1 # Always collapse (meaningful if `par-tile=False`)
o['par-collapse-work'] = 1 # Always collapse (meaningful if `par-tile=False`)
o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE)
Expand Down
4 changes: 3 additions & 1 deletion devito/core/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def __new__(cls, items, rule=None, tag=None):

class ParTile(UnboundedMultiTuple, OptOption):

def __new__(cls, items, default=None):
def __new__(cls, items, default=None, sparse=None, reduce=None):
if not items:
return UnboundedMultiTuple()
elif isinstance(items, bool):
Expand Down Expand Up @@ -397,5 +397,7 @@ def __new__(cls, items, default=None):

obj = super().__new__(cls, *items)
obj.default = as_tuple(default)
obj.sparse = as_tuple(sparse)
obj.reduce = as_tuple(reduce)

return obj
44 changes: 37 additions & 7 deletions devito/passes/clusters/blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
IntervalGroup, IterationSpace, Scope)
from devito.passes import is_on_device
from devito.symbolics import search, uxreplace, xreplace_indices
from devito.tools import (UnboundedMultiTuple, as_tuple, filter_ordered,
flatten, is_integer, prod)
from devito.tools import (UnboundedMultiTuple, UnboundTuple, as_tuple,
filter_ordered, flatten, is_integer, prod)
from devito.types import BlockDimension

__all__ = ['blocking']
Expand Down Expand Up @@ -433,12 +433,14 @@ class BlockSizeGenerator(object):
"""

def __init__(self, par_tile):
self.umt = par_tile
self.tip = -1

# This is for Clusters that need a small par-tile to avoid under-utilizing
# computational resources (e.g., kernels running over iteration spaces that
# are relatively small for the underlying architecture)
# The default par-tile, as an UnboundedMultiTuple. It will be used
# for most cases
self.umt = par_tile

# Special case 1: a smaller par-tile to avoid under-utilizing
# computational resources when the iteration spaces are too small
if (len(par_tile) == 1 and
(len(par_tile[0]) < len(par_tile.default) or
prod(par_tile[0]) < prod(par_tile.default))):
Expand All @@ -447,14 +449,42 @@ def __init__(self, par_tile):
else:
self.umt_small = UnboundedMultiTuple(par_tile.default)

# Special case 2: par-tiles for iteration spaces that must be fully
# blocked for correctness
if par_tile.sparse:
self.umt_sparse = UnboundTuple(*par_tile.sparse, 1)
elif len(par_tile) == 1:
self.umt_sparse = UnboundTuple(*par_tile[0], 1)
else:
self.umt_sparse = UnboundTuple(*par_tile.default, 1)

if par_tile.reduce:
self.umt_reduce = UnboundTuple(*par_tile.reduce, 1)
elif len(par_tile) == 1:
self.umt_reduce = UnboundTuple(*par_tile[0], 1)
else:
self.umt_reduce = UnboundTuple(*par_tile.default, 1)

def next(self, prefix, d, clusters):
# If a whole new set of Dimensions, move the tip -- this means `clusters`
# at `d` represents a new loop nest or kernel
x = any(i.dim.is_Block for i in flatten(c.ispace for c in clusters))
if not x:
self.tip += 1

# TODO: This is for now exceptionally rudimentary
# Correctness -- enforce blocking where necessary.
# See also issue #276:PRO
if any(c.properties.is_parallel_atomic(d) for c in clusters):
if any(c.is_sparse for c in clusters):
if not x:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't think you don't need this, it's just a tuple not a multi-tuple, same below

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait no you do need it to go back to first element if there is multiple sparse/reduce cluster nevermind

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

correct

self.umt_sparse.iter()
return self.umt_sparse.next()
else:
if not x:
self.umt_reduce.iter()
return self.umt_reduce.next()

# Performance heuristics -- use a smaller par-tile
if all(c.properties.is_blockable_small(d) for c in clusters):
if not x:
self.umt_small.iter()
Expand Down
Loading