Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compiler: Add skewing pass towards Temporal Blocking #1620

Merged
merged 17 commits into from
Apr 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions devito/core/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from devito.core.operator import CoreOperator, CustomOperator
from devito.exceptions import InvalidOperator
from devito.passes.equations import buffering, collect_derivatives
from devito.passes.clusters import (Blocking, Lift, cire, cse, eliminate_arrays,
from devito.passes.clusters import (Lift, blocking, cire, cse, eliminate_arrays,
extract_increments, factorize, fuse, optimize_pows)
from devito.passes.iet import (CTarget, OmpTarget, avoid_denormals, mpiize,
optimize_halospots, hoist_prodders, relax_incr_dimensions)
Expand Down Expand Up @@ -82,6 +82,7 @@ def _normalize_kwargs(cls, **kwargs):
# Blocking
o['blockinner'] = oo.pop('blockinner', False)
o['blocklevels'] = oo.pop('blocklevels', cls.BLOCK_LEVELS)
o['skewing'] = oo.pop('skewing', False)
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved

# CIRE
o['min-storage'] = oo.pop('min-storage', False)
Expand Down Expand Up @@ -170,7 +171,7 @@ def _specialize_clusters(cls, clusters, **kwargs):
clusters = Lift().process(clusters)

# Blocking to improve data locality
clusters = Blocking(options).process(clusters)
clusters = blocking(clusters, options)

# Reduce flops (potential arithmetic alterations)
clusters = extract_increments(clusters, sregistry)
Expand Down Expand Up @@ -268,7 +269,7 @@ def _specialize_clusters(cls, clusters, **kwargs):
clusters = cse(clusters, sregistry)

# Blocking to improve data locality
clusters = Blocking(options).process(clusters)
clusters = blocking(clusters, options)

return clusters

Expand Down Expand Up @@ -306,7 +307,7 @@ def _make_clusters_passes_mapper(cls, **kwargs):
sregistry = kwargs['sregistry']

return {
'blocking': Blocking(options).process,
'blocking': lambda i: blocking(i, options),
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
'factorize': factorize,
'fuse': fuse,
'lift': lambda i: Lift().process(cire(i, 'invariants', sregistry,
Expand Down
4 changes: 2 additions & 2 deletions devito/core/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from devito.core.operator import CoreOperator, CustomOperator
from devito.exceptions import InvalidOperator
from devito.passes.equations import collect_derivatives, buffering
from devito.passes.clusters import (Blocking, Lift, Streaming, Tasker, cire, cse,
from devito.passes.clusters import (Lift, Streaming, Tasker, blocking, cire, cse,
eliminate_arrays, extract_increments, factorize,
fuse, optimize_pows)
from devito.passes.iet import (DeviceOmpTarget, DeviceAccTarget, optimize_halospots,
Expand Down Expand Up @@ -240,7 +240,7 @@ def _make_clusters_passes_mapper(cls, **kwargs):
runs_on_host, reads_if_on_host = make_callbacks(options)

return {
'blocking': Blocking(options).process,
'blocking': lambda i: blocking(i, options),
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
'tasking': Tasker(runs_on_host).process,
'streaming': Streaming(reads_if_on_host).process,
'factorize': factorize,
Expand Down
21 changes: 16 additions & 5 deletions devito/ir/clusters/analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from devito.ir.clusters.queue import QueueStateful
from devito.ir.support import (SEQUENTIAL, PARALLEL, PARALLEL_INDEP, PARALLEL_IF_ATOMIC,
AFFINE, ROUNDABLE, TILABLE, Forward)
from devito.ir.support import (AFFINE, PARALLEL, PARALLEL_INDEP, PARALLEL_IF_ATOMIC,
ROUNDABLE, SEQUENTIAL, SKEWABLE, TILABLE, Forward)
from devito.tools import as_tuple, flatten, timed_pass

__all__ = ['analyze']
Expand All @@ -14,6 +14,7 @@ def analyze(clusters):
clusters = Parallelism(state).process(clusters)
clusters = Affiness(state).process(clusters)
clusters = Tiling(state).process(clusters)
clusters = Skewing(state).process(clusters)
clusters = Rounding(state).process(clusters)

# Reconstruct Clusters attaching the discovered properties
Expand Down Expand Up @@ -164,9 +165,6 @@ class Tiling(Detector):
Detect the TILABLE Dimensions.
"""

def process(self, elements):
return self._process_fdta(elements, 1)

def _callback(self, clusters, d, prefix):
# A Dimension is TILABLE only if it's PARALLEL and AFFINE
properties = self._fetch_properties(clusters, prefix)
Expand All @@ -192,3 +190,16 @@ def _callback(self, clusters, d, prefix):
return

return TILABLE


class Skewing(Detector):
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved

"""
Detect the SKEWABLE Dimensions.
"""

def _callback(self, clusters, d, prefix):
# A Dimension is SKEWABLE in case it is TILABLE
properties = self._fetch_properties(clusters, prefix)
if {TILABLE} <= properties[d]:
return SKEWABLE
2 changes: 1 addition & 1 deletion devito/ir/clusters/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ class ClusterGroup(tuple):

Parameters
----------
clusters : list of Clusters
clusters : tuple of Clusters
Input elements.
itintervals : tuple of IterationIntervals, optional
The region of iteration space shared by the ``clusters``.
Expand Down
3 changes: 3 additions & 0 deletions devito/ir/support/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def __init__(self, name, val=None):
TILABLE = Property('tilable')
"""A fully parallel Dimension that would benefit from tiling (or "blocking")."""

SKEWABLE = Property('skewable')
"""A fully parallel Dimension that would benefit from wavefront/skewed tiling."""

ROUNDABLE = Property('roundable')
"""
A Dimension whose upper limit may be rounded up to a multiple of the SIMD
Expand Down
165 changes: 139 additions & 26 deletions devito/passes/clusters/blocking.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,45 @@
from collections import Counter

from devito.ir.clusters import Queue
from devito.ir.support import TILABLE, IntervalGroup, IterationSpace
from devito.ir.support import (SEQUENTIAL, SKEWABLE, TILABLE, Interval, IntervalGroup,
IterationSpace)
from devito.symbolics import uxreplace
from devito.tools import timed_pass
from devito.types import IncrDimension

__all__ = ['Blocking']
from devito.symbolics import xreplace_indices

__all__ = ['blocking']


def blocking(clusters, options):
"""
Loop blocking to improve data locality.

Parameters
----------
clusters : tuple of Clusters
Input Clusters, subject of the optimization pass.
options : dict
The optimization options.
* `blockinner` (boolean, False): enable/disable loop blocking along the
innermost loop.
* `blocklevels` (int, 1): 1 => classic loop blocking; 2 for two-level
hierarchical blocking.
* `skewing` (boolean, False): enable/disable loop skewing.

georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
Notes
------
In case of skewing, if 'blockinner' is enabled, the innermost loop is also skewed.
"""
processed = preprocess(clusters, options)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was pointed out a few times, "processed" is only at the end, when it needs no more processing... anyway, nitpicking


if options['blocklevels'] > 0:
processed = Blocking(options).process(processed)

if options['skewing']:
processed = Skewing(options).process(processed)

return processed


class Blocking(Queue):
Expand All @@ -24,29 +57,6 @@ def __init__(self, options):
def _make_key_hook(self, cluster, level):
return (tuple(cluster.guards.get(i.dim) for i in cluster.itintervals[:level]),)

@timed_pass(name='blocking')
def process(self, clusters):
# Preprocess: heuristic: drop TILABLE from innermost Dimensions to
# maximize vectorization
processed = []
for c in clusters:
ntilable = len([i for i in c.properties.values() if TILABLE in i])
ntilable -= int(not self.inner)
if ntilable <= 1:
properties = {k: v - {TILABLE} for k, v in c.properties.items()}
processed.append(c.rebuild(properties=properties))
elif not self.inner:
d = c.itintervals[-1].dim
properties = dict(c.properties)
properties[d] = properties[d] - {TILABLE}
processed.append(c.rebuild(properties=properties))
else:
processed.append(c)

processed = super(Blocking, self).process(processed)

return processed

def _process_fdta(self, clusters, level, prefix=None):
# Truncate recursion in case of TILABLE, non-perfect sub-nests, as
# it's an unsupported case
Expand Down Expand Up @@ -88,9 +98,14 @@ def callback(self, clusters, prefix):
exprs = [uxreplace(e, {d: bd}) for e in c.exprs]

# The new Cluster properties
# TILABLE property is dropped after the blocking.
Copy link
Contributor

@FabioLuporini FabioLuporini Apr 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# comments don't take full stop at end

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

after the blocking -> after blocking

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds like a useless comment though

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

# SKEWABLE is dropped as well, but only from the new
# block dimensions.
properties = dict(c.properties)
properties.pop(d)
properties.update({bd: c.properties[d] - {TILABLE} for bd in block_dims})
properties.update({bd: c.properties[d] - {SKEWABLE}
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
for bd in block_dims[:-1]})

processed.append(c.rebuild(exprs=exprs, ispace=ispace,
FabioLuporini marked this conversation as resolved.
Show resolved Hide resolved
properties=properties))
Expand All @@ -103,6 +118,28 @@ def callback(self, clusters, prefix):
return processed


def preprocess(clusters, options):
# Preprocess: heuristic: drop TILABLE from innermost Dimensions to
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could maybe be turned into a docstring now that preprocess has its own function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

# maximize vectorization
inner = bool(options['blockinner'])
processed = []
for c in clusters:
ntilable = len([i for i in c.properties.values() if TILABLE in i])
ntilable -= int(not inner)
if ntilable <= 1:
properties = {k: v - {TILABLE} for k, v in c.properties.items()}
processed.append(c.rebuild(properties=properties))
elif not inner:
d = c.itintervals[-1].dim
properties = dict(c.properties)
properties[d] = properties[d] - {TILABLE}
processed.append(c.rebuild(properties=properties))
else:
processed.append(c)

return processed


def decompose(ispace, d, block_dims):
"""
Create a new IterationSpace in which the `d` Interval is decomposed
Expand Down Expand Up @@ -161,3 +198,79 @@ def decompose(ispace, d, block_dims):
directions.update({bd: ispace.directions[d] for bd in block_dims})

return IterationSpace(intervals, sub_iterators, directions)


class Skewing(Queue):

"""
Construct a new sequence of clusters with skewed expressions and iteration spaces.

Notes
-----
This transformation is applying loop skewing to derive the
wavefront method of execution of nested loops. Loop skewing is
a simple transformation of loop bounds and is combined with loop
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
interchanging to generate the wavefront [1]_.
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved

.. [1] Wolfe, Michael. "Loops skewing: The wavefront method revisited."
International Journal of Parallel Programming 15.4 (1986): 279-293.

Examples:

.. code-block:: python

for i = 2, n-1
for j = 2, m-1
a[i,j] = (a[a-1,j] + a[i,j-1] + a[i+1,j] + a[i,j+1]) / 4

to

.. code-block:: python

for i = 2, n-1
for j = 2+i, m-1+i
a[i,j-i] = (a[a-1,j-i] + a[i,j-1-i] + a[i+1,j-i] + a[i,j+1-i]) / 4

"""

def __init__(self, options):
self.skewinner = bool(options['blockinner'])

super(Skewing, self).__init__()

def callback(self, clusters, prefix):
if not prefix:
return clusters

d = prefix[-1].dim

processed = []
for c in clusters:
if SKEWABLE not in c.properties[d]:
return clusters

if d is c.ispace[-1].dim and not self.skewinner:
return clusters

skew_dims = {i.dim for i in c.ispace if SEQUENTIAL in c.properties[i.dim]}
if len(skew_dims) > 1:
return clusters
skew_dim = skew_dims.pop()

# Since we are here, prefix is skewable and nested under a
# SEQUENTIAL loop.
intervals = []
for i in c.ispace:
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
if i.dim is d:
intervals.append(Interval(d, skew_dim, skew_dim))
else:
intervals.append(i)
intervals = IntervalGroup(intervals, relations=c.ispace.relations)
ispace = IterationSpace(intervals, c.ispace.sub_iterators,
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
c.ispace.directions)

exprs = xreplace_indices(c.exprs, {d: d - skew_dim})
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
processed.append(c.rebuild(exprs=exprs, ispace=ispace,
properties=c.properties))

return processed
4 changes: 2 additions & 2 deletions examples/compiler/03_iet-A.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@
" <Section (section0)>\n",
"\n",
" <TimedList (1, 1, 1)>\n",
" <[affine,parallel,parallel=,tilable] Iteration x::x::(x_m, x_M, 1)>\n",
" <[affine,parallel,parallel=,tilable] Iteration y::y::(y_m, y_M, 1)>\n",
" <[affine,parallel,parallel=,skewable,tilable] Iteration x::x::(x_m, x_M, 1)>\n",
" <[affine,parallel,parallel=,skewable,tilable] Iteration y::y::(y_m, y_M, 1)>\n",
" <ExpressionBundle (1)>\n",
"\n",
" <Expression u[t1, x + 1, y + 1] = u[t0, x + 1, y + 1] + 1>\n",
Expand Down
2 changes: 1 addition & 1 deletion examples/misc/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def callback_shape(ctx, param, value):

def callback_opts(ctx, param, value):
if value is True:
return ('blocking,simd,openmp', {'blockinner': True})
return ('blocking', 'simd', 'openmp', {'blockinner': True})
else:
return 'noop'

Expand Down
7 changes: 7 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,10 @@ def pytest_runtest_call(item):
if item.get_closest_marker("parallel") and not partest:
# Spawn parallel processes to run test
parallel(item)


# A list of optimization options/pipelines to be used in testing
# regarding spatial and/or temporal blocking.
opts_tiling = ['advanced',
('advanced', {'skewing': True}),
('advanced', {'skewing': True, 'blockinner': True})]
Loading