Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compiler: Patch race conditions due to storage-related dependencies #1903

Merged
merged 5 commits into from
Apr 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion devito/arch/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,13 @@ def __init__(self, *args, **kwargs):
self.cflags.remove('-std=c99')
self.cflags.remove('-O3')
self.cflags.remove('-Wall')
self.cflags += ['-std=c++11', '-acc:gpu', '-gpu=pinned', '-mp']

self.cflags += ['-std=c++11', '-mp']

platform = kwargs.pop('platform', configuration['platform'])
if platform is NVIDIAX:
self.cflags += ['-acc:gpu', '-gpu=pinned']

if not configuration['safe-math']:
self.cflags.append('-fast')
# Default PGI compile for a target is GPU and single threaded host.
Expand Down
9 changes: 6 additions & 3 deletions devito/core/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,15 @@ class Cpu64NoopOperator(Cpu64OperatorMixin, CoreOperator):
def _specialize_iet(cls, graph, **kwargs):
options = kwargs['options']
platform = kwargs['platform']
compiler = kwargs['compiler']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: not for here but long run, should be gather all of these into a CompilerOptions class to avoid carrying all these multiple arguments everywhere.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's a fair point, but I actually don't mind supplying the passes only the strict necessary rather than a big batch of things

sregistry = kwargs['sregistry']

# Distributed-memory parallelism
mpiize(graph, sregistry=sregistry, options=options)

# Shared-memory parallelism
if options['openmp']:
parizer = cls._Target.Parizer(sregistry, options, platform)
parizer = cls._Target.Parizer(sregistry, options, platform, compiler)
parizer.make_parallel(graph)
parizer.initialize(graph)

Expand Down Expand Up @@ -213,6 +214,7 @@ def _specialize_clusters(cls, clusters, **kwargs):
def _specialize_iet(cls, graph, **kwargs):
options = kwargs['options']
platform = kwargs['platform']
compiler = kwargs['compiler']
sregistry = kwargs['sregistry']

# Flush denormal numbers
Expand All @@ -225,7 +227,7 @@ def _specialize_iet(cls, graph, **kwargs):
relax_incr_dimensions(graph)

# Parallelism
parizer = cls._Target.Parizer(sregistry, options, platform)
parizer = cls._Target.Parizer(sregistry, options, platform, compiler)
parizer.make_simd(graph)
parizer.make_parallel(graph)
parizer.initialize(graph)
Expand Down Expand Up @@ -304,9 +306,10 @@ def callback(f):
def _make_iet_passes_mapper(cls, **kwargs):
options = kwargs['options']
platform = kwargs['platform']
compiler = kwargs['compiler']
sregistry = kwargs['sregistry']

parizer = cls._Target.Parizer(sregistry, options, platform)
parizer = cls._Target.Parizer(sregistry, options, platform, compiler)

return {
'denormals': avoid_denormals,
Expand Down
9 changes: 6 additions & 3 deletions devito/core/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,14 @@ class DeviceNoopOperator(DeviceOperatorMixin, CoreOperator):
def _specialize_iet(cls, graph, **kwargs):
options = kwargs['options']
platform = kwargs['platform']
compiler = kwargs['compiler']
sregistry = kwargs['sregistry']

# Distributed-memory parallelism
mpiize(graph, sregistry=sregistry, options=options)

# GPU parallelism
parizer = cls._Target.Parizer(sregistry, options, platform)
parizer = cls._Target.Parizer(sregistry, options, platform, compiler)
parizer.make_parallel(graph)
parizer.initialize(graph)

Expand Down Expand Up @@ -202,6 +203,7 @@ def _specialize_clusters(cls, clusters, **kwargs):
def _specialize_iet(cls, graph, **kwargs):
options = kwargs['options']
platform = kwargs['platform']
compiler = kwargs['compiler']
sregistry = kwargs['sregistry']

# Distributed-memory parallelism
Expand All @@ -211,7 +213,7 @@ def _specialize_iet(cls, graph, **kwargs):
relax_incr_dimensions(graph)

# GPU parallelism
parizer = cls._Target.Parizer(sregistry, options, platform)
parizer = cls._Target.Parizer(sregistry, options, platform, compiler)
parizer.make_parallel(graph)
parizer.initialize(graph)

Expand Down Expand Up @@ -282,9 +284,10 @@ def callback(f):
def _make_iet_passes_mapper(cls, **kwargs):
options = kwargs['options']
platform = kwargs['platform']
compiler = kwargs['compiler']
sregistry = kwargs['sregistry']

parizer = cls._Target.Parizer(sregistry, options, platform)
parizer = cls._Target.Parizer(sregistry, options, platform, compiler)
orchestrator = cls._Target.Orchestrator(sregistry)

return {
Expand Down
2 changes: 1 addition & 1 deletion devito/ir/clusters/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class Schedule(QueueStateful):
scheduled to different loop nests.

* If *all* dependences across two Clusters along a given Dimension are
backward carried depedences, then the IterationSpaces are _lifted_
backward carried dependences, then the IterationSpaces are _lifted_
such that the two Clusters cannot be fused. This is to maximize
the number of parallel Dimensions. Essentially, this is what low-level
compilers call "loop fission" -- only that here it occurs at a much
Expand Down
30 changes: 21 additions & 9 deletions devito/ir/support/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,6 @@ def aindices(self):
retval.append(dims.pop())
elif isinstance(i, Dimension):
retval.append(i)
elif q_constant(i):
retval.append(fi)
else:
retval.append(None)
return DimensionTuple(*retval, getters=self.findices)
Expand Down Expand Up @@ -262,10 +260,14 @@ def is_regular(self):
# space Dimensions
positions = []
for d in self.aindices:
for n, i in enumerate(self.intervals):
if i.dim._defines & d._defines:
positions.append(n)
break
try:
for n, i in enumerate(self.intervals):
if i.dim._defines & d._defines:
positions.append(n)
break
except AttributeError:
# `d is None` due to e.g. constant access
continue
return positions == sorted(positions)

def __lt__(self, other):
Expand Down Expand Up @@ -548,6 +550,15 @@ def is_cross(self):
def is_local(self):
return self.function.is_Symbol

@memoized_meth
def is_const(self, dim):
"""
True if a constant dependence, that is no Dimensions involved, False otherwise.
"""
return (self.source.aindices[dim] is None and
self.sink.aindices[dim] is None and
self.distance_mapper[dim] == 0)

@memoized_meth
def is_carried(self, dim=None):
"""Return True if definitely a dimension-carried dependence, False otherwise."""
Expand Down Expand Up @@ -623,9 +634,10 @@ def is_storage_related(self, dims=None):
cause the access of the same memory location, False otherwise.
"""
for d in self.findices:
if (d._defines & set(as_tuple(dims)) and
any(i.is_NonlinearDerived for i in d._defines)):
return True
if d._defines & set(as_tuple(dims)):
if any(i.is_NonlinearDerived for i in d._defines) or \
self.is_const(d):
return True
return False


Expand Down
9 changes: 7 additions & 2 deletions devito/mpi/halo_scheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ def classify(exprs, ispace):
v[(d, LEFT)] = STENCIL
v[(d, RIGHT)] = STENCIL
else:
v[(d, i.aindices[d])] = NONE
v[(d, i[d])] = NONE

# Does `i` actually require a halo exchange?
if not any(hl is STENCIL for hl in v.values()):
Expand Down Expand Up @@ -426,7 +426,12 @@ def classify(exprs, ispace):
func = Max
candidates = [i for i in aindices if not is_integer(i)]
candidates = {(i.origin if d.is_Stepping else i) - d: i for i in candidates}
loc_indices[d] = candidates[func(*candidates.keys())]
try:
loc_indices[d] = candidates[func(*candidates.keys())]
except KeyError:
# E.g., `aindices = [0, 1, d+1]` -- it doesn't really matter
# what we put here, so we place 0 as it's the old behaviour
loc_indices[d] = 0

mapper[f] = HaloSchemeEntry(frozendict(loc_indices), frozenset(halos))

Expand Down
2 changes: 1 addition & 1 deletion devito/passes/clusters/asynchrony.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def callback(self, clusters, prefix):
else:
# Functions over non-stepping Dimensions need no lock
continue
except KeyError:
except (AttributeError, KeyError):
# Would degenerate to a scalar, but we rather use a lock
# of size 1 for simplicity
ld = CustomDimension(name='ld', symbolic_size=1)
Expand Down
25 changes: 22 additions & 3 deletions devito/passes/clusters/buffering.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import numpy as np

from devito.ir import (Cluster, Forward, GuardBound, Interval, IntervalGroup,
IterationSpace, PARALLEL, Queue, Vector, lower_exprs, vmax, vmin)
IterationSpace, PARALLEL, Queue, SEQUENTIAL, Vector,
lower_exprs, normalize_properties, vmax, vmin)
from devito.exceptions import InvalidOperator
from devito.logger import warning
from devito.symbolics import retrieve_function_carriers, uxreplace
Expand Down Expand Up @@ -207,7 +208,16 @@ def callback(self, clusters, prefix, cache=None):
expr = lower_exprs(uxreplace(Eq(lhs, rhs), b.subdims_mapper))
ispace = b.written

processed.append(c.rebuild(exprs=expr, ispace=ispace))
# Buffering creates a storage-related dependence along the
# contracted dimensions
properties = dict(c.properties)
for d in b.contraction_mapper:
d = ispace[d].dim # E.g., `time_sub -> time`
properties[d] = normalize_properties(properties[d], {SEQUENTIAL})

processed.append(
c.rebuild(exprs=expr, ispace=ispace, properties=properties)
)

# Substitute buffered Functions with the newly created buffers
exprs = [uxreplace(e, subs) for e in c.exprs]
Expand All @@ -233,7 +243,16 @@ def callback(self, clusters, prefix, cache=None):
expr = lower_exprs(uxreplace(Eq(lhs, rhs), b.subdims_mapper))
ispace = b.written

processed.append(c.rebuild(exprs=expr, ispace=ispace))
# Buffering creates a storage-related dependence along the
# contracted dimensions
properties = dict(c.properties)
mloubout marked this conversation as resolved.
Show resolved Hide resolved
for d in b.contraction_mapper:
d = ispace[d].dim # E.g., `time_sub -> time`
properties[d] = normalize_properties(properties[d], {SEQUENTIAL})

processed.append(
c.rebuild(exprs=expr, ispace=ispace, properties=properties)
)

return processed

Expand Down
5 changes: 4 additions & 1 deletion devito/passes/iet/langbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class LangTransformer(ABC):
The constructs of the target language. To be specialized by a subclass.
"""

def __init__(self, key, sregistry, platform):
def __init__(self, key, sregistry, platform, compiler):
"""
Parameters
----------
Expand All @@ -195,13 +195,16 @@ def __init__(self, key, sregistry, platform):
The symbol registry, to access the symbols appearing in an IET.
platform : Platform
The underlying platform.
compiler : Compiler
The underlying JIT compiler.
"""
if key is not None:
self.key = key
else:
self.key = lambda i: False
self.sregistry = sregistry
self.platform = platform
self.compiler = compiler

@iet_pass
def make_parallel(self, iet):
Expand Down
13 changes: 13 additions & 0 deletions devito/passes/iet/languages/openmp.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from distutils import version

import cgen as c
from sympy import Not

from devito.arch import AMDGPUX, NVIDIAX, INTELGPUX
from devito.arch.compiler import GNUCompiler
from devito.ir import (Call, Conditional, List, Prodder, ParallelIteration,
ParallelBlock, PointerCast, While, FindSymbols)
from devito.passes.iet.definitions import DataManager, DeviceAwareDataManager
Expand Down Expand Up @@ -186,8 +189,18 @@ class SimdOmpizer(PragmaSimdTransformer):


class Ompizer(PragmaShmTransformer):

lang = OmpBB

@classmethod
def _support_array_reduction(cls, compiler):
# Not all backend compilers support array reduction!
# Here are the known unsupported ones:
if isinstance(compiler, GNUCompiler) and \
compiler.version < version.StrictVersion("6.0"):
return False
return True


class DeviceOmpizer(PragmaDeviceAwareTransformer):
lang = DeviceOmpBB
Expand Down
Loading