Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compiler: Add blockrelax tests and refresh advisor profiling #1929

Merged
merged 2 commits into from
Jun 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 21 additions & 13 deletions benchmarks/user/advisor/run_advisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
def run_with_advisor(path, output, name, exec_args):
path = Path(path)
check(path.is_file(), '%s not found' % path)
check(path.suffix == '.py', '%s not a regular Python file' % path)
check(path.suffix == '.py', '%s not a Python file' % path)

# Create a directory to store the profiling report
if name is None:
Expand All @@ -49,15 +49,20 @@ def run_with_advisor(path, output, name, exec_args):
output = Path(output).joinpath(name)
output.mkdir(parents=True, exist_ok=True)

# Intel Advisor must be available through either Intel Parallel Studio
# or Intel oneAPI (currently tested versions include IPS 2020 Update 2 and
# Intel Advisor and Intel compilers must be available through either Intel Parallel
# Studio or Intel oneAPI (currently tested versions include IPS 2020 Update 2 and
# oneAPI 2021 beta08)
try:
ret = check_output(['advixe-cl', '--version']).decode("utf-8")
except FileNotFoundError:
check(False, "Error: Couldn't detect `advixe-cl` to run Intel Advisor.")

# If Advisor is available, so is the Intel compiler
try:
ret = check_output(['icc', '--version']).decode("utf-8")
except FileNotFoundError:
check(False, "Error: Couldn't detect Intel Compiler (icc).")

# All good, Intel compiler and advisor are available
os.environ['DEVITO_ARCH'] = 'intel'

# Tell Devito to instrument the generated code for Advisor
Expand All @@ -68,12 +73,13 @@ def run_with_advisor(path, output, name, exec_args):
if devito_logging is None:
os.environ['DEVITO_LOGGING'] = 'WARNING'

with progress('Set up multi-threading environment'):
# Roofline analyses only make sense with threading enabled
with progress('Setting up multi-threading environment'):
# Roofline analyses are recommended with threading enabled
os.environ['DEVITO_LANGUAGE'] = 'openmp'

# We must be able to do thread pinning, otherwise any results would be
# meaningless. Currently, we only support doing that via numactl
# Thread pinning is strongly recommended for reliable results.
# This script is using numactl for this purpose. Users may want to set their
# own pinning: https://hpc-wiki.info/hpc/Binding/Pinning
try:
ret = check_output(['numactl', '--show']).decode("utf-8")
ret = dict(i.split(':') for i in ret.split('\n') if i)
Expand All @@ -94,6 +100,9 @@ def run_with_advisor(path, output, name, exec_args):
# `stackoverflow.com/questions/17053671/python-how-do-you-stop-numpy-from-multithreading` # noqa
os.environ['NUMEXPR_NUM_THREADS'] = '1'

# To build a roofline with Advisor, we need to run two analyses back to
# back, `survey` and `tripcounts`.

numactl_cmd = [
'numactl',
'--cpunodebind=0'
Expand All @@ -109,21 +118,20 @@ def run_with_advisor(path, output, name, exec_args):
'-run-pass-thru=--no-altstack', # Avoids `https://software.intel.com/en-us/vtune-amplifier-help-error-message-stack-size-is-too-small` # noqa
'-run-pass-thru=-timestamp=sys', # Avoids 'VTune Amplifier may detect which timer source to use incorrectly on Intel® Xeon® processor E5-XXXX processors (200287361)' # noqa
'-strategy ldconfig:notrace:notrace', # Avoids `https://software.intel.com/en-us/forums/intel-vtune-amplifier-xe/topic/779309` # noqa
'-start-paused', # The generated code will enable/disable Advisor on a loop basis
'-start-paused', # The generated code will enable/disable Advisor on a loop basis according to the decorated pragmas # noqa
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
]
advisor_flops = [
'--collect=tripcounts',
'--enable-cache-simulation', # Switch to '-enable-cache-simulation' for a CARM roofline `https://software.intel.com/content/www/us/en/develop/articles/integrated-roofline-model-with-intel-advisor.html` # noqa
'--enable-cache-simulation', # Switch to '-enable-cache-simulation' for a CARM roofline model `https://software.intel.com/content/www/us/en/develop/articles/integrated-roofline-model-with-intel-advisor.html` # noqa
'--flop',
'--stacks',
'--collect=map',
'-start-paused',
]
py_cmd = [sys.executable, str(path)] + exec_args.split()

# To build a roofline with Advisor, we need to run two analyses back to
# back, `survey` and `tripcounts`. These are preceded by a "pure" python
# run to warmup the jit cache
# Before collecting the `survey` and `tripcounts` a "pure" python run to warmup the
# jit cache is preceded

log('Starting Intel Advisor\'s `roofline` analysis for `%s`' % name)
dt = datetime.datetime.now()
Expand Down
15 changes: 6 additions & 9 deletions devito/core/autotuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,16 +75,13 @@ def autotune(operator, args, level, mode):
# Detect the time-stepping Iteration; shrink its iteration range so that
# each autotuning run only takes a few iterations
steppers = {i for i in flatten(trees) if i.dim.is_Time}
if len(steppers) == 0:
stepper = None
timesteps = 1
elif len(steppers) == 1:
if len(steppers) == 1:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why remove that 0 case?

Copy link
Contributor

@FabioLuporini FabioLuporini Jun 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if CI doesn't fail, either test suite is flawed or we somewhat changed things over time such that it's never the case.

Now, the second case to me is unlikely...one could definitely try auto-tuning without a time loop. Hence, the test suite requires an update !

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 0 case is removed since devito does not apply loop blocking for non time-iterative computations.

# Heuristic: TILABLE not worth it if not within a SEQUENTIAL Dimension

CI passes.

Idea: blockrelax should allow blocking non-time-iterative loops, right? It skips the heuristics. Then we need to add some tests for this.

i.e. block relax on matrix multiplication example?
I need to check

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sounds good to me yeah

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tests updated with blockrelax in linear algebra.
Need to add some tests that check the structure too

stepper = steppers.pop()
timesteps = init_time_bounds(stepper, at_args, args)
if timesteps is None:
return args, {}
else:
warning("cannot perform autotuning unless there is one time loop; skipping")
warning("cannot perform autotuning with %d time loops; skipping" % len(steppers))
return args, {}

# Use a fresh Timer for auto-tuning
Expand Down Expand Up @@ -220,7 +217,7 @@ def init_time_bounds(stepper, at_args, args):


def check_time_bounds(stepper, at_args, args, mode):
if mode != 'runtime' or stepper is None:
if mode != 'runtime':
return True
dim = stepper.dim.root
if stepper.direction is Backward:
Expand Down Expand Up @@ -319,13 +316,13 @@ def generate_block_shapes(blockable, args, level):
for bs in list(ret):
handle = []
for v in options['blocksize-l1']:
# To be a valid blocksize, it must be smaller than and divide evenly
# the parent's block size
# To be a valid block size, it must be smaller than
# and divide evenly the parent's block size
if all(v <= i and i % v == 0 for _, i in bs):
ret.append(bs + tuple((d.step, v) for d in level_1))
ret.remove(bs)

# Generate level-n (n > 1) block shapes
# Generate level-n (n > 2) block shapes
georgebisbas marked this conversation as resolved.
Show resolved Hide resolved
# TODO -- currently, there's no Operator producing depth>2 hierarchical blocking,
# so for simplicity we ignore this for the time being

Expand Down
2 changes: 1 addition & 1 deletion examples/misc/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def callback_shape(ctx, param, value):

def callback_opts(ctx, param, value):
if value is True:
return ('blocking', 'simd', 'openmp', {'blockinner': True})
return ('advanced', {'blockinner': True, 'blockrelax': True})
else:
return 'noop'

Expand Down
28 changes: 28 additions & 0 deletions tests/test_dle.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,34 @@ def test_cache_blocking_structure_optrelax():
assert iters[1].dim.is_Block


@pytest.mark.parametrize('opt, expected', [('noop', ('ijk', 'ikl')),
(('advanced', {'blockinner': True, 'blockrelax': True}),
('i0_blk0ijk', 'i0_blk0ikl'))])
def test_cache_blocking_structure_optrelax_linalg(opt, expected):
mat_shape = (4, 4)

i, j, k, l = dimensions('i j k l')
A = Function(name='A', shape=mat_shape, dimensions=(i, j))
B = Function(name='B', shape=mat_shape, dimensions=(j, k))
C = Function(name='C', shape=mat_shape, dimensions=(j, k))
D = Function(name='D', shape=mat_shape, dimensions=(i, k))
E = Function(name='E', shape=mat_shape, dimensions=(k, l))
F = Function(name='F', shape=mat_shape, dimensions=(i, l))

eqs = [Inc(D, A*B + A*C), Inc(F, D*E)]

A.data[:] = 1
B.data[:] = 1
C.data[:] = 1
E.data[:] = 1

op0 = Operator(eqs, opt=opt)
op0.apply()
assert_structure(op0, expected)
assert np.linalg.norm(D.data) == 32.0
assert np.linalg.norm(F.data) == 128.0


@pytest.mark.parametrize('par_tile,expected', [
(True, ((16, 16, 16), (16, 16, 16))),
((32, 4, 4), ((4, 4, 32), (4, 4, 32))),
Expand Down