devitocodes · mloubout · Jun 7, 2022 · Jun 6, 2022 · Jun 6, 2022 · mloubout
diff --git a/benchmarks/user/advisor/run_advisor.py b/benchmarks/user/advisor/run_advisor.py
@@ -31,7 +31,7 @@
 def run_with_advisor(path, output, name, exec_args):
     path = Path(path)
     check(path.is_file(), '%s not found' % path)
-    check(path.suffix == '.py', '%s not a regular Python file' % path)
+    check(path.suffix == '.py', '%s not a Python file' % path)
 
     # Create a directory to store the profiling report
     if name is None:
@@ -49,15 +49,20 @@ def run_with_advisor(path, output, name, exec_args):
         output = Path(output).joinpath(name)
         output.mkdir(parents=True, exist_ok=True)
 
-    # Intel Advisor must be available through either Intel Parallel Studio
-    # or Intel oneAPI (currently tested versions include IPS 2020 Update 2 and
+    # Intel Advisor and Intel compilers must be available through either Intel Parallel
+    # Studio or Intel oneAPI (currently tested versions include IPS 2020 Update 2 and
     # oneAPI 2021 beta08)
     try:
         ret = check_output(['advixe-cl', '--version']).decode("utf-8")
     except FileNotFoundError:
         check(False, "Error: Couldn't detect `advixe-cl` to run Intel Advisor.")
 
-    # If Advisor is available, so is the Intel compiler
+    try:
+        ret = check_output(['icc', '--version']).decode("utf-8")
+    except FileNotFoundError:
+        check(False, "Error: Couldn't detect Intel Compiler (icc).")
+
+    # All good, Intel compiler and advisor are available
     os.environ['DEVITO_ARCH'] = 'intel'
 
     # Tell Devito to instrument the generated code for Advisor
@@ -68,12 +73,13 @@ def run_with_advisor(path, output, name, exec_args):
     if devito_logging is None:
         os.environ['DEVITO_LOGGING'] = 'WARNING'
 
-    with progress('Set up multi-threading environment'):
-        # Roofline analyses only make sense with threading enabled
+    with progress('Setting up multi-threading environment'):
+        # Roofline analyses are recommended with threading enabled
         os.environ['DEVITO_LANGUAGE'] = 'openmp'
 
-        # We must be able to do thread pinning, otherwise any results would be
-        # meaningless. Currently, we only support doing that via numactl
+        # Thread pinning is strongly recommended for reliable results.
+        # This script is using numactl for this purpose. Users may want to set their
+        # own pinning: https://hpc-wiki.info/hpc/Binding/Pinning
         try:
             ret = check_output(['numactl', '--show']).decode("utf-8")
             ret = dict(i.split(':') for i in ret.split('\n') if i)
@@ -94,6 +100,9 @@ def run_with_advisor(path, output, name, exec_args):
         #     `stackoverflow.com/questions/17053671/python-how-do-you-stop-numpy-from-multithreading`  # noqa
         os.environ['NUMEXPR_NUM_THREADS'] = '1'
 
+    # To build a roofline with Advisor, we need to run two analyses back to
+    # back, `survey` and `tripcounts`.
+
     numactl_cmd = [
         'numactl',
         '--cpunodebind=0'
@@ -109,21 +118,20 @@ def run_with_advisor(path, output, name, exec_args):
         '-run-pass-thru=--no-altstack',  # Avoids `https://software.intel.com/en-us/vtune-amplifier-help-error-message-stack-size-is-too-small`  # noqa
         '-run-pass-thru=-timestamp=sys',  # Avoids 'VTune Amplifier may detect which timer source to use incorrectly on Intel® Xeon® processor E5-XXXX processors (200287361)' # noqa
         '-strategy ldconfig:notrace:notrace',  # Avoids `https://software.intel.com/en-us/forums/intel-vtune-amplifier-xe/topic/779309`  # noqa
-        '-start-paused',  # The generated code will enable/disable Advisor on a loop basis
+        '-start-paused',  # The generated code will enable/disable Advisor on a loop basis according to the decorated pragmas  # noqa
     ]
     advisor_flops = [
         '--collect=tripcounts',
-        '--enable-cache-simulation', # Switch to '-enable-cache-simulation' for a CARM roofline `https://software.intel.com/content/www/us/en/develop/articles/integrated-roofline-model-with-intel-advisor.html`  # noqa
+        '--enable-cache-simulation', # Switch to '-enable-cache-simulation' for a CARM roofline model `https://software.intel.com/content/www/us/en/develop/articles/integrated-roofline-model-with-intel-advisor.html`  # noqa
         '--flop',
         '--stacks',
         '--collect=map',
         '-start-paused',
     ]
     py_cmd = [sys.executable, str(path)] + exec_args.split()
 
-    # To build a roofline with Advisor, we need to run two analyses back to
-    # back, `survey` and `tripcounts`. These are preceded by a "pure" python
-    # run to warmup the jit cache
+    # Before collecting the `survey` and `tripcounts` a "pure" python run to warmup the
+    # jit cache is preceded
 
     log('Starting Intel Advisor\'s `roofline` analysis for `%s`' % name)
     dt = datetime.datetime.now()

diff --git a/devito/core/autotuning.py b/devito/core/autotuning.py
@@ -75,16 +75,13 @@ def autotune(operator, args, level, mode):
     # Detect the time-stepping Iteration; shrink its iteration range so that
     # each autotuning run only takes a few iterations
     steppers = {i for i in flatten(trees) if i.dim.is_Time}
-    if len(steppers) == 0:
-        stepper = None
-        timesteps = 1
-    elif len(steppers) == 1:
+    if len(steppers) == 1:
 # Heuristic: TILABLE not worth it if not within a SEQUENTIAL Dimension 
 # Heuristic: TILABLE not worth it if not within a SEQUENTIAL Dimension 
         stepper = steppers.pop()
         timesteps = init_time_bounds(stepper, at_args, args)
         if timesteps is None:
             return args, {}
     else:
-        warning("cannot perform autotuning unless there is one time loop; skipping")
+        warning("cannot perform autotuning with %d time loops; skipping" % len(steppers))
         return args, {}
 
     # Use a fresh Timer for auto-tuning
@@ -220,7 +217,7 @@ def init_time_bounds(stepper, at_args, args):
 
 
 def check_time_bounds(stepper, at_args, args, mode):
-    if mode != 'runtime' or stepper is None:
+    if mode != 'runtime':
         return True
     dim = stepper.dim.root
     if stepper.direction is Backward:
@@ -319,13 +316,13 @@ def generate_block_shapes(blockable, args, level):
         for bs in list(ret):
             handle = []
             for v in options['blocksize-l1']:
-                # To be a valid blocksize, it must be smaller than and divide evenly
-                # the parent's block size
+                # To be a valid block size, it must be smaller than
+                # and divide evenly the parent's block size
                 if all(v <= i and i % v == 0 for _, i in bs):
                     ret.append(bs + tuple((d.step, v) for d in level_1))
             ret.remove(bs)
 
-    # Generate level-n (n > 1) block shapes
+    # Generate level-n (n > 2) block shapes
     # TODO -- currently, there's no Operator producing depth>2 hierarchical blocking,
     # so for simplicity we ignore this for the time being
 

diff --git a/examples/misc/linalg.py b/examples/misc/linalg.py
@@ -24,7 +24,7 @@ def callback_shape(ctx, param, value):
 
     def callback_opts(ctx, param, value):
         if value is True:
-            return ('blocking', 'simd', 'openmp', {'blockinner': True})
+            return ('advanced', {'blockinner': True, 'blockrelax': True})
         else:
             return 'noop'
 

diff --git a/tests/test_dle.py b/tests/test_dle.py
@@ -193,6 +193,34 @@ def test_cache_blocking_structure_optrelax():
     assert iters[1].dim.is_Block
 
 
+@pytest.mark.parametrize('opt, expected', [('noop', ('ijk', 'ikl')),
+                         (('advanced', {'blockinner': True, 'blockrelax': True}),
+                         ('i0_blk0ijk', 'i0_blk0ikl'))])
+def test_cache_blocking_structure_optrelax_linalg(opt, expected):
+    mat_shape = (4, 4)
+
+    i, j, k, l = dimensions('i j k l')
+    A = Function(name='A', shape=mat_shape, dimensions=(i, j))
+    B = Function(name='B', shape=mat_shape, dimensions=(j, k))
+    C = Function(name='C', shape=mat_shape, dimensions=(j, k))
+    D = Function(name='D', shape=mat_shape, dimensions=(i, k))
+    E = Function(name='E', shape=mat_shape, dimensions=(k, l))
+    F = Function(name='F', shape=mat_shape, dimensions=(i, l))
+
+    eqs = [Inc(D, A*B + A*C), Inc(F, D*E)]
+
+    A.data[:] = 1
+    B.data[:] = 1
+    C.data[:] = 1
+    E.data[:] = 1
+
+    op0 = Operator(eqs, opt=opt)
+    op0.apply()
+    assert_structure(op0, expected)
+    assert np.linalg.norm(D.data) == 32.0
+    assert np.linalg.norm(F.data) == 128.0
+
+
 @pytest.mark.parametrize('par_tile,expected', [
     (True, ((16, 16, 16), (16, 16, 16))),
     ((32, 4, 4), ((4, 4, 32), (4, 4, 32))),