compiler: Refresh autotuning

devitocodes · Jun 1, 2022 · ac4b61f · ac4b61f
1 parent 1fa9db5
commit ac4b61f
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 13 deletions.
diff --git a/benchmarks/user/advisor/run_advisor.py b/benchmarks/user/advisor/run_advisor.py
@@ -78,7 +78,8 @@ def run_with_advisor(path, output, name, exec_args):
         os.environ['DEVITO_LANGUAGE'] = 'openmp'
 
         # Thread pinning is strongly recommended for reliable results.
-        # We support thread pinning via numactl
+        # This script is using numactl for this purpose. Users may want to set their
+        # own pinning: https://hpc-wiki.info/hpc/Binding/Pinning
         try:
             ret = check_output(['numactl', '--show']).decode("utf-8")
             ret = dict(i.split(':') for i in ret.split('\n') if i)
@@ -99,7 +100,6 @@ def run_with_advisor(path, output, name, exec_args):
         #     `stackoverflow.com/questions/17053671/python-how-do-you-stop-numpy-from-multithreading`  # noqa
         os.environ['NUMEXPR_NUM_THREADS'] = '1'
 
-
     # To build a roofline with Advisor, we need to run two analyses back to
     # back, `survey` and `tripcounts`.
 
@@ -118,7 +118,7 @@ def run_with_advisor(path, output, name, exec_args):
         '-run-pass-thru=--no-altstack',  # Avoids `https://software.intel.com/en-us/vtune-amplifier-help-error-message-stack-size-is-too-small`  # noqa
         '-run-pass-thru=-timestamp=sys',  # Avoids 'VTune Amplifier may detect which timer source to use incorrectly on Intel® Xeon® processor E5-XXXX processors (200287361)' # noqa
         '-strategy ldconfig:notrace:notrace',  # Avoids `https://software.intel.com/en-us/forums/intel-vtune-amplifier-xe/topic/779309`  # noqa
-        '-start-paused',  # The generated code will enable/disable Advisor on a loop basis according to the decorated pragmas
+        '-start-paused',  # The generated code will enable/disable Advisor on a loop basis according to the decorated pragmas  # noqa
     ]
     advisor_flops = [
         '--collect=tripcounts',
@@ -130,8 +130,8 @@ def run_with_advisor(path, output, name, exec_args):
     ]
     py_cmd = [sys.executable, str(path)] + exec_args.split()
 
-    # Before collecting the `survey` and `tripcounts` a "pure" python run to warmup the jit cache
-    # is preceded
+    # Before collecting the `survey` and `tripcounts` a "pure" python run to warmup the
+    # jit cache is preceded
 
     log('Starting Intel Advisor\'s `roofline` analysis for `%s`' % name)
     dt = datetime.datetime.now()

diff --git a/devito/core/autotuning.py b/devito/core/autotuning.py
@@ -75,16 +75,13 @@ def autotune(operator, args, level, mode):
     # Detect the time-stepping Iteration; shrink its iteration range so that
     # each autotuning run only takes a few iterations
     steppers = {i for i in flatten(trees) if i.dim.is_Time}
-    if len(steppers) == 0:
-        stepper = None
-        timesteps = 1
-    elif len(steppers) == 1:
+    if len(steppers) == 1:
         stepper = steppers.pop()
         timesteps = init_time_bounds(stepper, at_args, args)
         if timesteps is None:
             return args, {}
     else:
-        warning("cannot perform autotuning unless there is one time loop; skipping")
+        warning("cannot perform autotuning with %d time loops; skipping" % len(steppers))
         return args, {}
 
     # Use a fresh Timer for auto-tuning
@@ -220,7 +217,7 @@ def init_time_bounds(stepper, at_args, args):
 
 
 def check_time_bounds(stepper, at_args, args, mode):
-    if mode != 'runtime' or stepper is None:
+    if mode != 'runtime':
         return True
     dim = stepper.dim.root
     if stepper.direction is Backward:
@@ -319,8 +316,9 @@ def generate_block_shapes(blockable, args, level):
         for bs in list(ret):
             handle = []
             for v in options['blocksize-l1']:
-                # To be a valid blocksize, it must be strictly smaller than
-                # and divide evenly the parent's block size
+                # To be a valid block size, it must be smaller than
+                # and divide evenly the parent's block size.
+                # Blocksizes equal to the parent's block size are not included
                 if all(v < i and i % v == 0 for _, i in bs):
                     ret.append(bs + tuple((d.step, v) for d in level_1))
             ret.remove(bs)