From d0e2a62e7068ce39dfc6e707f28289fa2940e223 Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <>
Date: Mon, 6 Jan 2025 11:46:08 +0100
Subject: [PATCH 1/3] DaCeProgress: avoid double assignment of prefix

---
 ndsl/dsl/dace/utils.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/ndsl/dsl/dace/utils.py b/ndsl/dsl/dace/utils.py
index 29e3211d..fa39372a 100644
--- a/ndsl/dsl/dace/utils.py
+++ b/ndsl/dsl/dace/utils.py
@@ -15,15 +15,11 @@
 from ndsl.optional_imports import cupy as cp
 
 
-# ----------------------------------------------------------
-# Rough timer & log for major operations of DaCe build stack
-# ----------------------------------------------------------
 class DaCeProgress:
-    """Timer and log to track build progress"""
+    """Rough timer & log for major operations of DaCe build stack."""
 
     def __init__(self, config: DaceConfig, label: str):
         self.prefix = DaCeProgress.default_prefix(config)
-        self.prefix = f"[{config.get_orchestrate()}]"
         self.label = label
 
     @classmethod

From 63aca7686dc84c96453485e8cdf0991b594a2999 Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <>
Date: Mon, 6 Jan 2025 12:02:12 +0100
Subject: [PATCH 2/3] Fix typos

---
 ndsl/__init__.py       |  2 +-
 ndsl/dsl/dace/utils.py | 38 ++++++++++++++++++++------------------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/ndsl/__init__.py b/ndsl/__init__.py
index a2f771cd..5ec303de 100644
--- a/ndsl/__init__.py
+++ b/ndsl/__init__.py
@@ -10,7 +10,7 @@
 from .dsl.dace.utils import (
     ArrayReport,
     DaCeProgress,
-    MaxBandwithBenchmarkProgram,
+    MaxBandwidthBenchmarkProgram,
     StorageReport,
 )
 from .dsl.dace.wrapped_halo_exchange import WrappedHaloUpdater
diff --git a/ndsl/dsl/dace/utils.py b/ndsl/dsl/dace/utils.py
index fa39372a..7efd8f24 100644
--- a/ndsl/dsl/dace/utils.py
+++ b/ndsl/dsl/dace/utils.py
@@ -77,7 +77,7 @@ def memory_static_analysis(
     """Analysis an SDFG for memory pressure.
 
     The results split memory by type (dace.StorageType) and account for
-    allocated, unreferenced and top lovel (e.g. top-most SDFG) memory
+    allocated, unreferenced and top level (e.g. top-most SDFG) memory
     """
     # We report all allocation type
     allocations: Dict[dace.StorageType, StorageReport] = {}
@@ -88,7 +88,7 @@ def memory_static_analysis(
         array_size_in_bytes = arr.total_size * arr.dtype.bytes
         ref = _is_ref(sd, aname)
 
-        # Transient in maps (refrence and not referenced)
+        # Transient in maps (reference and not referenced)
         if sd is not sdfg and arr.transient:
             if arr.pool:
                 allocations[arr.storage].in_pooled_in_bytes += array_size_in_bytes
@@ -107,7 +107,7 @@ def memory_static_analysis(
             else:
                 allocations[arr.storage].unreferenced_in_bytes += array_size_in_bytes
 
-        # SDFG-level memory (refrence, not referenced and pooled)
+        # SDFG-level memory (reference, not referenced and pooled)
         elif sd is sdfg:
             if arr.pool:
                 allocations[arr.storage].in_pooled_in_bytes += array_size_in_bytes
@@ -141,14 +141,14 @@ def report_memory_static_analysis(
         alloc_in_mb = float(allocs.referenced_in_bytes / (1024 * 1024))
         unref_alloc_in_mb = float(allocs.unreferenced_in_bytes / (1024 * 1024))
         in_pooled_in_mb = float(allocs.in_pooled_in_bytes / (1024 * 1024))
-        toplvlalloc_in_mb = float(allocs.top_level_in_bytes / (1024 * 1024))
-        if alloc_in_mb or toplvlalloc_in_mb > 0:
+        top_level_alloc_in_mb = float(allocs.top_level_in_bytes / (1024 * 1024))
+        if alloc_in_mb or top_level_alloc_in_mb > 0:
             report += (
                 f"{storage}:\n"
                 f"  Alloc ref {alloc_in_mb:.2f} mb\n"
                 f"  Alloc unref {unref_alloc_in_mb:.2f} mb\n"
                 f"  Pooled {in_pooled_in_mb:.2f} mb\n"
-                f"  Top lvl alloc: {toplvlalloc_in_mb:.2f}mb\n"
+                f"  Top lvl alloc: {top_level_alloc_in_mb:.2f}mb\n"
             )
             if detail_report:
                 report += "\n"
@@ -179,27 +179,29 @@ def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str
 
 
 # ----------------------------------------------------------
-# Theoritical bandwith from SDFG
+# Theoretical bandwidth from SDFG
 # ----------------------------------------------------------
-def copy_defn(q_in: FloatField, q_out: FloatField):
+def copy_kernel(q_in: FloatField, q_out: FloatField):
     with computation(PARALLEL), interval(...):
         q_in = q_out
 
 
-class MaxBandwithBenchmarkProgram:
+class MaxBandwidthBenchmarkProgram:
     def __init__(self, size, backend) -> None:
         from ndsl.dsl.dace.orchestration import DaCeOrchestration, orchestrate
 
-        dconfig = DaceConfig(None, backend, orchestration=DaCeOrchestration.BuildAndRun)
+        dace_config = DaceConfig(
+            None, backend, orchestration=DaCeOrchestration.BuildAndRun
+        )
         c = CompilationConfig(backend=backend)
-        s = StencilConfig(dace_config=dconfig, compilation_config=c)
+        s = StencilConfig(dace_config=dace_config, compilation_config=c)
         self.copy_stencil = FrozenStencil(
-            func=copy_defn,
+            func=copy_kernel,
             origin=(0, 0, 0),
             domain=size,
             stencil_config=s,
         )
-        orchestrate(obj=self, config=dconfig)
+        orchestrate(obj=self, config=dace_config)
 
     def __call__(self, A, B, n: int):
         for i in dace.nounroll(range(n)):
@@ -215,17 +217,17 @@ def kernel_theoretical_timing(
 
     - Performance is memory bound, e.g. arithmetic intensity isn't counted
     - Hardware bandwidth comes from a GT4Py/DaCe test rather than a spec sheet for
-      for higher accuracy. Best is to run a copy_stencils on a full domain
+      for higher accuracy. Best is to run a copy_stencil on a full domain
     - Memory pressure is mostly in read/write from global memory, inner scalar & shared
       memory is not counted towards memory movement.
     """
     if not hardware_bw_in_GB_s:
         size = np.array(sdfg.arrays["__g_self__w"].shape)
         print(
-            f"Calculating experimental hardware bandwith on {size}"
+            f"Calculating experimental hardware bandwidth on {size}"
             f" arrays at {Float} precision..."
         )
-        bench = MaxBandwithBenchmarkProgram(size, backend)
+        bench = MaxBandwidthBenchmarkProgram(size, backend)
         if backend == "dace:gpu":
             A = cp.ones(size, dtype=Float)
             B = cp.ones(size, dtype=Float)
@@ -246,11 +248,11 @@ def kernel_theoretical_timing(
         memory_size_in_b = np.prod(size) * np.dtype(Float).itemsize * 8
         bandwidth_in_bytes_s = memory_size_in_b / np.median(dt)
         print(
-            f"Hardware bandwith computed: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s"
+            f"Hardware bandwidth computed: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s"
         )
     else:
         bandwidth_in_bytes_s = hardware_bw_in_GB_s * 1024 * 1024 * 1024
-        print(f"Given hardware bandwith: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s")
+        print(f"Given hardware bandwidth: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s")
 
     allmaps = [
         (me, state)

From d4e065cc11184e365f466b90b88c1e00299b69e3 Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <>
Date: Mon, 6 Jan 2025 12:29:45 +0100
Subject: [PATCH 3/3] Add type hints/simplify kernel_theoretical_timing

Adding type hints allowed to simplify `kernel_theoretical_timing`.
---
 ndsl/dsl/dace/utils.py | 48 ++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/ndsl/dsl/dace/utils.py b/ndsl/dsl/dace/utils.py
index 7efd8f24..05fa5754 100644
--- a/ndsl/dsl/dace/utils.py
+++ b/ndsl/dsl/dace/utils.py
@@ -18,7 +18,7 @@
 class DaCeProgress:
     """Rough timer & log for major operations of DaCe build stack."""
 
-    def __init__(self, config: DaceConfig, label: str):
+    def __init__(self, config: DaceConfig, label: str) -> None:
         self.prefix = DaCeProgress.default_prefix(config)
         self.label = label
 
@@ -26,11 +26,11 @@ def __init__(self, config: DaceConfig, label: str):
     def default_prefix(cls, config: DaceConfig) -> str:
         return f"[{config.get_orchestrate()}]"
 
-    def __enter__(self):
+    def __enter__(self) -> None:
         ndsl_log.debug(f"{self.prefix} {self.label}...")
         self.start = time.time()
 
-    def __exit__(self, _type, _val, _traceback):
+    def __exit__(self, _type, _val, _traceback) -> None:
         elapsed = time.time() - self.start
         ndsl_log.debug(f"{self.prefix} {self.label}...{elapsed}s.")
 
@@ -133,7 +133,7 @@ def memory_static_analysis(
 def report_memory_static_analysis(
     sdfg: dace.sdfg.SDFG,
     allocations: Dict[dace.StorageType, StorageReport],
-    detail_report=False,
+    detail_report: bool = False,
 ) -> str:
     """Create a human readable report form the memory analysis results"""
     report = f"{sdfg.name}:\n"
@@ -168,7 +168,9 @@ def report_memory_static_analysis(
     return report
 
 
-def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str:
+def memory_static_analysis_from_path(
+    sdfg_path: str, detail_report: bool = False
+) -> str:
     """Open a SDFG and report the memory analysis"""
     sdfg = dace.SDFG.from_file(sdfg_path)
     return report_memory_static_analysis(
@@ -181,7 +183,7 @@ def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str
 # ----------------------------------------------------------
 # Theoretical bandwidth from SDFG
 # ----------------------------------------------------------
-def copy_kernel(q_in: FloatField, q_out: FloatField):
+def copy_kernel(q_in: FloatField, q_out: FloatField) -> None:
     with computation(PARALLEL), interval(...):
         q_in = q_out
 
@@ -203,15 +205,15 @@ def __init__(self, size, backend) -> None:
         )
         orchestrate(obj=self, config=dace_config)
 
-    def __call__(self, A, B, n: int):
+    def __call__(self, A, B, n: int) -> None:
         for i in dace.nounroll(range(n)):
             self.copy_stencil(A, B)
 
 
 def kernel_theoretical_timing(
     sdfg: dace.sdfg.SDFG,
-    hardware_bw_in_GB_s=None,
-    backend=None,
+    hardware_bw_in_GB_s: Optional[float] = None,
+    backend: Optional[str] = None,
 ) -> Dict[str, float]:
     """Compute a lower timing bound for kernels with the following hypothesis:
 
@@ -221,7 +223,7 @@ def kernel_theoretical_timing(
     - Memory pressure is mostly in read/write from global memory, inner scalar & shared
       memory is not counted towards memory movement.
     """
-    if not hardware_bw_in_GB_s:
+    if hardware_bw_in_GB_s is None:
         size = np.array(sdfg.arrays["__g_self__w"].shape)
         print(
             f"Calculating experimental hardware bandwidth on {size}"
@@ -246,13 +248,19 @@ def kernel_theoretical_timing(
             bench(A, B, n)
             dt.append((time.time() - s) / n)
         memory_size_in_b = np.prod(size) * np.dtype(Float).itemsize * 8
-        bandwidth_in_bytes_s = memory_size_in_b / np.median(dt)
-        print(
-            f"Hardware bandwidth computed: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s"
-        )
-    else:
-        bandwidth_in_bytes_s = hardware_bw_in_GB_s * 1024 * 1024 * 1024
-        print(f"Given hardware bandwidth: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s")
+        measured_bandwidth_in_bytes_s = memory_size_in_b / np.median(dt)
+
+    bandwidth_in_bytes_s = (
+        measured_bandwidth_in_bytes_s
+        if hardware_bw_in_GB_s is None
+        else hardware_bw_in_GB_s * 1024 * 1024 * 1024
+    )
+    label = (
+        "Hardware bandwidth computed"
+        if hardware_bw_in_GB_s
+        else "Given hardware bandwidth"
+    )
+    print(f"{label}: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s")
 
     allmaps = [
         (me, state)
@@ -305,12 +313,6 @@ def kernel_theoretical_timing(
             except TypeError:
                 pass
 
-        # Bad expansion
-        if not isinstance(newresult_in_us, sympy.core.numbers.Float) and not isinstance(
-            newresult_in_us, float
-        ):
-            continue
-
         result[node.label] = float(newresult_in_us)
 
     return result