From d0e2a62e7068ce39dfc6e707f28289fa2940e223 Mon Sep 17 00:00:00 2001 From: Roman Cattaneo <> Date: Mon, 6 Jan 2025 11:46:08 +0100 Subject: [PATCH 1/3] DaCeProgress: avoid double assignment of prefix --- ndsl/dsl/dace/utils.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ndsl/dsl/dace/utils.py b/ndsl/dsl/dace/utils.py index 29e3211d..fa39372a 100644 --- a/ndsl/dsl/dace/utils.py +++ b/ndsl/dsl/dace/utils.py @@ -15,15 +15,11 @@ from ndsl.optional_imports import cupy as cp -# ---------------------------------------------------------- -# Rough timer & log for major operations of DaCe build stack -# ---------------------------------------------------------- class DaCeProgress: - """Timer and log to track build progress""" + """Rough timer & log for major operations of DaCe build stack.""" def __init__(self, config: DaceConfig, label: str): self.prefix = DaCeProgress.default_prefix(config) - self.prefix = f"[{config.get_orchestrate()}]" self.label = label @classmethod From 63aca7686dc84c96453485e8cdf0991b594a2999 Mon Sep 17 00:00:00 2001 From: Roman Cattaneo <> Date: Mon, 6 Jan 2025 12:02:12 +0100 Subject: [PATCH 2/3] Fix typos --- ndsl/__init__.py | 2 +- ndsl/dsl/dace/utils.py | 38 ++++++++++++++++++++------------------ 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/ndsl/__init__.py b/ndsl/__init__.py index a2f771cd..5ec303de 100644 --- a/ndsl/__init__.py +++ b/ndsl/__init__.py @@ -10,7 +10,7 @@ from .dsl.dace.utils import ( ArrayReport, DaCeProgress, - MaxBandwithBenchmarkProgram, + MaxBandwidthBenchmarkProgram, StorageReport, ) from .dsl.dace.wrapped_halo_exchange import WrappedHaloUpdater diff --git a/ndsl/dsl/dace/utils.py b/ndsl/dsl/dace/utils.py index fa39372a..7efd8f24 100644 --- a/ndsl/dsl/dace/utils.py +++ b/ndsl/dsl/dace/utils.py @@ -77,7 +77,7 @@ def memory_static_analysis( """Analysis an SDFG for memory pressure. The results split memory by type (dace.StorageType) and account for - allocated, unreferenced and top lovel (e.g. top-most SDFG) memory + allocated, unreferenced and top level (e.g. top-most SDFG) memory """ # We report all allocation type allocations: Dict[dace.StorageType, StorageReport] = {} @@ -88,7 +88,7 @@ def memory_static_analysis( array_size_in_bytes = arr.total_size * arr.dtype.bytes ref = _is_ref(sd, aname) - # Transient in maps (refrence and not referenced) + # Transient in maps (reference and not referenced) if sd is not sdfg and arr.transient: if arr.pool: allocations[arr.storage].in_pooled_in_bytes += array_size_in_bytes @@ -107,7 +107,7 @@ def memory_static_analysis( else: allocations[arr.storage].unreferenced_in_bytes += array_size_in_bytes - # SDFG-level memory (refrence, not referenced and pooled) + # SDFG-level memory (reference, not referenced and pooled) elif sd is sdfg: if arr.pool: allocations[arr.storage].in_pooled_in_bytes += array_size_in_bytes @@ -141,14 +141,14 @@ def report_memory_static_analysis( alloc_in_mb = float(allocs.referenced_in_bytes / (1024 * 1024)) unref_alloc_in_mb = float(allocs.unreferenced_in_bytes / (1024 * 1024)) in_pooled_in_mb = float(allocs.in_pooled_in_bytes / (1024 * 1024)) - toplvlalloc_in_mb = float(allocs.top_level_in_bytes / (1024 * 1024)) - if alloc_in_mb or toplvlalloc_in_mb > 0: + top_level_alloc_in_mb = float(allocs.top_level_in_bytes / (1024 * 1024)) + if alloc_in_mb or top_level_alloc_in_mb > 0: report += ( f"{storage}:\n" f" Alloc ref {alloc_in_mb:.2f} mb\n" f" Alloc unref {unref_alloc_in_mb:.2f} mb\n" f" Pooled {in_pooled_in_mb:.2f} mb\n" - f" Top lvl alloc: {toplvlalloc_in_mb:.2f}mb\n" + f" Top lvl alloc: {top_level_alloc_in_mb:.2f}mb\n" ) if detail_report: report += "\n" @@ -179,27 +179,29 @@ def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str # ---------------------------------------------------------- -# Theoritical bandwith from SDFG +# Theoretical bandwidth from SDFG # ---------------------------------------------------------- -def copy_defn(q_in: FloatField, q_out: FloatField): +def copy_kernel(q_in: FloatField, q_out: FloatField): with computation(PARALLEL), interval(...): q_in = q_out -class MaxBandwithBenchmarkProgram: +class MaxBandwidthBenchmarkProgram: def __init__(self, size, backend) -> None: from ndsl.dsl.dace.orchestration import DaCeOrchestration, orchestrate - dconfig = DaceConfig(None, backend, orchestration=DaCeOrchestration.BuildAndRun) + dace_config = DaceConfig( + None, backend, orchestration=DaCeOrchestration.BuildAndRun + ) c = CompilationConfig(backend=backend) - s = StencilConfig(dace_config=dconfig, compilation_config=c) + s = StencilConfig(dace_config=dace_config, compilation_config=c) self.copy_stencil = FrozenStencil( - func=copy_defn, + func=copy_kernel, origin=(0, 0, 0), domain=size, stencil_config=s, ) - orchestrate(obj=self, config=dconfig) + orchestrate(obj=self, config=dace_config) def __call__(self, A, B, n: int): for i in dace.nounroll(range(n)): @@ -215,17 +217,17 @@ def kernel_theoretical_timing( - Performance is memory bound, e.g. arithmetic intensity isn't counted - Hardware bandwidth comes from a GT4Py/DaCe test rather than a spec sheet for - for higher accuracy. Best is to run a copy_stencils on a full domain + for higher accuracy. Best is to run a copy_stencil on a full domain - Memory pressure is mostly in read/write from global memory, inner scalar & shared memory is not counted towards memory movement. """ if not hardware_bw_in_GB_s: size = np.array(sdfg.arrays["__g_self__w"].shape) print( - f"Calculating experimental hardware bandwith on {size}" + f"Calculating experimental hardware bandwidth on {size}" f" arrays at {Float} precision..." ) - bench = MaxBandwithBenchmarkProgram(size, backend) + bench = MaxBandwidthBenchmarkProgram(size, backend) if backend == "dace:gpu": A = cp.ones(size, dtype=Float) B = cp.ones(size, dtype=Float) @@ -246,11 +248,11 @@ def kernel_theoretical_timing( memory_size_in_b = np.prod(size) * np.dtype(Float).itemsize * 8 bandwidth_in_bytes_s = memory_size_in_b / np.median(dt) print( - f"Hardware bandwith computed: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s" + f"Hardware bandwidth computed: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s" ) else: bandwidth_in_bytes_s = hardware_bw_in_GB_s * 1024 * 1024 * 1024 - print(f"Given hardware bandwith: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s") + print(f"Given hardware bandwidth: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s") allmaps = [ (me, state) From d4e065cc11184e365f466b90b88c1e00299b69e3 Mon Sep 17 00:00:00 2001 From: Roman Cattaneo <> Date: Mon, 6 Jan 2025 12:29:45 +0100 Subject: [PATCH 3/3] Add type hints/simplify kernel_theoretical_timing Adding type hints allowed to simplify `kernel_theoretical_timing`. --- ndsl/dsl/dace/utils.py | 48 ++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/ndsl/dsl/dace/utils.py b/ndsl/dsl/dace/utils.py index 7efd8f24..05fa5754 100644 --- a/ndsl/dsl/dace/utils.py +++ b/ndsl/dsl/dace/utils.py @@ -18,7 +18,7 @@ class DaCeProgress: """Rough timer & log for major operations of DaCe build stack.""" - def __init__(self, config: DaceConfig, label: str): + def __init__(self, config: DaceConfig, label: str) -> None: self.prefix = DaCeProgress.default_prefix(config) self.label = label @@ -26,11 +26,11 @@ def __init__(self, config: DaceConfig, label: str): def default_prefix(cls, config: DaceConfig) -> str: return f"[{config.get_orchestrate()}]" - def __enter__(self): + def __enter__(self) -> None: ndsl_log.debug(f"{self.prefix} {self.label}...") self.start = time.time() - def __exit__(self, _type, _val, _traceback): + def __exit__(self, _type, _val, _traceback) -> None: elapsed = time.time() - self.start ndsl_log.debug(f"{self.prefix} {self.label}...{elapsed}s.") @@ -133,7 +133,7 @@ def memory_static_analysis( def report_memory_static_analysis( sdfg: dace.sdfg.SDFG, allocations: Dict[dace.StorageType, StorageReport], - detail_report=False, + detail_report: bool = False, ) -> str: """Create a human readable report form the memory analysis results""" report = f"{sdfg.name}:\n" @@ -168,7 +168,9 @@ def report_memory_static_analysis( return report -def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str: +def memory_static_analysis_from_path( + sdfg_path: str, detail_report: bool = False +) -> str: """Open a SDFG and report the memory analysis""" sdfg = dace.SDFG.from_file(sdfg_path) return report_memory_static_analysis( @@ -181,7 +183,7 @@ def memory_static_analysis_from_path(sdfg_path: str, detail_report=False) -> str # ---------------------------------------------------------- # Theoretical bandwidth from SDFG # ---------------------------------------------------------- -def copy_kernel(q_in: FloatField, q_out: FloatField): +def copy_kernel(q_in: FloatField, q_out: FloatField) -> None: with computation(PARALLEL), interval(...): q_in = q_out @@ -203,15 +205,15 @@ def __init__(self, size, backend) -> None: ) orchestrate(obj=self, config=dace_config) - def __call__(self, A, B, n: int): + def __call__(self, A, B, n: int) -> None: for i in dace.nounroll(range(n)): self.copy_stencil(A, B) def kernel_theoretical_timing( sdfg: dace.sdfg.SDFG, - hardware_bw_in_GB_s=None, - backend=None, + hardware_bw_in_GB_s: Optional[float] = None, + backend: Optional[str] = None, ) -> Dict[str, float]: """Compute a lower timing bound for kernels with the following hypothesis: @@ -221,7 +223,7 @@ def kernel_theoretical_timing( - Memory pressure is mostly in read/write from global memory, inner scalar & shared memory is not counted towards memory movement. """ - if not hardware_bw_in_GB_s: + if hardware_bw_in_GB_s is None: size = np.array(sdfg.arrays["__g_self__w"].shape) print( f"Calculating experimental hardware bandwidth on {size}" @@ -246,13 +248,19 @@ def kernel_theoretical_timing( bench(A, B, n) dt.append((time.time() - s) / n) memory_size_in_b = np.prod(size) * np.dtype(Float).itemsize * 8 - bandwidth_in_bytes_s = memory_size_in_b / np.median(dt) - print( - f"Hardware bandwidth computed: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s" - ) - else: - bandwidth_in_bytes_s = hardware_bw_in_GB_s * 1024 * 1024 * 1024 - print(f"Given hardware bandwidth: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s") + measured_bandwidth_in_bytes_s = memory_size_in_b / np.median(dt) + + bandwidth_in_bytes_s = ( + measured_bandwidth_in_bytes_s + if hardware_bw_in_GB_s is None + else hardware_bw_in_GB_s * 1024 * 1024 * 1024 + ) + label = ( + "Hardware bandwidth computed" + if hardware_bw_in_GB_s + else "Given hardware bandwidth" + ) + print(f"{label}: {bandwidth_in_bytes_s/(1024*1024*1024)} GB/s") allmaps = [ (me, state) @@ -305,12 +313,6 @@ def kernel_theoretical_timing( except TypeError: pass - # Bad expansion - if not isinstance(newresult_in_us, sympy.core.numbers.Float) and not isinstance( - newresult_in_us, float - ): - continue - result[node.label] = float(newresult_in_us) return result