NOAA-GFDL · FlorianDeconinck · Oct 24, 2025 · Aug 4, 2025 · Aug 5, 2025 · Aug 7, 2025
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ To run the GPU backends, you'll need:
 - Libraries: MPI compiled with CUDA support
 - CUDA 11.2+
 - Python package:
-  - `cupy` (latest with proper driver support [see install notes](https://docs.cupy.dev/en/stable/install.html))
+    - `cupy` (latest with proper driver support [see install notes](https://docs.cupy.dev/en/stable/install.html))
 
 A simple way to install MPI is using pre-built wheels, e.g.
 

diff --git a/ndsl/__init__.py b/ndsl/__init__.py
@@ -1,4 +1,5 @@
 from . import dsl  # isort:skip
+from .logging import ndsl_log  # isort:skip
 from .comm.communicator import CubedSphereCommunicator, TileCommunicator
 from .comm.local_comm import LocalComm
 from .comm.mpi import MPIComm
@@ -22,7 +23,6 @@
 from .halo.data_transformer import HaloExchangeSpec
 from .halo.updater import HaloUpdater, HaloUpdateRequest, VectorInterfaceHaloUpdater
 from .initialization import GridSizer, QuantityFactory, SubtileGridSizer
-from .logging import ndsl_log
 from .monitor.netcdf_monitor import NetCDFMonitor
 from .namelist import Namelist
 from .performance.collector import NullPerformanceCollector, PerformanceCollector

diff --git a/ndsl/dsl/dace/orchestration.py b/ndsl/dsl/dace/orchestration.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import numbers
 import os
 from collections.abc import Callable, Sequence
 from typing import Any
@@ -32,6 +33,8 @@
     negative_qtracers_checker,
     sdfg_nan_checker,
 )
+from ndsl.dsl.dace.stree import CPUPipeline, GPUPipeline
+from ndsl.dsl.dace.stree.optimizations import AxisIterator, CartesianAxisMerge
 from ndsl.dsl.dace.utils import (
     DaCeProgress,
     memory_static_analysis,
@@ -41,6 +44,13 @@
 from ndsl.optional_imports import cupy as cp
 
 
+_INTERNAL__SCHEDULE_TREE_OPTIMIZATION: bool = False
+"""INTERNAL: Developer flag to turn the untested schedule tree roundtrip optimizer."""
+
+_INTERNAL__SCHEDULE_TREE_PASSES = [CartesianAxisMerge(AxisIterator._K)]
+"""INTERNAL: Default schedule passes for CPU. To be replaced with proper configuration."""
+
+
 def dace_inhibitor(func: Callable) -> Callable:
     """Triggers callback generation wrapping `func` while doing DaCe parsing."""
     return func
@@ -124,18 +134,47 @@ def _build_sdfg(
 ) -> None:
     """Build the .so out of the SDFG on the top tile ranks only."""
     is_compiling = True if DEACTIVATE_DISTRIBUTED_DACE_COMPILE else config.do_compile
+    device_type = DaceDeviceType.GPU if config.is_gpu_backend() else DaceDeviceType.CPU
 
     if is_compiling:
         with DaCeProgress(config, "Validate original SDFG"):
             sdfg.validate()
 
+        # Fully specialize all known symbols and then propagate these changes in the simplify
+        # pass that follows. This is not only a smart idea in general, but also simplifies (haha)
+        # the schedule tree (optimization) roundtrip.
+        with DaCeProgress(config, "Fully specialize symbols"):
+            for my_sdfg in sdfg.all_sdfgs_recursive():
+                if my_sdfg.parent_nsdfg_node is not None:
+                    repl_dict = {}
+                    for sym, val in my_sdfg.parent_nsdfg_node.symbol_mapping.items():
+                        if isinstance(val, numbers.Number):
+                            repl_dict[sym] = val
+                    my_sdfg.replace_dict(repl_dict)
+
+        with DaCeProgress(config, "Simplify (1)"):
+            _simplify(sdfg)
+
+        if _INTERNAL__SCHEDULE_TREE_OPTIMIZATION:
+            with DaCeProgress(config, "Schedule Tree: generate from SDFG"):
+                stree = sdfg.as_schedule_tree()
+
+            with DaCeProgress(config, "Schedule Tree: optimization"):
+                if config.is_gpu_backend():
+                    GPUPipeline().run(stree)
+                else:
+                    CPUPipeline(passes=_INTERNAL__SCHEDULE_TREE_PASSES).run(stree)
+
+            with DaCeProgress(config, "Schedule Tree: go back to SDFG"):
+                sdfg = stree.as_sdfg(skip={"ScalarToSymbolPromotion"})
+
         # Make the transients array persistents
         if config.is_gpu_backend():
             # TODO
             # The following should happen on the stree level
             _to_gpu(sdfg)
 
-            make_transients_persistent(sdfg=sdfg, device=DaceDeviceType.GPU)
+            make_transients_persistent(sdfg=sdfg, device=device_type)
 
             # Upload args to device
             _upload_to_device(list(args) + list(kwargs.values()))
@@ -145,7 +184,7 @@ def _build_sdfg(
             for _sd, _aname, arr in sdfg.arrays_recursive():
                 if arr.shape == (1,):
                     arr.storage = DaceStorageType.Register
-            make_transients_persistent(sdfg=sdfg, device=DaceDeviceType.CPU)
+            make_transients_persistent(sdfg=sdfg, device=device_type)
 
         # Build non-constants & non-transients from the sdfg_kwargs
         sdfg_kwargs = dace_program._create_sdfg_args(sdfg, args, kwargs)
@@ -157,8 +196,8 @@ def _build_sdfg(
             if k in sdfg_kwargs and tup[1].transient:
                 del sdfg_kwargs[k]
 
-        with DaCeProgress(config, "Simplify"):
-            _simplify(sdfg, validate=False, verbose=True)
+        with DaCeProgress(config, "Simplify (2)"):
+            _simplify(sdfg)
 
         # Move all memory that can be into a pool to lower memory pressure.
         # Change Persistent memory (sub-SDFG) into Scope and flag it.
@@ -182,6 +221,9 @@ def _build_sdfg(
                 negative_delp_checker(sdfg)
                 negative_qtracers_checker(sdfg)
 
+        with DaCeProgress(config, "Validate before compile"):
+            sdfg.validate()
+
         # Compile
         with DaCeProgress(config, "Codegen & compile"):
             sdfg.compile()
@@ -495,7 +537,7 @@ def orchestrate(
         raise RuntimeError(
             f"Could not orchestrate, "
             f"{type(obj).__name__}.{method_to_orchestrate} "
-            "does not exists"
+            "does not exist."
         )
 
     if dace_compiletime_args is None:
@@ -535,7 +577,9 @@ def __call__(self, *arg, **kwarg):  # type: ignore[no-untyped-def]
                 return wrapped(*arg, **kwarg)
 
             def __sdfg__(self, *args, **kwargs):  # type: ignore[no-untyped-def]
-                return wrapped.__sdfg__(*args, **kwargs)
+                sdfg = wrapped.__sdfg__(*args, **kwargs)
+                sdfg.validate()
+                return sdfg
 
             def __sdfg_closure__(self, reevaluate=None):  # type: ignore[no-untyped-def]
                 return wrapped.__sdfg_closure__(reevaluate)

diff --git a/ndsl/dsl/dace/sdfg/loop_transform.py b/ndsl/dsl/dace/sdfg/loop_transform.py
@@ -0,0 +1,19 @@
+from dace import SDFG, ScheduleType, nodes
+
+
+def make_SDFG_CPU_sequential(sdfg: SDFG) -> None:
+    """Utility to turn a CPU-based SDFG to pure serial by removing OpenMP"""
+    # Disable OpenMP sections
+    for sd in sdfg.all_sdfgs_recursive():
+        sd.openmp_sections = False
+
+    # Disable OpenMP maps
+    for node, _ in sdfg.all_nodes_recursive():
+        if isinstance(node, nodes.EntryNode):
+            schedule = getattr(node, "schedule", False)
+            if schedule in (
+                ScheduleType.CPU_Multicore,
+                ScheduleType.CPU_Persistent,
+                ScheduleType.Default,
+            ):
+                node.schedule = ScheduleType.Sequential
diff --git a/ndsl/dsl/dace/stree/__init__.py b/ndsl/dsl/dace/stree/__init__.py
@@ -0,0 +1,4 @@
+from .pipeline import CPUPipeline, GPUPipeline
+
+
+__all__ = ["CPUPipeline", "GPUPipeline"]
diff --git a/ndsl/dsl/dace/stree/optimizations/__init__.py b/ndsl/dsl/dace/stree/optimizations/__init__.py
@@ -0,0 +1,4 @@
+from .axis_merge import AxisIterator, CartesianAxisMerge
+
+
+__all__ = ["AxisIterator", "CartesianAxisMerge"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		from .pipeline import CPUPipeline, GPUPipeline


		__all__ = ["CPUPipeline", "GPUPipeline"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		from .axis_merge import AxisIterator, CartesianAxisMerge


		__all__ = ["AxisIterator", "CartesianAxisMerge"]