Skip to content
Merged
Show file tree
Hide file tree
Changes from 49 commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
53c8e68
NASA Team: Mileston 2 "release" branch
romanc Aug 4, 2025
c72c774
Expose erf, erfc, round, and new typecasts from ndsl.dsl.gt4py
Aug 5, 2025
1e8ecfb
gt4py update: abs k and current k in debug backend
romanc Aug 7, 2025
258836d
gt4py update: fix literal precision
romanc Aug 7, 2025
cfb268a
Merge branch 'develop' into nasa/milestone2
romanc Aug 14, 2025
ab20e9b
Merge branch 'develop' into nasa/milestone2
romanc Aug 14, 2025
b1a1abd
dace|orchestration: Schedule tree roundrip work (#206)
romanc Aug 21, 2025
aac6360
update gt4py to milestone2
romanc Aug 22, 2025
7521c0e
Added device_synchronize call to fix GPU/MPI synchronization issue on…
Aug 22, 2025
9cb865b
Merge branch 'nasa/milestone2' of https://github.com/NOAA-GFDL/NDSL i…
Aug 22, 2025
7555ec9
Linting
Aug 22, 2025
f7d9628
Linting again
Aug 22, 2025
ac5af0b
perf: set build type to release in dace config
romanc Aug 25, 2025
573d44c
perf: set -march=native flag for cpu
romanc Aug 26, 2025
caad0b0
fix: stencil wrapper field origins with data_dims
romanc Sep 2, 2025
0a66099
Unrelated: no unused arguments in stencil definition
romanc Sep 2, 2025
ef3c3de
Update gt4py to lastest romanc/milestone2
romanc Sep 2, 2025
6257eb6
tests: Add test case for orchestrated tables
romanc Sep 3, 2025
85011ef
[orchestration] common cast operation replacments
FlorianDeconinck Jun 25, 2025
13c912d
FieldBundle memoization fix
romanc Sep 4, 2025
8859680
Merge branch 'develop' into nasa/milestone2
romanc Sep 9, 2025
4745174
Update gt4py: fix memlets into FrozenSDFG
romanc Sep 11, 2025
83517d4
update gt4py: tests memlet dimesion / fix domain symbols
romanc Sep 12, 2025
00d9b8b
Merge remote-tracking branch 'origin/develop' into nasa/milestone2
romanc Sep 16, 2025
c897975
cleanup: backends raise if not defined (#234)
romanc Sep 19, 2025
3fe17f1
Merge remote-tracking branch 'origin/develop' into nasa/milestone2
romanc Sep 23, 2025
43030fa
GT4Py update
romanc Sep 23, 2025
e728e78
Merge remote-tracking branch 'origin/develop' into nasa/milestone2
romanc Sep 24, 2025
723b96e
Merge remote-tracking branch 'NOAA/develop' into nasa/milestone2
FlorianDeconinck Sep 25, 2025
cd599ff
gt4py update: no major changes in cartesian
romanc Sep 26, 2025
bebac64
Merge remote-tracking branch 'NOAA/develop' into nasa/milestone2
FlorianDeconinck Oct 2, 2025
7c62ae2
Merge remote-tracking branch 'NOAA/develop' into nasa/milestone2
FlorianDeconinck Oct 2, 2025
4206b3b
gt4py update (abs K index fix in debug & dace)
romanc Oct 6, 2025
5e563ea
gt4py update: absolute K indexing in mainline
romanc Oct 7, 2025
b9c14e5
Merge remote-tracking branch 'origin/develop' into nasa/milestone2
romanc Oct 7, 2025
fc8b92f
absolute k indexing is now part of mainline gt4py (experimental)
romanc Oct 7, 2025
47bc3e3
Schedule Tree Pipeline + Untested Axis Merge (#251)
FlorianDeconinck Oct 7, 2025
00f8326
[Clean up] Schedule Tree optimizer (WIP) (#255)
FlorianDeconinck Oct 10, 2025
eb199a9
Merge remote-tracking branch 'origin/develop' into nasa/milestone2
romanc Oct 13, 2025
10f14dc
gt4py update: push forscope down, shiny error messages
romanc Oct 13, 2025
b3136d5
update dace (& gt4py): fixes from v1/maintenance
romanc Oct 14, 2025
f863765
Merge remote-tracking branch 'origin/develop' into nasa/milestone2
romanc Oct 14, 2025
b1d6222
Merge remote-tracking branch 'origin/develop' into nasa/milestone2
romanc Oct 15, 2025
b405fd8
Merge remote-tracking branch 'origin/develop' into nasa/milestone2
Oct 20, 2025
379ef1b
fixup: add missing type after merge
Oct 20, 2025
c1bf223
Merge remote-tracking branch 'origin/develop' into nasa/milestone2
romanc Oct 22, 2025
d5b0e6a
update gt4py: K iteration index
romanc Oct 22, 2025
540d6ce
Merge remote-tracking branch 'NOAA/develop' into nasa/milestone2
FlorianDeconinck Oct 23, 2025
a19c94d
De-dragon the README
FlorianDeconinck Oct 23, 2025
bd6ddc3
Rename `dst` to `stree` for moniker of `dace.sdfg.analysis.schedule_t…
FlorianDeconinck Oct 23, 2025
2d58b2b
Flip `Protocol` base class to the broader and cleaner ABC
FlorianDeconinck Oct 24, 2025
0502357
Merge branch 'develop' into nasa/milestone2
FlorianDeconinck Oct 24, 2025
9483da7
Lint
FlorianDeconinck Oct 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ To run the GPU backends, you'll need:
- Libraries: MPI compiled with CUDA support
- CUDA 11.2+
- Python package:
- `cupy` (latest with proper driver support [see install notes](https://docs.cupy.dev/en/stable/install.html))
- `cupy` (latest with proper driver support [see install notes](https://docs.cupy.dev/en/stable/install.html))

A simple way to install MPI is using pre-built wheels, e.g.

Expand Down
2 changes: 1 addition & 1 deletion ndsl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import dsl # isort:skip
from .logging import ndsl_log # isort:skip
from .comm.communicator import CubedSphereCommunicator, TileCommunicator
from .comm.local_comm import LocalComm
from .comm.mpi import MPIComm
Expand All @@ -22,7 +23,6 @@
from .halo.data_transformer import HaloExchangeSpec
from .halo.updater import HaloUpdater, HaloUpdateRequest, VectorInterfaceHaloUpdater
from .initialization import GridSizer, QuantityFactory, SubtileGridSizer
from .logging import ndsl_log
from .monitor.netcdf_monitor import NetCDFMonitor
from .namelist import Namelist
from .performance.collector import NullPerformanceCollector, PerformanceCollector
Expand Down
56 changes: 50 additions & 6 deletions ndsl/dsl/dace/orchestration.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import numbers
import os
from collections.abc import Callable, Sequence
from typing import Any
Expand Down Expand Up @@ -32,6 +33,8 @@
negative_qtracers_checker,
sdfg_nan_checker,
)
from ndsl.dsl.dace.stree import CPUPipeline, GPUPipeline
from ndsl.dsl.dace.stree.optimizations import AxisIterator, CartesianAxisMerge
from ndsl.dsl.dace.utils import (
DaCeProgress,
memory_static_analysis,
Expand All @@ -41,6 +44,13 @@
from ndsl.optional_imports import cupy as cp


_INTERNAL__SCHEDULE_TREE_OPTIMIZATION: bool = False
"""INTERNAL: Developer flag to turn the untested schedule tree roundtrip optimizer."""

_INTERNAL__SCHEDULE_TREE_PASSES = [CartesianAxisMerge(AxisIterator._K)]
"""INTERNAL: Default schedule passes for CPU. To be replaced with proper configuration."""


def dace_inhibitor(func: Callable) -> Callable:
"""Triggers callback generation wrapping `func` while doing DaCe parsing."""
return func
Expand Down Expand Up @@ -124,18 +134,47 @@ def _build_sdfg(
) -> None:
"""Build the .so out of the SDFG on the top tile ranks only."""
is_compiling = True if DEACTIVATE_DISTRIBUTED_DACE_COMPILE else config.do_compile
device_type = DaceDeviceType.GPU if config.is_gpu_backend() else DaceDeviceType.CPU

if is_compiling:
with DaCeProgress(config, "Validate original SDFG"):
sdfg.validate()

# Fully specialize all known symbols and then propagate these changes in the simplify
# pass that follows. This is not only a smart idea in general, but also simplifies (haha)
# the schedule tree (optimization) roundtrip.
with DaCeProgress(config, "Fully specialize symbols"):
for my_sdfg in sdfg.all_sdfgs_recursive():
if my_sdfg.parent_nsdfg_node is not None:
repl_dict = {}
for sym, val in my_sdfg.parent_nsdfg_node.symbol_mapping.items():
if isinstance(val, numbers.Number):
repl_dict[sym] = val
my_sdfg.replace_dict(repl_dict)

with DaCeProgress(config, "Simplify (1)"):
_simplify(sdfg)

if _INTERNAL__SCHEDULE_TREE_OPTIMIZATION:
with DaCeProgress(config, "Schedule Tree: generate from SDFG"):
stree = sdfg.as_schedule_tree()

with DaCeProgress(config, "Schedule Tree: optimization"):
if config.is_gpu_backend():
GPUPipeline().run(stree)
else:
CPUPipeline(passes=_INTERNAL__SCHEDULE_TREE_PASSES).run(stree)

with DaCeProgress(config, "Schedule Tree: go back to SDFG"):
sdfg = stree.as_sdfg(skip={"ScalarToSymbolPromotion"})

# Make the transients array persistents
if config.is_gpu_backend():
# TODO
# The following should happen on the stree level
_to_gpu(sdfg)

make_transients_persistent(sdfg=sdfg, device=DaceDeviceType.GPU)
make_transients_persistent(sdfg=sdfg, device=device_type)

# Upload args to device
_upload_to_device(list(args) + list(kwargs.values()))
Expand All @@ -145,7 +184,7 @@ def _build_sdfg(
for _sd, _aname, arr in sdfg.arrays_recursive():
if arr.shape == (1,):
arr.storage = DaceStorageType.Register
make_transients_persistent(sdfg=sdfg, device=DaceDeviceType.CPU)
make_transients_persistent(sdfg=sdfg, device=device_type)

# Build non-constants & non-transients from the sdfg_kwargs
sdfg_kwargs = dace_program._create_sdfg_args(sdfg, args, kwargs)
Expand All @@ -157,8 +196,8 @@ def _build_sdfg(
if k in sdfg_kwargs and tup[1].transient:
del sdfg_kwargs[k]

with DaCeProgress(config, "Simplify"):
_simplify(sdfg, validate=False, verbose=True)
with DaCeProgress(config, "Simplify (2)"):
_simplify(sdfg)

# Move all memory that can be into a pool to lower memory pressure.
# Change Persistent memory (sub-SDFG) into Scope and flag it.
Expand All @@ -182,6 +221,9 @@ def _build_sdfg(
negative_delp_checker(sdfg)
negative_qtracers_checker(sdfg)

with DaCeProgress(config, "Validate before compile"):
sdfg.validate()

# Compile
with DaCeProgress(config, "Codegen & compile"):
sdfg.compile()
Expand Down Expand Up @@ -495,7 +537,7 @@ def orchestrate(
raise RuntimeError(
f"Could not orchestrate, "
f"{type(obj).__name__}.{method_to_orchestrate} "
"does not exists"
"does not exist."
)

if dace_compiletime_args is None:
Expand Down Expand Up @@ -535,7 +577,9 @@ def __call__(self, *arg, **kwarg): # type: ignore[no-untyped-def]
return wrapped(*arg, **kwarg)

def __sdfg__(self, *args, **kwargs): # type: ignore[no-untyped-def]
return wrapped.__sdfg__(*args, **kwargs)
sdfg = wrapped.__sdfg__(*args, **kwargs)
sdfg.validate()
return sdfg

def __sdfg_closure__(self, reevaluate=None): # type: ignore[no-untyped-def]
return wrapped.__sdfg_closure__(reevaluate)
Expand Down
19 changes: 19 additions & 0 deletions ndsl/dsl/dace/sdfg/loop_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from dace import SDFG, ScheduleType, nodes


def make_SDFG_CPU_sequential(sdfg: SDFG) -> None:
"""Utility to turn a CPU-based SDFG to pure serial by removing OpenMP"""
# Disable OpenMP sections
for sd in sdfg.all_sdfgs_recursive():
sd.openmp_sections = False

# Disable OpenMP maps
for node, _ in sdfg.all_nodes_recursive():
if isinstance(node, nodes.EntryNode):
schedule = getattr(node, "schedule", False)
if schedule in (
ScheduleType.CPU_Multicore,
ScheduleType.CPU_Persistent,
ScheduleType.Default,
):
node.schedule = ScheduleType.Sequential
4 changes: 4 additions & 0 deletions ndsl/dsl/dace/stree/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .pipeline import CPUPipeline, GPUPipeline


__all__ = ["CPUPipeline", "GPUPipeline"]
4 changes: 4 additions & 0 deletions ndsl/dsl/dace/stree/optimizations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .axis_merge import AxisIterator, CartesianAxisMerge


__all__ = ["AxisIterator", "CartesianAxisMerge"]
Loading