From 53932b8112039d27278c8be48a35ae68218b2026 Mon Sep 17 00:00:00 2001 From: Roman Cattaneo Date: Thu, 12 Mar 2026 12:00:01 +0100 Subject: [PATCH 1/2] build: update gt4py to get compiler support The PR adds extended compiler support by updating GT4Py, which now auto-detects compilers (gnu, intel, clang, and apple-clang) and sets defaults for the compiler flags accordingly. For example, not all compilers have the same OpenMP flags and `apple-clang` doesn't support it out of the box anyway. All of this is now caputred at the GT4Py level. In additition, GT4Py now automatically disables FMA operations in case of `-O0` (optimization level 0) to help with stability in porting when comparing to Fortran generated reference data. --- external/gt4py | 2 +- ndsl/dsl/__init__.py | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/external/gt4py b/external/gt4py index 24b2dab3..2d2511ad 160000 --- a/external/gt4py +++ b/external/gt4py @@ -1 +1 @@ -Subproject commit 24b2dab321a25a87d3d4c36ed20e0c3fc6c525d5 +Subproject commit 2d2511ad0652ca92f6f36ce48f484d61d8939c50 diff --git a/ndsl/dsl/__init__.py b/ndsl/dsl/__init__.py index 202b1569..f562b982 100644 --- a/ndsl/dsl/__init__.py +++ b/ndsl/dsl/__init__.py @@ -1,6 +1,5 @@ # Literal precision for both GT4Py & NDSL import os -import platform import sys from typing import Literal @@ -36,15 +35,6 @@ def _get_literal_precision(default: Literal["32", "64"] = "64") -> Literal["32", os.environ["GT4PY_LITERAL_INT_PRECISION"] = str(NDSL_GLOBAL_PRECISION) os.environ["GT4PY_LITERAL_FLOAT_PRECISION"] = str(NDSL_GLOBAL_PRECISION) -# OpenMP handling - -detected_macos = platform.system() == "Darwin" -if detected_macos: - ndsl_log.warning( - "Multithreading is deactivated under MacOS due to apple-clang not handling OpenMP by default." - ) -os.environ["GT4PY_CARTESIAN_ENABLE_OPENMP"] = "False" if detected_macos else "True" - # Set cache names for default gt backends workflow import gt4py.cartesian.config # noqa: E402 From d0b1e12453f63eaeee2d1657abe7e12b1693b1d4 Mon Sep 17 00:00:00 2001 From: Florian Deconinck Date: Thu, 12 Mar 2026 08:16:48 -0400 Subject: [PATCH 2/2] Apply gt4py.cartesian compiler default to orchestration pipeline via DaceConfig Fix optimization level (potentially) ignored on GPU --- ndsl/dsl/dace/dace_config.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/ndsl/dsl/dace/dace_config.py b/ndsl/dsl/dace/dace_config.py index 32dfdb9f..7f1a4004 100644 --- a/ndsl/dsl/dace/dace_config.py +++ b/ndsl/dsl/dace/dace_config.py @@ -7,6 +7,7 @@ import dace.config from gt4py.cartesian.config import GT4PY_COMPILE_OPT_LEVEL +from gt4py.cartesian.utils.compiler import cxx_compiler_defaults, gpu_configuration from ndsl import LocalComm from ndsl.comm.communicator import Communicator @@ -226,23 +227,18 @@ def __init__( else: dace.config.Config.set("compiler", "build_type", value="Release") - # Required to True for gt4py storage/memory - dace.config.Config.set( - "compiler", - "allow_view_arguments", - value=True, - ) # Resolve "march/mtune" option for GPU # - turn on numeric-centric SSE by default # - Neoverse-V2 Grace CPU is too new for GCC 14 and -march=native will fail # - use alternative march=armv8-a instead march_cpu = "armv8-a" if is_arm_neoverse else "native" # Removed --fmath + cxx_defaults = cxx_compiler_defaults(GT4PY_COMPILE_OPT_LEVEL) dace.config.Config.set( "compiler", "cpu", "args", - value=f"-march={march_cpu} -std=c++17 -fPIC -Wall -Wextra -O{optimization_level}", + value=f"-march={march_cpu} -std=c++17 -fPIC -Wall -Wextra -O{optimization_level} {cxx_defaults.cxx_compile_flags}", ) # Potentially buggy - deactivate dace.config.Config.set( @@ -257,11 +253,12 @@ def __init__( # - use alternative mcpu=native instead march_option = "-mcpu=native" if is_arm_neoverse else "-march=native" # Removed --fast-math + gpu_config = gpu_configuration(GT4PY_COMPILE_OPT_LEVEL) dace.config.Config.set( "compiler", "cuda", "args", - value=f"-std=c++14 -Xcompiler -fPIC -O3 -Xcompiler {march_option}", + value=f"-std=c++14 -Xcompiler -fPIC -O{optimization_level} -Xcompiler {march_option} {gpu_config.gpu_compile_flags}", ) cuda_sm = cp.cuda.Device(0).compute_capability if cp else 60 @@ -280,6 +277,14 @@ def __init__( "max_concurrent_streams", value=-1, # no concurrent streams, every kernel on defaultStream ) + + # Required to True for gt4py storage/memory + dace.config.Config.set( + "compiler", + "allow_view_arguments", + value=True, + ) + # Speed up built time dace.config.Config.set( "compiler",