From 53932b8112039d27278c8be48a35ae68218b2026 Mon Sep 17 00:00:00 2001
From: Roman Cattaneo <romanc@users.noreply.github.com>
Date: Thu, 12 Mar 2026 12:00:01 +0100
Subject: [PATCH 1/2] build: update gt4py to get compiler support

The PR adds extended compiler support by updating GT4Py, which now
auto-detects compilers (gnu, intel, clang, and apple-clang) and sets
defaults for the compiler flags accordingly. For example, not all
compilers have the same OpenMP flags and `apple-clang` doesn't support
it out of the box anyway. All of this is now caputred at the GT4Py
level.

In additition, GT4Py now automatically disables FMA operations in case
of `-O0` (optimization level 0) to help with stability in porting when
comparing to Fortran generated reference data.
---
 external/gt4py       |  2 +-
 ndsl/dsl/__init__.py | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/external/gt4py b/external/gt4py
index 24b2dab3..2d2511ad 160000
--- a/external/gt4py
+++ b/external/gt4py
@@ -1 +1 @@
-Subproject commit 24b2dab321a25a87d3d4c36ed20e0c3fc6c525d5
+Subproject commit 2d2511ad0652ca92f6f36ce48f484d61d8939c50
diff --git a/ndsl/dsl/__init__.py b/ndsl/dsl/__init__.py
index 202b1569..f562b982 100644
--- a/ndsl/dsl/__init__.py
+++ b/ndsl/dsl/__init__.py
@@ -1,6 +1,5 @@
 # Literal precision for both GT4Py & NDSL
 import os
-import platform
 import sys
 from typing import Literal
 
@@ -36,15 +35,6 @@ def _get_literal_precision(default: Literal["32", "64"] = "64") -> Literal["32",
 os.environ["GT4PY_LITERAL_INT_PRECISION"] = str(NDSL_GLOBAL_PRECISION)
 os.environ["GT4PY_LITERAL_FLOAT_PRECISION"] = str(NDSL_GLOBAL_PRECISION)
 
-# OpenMP handling
-
-detected_macos = platform.system() == "Darwin"
-if detected_macos:
-    ndsl_log.warning(
-        "Multithreading is deactivated under MacOS due to apple-clang not handling OpenMP by default."
-    )
-os.environ["GT4PY_CARTESIAN_ENABLE_OPENMP"] = "False" if detected_macos else "True"
-
 
 # Set cache names for default gt backends workflow
 import gt4py.cartesian.config  # noqa: E402

From d0b1e12453f63eaeee2d1657abe7e12b1693b1d4 Mon Sep 17 00:00:00 2001
From: Florian Deconinck <deconinck.florian@gmail.com>
Date: Thu, 12 Mar 2026 08:16:48 -0400
Subject: [PATCH 2/2] Apply gt4py.cartesian compiler default to orchestration
 pipeline via DaceConfig Fix optimization level (potentially) ignored on GPU

---
 ndsl/dsl/dace/dace_config.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/ndsl/dsl/dace/dace_config.py b/ndsl/dsl/dace/dace_config.py
index 32dfdb9f..7f1a4004 100644
--- a/ndsl/dsl/dace/dace_config.py
+++ b/ndsl/dsl/dace/dace_config.py
@@ -7,6 +7,7 @@
 
 import dace.config
 from gt4py.cartesian.config import GT4PY_COMPILE_OPT_LEVEL
+from gt4py.cartesian.utils.compiler import cxx_compiler_defaults, gpu_configuration
 
 from ndsl import LocalComm
 from ndsl.comm.communicator import Communicator
@@ -226,23 +227,18 @@ def __init__(
             else:
                 dace.config.Config.set("compiler", "build_type", value="Release")
 
-            # Required to True for gt4py storage/memory
-            dace.config.Config.set(
-                "compiler",
-                "allow_view_arguments",
-                value=True,
-            )
             # Resolve "march/mtune" option for GPU
             # - turn on numeric-centric SSE by default
             # - Neoverse-V2 Grace CPU is too new for GCC 14 and -march=native will fail
             # - use alternative march=armv8-a instead
             march_cpu = "armv8-a" if is_arm_neoverse else "native"
             # Removed --fmath
+            cxx_defaults = cxx_compiler_defaults(GT4PY_COMPILE_OPT_LEVEL)
             dace.config.Config.set(
                 "compiler",
                 "cpu",
                 "args",
-                value=f"-march={march_cpu} -std=c++17 -fPIC -Wall -Wextra -O{optimization_level}",
+                value=f"-march={march_cpu} -std=c++17 -fPIC -Wall -Wextra -O{optimization_level} {cxx_defaults.cxx_compile_flags}",
             )
             # Potentially buggy - deactivate
             dace.config.Config.set(
@@ -257,11 +253,12 @@ def __init__(
             # - use alternative mcpu=native instead
             march_option = "-mcpu=native" if is_arm_neoverse else "-march=native"
             # Removed --fast-math
+            gpu_config = gpu_configuration(GT4PY_COMPILE_OPT_LEVEL)
             dace.config.Config.set(
                 "compiler",
                 "cuda",
                 "args",
-                value=f"-std=c++14 -Xcompiler -fPIC -O3 -Xcompiler {march_option}",
+                value=f"-std=c++14 -Xcompiler -fPIC -O{optimization_level} -Xcompiler {march_option} {gpu_config.gpu_compile_flags}",
             )
 
             cuda_sm = cp.cuda.Device(0).compute_capability if cp else 60
@@ -280,6 +277,14 @@ def __init__(
                 "max_concurrent_streams",
                 value=-1,  # no concurrent streams, every kernel on defaultStream
             )
+
+            # Required to True for gt4py storage/memory
+            dace.config.Config.set(
+                "compiler",
+                "allow_view_arguments",
+                value=True,
+            )
+
             # Speed up built time
             dace.config.Config.set(
                 "compiler",