NOAA-GFDL · FlorianDeconinck · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
diff --git a/external/gt4py b/external/gt4py
diff --git a/ndsl/dsl/__init__.py b/ndsl/dsl/__init__.py
@@ -1,6 +1,5 @@
 # Literal precision for both GT4Py & NDSL
 import os
-import platform
 import sys
 from typing import Literal
 
@@ -36,15 +35,6 @@ def _get_literal_precision(default: Literal["32", "64"] = "64") -> Literal["32",
 os.environ["GT4PY_LITERAL_INT_PRECISION"] = str(NDSL_GLOBAL_PRECISION)
 os.environ["GT4PY_LITERAL_FLOAT_PRECISION"] = str(NDSL_GLOBAL_PRECISION)
 
-# OpenMP handling
-
-detected_macos = platform.system() == "Darwin"
-if detected_macos:
-    ndsl_log.warning(
-        "Multithreading is deactivated under MacOS due to apple-clang not handling OpenMP by default."
-    )
-os.environ["GT4PY_CARTESIAN_ENABLE_OPENMP"] = "False" if detected_macos else "True"
-
 
 # Set cache names for default gt backends workflow
 import gt4py.cartesian.config  # noqa: E402

diff --git a/ndsl/dsl/dace/dace_config.py b/ndsl/dsl/dace/dace_config.py
@@ -7,6 +7,7 @@
 
 import dace.config
 from gt4py.cartesian.config import GT4PY_COMPILE_OPT_LEVEL
+from gt4py.cartesian.utils.compiler import cxx_compiler_defaults, gpu_configuration
 
 from ndsl import LocalComm
 from ndsl.comm.communicator import Communicator
@@ -226,23 +227,18 @@ def __init__(
             else:
                 dace.config.Config.set("compiler", "build_type", value="Release")
 
-            # Required to True for gt4py storage/memory
-            dace.config.Config.set(
-                "compiler",
-                "allow_view_arguments",
-                value=True,
-            )
             # Resolve "march/mtune" option for GPU
             # - turn on numeric-centric SSE by default
             # - Neoverse-V2 Grace CPU is too new for GCC 14 and -march=native will fail
             # - use alternative march=armv8-a instead
             march_cpu = "armv8-a" if is_arm_neoverse else "native"
             # Removed --fmath
+            cxx_defaults = cxx_compiler_defaults(GT4PY_COMPILE_OPT_LEVEL)
             dace.config.Config.set(
                 "compiler",
                 "cpu",
                 "args",
-                value=f"-march={march_cpu} -std=c++17 -fPIC -Wall -Wextra -O{optimization_level}",
+                value=f"-march={march_cpu} -std=c++17 -fPIC -Wall -Wextra -O{optimization_level} {cxx_defaults.cxx_compile_flags}",
             )
             # Potentially buggy - deactivate
             dace.config.Config.set(
@@ -257,11 +253,12 @@ def __init__(
             # - use alternative mcpu=native instead
             march_option = "-mcpu=native" if is_arm_neoverse else "-march=native"
             # Removed --fast-math
+            gpu_config = gpu_configuration(GT4PY_COMPILE_OPT_LEVEL)
             dace.config.Config.set(
                 "compiler",
                 "cuda",
                 "args",
-                value=f"-std=c++14 -Xcompiler -fPIC -O3 -Xcompiler {march_option}",
+                value=f"-std=c++14 -Xcompiler -fPIC -O{optimization_level} -Xcompiler {march_option} {gpu_config.gpu_compile_flags}",
             )
 
             cuda_sm = cp.cuda.Device(0).compute_capability if cp else 60
@@ -280,6 +277,14 @@ def __init__(
                 "max_concurrent_streams",
                 value=-1,  # no concurrent streams, every kernel on defaultStream
             )
+
+            # Required to True for gt4py storage/memory
+            dace.config.Config.set(
+                "compiler",
+                "allow_view_arguments",
+                value=True,
+            )
+
             # Speed up built time
             dace.config.Config.set(
                 "compiler",
+12 −0		CHANGELOG.md
+1 −1		pyproject.toml
+1 −1		src/gt4py/__about__.py
+36 −33		src/gt4py/cartesian/config.py
+140 −0		src/gt4py/cartesian/utils/compiler.py
+120 −61		src/gt4py/next/program_processors/runners/dace/transformations/fuse_horizontal_conditionblocks.py
+0 −4		src/gt4py/next/program_processors/runners/dace/transformations/move_dataflow_into_if_body.py
+157 −17		...ogram_processor_tests/runners_tests/dace_tests/transformation_tests/test_fuse_horizontal_conditionblocks.py
+227 −0		...ts/program_processor_tests/runners_tests/dace_tests/transformation_tests/test_move_dataflow_into_if_body.py