From 1638e9c6d1c11564def7b37f279c0135e437f195 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Tue, 27 Jan 2026 10:26:14 +0100
Subject: [PATCH 01/24] Add local limiting for conservative variables (3d)

---
 NEWS.md                                       |  15 +-
 .../elixir_euler_sedov_sc_subcell.jl          |   1 +
 .../subcell_bounds_check_3d.jl                |  30 +++
 .../dgsem_p4est/subcell_limiters_3d.jl        | 237 ++++++++++++++++++
 test/test_p4est_3d.jl                         |  25 +-
 5 files changed, 288 insertions(+), 20 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 948cc53531c..6c87870a869 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -7,15 +7,14 @@ for human readability.
 
 ## Changes in the v0.14 lifecycle
 
-#### Added 
-- Added `NonIdealCompressibleEulerEquations1D`, which allows users to specify a non-ideal equation of state. Currently `IdealGas` and `VanDerWaals` are supported ([#2739]). 
-- Added the APEC (approximate pressure equilibrium preserving with conservation) fluxes of `flux_terashima_etal` and `flux_central_terashima_etal` from [Terashima, Ly, Ihme (2025)](https://doi.org/10.1016/j.jcp.2024.113701) ([#2756]). 
+#### Added
+- Added `NonIdealCompressibleEulerEquations1D`, which allows users to specify a non-ideal equation of state. Currently `IdealGas` and `VanDerWaals` are supported ([#2739]).
+- Added the APEC (approximate pressure equilibrium preserving with conservation) fluxes of `flux_terashima_etal` and `flux_central_terashima_etal` from [Terashima, Ly, Ihme (2025)](https://doi.org/10.1016/j.jcp.2024.113701) ([#2756]).
 - Support for second-order finite volume subcell volume integral (`VolumeIntegralPureLGLFiniteVolumeO2`) and
-  stabilized DG-FV blending volume integral (`VolumeIntegralShockCapturingRRG`) on 
+  stabilized DG-FV blending volume integral (`VolumeIntegralShockCapturingRRG`) on
   3D meshes for conservative systems ([#2734], [#2755]).
-- Extended 3D support for subcell limiting with `P4estMesh` was added ([#2733]).
-  In the new version, local (minimum or maximum) limiting for nonlinear variables (using
-  the keyword `local_onesided_variables_nonlinear` in `SubcellLimiterIDP()`) is supported.
+- Extended 3D support for subcell limiting with `P4estMesh` was added ([#2733], TODO).
+  In the new version, local (minimum and/or maximum) limiting for conservative variables (using the keyword `local_twosided_variables_cons` in `SubcellLimiterIDP()`) and nonlinear variables (using `local_onesided_variables_nonlinear`) is supported.
 
 #### Changed
 
@@ -40,7 +39,7 @@ for human readability.
   `flux_cons(u_ll, u_rr) + 0.5f0 * flux_noncons(u_rr, u_ll)`.
 - Added support for `source_terms_parabolic`, which allows users to specify gradient-dependent source terms when solving parabolic equations ([#2721]).
 - Support for second-order finite volume subcell volume integral (`VolumeIntegralPureLGLFiniteVolumeO2`) and
-  stabilized DG-FV blending volume integral (`VolumeIntegralShockCapturingRRG`) on 
+  stabilized DG-FV blending volume integral (`VolumeIntegralShockCapturingRRG`) on
   1D & 2D meshes for conservative systems ([#2022], [#2695], [#2718]).
 
 ## Changes when updating to v0.13 from v0.12.x
diff --git a/examples/p4est_3d_dgsem/elixir_euler_sedov_sc_subcell.jl b/examples/p4est_3d_dgsem/elixir_euler_sedov_sc_subcell.jl
index 721688fc6d2..1925b7ff0c4 100644
--- a/examples/p4est_3d_dgsem/elixir_euler_sedov_sc_subcell.jl
+++ b/examples/p4est_3d_dgsem/elixir_euler_sedov_sc_subcell.jl
@@ -45,6 +45,7 @@ basis = LobattoLegendreBasis(polydeg)
 limiter_idp = SubcellLimiterIDP(equations, basis;
                                 positivity_variables_cons = ["rho"],
                                 positivity_variables_nonlinear = [pressure],
+                                local_twosided_variables_cons = [],
                                 local_onesided_variables_nonlinear = [],
                                 max_iterations_newton = 25)
 volume_integral = VolumeIntegralSubcellLimiting(limiter_idp;
diff --git a/src/callbacks_stage/subcell_bounds_check_3d.jl b/src/callbacks_stage/subcell_bounds_check_3d.jl
index 5267f061d85..b842aef2df9 100644
--- a/src/callbacks_stage/subcell_bounds_check_3d.jl
+++ b/src/callbacks_stage/subcell_bounds_check_3d.jl
@@ -19,6 +19,33 @@
     # `@batch` here to allow a possible redefinition of `@threaded` without creating errors here.
     # See also https://github.com/trixi-framework/Trixi.jl/pull/1888#discussion_r1537785293.
 
+    if local_twosided
+        for v in limiter.local_twosided_variables_cons
+            v_string = string(v)
+            key_min = Symbol(v_string, "_min")
+            key_max = Symbol(v_string, "_max")
+            deviation_min = idp_bounds_delta_local[key_min]
+            deviation_max = idp_bounds_delta_local[key_max]
+            @batch reduction=((max, deviation_min), (max, deviation_max)) for element in eachelement(solver,
+                                                                                                     cache)
+                for k in eachnode(solver), j in eachnode(solver), i in eachnode(solver)
+                    var = u[v, i, j, k, element]
+                    # Note: We always save the absolute deviations >= 0 and therefore use the
+                    # `max` operator for the lower and upper bound. The different directions of
+                    # upper and lower bound are considered in their calculations with a
+                    # different sign.
+                    deviation_min = max(deviation_min,
+                                        variable_bounds[key_min][i, j, k, element] -
+                                        var)
+                    deviation_max = max(deviation_max,
+                                        var -
+                                        variable_bounds[key_max][i, j, k, element])
+                end
+            end
+            idp_bounds_delta_local[key_min] = deviation_min
+            idp_bounds_delta_local[key_max] = deviation_max
+        end
+    end
     if local_onesided
         for (variable, min_or_max) in limiter.local_onesided_variables_nonlinear
             key = Symbol(string(variable), "_", string(min_or_max))
@@ -41,6 +68,9 @@
     end
     if positivity
         for v in limiter.positivity_variables_cons
+            if v in limiter.local_twosided_variables_cons
+                continue
+            end
             key = Symbol(string(v), "_min")
             deviation = idp_bounds_delta_local[key]
             @batch reduction=(max, deviation) for element in eachelement(solver, cache)
diff --git a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
index a15a0173366..4b1e10da09a 100644
--- a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
+++ b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
@@ -12,10 +12,171 @@
 ###############################################################################
 # Calculation of local bounds using low-order FV solution
 
+@inline function calc_bounds_twosided!(var_min, var_max, variable,
+                                       u::AbstractArray{<:Any, 5}, t, semi, equations)
+    mesh, _, dg, cache = mesh_equations_solver_cache(semi)
+    # Calc bounds inside elements
+    @threaded for element in eachelement(dg, cache)
+        # Calculate bounds at Gauss-Lobatto nodes
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            var = u[variable, i, j, k, element]
+            var_min[i, j, k, element] = var
+            var_max[i, j, k, element] = var
+        end
+
+        # Apply values in x direction
+        for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
+            var = u[variable, i - 1, j, k, element]
+            var_min[i, j, k, element] = min(var_min[i, j, k, element], var)
+            var_max[i, j, k, element] = max(var_max[i, j, k, element], var)
+
+            var = u[variable, i, j, k, element]
+            var_min[i - 1, j, k, element] = min(var_min[i - 1, j, k, element], var)
+            var_max[i - 1, j, k, element] = max(var_max[i - 1, j, k, element], var)
+        end
+
+        # Apply values in y direction
+        for k in eachnode(dg), j in 2:nnodes(dg), i in eachnode(dg)
+            var = u[variable, i, j - 1, k, element]
+            var_min[i, j, k, element] = min(var_min[i, j, k, element], var)
+            var_max[i, j, k, element] = max(var_max[i, j, k, element], var)
+
+            var = u[variable, i, j, k, element]
+            var_min[i, j - 1, k, element] = min(var_min[i, j - 1, k, element], var)
+            var_max[i, j - 1, k, element] = max(var_max[i, j - 1, k, element], var)
+        end
+
+        # Apply values in z direction
+        for k in 2:nnodes(dg), j in eachnode(dg), i in eachnode(dg)
+            var = u[variable, i, j, k - 1, element]
+            var_min[i, j, k, element] = min(var_min[i, j, k, element], var)
+            var_max[i, j, k, element] = max(var_max[i, j, k, element], var)
+
+            var = u[variable, i, j, k, element]
+            var_min[i, j, k - 1, element] = min(var_min[i, j, k - 1, element], var)
+            var_max[i, j, k - 1, element] = max(var_max[i, j, k - 1, element], var)
+        end
+    end
+
+    # Values at element boundary
+    calc_bounds_twosided_interface!(var_min, var_max, variable,
+                                    u, t, semi, mesh, equations)
+    return nothing
+end
+
+function calc_bounds_twosided_interface!(var_min, var_max, variable,
+                                         u, t, semi, mesh::P4estMesh{3}, equations)
+    _, _, dg, cache = mesh_equations_solver_cache(semi)
+    (; boundary_conditions) = semi
+
+    (; neighbor_ids, node_indices) = cache.interfaces
+    index_range = eachnode(dg)
+
+    # Calc bounds at interfaces and periodic boundaries
+    for interface in eachinterface(dg, cache)
+        # Get element and side index information on the primary element
+        primary_element = neighbor_ids[1, interface]
+        primary_indices = node_indices[1, interface]
+
+        # Get element and side index information on the secondary element
+        secondary_element = neighbor_ids[2, interface]
+        secondary_indices = node_indices[2, interface]
+
+        # Create the local i,j,k indexing
+        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                     index_range)
+        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                     index_range)
+        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                     index_range)
+
+        i_primary = i_primary_start
+        j_primary = j_primary_start
+        k_primary = k_primary_start
+
+        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_indices[1],
+                                                                                           index_range)
+        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_indices[2],
+                                                                                           index_range)
+        k_secondary_start, k_secondary_step_i, k_secondary_step_j = index_to_start_step_3d(secondary_indices[3],
+                                                                                           index_range)
+
+        i_secondary = i_secondary_start
+        j_secondary = j_secondary_start
+        k_secondary = k_secondary_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                var_primary = u[variable, i_primary, j_primary, k_primary,
+                                primary_element]
+                var_secondary = u[variable, i_secondary, j_secondary, k_secondary,
+                                  secondary_element]
+
+                var_min[i_primary, j_primary, k_primary, primary_element] = min(var_min[i_primary,
+                                                                                        j_primary,
+                                                                                        k_primary,
+                                                                                        primary_element],
+                                                                                var_secondary)
+                var_max[i_primary, j_primary, k_primary, primary_element] = max(var_max[i_primary,
+                                                                                        j_primary,
+                                                                                        k_primary,
+                                                                                        primary_element],
+                                                                                var_secondary)
+
+                var_min[i_secondary, j_secondary, k_secondary, secondary_element] = min(var_min[i_secondary,
+                                                                                                j_secondary,
+                                                                                                k_secondary,
+                                                                                                secondary_element],
+                                                                                        var_primary)
+                var_max[i_secondary, j_secondary, k_secondary, secondary_element] = max(var_max[i_secondary,
+                                                                                                j_secondary,
+                                                                                                k_secondary,
+                                                                                                secondary_element],
+                                                                                        var_primary)
+
+                # Increment the primary element indices
+                i_primary += i_primary_step_i
+                j_primary += j_primary_step_i
+                k_primary += k_primary_step_i
+                # Increment the secondary element surface indices
+                i_secondary += i_secondary_step_i
+                j_secondary += j_secondary_step_i
+                k_secondary += k_secondary_step_i
+            end
+            # Increment the primary element indices
+            i_primary += i_primary_step_j
+            j_primary += j_primary_step_j
+            k_primary += k_primary_step_j
+            # Increment the secondary element surface indices
+            i_secondary += i_secondary_step_j
+            j_secondary += j_secondary_step_j
+            k_secondary += k_secondary_step_j
+        end
+    end
+
+    # Calc bounds at physical boundaries
+    calc_bounds_twosided_boundary!(var_min, var_max, variable, u, t,
+                                   boundary_conditions,
+                                   mesh, equations, dg, cache)
+
+    return nothing
+end
+
+@inline function calc_bounds_twosided_boundary!(var_min, var_max, variable, u, t,
+                                                boundary_conditions::BoundaryConditionPeriodic,
+                                                mesh::P4estMesh{3},
+                                                equations, dg, cache)
+    return nothing
+end
+
 @inline function calc_bounds_onesided!(var_minmax, min_or_max, variable,
                                        u::AbstractArray{<:Any, 5}, t, semi)
     mesh, equations, dg, cache = mesh_equations_solver_cache(semi)
     # Calc bounds inside elements
+
+    # The approach used in `calc_bounds_twosided!` is not used here because it requires more
+    # evaluations of the variable and is therefore slower.
+
     @threaded for element in eachelement(dg, cache)
         # Reset bounds
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
@@ -161,6 +322,75 @@ end
     return nothing
 end
 
+###############################################################################
+# Local two-sided limiting of conservative variables
+
+@inline function idp_local_twosided!(alpha, limiter, u::AbstractArray{<:Any, 5},
+                                     t, dt, semi, variable)
+    mesh, equations, dg, cache = mesh_equations_solver_cache(semi)
+    (; antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R) = cache.antidiffusive_fluxes
+    (; inverse_weights) = dg.basis
+
+    (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
+    variable_string = string(variable)
+    var_min = variable_bounds[Symbol(variable_string, "_min")]
+    var_max = variable_bounds[Symbol(variable_string, "_max")]
+    calc_bounds_twosided!(var_min, var_max, variable, u, t, semi, equations)
+
+    @threaded for element in eachelement(dg, semi.cache)
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            inverse_jacobian = get_inverse_jacobian(cache.elements.inverse_jacobian,
+                                                    mesh, i, j, k, element)
+            var = u[variable, i, j, k, element]
+            # Real Zalesak type limiter
+            #   * Zalesak (1979). "Fully multidimensional flux-corrected transport algorithms for fluids"
+            #   * Kuzmin et al. (2010). "Failsafe flux limiting and constrained data projections for equations of gas dynamics"
+            #   Note: The Zalesak limiter has to be computed, even if the state is valid, because the correction is
+            #         for each interface, not each node
+
+            Qp = max(0, (var_max[i, j, k, element] - var) / dt)
+            Qm = min(0, (var_min[i, j, k, element] - var) / dt)
+
+            # Calculate Pp and Pm
+            # Note: Boundaries of antidiffusive_flux1/2 are constant 0, so they make no difference here.
+            val_flux1_local = inverse_weights[i] *
+                              antidiffusive_flux1_R[variable, i, j, k, element]
+            val_flux1_local_ip1 = -inverse_weights[i] *
+                                  antidiffusive_flux1_L[variable, i + 1, j, k, element]
+            val_flux2_local = inverse_weights[j] *
+                              antidiffusive_flux2_R[variable, i, j, k, element]
+            val_flux2_local_jp1 = -inverse_weights[j] *
+                                  antidiffusive_flux2_L[variable, i, j + 1, k, element]
+            val_flux3_local = inverse_weights[k] *
+                              antidiffusive_flux3_R[variable, i, j, k, element]
+            val_flux3_local_jp1 = -inverse_weights[k] *
+                                  antidiffusive_flux3_L[variable, i, j, k + 1, element]
+
+            Pp = max(0, val_flux1_local) + max(0, val_flux1_local_ip1) +
+                 max(0, val_flux2_local) + max(0, val_flux2_local_jp1) +
+                 max(0, val_flux3_local) + max(0, val_flux3_local_jp1)
+            Pm = min(0, val_flux1_local) + min(0, val_flux1_local_ip1) +
+                 min(0, val_flux2_local) + min(0, val_flux2_local_jp1) +
+                 min(0, val_flux3_local) + min(0, val_flux3_local_jp1)
+
+            Pp = inverse_jacobian * Pp
+            Pm = inverse_jacobian * Pm
+
+            # Compute blending coefficient avoiding division by zero
+            # (as in paper of [Guermond, Nazarov, Popov, Thomas] (4.8))
+            Qp = abs(Qp) /
+                 (abs(Pp) + eps(typeof(Qp)) * 100 * abs(var_max[i, j, k, element]))
+            Qm = abs(Qm) /
+                 (abs(Pm) + eps(typeof(Qm)) * 100 * abs(var_max[i, j, k, element]))
+
+            # Calculate alpha at nodes
+            alpha[i, j, k, element] = max(alpha[i, j, k, element], 1 - min(1, Qp, Qm))
+        end
+    end
+
+    return nothing
+end
+
 ##############################################################################
 # Local one-sided limiting of nonlinear variables
 
@@ -214,6 +444,13 @@ end
             end
 
             # Compute bound
+            if limiter.local_twosided &&
+               (variable in limiter.local_twosided_variables_cons) &&
+               (var_min[i, j, k, element] >= positivity_correction_factor * var)
+                # Local limiting is more restrictive that positivity limiting
+                # => Skip positivity limiting for this node
+                continue
+            end
             var_min[i, j, k, element] = positivity_correction_factor * var
 
             # Real one-sided Zalesak-type limiter
diff --git a/test/test_p4est_3d.jl b/test/test_p4est_3d.jl
index f29256f056b..b5908bc8b03 100644
--- a/test/test_p4est_3d.jl
+++ b/test/test_p4est_3d.jl
@@ -387,7 +387,7 @@ end
                             1.888627709320322,
                             4.971280431903264
                         ],
-                        tspan=(0.0, 0.3),)
+                        tspan=(0.0, 0.3))
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
     # Larger values for allowed allocations due to usage of custom
@@ -399,22 +399,23 @@ end
 
 @trixi_testset "elixir_euler_sedov_sc_subcell.jl (local bounds)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_sedov_sc_subcell.jl"),
+                        local_twosided_variables_cons=["rho"],
                         local_onesided_variables_nonlinear=[(entropy_guermond_etal,
                                                              min)],
-                        max_iterations_newton=40,
+                        max_iterations_newton=30,
                         l2=[
-                            0.19153085678321066,
-                            0.07411109384422779,
-                            0.07411109384410808,
-                            0.07411109384406232,
-                            0.36714268468314665
+                            0.16504564013491585,
+                            0.06461384162458203,
+                            0.06461384162461223,
+                            0.06461384162461678,
+                            0.36193245790622036
                         ],
                         linf=[
-                            1.4037775549639524,
-                            1.339590863739464,
-                            1.3395908637591605,
-                            1.3395908637371077,
-                            4.824252924073932
+                            0.9138327077620716,
+                            0.5707102472596818,
+                            0.5707102472739252,
+                            0.5707102472781822,
+                            4.777595503303726
                         ],
                         tspan=(0.0, 0.3))
     # Ensure that we do not have excessive memory allocations

From fb6c67f2e31f3172e6dff0ab9545cb30e6a596c8 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Tue, 27 Jan 2026 10:28:17 +0100
Subject: [PATCH 02/24] Add PR number

---
 NEWS.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 6c87870a869..e7df24bf7fe 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -13,8 +13,10 @@ for human readability.
 - Support for second-order finite volume subcell volume integral (`VolumeIntegralPureLGLFiniteVolumeO2`) and
   stabilized DG-FV blending volume integral (`VolumeIntegralShockCapturingRRG`) on
   3D meshes for conservative systems ([#2734], [#2755]).
-- Extended 3D support for subcell limiting with `P4estMesh` was added ([#2733], TODO).
-  In the new version, local (minimum and/or maximum) limiting for conservative variables (using the keyword `local_twosided_variables_cons` in `SubcellLimiterIDP()`) and nonlinear variables (using `local_onesided_variables_nonlinear`) is supported.
+- Extended 3D support for subcell limiting with `P4estMesh` was added ([#2733], [#2763]).
+  In the new version, local (minimum and/or maximum) limiting for conservative variables (using the
+  keyword `local_twosided_variables_cons` in `SubcellLimiterIDP()`) and nonlinear variables
+  (using `local_onesided_variables_nonlinear`) is supported.
 
 #### Changed
 

From af49ff56590259e75e8b02f6550eb93de103e067 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Wed, 4 Feb 2026 08:35:42 +0100
Subject: [PATCH 03/24] Add comment

---
 src/callbacks_stage/subcell_bounds_check_2d.jl | 2 ++
 src/callbacks_stage/subcell_bounds_check_3d.jl | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl
index 6085673959d..90a379f2aa2 100644
--- a/src/callbacks_stage/subcell_bounds_check_2d.jl
+++ b/src/callbacks_stage/subcell_bounds_check_2d.jl
@@ -65,6 +65,8 @@
     end
     if positivity
         for v in limiter.positivity_variables_cons
+            # Note: If a variable appears here and in the local min/max limiting, the positivity
+            # lower bound is taken into account there. Skip these variables here.
             if v in limiter.local_twosided_variables_cons
                 continue
             end
diff --git a/src/callbacks_stage/subcell_bounds_check_3d.jl b/src/callbacks_stage/subcell_bounds_check_3d.jl
index b842aef2df9..19afe5d3479 100644
--- a/src/callbacks_stage/subcell_bounds_check_3d.jl
+++ b/src/callbacks_stage/subcell_bounds_check_3d.jl
@@ -68,6 +68,8 @@
     end
     if positivity
         for v in limiter.positivity_variables_cons
+            # Note: If a variable appears here and in the local min/max limiting, the positivity
+            # lower bound is taken into account there. Skip these variables here.
             if v in limiter.local_twosided_variables_cons
                 continue
             end

From 1174dca5ef1e82becb525e0af349f042023f296f Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Mon, 9 Feb 2026 12:37:08 +0100
Subject: [PATCH 04/24] Adapt news.md

---
 NEWS.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 049f40692aa..80efd7c42a8 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -43,8 +43,8 @@ for human readability.
   stabilized DG-FV blending volume integral (`VolumeIntegralShockCapturingRRG`) on
   3D meshes for conservative systems ([#2734], [#2755]).
 - Extended 3D support for subcell limiting with `P4estMesh` was added ([#2733]).
-  In the new version, local (minimum and/or maximum) limiting for nonlinear variables
-  (using the keyword `local_onesided_variables_nonlinear` in `SubcellLimiterIDP()`) is supported.
+  In the new version, local (minimum or maximum) limiting for nonlinear variables (using
+  the keyword `local_onesided_variables_nonlinear` in `SubcellLimiterIDP()`) is supported.
 
 #### Changed
 

From c1218e0a49bee7c5c1e6e8bbf4d2d46dc06ca505 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Wed, 24 Sep 2025 11:40:38 +0200
Subject: [PATCH 05/24] Initial commit for 3D subcell limiting support

---
 src/solvers/dgsem_structured/dg.jl            |   1 +
 .../dg_3d_subcell_limiters.jl                 | 156 +++++++++++++++
 src/solvers/dgsem_tree/dg.jl                  |   2 +
 .../dgsem_tree/dg_3d_subcell_limiters.jl      | 185 ++++++++++++++++++
 src/solvers/dgsem_tree/subcell_limiters_3d.jl | 150 ++++++++++++++
 5 files changed, 494 insertions(+)
 create mode 100644 src/solvers/dgsem_structured/dg_3d_subcell_limiters.jl
 create mode 100644 src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
 create mode 100644 src/solvers/dgsem_tree/subcell_limiters_3d.jl

diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl
index cc0243a9989..39f4a504373 100644
--- a/src/solvers/dgsem_structured/dg.jl
+++ b/src/solvers/dgsem_structured/dg.jl
@@ -210,6 +210,7 @@ include("indicators_3d.jl")
 
 include("subcell_limiters_2d.jl")
 include("dg_2d_subcell_limiters.jl")
+include("dg_3d_subcell_limiters.jl")
 
 # Specialized implementations used to improve performance
 include("dg_2d_compressible_euler.jl")
diff --git a/src/solvers/dgsem_structured/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_structured/dg_3d_subcell_limiters.jl
new file mode 100644
index 00000000000..769b1a73921
--- /dev/null
+++ b/src/solvers/dgsem_structured/dg_3d_subcell_limiters.jl
@@ -0,0 +1,156 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+# Calculate the DG staggered volume fluxes `fhat` in subcell FV-form inside the element
+# (**without non-conservative terms**).
+#
+# See also `flux_differencing_kernel!`.
+@inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R, u,
+                                mesh::P4estMesh{3},
+                                nonconservative_terms::False, equations,
+                                volume_flux, dg::DGSEM, element, cache)
+    (; contravariant_vectors) = cache.elements
+    (; weights, derivative_split) = dg.basis
+    (; flux_temp_threaded) = cache
+
+    flux_temp = flux_temp_threaded[Threads.threadid()]
+
+    # The FV-form fluxes are calculated in a recursive manner, i.e.:
+    # fhat_(0,1)   = w_0 * FVol_0,
+    # fhat_(j,j+1) = fhat_(j-1,j) + w_j * FVol_j,   for j=1,...,N-1,
+    # with the split form volume fluxes FVol_j = -2 * sum_i=0^N D_ji f*_(j,i).
+
+    # To use the symmetry of the `volume_flux`, the split form volume flux is precalculated
+    # like in `calc_volume_integral!` for the `VolumeIntegralFluxDifferencing`
+    # and saved in in `flux_temp`.
+
+    # Split form volume flux in orientation 1: x direction
+    flux_temp .= zero(eltype(flux_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # pull the contravariant vectors in each coordinate direction
+        Ja1_node = get_contravariant_vector(1, contravariant_vectors, i, j, k, element) # x direction
+
+        # All diagonal entries of `derivative_split` are zero. Thus, we can skip
+        # the computation of the diagonal terms. In addition, we use the symmetry
+        # of the `volume_flux` to save half of the possible two-point flux
+        # computations.
+
+        # x direction
+        for ii in (i + 1):nnodes(dg)
+            u_node_ii = get_node_vars(u, equations, dg, ii, j, k, element)
+            # pull the contravariant vectors and compute the average
+            Ja1_node_ii = get_contravariant_vector(1, contravariant_vectors, ii, j, k,
+                                                   element)
+            Ja1_avg = 0.5f0 * (Ja1_node + Ja1_node_ii)
+
+            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
+            fluxtilde1 = volume_flux(u_node, u_node_ii, Ja1_avg, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[i, ii], fluxtilde1,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[ii, i], fluxtilde1,
+                                       equations, dg, ii, j, k)
+        end
+    end
+
+    # FV-form flux `fhat` in x direction
+    fhat1_L[:, 1, :, :] .= zero(eltype(fhat1_L))
+    fhat1_L[:, nnodes(dg) + 1, :, :] .= zero(eltype(fhat1_L))
+    fhat1_R[:, 1, :, :] .= zero(eltype(fhat1_R))
+    fhat1_R[:, nnodes(dg) + 1, :, :] .= zero(eltype(fhat1_R))
+
+    for k in eachnode(dg), j in eachnode(dg), i in 1:(nnodes(dg) - 1),
+        v in eachvariable(equations)
+
+        fhat1_L[v, i + 1, j, k] = fhat1_L[v, i, j, k] +
+                                  weights[i] * flux_temp[v, i, j, k]
+        fhat1_R[v, i + 1, j, k] = fhat1_L[v, i + 1, j, k]
+    end
+
+    # Split form volume flux in orientation 2: y direction
+    flux_temp .= zero(eltype(flux_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # pull the contravariant vectors in each coordinate direction
+        Ja2_node = get_contravariant_vector(2, contravariant_vectors, i, j, k, element)
+
+        # y direction
+        for jj in (j + 1):nnodes(dg)
+            u_node_jj = get_node_vars(u, equations, dg, i, jj, k, element)
+            # pull the contravariant vectors and compute the average
+            Ja2_node_jj = get_contravariant_vector(2, contravariant_vectors, i, jj, k,
+                                                   element)
+            Ja2_avg = 0.5f0 * (Ja2_node + Ja2_node_jj)
+            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
+            fluxtilde2 = volume_flux(u_node, u_node_jj, Ja2_avg, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[j, jj], fluxtilde2,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[jj, j], fluxtilde2,
+                                       equations, dg, i, jj, k)
+        end
+    end
+
+    # FV-form flux `fhat` in y direction
+    fhat2_L[:, :, 1, :] .= zero(eltype(fhat2_L))
+    fhat2_L[:, :, nnodes(dg) + 1, :] .= zero(eltype(fhat2_L))
+    fhat2_R[:, :, 1, :] .= zero(eltype(fhat2_R))
+    fhat2_R[:, :, nnodes(dg) + 1, :] .= zero(eltype(fhat2_R))
+
+    for k in eachnode(dg), j in 1:(nnodes(dg) - 1), i in eachnode(dg),
+        v in eachvariable(equations)
+
+        fhat2_L[v, i, j + 1, k] = fhat2_L[v, i, j, k] +
+                                  weights[j] * flux_temp[v, i, j, k]
+        fhat2_R[v, i, j + 1, k] = fhat2_L[v, i, j + 1, k]
+    end
+
+    # Split form volume flux in orientation 3: z direction
+    flux_temp .= zero(eltype(flux_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # pull the contravariant vectors in each coordinate direction
+        Ja3_node = get_contravariant_vector(3, contravariant_vectors, i, j, k, element)
+
+        # y direction
+        for kk in (k + 1):nnodes(dg)
+            u_node_kk = get_node_vars(u, equations, dg, i, j, kk, element)
+            # pull the contravariant vectors and compute the average
+            Ja3_node_kk = get_contravariant_vector(3, contravariant_vectors, i, j, kk,
+                                                   element)
+            Ja3_avg = 0.5f0 * (Ja3_node + Ja3_node_kk)
+            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
+            fluxtilde3 = volume_flux(u_node, u_node_kk, Ja3_avg, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[k, kk], fluxtilde3,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[kk, k], fluxtilde3,
+                                       equations, dg, i, j, kk)
+        end
+    end
+
+    # FV-form flux `fhat` in y direction
+    fhat3_L[:, :, :, 1] .= zero(eltype(fhat3_L))
+    fhat3_L[:, :, :, nnodes(dg) + 1] .= zero(eltype(fhat3_L))
+    fhat3_R[:, :, :, 1] .= zero(eltype(fhat3_R))
+    fhat3_R[:, :, :, nnodes(dg) + 1] .= zero(eltype(fhat3_R))
+
+    for k in 1:(nnodes(dg) - 1), j in eachnode(dg), i in eachnode(dg),
+        v in eachvariable(equations)
+
+        fhat3_L[v, i, j, k + 1] = fhat3_L[v, i, j, k] +
+                                  weights[k] * flux_temp[v, i, j, k]
+        fhat3_R[v, i, j, k + 1] = fhat3_L[v, i, j, k + 1]
+    end
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem_tree/dg.jl b/src/solvers/dgsem_tree/dg.jl
index 54d71af2b66..c389af89880 100644
--- a/src/solvers/dgsem_tree/dg.jl
+++ b/src/solvers/dgsem_tree/dg.jl
@@ -62,5 +62,7 @@ include("dg_3d_compressible_euler.jl")
 # Subcell limiters
 include("subcell_limiters.jl")
 include("subcell_limiters_2d.jl")
+include("subcell_limiters_3d.jl")
 include("dg_2d_subcell_limiters.jl")
+include("dg_3d_subcell_limiters.jl")
 end # @muladd
diff --git a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
new file mode 100644
index 00000000000..dc8f3f3f267
--- /dev/null
+++ b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
@@ -0,0 +1,185 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+function create_cache(mesh::P4estMesh{3},
+                      equations, volume_integral::VolumeIntegralSubcellLimiting,
+                      dg::DG, uEltype)
+    cache = create_cache(mesh, equations,
+                         VolumeIntegralPureLGLFiniteVolume(volume_integral.volume_flux_fv),
+                         dg, uEltype)
+
+    A4dp1_x = Array{uEltype, 4}
+    A4dp1_y = Array{uEltype, 4}
+    A4dp1_z = Array{uEltype, 4}
+    A4d = Array{uEltype, 4}
+    A5d = Array{uEltype, 5}
+
+    fhat1_L_threaded = A4dp1_x[A4dp1_x(undef, nvariables(equations), nnodes(dg) + 1,
+                                       nnodes(dg), nnodes(dg))
+                               for _ in 1:Threads.nthreads()]
+    fhat1_R_threaded = A4dp1_x[A4dp1_x(undef, nvariables(equations), nnodes(dg) + 1,
+                                       nnodes(dg), nnodes(dg))
+                               for _ in 1:Threads.nthreads()]
+    fhat2_L_threaded = A4dp1_y[A4dp1_y(undef, nvariables(equations), nnodes(dg),
+                                       nnodes(dg) + 1, nnodes(dg))
+                               for _ in 1:Threads.nthreads()]
+    fhat2_R_threaded = A4dp1_y[A4dp1_y(undef, nvariables(equations), nnodes(dg),
+                                       nnodes(dg) + 1, nnodes(dg))
+                               for _ in 1:Threads.nthreads()]
+    fhat3_L_threaded = A4dp1_z[A4dp1_z(undef, nvariables(equations), nnodes(dg),
+                                       nnodes(dg), nnodes(dg) + 1)
+                               for _ in 1:Threads.nthreads()]
+    fhat3_R_threaded = A4dp1_z[A4dp1_z(undef, nvariables(equations), nnodes(dg),
+                                       nnodes(dg), nnodes(dg) + 1)
+                               for _ in 1:Threads.nthreads()]
+    flux_temp_threaded = A4d[A4d(undef, nvariables(equations), nnodes(dg), nnodes(dg),
+                                 nnodes(dg))
+                             for _ in 1:Threads.nthreads()]
+    fhat_temp_threaded = A4d[A4d(undef, nvariables(equations), nnodes(dg),
+                                 nnodes(dg), nnodes(dg)) for _ in 1:Threads.nthreads()]
+    antidiffusive_fluxes = ContainerAntidiffusiveFlux3D{uEltype}(0,
+                                                                 nvariables(equations),
+                                                                 nnodes(dg))
+
+    if have_nonconservative_terms(equations) == true
+        # Extract the nonconservative flux as a dispatch argument for `n_nonconservative_terms`
+        _, volume_flux_noncons = volume_integral.volume_flux_dg
+
+        flux_nonconservative_temp_threaded = A5d[A5d(undef, nvariables(equations),
+                                                     n_nonconservative_terms(volume_flux_noncons),
+                                                     nnodes(dg), nnodes(dg),
+                                                     nnodes(dg))
+                                                 for _ in 1:Threads.nthreads()]
+        fhat_nonconservative_temp_threaded = A5d[A5d(undef, nvariables(equations),
+                                                     n_nonconservative_terms(volume_flux_noncons),
+                                                     nnodes(dg), nnodes(dg),
+                                                     nnodes(dg))
+                                                 for _ in 1:Threads.nthreads()]
+        phi_threaded = A5d[A5d(undef, nvariables(equations),
+                               n_nonconservative_terms(volume_flux_noncons),
+                               nnodes(dg), nnodes(dg), nnodes(dg))
+                           for _ in 1:Threads.nthreads()]
+        cache = (; cache..., flux_nonconservative_temp_threaded,
+                 fhat_nonconservative_temp_threaded, phi_threaded)
+    end
+
+    return (; cache..., antidiffusive_fluxes,
+            fhat1_L_threaded, fhat1_R_threaded, fhat2_L_threaded, fhat2_R_threaded,
+            fhat3_L_threaded, fhat3_R_threaded, flux_temp_threaded, fhat_temp_threaded)
+end
+
+@inline function subcell_limiting_kernel!(du, u, element,
+                                          mesh::P4estMesh{3},
+                                          nonconservative_terms, equations,
+                                          volume_integral, limiter::SubcellLimiterIDP,
+                                          dg::DGSEM, cache)
+    @unpack inverse_weights = dg.basis
+    @unpack volume_flux_dg, volume_flux_fv = volume_integral
+
+    # high-order DG fluxes
+    @unpack fhat1_L_threaded, fhat1_R_threaded, fhat2_L_threaded, fhat2_R_threaded, fhat3_L_threaded, fhat3_R_threaded = cache
+
+    fhat1_L = fhat1_L_threaded[Threads.threadid()]
+    fhat1_R = fhat1_R_threaded[Threads.threadid()]
+    fhat2_L = fhat2_L_threaded[Threads.threadid()]
+    fhat2_R = fhat2_R_threaded[Threads.threadid()]
+    fhat3_L = fhat3_L_threaded[Threads.threadid()]
+    fhat3_R = fhat3_R_threaded[Threads.threadid()]
+    calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R, u, mesh,
+                   nonconservative_terms, equations, volume_flux_dg, dg, element,
+                   cache)
+
+    # low-order FV fluxes
+    @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded, fstar3_L_threaded, fstar3_R_threaded = cache
+
+    fstar1_L = fstar1_L_threaded[Threads.threadid()]
+    fstar1_R = fstar1_R_threaded[Threads.threadid()]
+    fstar2_L = fstar2_L_threaded[Threads.threadid()]
+    fstar2_R = fstar2_R_threaded[Threads.threadid()]
+    fstar3_L = fstar3_L_threaded[Threads.threadid()]
+    fstar3_R = fstar3_R_threaded[Threads.threadid()]
+    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u, mesh,
+                 nonconservative_terms, equations, volume_flux_fv, dg, element,
+                 cache)
+
+    # antidiffusive flux
+    calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
+                            fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
+                            u, mesh, nonconservative_terms, equations, limiter, dg,
+                            element, cache)
+
+    # Calculate volume integral contribution of low-order FV flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            du[v, i, j, k, element] += inverse_weights[i] *
+                                       (fstar1_L[v, i + 1, j, k] - fstar1_R[v, i, j, k]) +
+                                       inverse_weights[j] *
+                                       (fstar2_L[v, i, j + 1, k] - fstar2_R[v, i, j, k]) +
+                                       inverse_weights[k] *
+                                       (fstar3_L[v, i, j, k + 1] - fstar3_R[v, i, j, k])
+        end
+    end
+
+    return nothing
+end
+
+# Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
+@inline function calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R,
+                                         fhat3_L, fhat3_R,
+                                         fstar1_L, fstar1_R, fstar2_L, fstar2_R,
+                                         fstar3_L, fstar3_R,
+                                         u, mesh::P4estMesh{3},
+                                         nonconservative_terms::False, equations,
+                                         limiter::SubcellLimiterIDP, dg, element, cache)
+    @unpack antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R = cache.antidiffusive_fluxes
+
+    for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux1_L[v, i, j, k, element] = fhat1_L[v, i, j, k] -
+                                                         fstar1_L[v, i, j, k]
+            antidiffusive_flux1_R[v, i, j, k, element] = antidiffusive_flux1_L[v, i, j,
+                                                                               k,
+                                                                               element]
+        end
+    end
+    for k in eachnode(dg), j in 2:nnodes(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux2_L[v, i, j, k, element] = fhat2_L[v, i, j, k] -
+                                                         fstar2_L[v, i, j, k]
+            antidiffusive_flux2_R[v, i, j, k, element] = antidiffusive_flux2_L[v, i, j,
+                                                                               k,
+                                                                               element]
+        end
+    end
+    for k in 2:nnodes(dg), j in eachnode(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux3_L[v, i, j, k, element] = fhat3_L[v, i, j, k] -
+                                                         fstar3_L[v, i, j, k]
+            antidiffusive_flux3_R[v, i, j, k, element] = antidiffusive_flux3_L[v, i, j,
+                                                                               k,
+                                                                               element]
+        end
+    end
+
+    antidiffusive_flux1_L[:, 1, :, :, element] .= zero(eltype(antidiffusive_flux1_L))
+    antidiffusive_flux1_L[:, nnodes(dg) + 1, :, :, element] .= zero(eltype(antidiffusive_flux1_L))
+    antidiffusive_flux1_R[:, 1, :, :, element] .= zero(eltype(antidiffusive_flux1_R))
+    antidiffusive_flux1_R[:, nnodes(dg) + 1, :, :, element] .= zero(eltype(antidiffusive_flux1_R))
+
+    antidiffusive_flux2_L[:, :, 1, :, element] .= zero(eltype(antidiffusive_flux2_L))
+    antidiffusive_flux2_L[:, :, nnodes(dg) + 1, :, element] .= zero(eltype(antidiffusive_flux2_L))
+    antidiffusive_flux2_R[:, :, 1, :, element] .= zero(eltype(antidiffusive_flux2_R))
+    antidiffusive_flux2_R[:, :, nnodes(dg) + 1, :, element] .= zero(eltype(antidiffusive_flux2_R))
+
+    antidiffusive_flux3_L[:, :, :, 1, element] .= zero(eltype(antidiffusive_flux3_L))
+    antidiffusive_flux3_L[:, :, :, nnodes(dg) + 1, element] .= zero(eltype(antidiffusive_flux3_L))
+    antidiffusive_flux3_R[:, :, :, 1, element] .= zero(eltype(antidiffusive_flux3_R))
+    antidiffusive_flux3_R[:, :, :, nnodes(dg) + 1, element] .= zero(eltype(antidiffusive_flux3_R))
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem_tree/subcell_limiters_3d.jl b/src/solvers/dgsem_tree/subcell_limiters_3d.jl
new file mode 100644
index 00000000000..8280eac03c1
--- /dev/null
+++ b/src/solvers/dgsem_tree/subcell_limiters_3d.jl
@@ -0,0 +1,150 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+###############################################################################
+# IDP Limiting
+###############################################################################
+
+# this method is used when the limiter is constructed as for shock-capturing volume integrals
+function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquations{3},
+                      basis::LobattoLegendreBasis, bound_keys)
+    subcell_limiter_coefficients = Trixi.ContainerSubcellLimiterIDP3D{real(basis)}(0,
+                                                                                   nnodes(basis),
+                                                                                   bound_keys)
+
+    # Memory for bounds checking routine with `BoundsCheckCallback`.
+    # Local variable contains the maximum deviation since the last export.
+    idp_bounds_delta_local = Dict{Symbol, real(basis)}()
+    # Global variable contains the total maximum deviation.
+    idp_bounds_delta_global = Dict{Symbol, real(basis)}()
+    for key in bound_keys
+        idp_bounds_delta_local[key] = zero(real(basis))
+        idp_bounds_delta_global[key] = zero(real(basis))
+    end
+
+    return (; subcell_limiter_coefficients, idp_bounds_delta_local,
+            idp_bounds_delta_global)
+end
+
+function (limiter::SubcellLimiterIDP)(u::AbstractArray{<:Any, 5},
+                                      semi, equations, dg::DGSEM,
+                                      t, dt;
+                                      kwargs...)
+    @unpack alpha = limiter.cache.subcell_limiter_coefficients
+    # TODO: Do not abuse `reset_du!` but maybe implement a generic `set_zero!`
+    @trixi_timeit timer() "reset alpha" reset_du!(alpha, dg, semi.cache)
+
+    if limiter.local_twosided
+        @trixi_timeit timer() "local twosided" idp_local_twosided!(alpha, limiter,
+                                                                   u, t, dt, semi)
+    end
+    if limiter.positivity
+        @trixi_timeit timer() "positivity" idp_positivity!(alpha, limiter, u, dt, semi)
+    end
+    if limiter.local_onesided
+        @trixi_timeit timer() "local onesided" idp_local_onesided!(alpha, limiter,
+                                                                   u, t, dt, semi)
+    end
+
+    # Calculate alpha1, alpha2 and alpha3
+    @unpack alpha1, alpha2, alpha3 = limiter.cache.subcell_limiter_coefficients
+    @threaded for element in eachelement(dg, semi.cache)
+        for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
+            alpha1[i, j, k, element] = max(alpha[i - 1, j, k, element],
+                                           alpha[i, j, k, element])
+        end
+        for k in eachnode(dg), j in 2:nnodes(dg), i in eachnode(dg)
+            alpha2[i, j, k, element] = max(alpha[i, j - 1, k, element],
+                                           alpha[i, j, k, element])
+        end
+        for k in 2:nnodes(dg), j in eachnode(dg), i in eachnode(dg)
+            alpha3[i, j, k, element] = max(alpha[i, j, k - 1, element],
+                                           alpha[i, j, k, element])
+        end
+        alpha1[1, :, :, element] .= zero(eltype(alpha1))
+        alpha1[nnodes(dg) + 1, :, :, element] .= zero(eltype(alpha1))
+        alpha2[:, 1, :, element] .= zero(eltype(alpha2))
+        alpha2[:, nnodes(dg) + 1, :, element] .= zero(eltype(alpha2))
+        alpha3[:, :, 1, element] .= zero(eltype(alpha3))
+        alpha3[:, :, nnodes(dg) + 1, element] .= zero(eltype(alpha3))
+    end
+
+    return nothing
+end
+
+###############################################################################
+# Global positivity limiting of conservative variables
+
+@inline function idp_positivity_conservative!(alpha, limiter,
+                                              u::AbstractArray{<:Real, 5}, dt, semi,
+                                              variable)
+    mesh, _, dg, cache = mesh_equations_solver_cache(semi)
+    (; antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R) = cache.antidiffusive_fluxes
+    (; inverse_weights) = dg.basis
+    (; positivity_correction_factor) = limiter
+
+    (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
+    var_min = variable_bounds[Symbol(string(variable), "_min")]
+
+    @threaded for element in eachelement(dg, semi.cache)
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            inverse_jacobian = get_inverse_jacobian(cache.elements.inverse_jacobian,
+                                                    mesh, i, j, k, element)
+            var = u[variable, i, j, k, element]
+            if var < 0
+                error("Safe low-order method produces negative value for conservative variable $variable. Try a smaller time step.")
+            end
+
+            # Compute bound
+            if limiter.local_twosided &&
+               variable in limiter.local_twosided_variables_cons &&
+               var_min[i, j, k, element] >= positivity_correction_factor * var
+                # Local limiting is more restrictive that positivity limiting
+                # => Skip positivity limiting for this node
+                continue
+            end
+            var_min[i, j, k, element] = positivity_correction_factor * var
+
+            # Real one-sided Zalesak-type limiter
+            # * Zalesak (1979). "Fully multidimensional flux-corrected transport algorithms for fluids"
+            # * Kuzmin et al. (2010). "Failsafe flux limiting and constrained data projections for equations of gas dynamics"
+            # Note: The Zalesak limiter has to be computed, even if the state is valid, because the correction is
+            #       for each interface, not each node
+            Qm = min(0, (var_min[i, j, k, element] - var) / dt)
+
+            # Calculate Pm
+            # Note: Boundaries of antidiffusive_flux1/2 are constant 0, so they make no difference here.
+            val_flux1_local = inverse_weights[i] *
+                              antidiffusive_flux1_R[variable, i, j, k, element]
+            val_flux1_local_ip1 = -inverse_weights[i] *
+                                  antidiffusive_flux1_L[variable, i + 1, j, k, element]
+            val_flux2_local = inverse_weights[j] *
+                              antidiffusive_flux2_R[variable, i, j, k, element]
+            val_flux2_local_jp1 = -inverse_weights[j] *
+                                  antidiffusive_flux2_L[variable, i, j + 1, k, element]
+            val_flux3_local = inverse_weights[k] *
+                              antidiffusive_flux3_R[variable, i, j, k, element]
+            val_flux3_local_jp1 = -inverse_weights[k] *
+                                  antidiffusive_flux3_L[variable, i, j, k + 1, element]
+
+            Pm = min(0, val_flux1_local) + min(0, val_flux1_local_ip1) +
+                 min(0, val_flux2_local) + min(0, val_flux2_local_jp1) +
+                 min(0, val_flux3_local) + min(0, val_flux3_local_jp1)
+            Pm = inverse_jacobian * Pm
+
+            # Compute blending coefficient avoiding division by zero
+            # (as in paper of [Guermond, Nazarov, Popov, Thomas] (4.8))
+            Qm = abs(Qm) / (abs(Pm) + eps(typeof(Qm)) * 100)
+
+            # Calculate alpha
+            alpha[i, j, k, element] = max(alpha[i, j, k, element], 1 - Qm)
+        end
+    end
+
+    return nothing
+end
+end # @muladd

From 970458495e225c414e4bbd285a6bc965ee3e81f3 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Wed, 24 Sep 2025 14:46:41 +0200
Subject: [PATCH 06/24] Implement suggestions

---
 .../dgsem_tree/dg_3d_subcell_limiters.jl      | 20 +------------------
 src/solvers/dgsem_tree/subcell_limiters_3d.jl |  6 ++----
 2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
index dc8f3f3f267..1148b2ca68f 100644
--- a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
@@ -46,25 +46,7 @@ function create_cache(mesh::P4estMesh{3},
                                                                  nnodes(dg))
 
     if have_nonconservative_terms(equations) == true
-        # Extract the nonconservative flux as a dispatch argument for `n_nonconservative_terms`
-        _, volume_flux_noncons = volume_integral.volume_flux_dg
-
-        flux_nonconservative_temp_threaded = A5d[A5d(undef, nvariables(equations),
-                                                     n_nonconservative_terms(volume_flux_noncons),
-                                                     nnodes(dg), nnodes(dg),
-                                                     nnodes(dg))
-                                                 for _ in 1:Threads.nthreads()]
-        fhat_nonconservative_temp_threaded = A5d[A5d(undef, nvariables(equations),
-                                                     n_nonconservative_terms(volume_flux_noncons),
-                                                     nnodes(dg), nnodes(dg),
-                                                     nnodes(dg))
-                                                 for _ in 1:Threads.nthreads()]
-        phi_threaded = A5d[A5d(undef, nvariables(equations),
-                               n_nonconservative_terms(volume_flux_noncons),
-                               nnodes(dg), nnodes(dg), nnodes(dg))
-                           for _ in 1:Threads.nthreads()]
-        cache = (; cache..., flux_nonconservative_temp_threaded,
-                 fhat_nonconservative_temp_threaded, phi_threaded)
+        error("Unsupported system of equations with nonconservative terms")
     end
 
     return (; cache..., antidiffusive_fluxes,
diff --git a/src/solvers/dgsem_tree/subcell_limiters_3d.jl b/src/solvers/dgsem_tree/subcell_limiters_3d.jl
index 8280eac03c1..f186364f3c2 100644
--- a/src/solvers/dgsem_tree/subcell_limiters_3d.jl
+++ b/src/solvers/dgsem_tree/subcell_limiters_3d.jl
@@ -39,15 +39,13 @@ function (limiter::SubcellLimiterIDP)(u::AbstractArray{<:Any, 5},
     @trixi_timeit timer() "reset alpha" reset_du!(alpha, dg, semi.cache)
 
     if limiter.local_twosided
-        @trixi_timeit timer() "local twosided" idp_local_twosided!(alpha, limiter,
-                                                                   u, t, dt, semi)
+        error("Unsupported type of limiter: local_twosided_variables_cons")
     end
     if limiter.positivity
         @trixi_timeit timer() "positivity" idp_positivity!(alpha, limiter, u, dt, semi)
     end
     if limiter.local_onesided
-        @trixi_timeit timer() "local onesided" idp_local_onesided!(alpha, limiter,
-                                                                   u, t, dt, semi)
+        error("Unsupported type of limiter: local_onesided_variables_nonlinear")
     end
 
     # Calculate alpha1, alpha2 and alpha3

From 29a255b3524a0528287095a8526735c1ea992e62 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Wed, 1 Oct 2025 11:56:06 +0200
Subject: [PATCH 07/24] Apply changes to 3d code (remove `alpha1/2/3`)

---
 src/solvers/dgsem_tree/subcell_limiters_3d.jl | 44 -------------------
 1 file changed, 44 deletions(-)

diff --git a/src/solvers/dgsem_tree/subcell_limiters_3d.jl b/src/solvers/dgsem_tree/subcell_limiters_3d.jl
index f186364f3c2..73ba3190404 100644
--- a/src/solvers/dgsem_tree/subcell_limiters_3d.jl
+++ b/src/solvers/dgsem_tree/subcell_limiters_3d.jl
@@ -30,50 +30,6 @@ function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquat
             idp_bounds_delta_global)
 end
 
-function (limiter::SubcellLimiterIDP)(u::AbstractArray{<:Any, 5},
-                                      semi, equations, dg::DGSEM,
-                                      t, dt;
-                                      kwargs...)
-    @unpack alpha = limiter.cache.subcell_limiter_coefficients
-    # TODO: Do not abuse `reset_du!` but maybe implement a generic `set_zero!`
-    @trixi_timeit timer() "reset alpha" reset_du!(alpha, dg, semi.cache)
-
-    if limiter.local_twosided
-        error("Unsupported type of limiter: local_twosided_variables_cons")
-    end
-    if limiter.positivity
-        @trixi_timeit timer() "positivity" idp_positivity!(alpha, limiter, u, dt, semi)
-    end
-    if limiter.local_onesided
-        error("Unsupported type of limiter: local_onesided_variables_nonlinear")
-    end
-
-    # Calculate alpha1, alpha2 and alpha3
-    @unpack alpha1, alpha2, alpha3 = limiter.cache.subcell_limiter_coefficients
-    @threaded for element in eachelement(dg, semi.cache)
-        for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
-            alpha1[i, j, k, element] = max(alpha[i - 1, j, k, element],
-                                           alpha[i, j, k, element])
-        end
-        for k in eachnode(dg), j in 2:nnodes(dg), i in eachnode(dg)
-            alpha2[i, j, k, element] = max(alpha[i, j - 1, k, element],
-                                           alpha[i, j, k, element])
-        end
-        for k in 2:nnodes(dg), j in eachnode(dg), i in eachnode(dg)
-            alpha3[i, j, k, element] = max(alpha[i, j, k - 1, element],
-                                           alpha[i, j, k, element])
-        end
-        alpha1[1, :, :, element] .= zero(eltype(alpha1))
-        alpha1[nnodes(dg) + 1, :, :, element] .= zero(eltype(alpha1))
-        alpha2[:, 1, :, element] .= zero(eltype(alpha2))
-        alpha2[:, nnodes(dg) + 1, :, element] .= zero(eltype(alpha2))
-        alpha3[:, :, 1, element] .= zero(eltype(alpha3))
-        alpha3[:, :, nnodes(dg) + 1, element] .= zero(eltype(alpha3))
-    end
-
-    return nothing
-end
-
 ###############################################################################
 # Global positivity limiting of conservative variables
 

From 3e829329b083a0afdfbdb00bd7c07c02cc59c984 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Wed, 1 Oct 2025 16:24:34 +0200
Subject: [PATCH 08/24] Reset antidiffusive fluxes after resizing in 3d

---
 src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
index 1148b2ca68f..876431234f0 100644
--- a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
@@ -147,21 +147,6 @@ end
         end
     end
 
-    antidiffusive_flux1_L[:, 1, :, :, element] .= zero(eltype(antidiffusive_flux1_L))
-    antidiffusive_flux1_L[:, nnodes(dg) + 1, :, :, element] .= zero(eltype(antidiffusive_flux1_L))
-    antidiffusive_flux1_R[:, 1, :, :, element] .= zero(eltype(antidiffusive_flux1_R))
-    antidiffusive_flux1_R[:, nnodes(dg) + 1, :, :, element] .= zero(eltype(antidiffusive_flux1_R))
-
-    antidiffusive_flux2_L[:, :, 1, :, element] .= zero(eltype(antidiffusive_flux2_L))
-    antidiffusive_flux2_L[:, :, nnodes(dg) + 1, :, element] .= zero(eltype(antidiffusive_flux2_L))
-    antidiffusive_flux2_R[:, :, 1, :, element] .= zero(eltype(antidiffusive_flux2_R))
-    antidiffusive_flux2_R[:, :, nnodes(dg) + 1, :, element] .= zero(eltype(antidiffusive_flux2_R))
-
-    antidiffusive_flux3_L[:, :, :, 1, element] .= zero(eltype(antidiffusive_flux3_L))
-    antidiffusive_flux3_L[:, :, :, nnodes(dg) + 1, element] .= zero(eltype(antidiffusive_flux3_L))
-    antidiffusive_flux3_R[:, :, :, 1, element] .= zero(eltype(antidiffusive_flux3_R))
-    antidiffusive_flux3_R[:, :, :, nnodes(dg) + 1, element] .= zero(eltype(antidiffusive_flux3_R))
-
     return nothing
 end
 end # @muladd

From df429c8146a867b07b1ce58ad5ee0341ca3bfdee Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Wed, 1 Oct 2025 17:25:43 +0200
Subject: [PATCH 09/24] Nicer line breaks

---
 src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
index 876431234f0..ee76e01c0a5 100644
--- a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
@@ -123,8 +123,8 @@ end
         for v in eachvariable(equations)
             antidiffusive_flux1_L[v, i, j, k, element] = fhat1_L[v, i, j, k] -
                                                          fstar1_L[v, i, j, k]
-            antidiffusive_flux1_R[v, i, j, k, element] = antidiffusive_flux1_L[v, i, j,
-                                                                               k,
+            antidiffusive_flux1_R[v, i, j, k, element] = antidiffusive_flux1_L[v,
+                                                                               i, j, k,
                                                                                element]
         end
     end
@@ -132,8 +132,8 @@ end
         for v in eachvariable(equations)
             antidiffusive_flux2_L[v, i, j, k, element] = fhat2_L[v, i, j, k] -
                                                          fstar2_L[v, i, j, k]
-            antidiffusive_flux2_R[v, i, j, k, element] = antidiffusive_flux2_L[v, i, j,
-                                                                               k,
+            antidiffusive_flux2_R[v, i, j, k, element] = antidiffusive_flux2_L[v,
+                                                                               i, j, k,
                                                                                element]
         end
     end
@@ -141,8 +141,8 @@ end
         for v in eachvariable(equations)
             antidiffusive_flux3_L[v, i, j, k, element] = fhat3_L[v, i, j, k] -
                                                          fstar3_L[v, i, j, k]
-            antidiffusive_flux3_R[v, i, j, k, element] = antidiffusive_flux3_L[v, i, j,
-                                                                               k,
+            antidiffusive_flux3_R[v, i, j, k, element] = antidiffusive_flux3_L[v,
+                                                                               i, j, k,
                                                                                element]
         end
     end

From f1e4bece629b598c8e48cca36679be86b2d8ce3a Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Mon, 6 Oct 2025 10:58:25 +0200
Subject: [PATCH 10/24] Move funcs to p4est files

---
 src/solvers/dgsem_structured/dg.jl            |   1 -
 .../dg_3d_subcell_limiters.jl                 | 156 ------------------
 src/solvers/dgsem_tree/dg.jl                  |   1 -
 .../dgsem_tree/dg_3d_subcell_limiters.jl      | 152 -----------------
 4 files changed, 310 deletions(-)
 delete mode 100644 src/solvers/dgsem_structured/dg_3d_subcell_limiters.jl
 delete mode 100644 src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl

diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl
index 39f4a504373..cc0243a9989 100644
--- a/src/solvers/dgsem_structured/dg.jl
+++ b/src/solvers/dgsem_structured/dg.jl
@@ -210,7 +210,6 @@ include("indicators_3d.jl")
 
 include("subcell_limiters_2d.jl")
 include("dg_2d_subcell_limiters.jl")
-include("dg_3d_subcell_limiters.jl")
 
 # Specialized implementations used to improve performance
 include("dg_2d_compressible_euler.jl")
diff --git a/src/solvers/dgsem_structured/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_structured/dg_3d_subcell_limiters.jl
deleted file mode 100644
index 769b1a73921..00000000000
--- a/src/solvers/dgsem_structured/dg_3d_subcell_limiters.jl
+++ /dev/null
@@ -1,156 +0,0 @@
-# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
-# Since these FMAs can increase the performance of many numerical algorithms,
-# we need to opt-in explicitly.
-# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
-@muladd begin
-#! format: noindent
-
-# Calculate the DG staggered volume fluxes `fhat` in subcell FV-form inside the element
-# (**without non-conservative terms**).
-#
-# See also `flux_differencing_kernel!`.
-@inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R, u,
-                                mesh::P4estMesh{3},
-                                nonconservative_terms::False, equations,
-                                volume_flux, dg::DGSEM, element, cache)
-    (; contravariant_vectors) = cache.elements
-    (; weights, derivative_split) = dg.basis
-    (; flux_temp_threaded) = cache
-
-    flux_temp = flux_temp_threaded[Threads.threadid()]
-
-    # The FV-form fluxes are calculated in a recursive manner, i.e.:
-    # fhat_(0,1)   = w_0 * FVol_0,
-    # fhat_(j,j+1) = fhat_(j-1,j) + w_j * FVol_j,   for j=1,...,N-1,
-    # with the split form volume fluxes FVol_j = -2 * sum_i=0^N D_ji f*_(j,i).
-
-    # To use the symmetry of the `volume_flux`, the split form volume flux is precalculated
-    # like in `calc_volume_integral!` for the `VolumeIntegralFluxDifferencing`
-    # and saved in in `flux_temp`.
-
-    # Split form volume flux in orientation 1: x direction
-    flux_temp .= zero(eltype(flux_temp))
-
-    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-        u_node = get_node_vars(u, equations, dg, i, j, k, element)
-
-        # pull the contravariant vectors in each coordinate direction
-        Ja1_node = get_contravariant_vector(1, contravariant_vectors, i, j, k, element) # x direction
-
-        # All diagonal entries of `derivative_split` are zero. Thus, we can skip
-        # the computation of the diagonal terms. In addition, we use the symmetry
-        # of the `volume_flux` to save half of the possible two-point flux
-        # computations.
-
-        # x direction
-        for ii in (i + 1):nnodes(dg)
-            u_node_ii = get_node_vars(u, equations, dg, ii, j, k, element)
-            # pull the contravariant vectors and compute the average
-            Ja1_node_ii = get_contravariant_vector(1, contravariant_vectors, ii, j, k,
-                                                   element)
-            Ja1_avg = 0.5f0 * (Ja1_node + Ja1_node_ii)
-
-            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
-            fluxtilde1 = volume_flux(u_node, u_node_ii, Ja1_avg, equations)
-            multiply_add_to_node_vars!(flux_temp, derivative_split[i, ii], fluxtilde1,
-                                       equations, dg, i, j, k)
-            multiply_add_to_node_vars!(flux_temp, derivative_split[ii, i], fluxtilde1,
-                                       equations, dg, ii, j, k)
-        end
-    end
-
-    # FV-form flux `fhat` in x direction
-    fhat1_L[:, 1, :, :] .= zero(eltype(fhat1_L))
-    fhat1_L[:, nnodes(dg) + 1, :, :] .= zero(eltype(fhat1_L))
-    fhat1_R[:, 1, :, :] .= zero(eltype(fhat1_R))
-    fhat1_R[:, nnodes(dg) + 1, :, :] .= zero(eltype(fhat1_R))
-
-    for k in eachnode(dg), j in eachnode(dg), i in 1:(nnodes(dg) - 1),
-        v in eachvariable(equations)
-
-        fhat1_L[v, i + 1, j, k] = fhat1_L[v, i, j, k] +
-                                  weights[i] * flux_temp[v, i, j, k]
-        fhat1_R[v, i + 1, j, k] = fhat1_L[v, i + 1, j, k]
-    end
-
-    # Split form volume flux in orientation 2: y direction
-    flux_temp .= zero(eltype(flux_temp))
-
-    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-        u_node = get_node_vars(u, equations, dg, i, j, k, element)
-
-        # pull the contravariant vectors in each coordinate direction
-        Ja2_node = get_contravariant_vector(2, contravariant_vectors, i, j, k, element)
-
-        # y direction
-        for jj in (j + 1):nnodes(dg)
-            u_node_jj = get_node_vars(u, equations, dg, i, jj, k, element)
-            # pull the contravariant vectors and compute the average
-            Ja2_node_jj = get_contravariant_vector(2, contravariant_vectors, i, jj, k,
-                                                   element)
-            Ja2_avg = 0.5f0 * (Ja2_node + Ja2_node_jj)
-            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
-            fluxtilde2 = volume_flux(u_node, u_node_jj, Ja2_avg, equations)
-            multiply_add_to_node_vars!(flux_temp, derivative_split[j, jj], fluxtilde2,
-                                       equations, dg, i, j, k)
-            multiply_add_to_node_vars!(flux_temp, derivative_split[jj, j], fluxtilde2,
-                                       equations, dg, i, jj, k)
-        end
-    end
-
-    # FV-form flux `fhat` in y direction
-    fhat2_L[:, :, 1, :] .= zero(eltype(fhat2_L))
-    fhat2_L[:, :, nnodes(dg) + 1, :] .= zero(eltype(fhat2_L))
-    fhat2_R[:, :, 1, :] .= zero(eltype(fhat2_R))
-    fhat2_R[:, :, nnodes(dg) + 1, :] .= zero(eltype(fhat2_R))
-
-    for k in eachnode(dg), j in 1:(nnodes(dg) - 1), i in eachnode(dg),
-        v in eachvariable(equations)
-
-        fhat2_L[v, i, j + 1, k] = fhat2_L[v, i, j, k] +
-                                  weights[j] * flux_temp[v, i, j, k]
-        fhat2_R[v, i, j + 1, k] = fhat2_L[v, i, j + 1, k]
-    end
-
-    # Split form volume flux in orientation 3: z direction
-    flux_temp .= zero(eltype(flux_temp))
-
-    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-        u_node = get_node_vars(u, equations, dg, i, j, k, element)
-
-        # pull the contravariant vectors in each coordinate direction
-        Ja3_node = get_contravariant_vector(3, contravariant_vectors, i, j, k, element)
-
-        # y direction
-        for kk in (k + 1):nnodes(dg)
-            u_node_kk = get_node_vars(u, equations, dg, i, j, kk, element)
-            # pull the contravariant vectors and compute the average
-            Ja3_node_kk = get_contravariant_vector(3, contravariant_vectors, i, j, kk,
-                                                   element)
-            Ja3_avg = 0.5f0 * (Ja3_node + Ja3_node_kk)
-            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
-            fluxtilde3 = volume_flux(u_node, u_node_kk, Ja3_avg, equations)
-            multiply_add_to_node_vars!(flux_temp, derivative_split[k, kk], fluxtilde3,
-                                       equations, dg, i, j, k)
-            multiply_add_to_node_vars!(flux_temp, derivative_split[kk, k], fluxtilde3,
-                                       equations, dg, i, j, kk)
-        end
-    end
-
-    # FV-form flux `fhat` in y direction
-    fhat3_L[:, :, :, 1] .= zero(eltype(fhat3_L))
-    fhat3_L[:, :, :, nnodes(dg) + 1] .= zero(eltype(fhat3_L))
-    fhat3_R[:, :, :, 1] .= zero(eltype(fhat3_R))
-    fhat3_R[:, :, :, nnodes(dg) + 1] .= zero(eltype(fhat3_R))
-
-    for k in 1:(nnodes(dg) - 1), j in eachnode(dg), i in eachnode(dg),
-        v in eachvariable(equations)
-
-        fhat3_L[v, i, j, k + 1] = fhat3_L[v, i, j, k] +
-                                  weights[k] * flux_temp[v, i, j, k]
-        fhat3_R[v, i, j, k + 1] = fhat3_L[v, i, j, k + 1]
-    end
-
-    return nothing
-end
-end # @muladd
diff --git a/src/solvers/dgsem_tree/dg.jl b/src/solvers/dgsem_tree/dg.jl
index c389af89880..561576c3df6 100644
--- a/src/solvers/dgsem_tree/dg.jl
+++ b/src/solvers/dgsem_tree/dg.jl
@@ -64,5 +64,4 @@ include("subcell_limiters.jl")
 include("subcell_limiters_2d.jl")
 include("subcell_limiters_3d.jl")
 include("dg_2d_subcell_limiters.jl")
-include("dg_3d_subcell_limiters.jl")
 end # @muladd
diff --git a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
deleted file mode 100644
index ee76e01c0a5..00000000000
--- a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
+++ /dev/null
@@ -1,152 +0,0 @@
-# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
-# Since these FMAs can increase the performance of many numerical algorithms,
-# we need to opt-in explicitly.
-# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
-@muladd begin
-#! format: noindent
-
-function create_cache(mesh::P4estMesh{3},
-                      equations, volume_integral::VolumeIntegralSubcellLimiting,
-                      dg::DG, uEltype)
-    cache = create_cache(mesh, equations,
-                         VolumeIntegralPureLGLFiniteVolume(volume_integral.volume_flux_fv),
-                         dg, uEltype)
-
-    A4dp1_x = Array{uEltype, 4}
-    A4dp1_y = Array{uEltype, 4}
-    A4dp1_z = Array{uEltype, 4}
-    A4d = Array{uEltype, 4}
-    A5d = Array{uEltype, 5}
-
-    fhat1_L_threaded = A4dp1_x[A4dp1_x(undef, nvariables(equations), nnodes(dg) + 1,
-                                       nnodes(dg), nnodes(dg))
-                               for _ in 1:Threads.nthreads()]
-    fhat1_R_threaded = A4dp1_x[A4dp1_x(undef, nvariables(equations), nnodes(dg) + 1,
-                                       nnodes(dg), nnodes(dg))
-                               for _ in 1:Threads.nthreads()]
-    fhat2_L_threaded = A4dp1_y[A4dp1_y(undef, nvariables(equations), nnodes(dg),
-                                       nnodes(dg) + 1, nnodes(dg))
-                               for _ in 1:Threads.nthreads()]
-    fhat2_R_threaded = A4dp1_y[A4dp1_y(undef, nvariables(equations), nnodes(dg),
-                                       nnodes(dg) + 1, nnodes(dg))
-                               for _ in 1:Threads.nthreads()]
-    fhat3_L_threaded = A4dp1_z[A4dp1_z(undef, nvariables(equations), nnodes(dg),
-                                       nnodes(dg), nnodes(dg) + 1)
-                               for _ in 1:Threads.nthreads()]
-    fhat3_R_threaded = A4dp1_z[A4dp1_z(undef, nvariables(equations), nnodes(dg),
-                                       nnodes(dg), nnodes(dg) + 1)
-                               for _ in 1:Threads.nthreads()]
-    flux_temp_threaded = A4d[A4d(undef, nvariables(equations), nnodes(dg), nnodes(dg),
-                                 nnodes(dg))
-                             for _ in 1:Threads.nthreads()]
-    fhat_temp_threaded = A4d[A4d(undef, nvariables(equations), nnodes(dg),
-                                 nnodes(dg), nnodes(dg)) for _ in 1:Threads.nthreads()]
-    antidiffusive_fluxes = ContainerAntidiffusiveFlux3D{uEltype}(0,
-                                                                 nvariables(equations),
-                                                                 nnodes(dg))
-
-    if have_nonconservative_terms(equations) == true
-        error("Unsupported system of equations with nonconservative terms")
-    end
-
-    return (; cache..., antidiffusive_fluxes,
-            fhat1_L_threaded, fhat1_R_threaded, fhat2_L_threaded, fhat2_R_threaded,
-            fhat3_L_threaded, fhat3_R_threaded, flux_temp_threaded, fhat_temp_threaded)
-end
-
-@inline function subcell_limiting_kernel!(du, u, element,
-                                          mesh::P4estMesh{3},
-                                          nonconservative_terms, equations,
-                                          volume_integral, limiter::SubcellLimiterIDP,
-                                          dg::DGSEM, cache)
-    @unpack inverse_weights = dg.basis
-    @unpack volume_flux_dg, volume_flux_fv = volume_integral
-
-    # high-order DG fluxes
-    @unpack fhat1_L_threaded, fhat1_R_threaded, fhat2_L_threaded, fhat2_R_threaded, fhat3_L_threaded, fhat3_R_threaded = cache
-
-    fhat1_L = fhat1_L_threaded[Threads.threadid()]
-    fhat1_R = fhat1_R_threaded[Threads.threadid()]
-    fhat2_L = fhat2_L_threaded[Threads.threadid()]
-    fhat2_R = fhat2_R_threaded[Threads.threadid()]
-    fhat3_L = fhat3_L_threaded[Threads.threadid()]
-    fhat3_R = fhat3_R_threaded[Threads.threadid()]
-    calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R, u, mesh,
-                   nonconservative_terms, equations, volume_flux_dg, dg, element,
-                   cache)
-
-    # low-order FV fluxes
-    @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded, fstar3_L_threaded, fstar3_R_threaded = cache
-
-    fstar1_L = fstar1_L_threaded[Threads.threadid()]
-    fstar1_R = fstar1_R_threaded[Threads.threadid()]
-    fstar2_L = fstar2_L_threaded[Threads.threadid()]
-    fstar2_R = fstar2_R_threaded[Threads.threadid()]
-    fstar3_L = fstar3_L_threaded[Threads.threadid()]
-    fstar3_R = fstar3_R_threaded[Threads.threadid()]
-    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u, mesh,
-                 nonconservative_terms, equations, volume_flux_fv, dg, element,
-                 cache)
-
-    # antidiffusive flux
-    calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
-                            fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
-                            u, mesh, nonconservative_terms, equations, limiter, dg,
-                            element, cache)
-
-    # Calculate volume integral contribution of low-order FV flux
-    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-        for v in eachvariable(equations)
-            du[v, i, j, k, element] += inverse_weights[i] *
-                                       (fstar1_L[v, i + 1, j, k] - fstar1_R[v, i, j, k]) +
-                                       inverse_weights[j] *
-                                       (fstar2_L[v, i, j + 1, k] - fstar2_R[v, i, j, k]) +
-                                       inverse_weights[k] *
-                                       (fstar3_L[v, i, j, k + 1] - fstar3_R[v, i, j, k])
-        end
-    end
-
-    return nothing
-end
-
-# Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
-@inline function calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R,
-                                         fhat3_L, fhat3_R,
-                                         fstar1_L, fstar1_R, fstar2_L, fstar2_R,
-                                         fstar3_L, fstar3_R,
-                                         u, mesh::P4estMesh{3},
-                                         nonconservative_terms::False, equations,
-                                         limiter::SubcellLimiterIDP, dg, element, cache)
-    @unpack antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R = cache.antidiffusive_fluxes
-
-    for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
-        for v in eachvariable(equations)
-            antidiffusive_flux1_L[v, i, j, k, element] = fhat1_L[v, i, j, k] -
-                                                         fstar1_L[v, i, j, k]
-            antidiffusive_flux1_R[v, i, j, k, element] = antidiffusive_flux1_L[v,
-                                                                               i, j, k,
-                                                                               element]
-        end
-    end
-    for k in eachnode(dg), j in 2:nnodes(dg), i in eachnode(dg)
-        for v in eachvariable(equations)
-            antidiffusive_flux2_L[v, i, j, k, element] = fhat2_L[v, i, j, k] -
-                                                         fstar2_L[v, i, j, k]
-            antidiffusive_flux2_R[v, i, j, k, element] = antidiffusive_flux2_L[v,
-                                                                               i, j, k,
-                                                                               element]
-        end
-    end
-    for k in 2:nnodes(dg), j in eachnode(dg), i in eachnode(dg)
-        for v in eachvariable(equations)
-            antidiffusive_flux3_L[v, i, j, k, element] = fhat3_L[v, i, j, k] -
-                                                         fstar3_L[v, i, j, k]
-            antidiffusive_flux3_R[v, i, j, k, element] = antidiffusive_flux3_L[v,
-                                                                               i, j, k,
-                                                                               element]
-        end
-    end
-
-    return nothing
-end
-end # @muladd

From 8d1b7428ca8880a3d752bd8e96af424ed44c7f05 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Mon, 6 Oct 2025 11:07:10 +0200
Subject: [PATCH 11/24] Move even more

---
 src/solvers/dgsem_tree/dg.jl                  |   1 -
 src/solvers/dgsem_tree/subcell_limiters_3d.jl | 104 ------------------
 2 files changed, 105 deletions(-)
 delete mode 100644 src/solvers/dgsem_tree/subcell_limiters_3d.jl

diff --git a/src/solvers/dgsem_tree/dg.jl b/src/solvers/dgsem_tree/dg.jl
index 561576c3df6..54d71af2b66 100644
--- a/src/solvers/dgsem_tree/dg.jl
+++ b/src/solvers/dgsem_tree/dg.jl
@@ -62,6 +62,5 @@ include("dg_3d_compressible_euler.jl")
 # Subcell limiters
 include("subcell_limiters.jl")
 include("subcell_limiters_2d.jl")
-include("subcell_limiters_3d.jl")
 include("dg_2d_subcell_limiters.jl")
 end # @muladd
diff --git a/src/solvers/dgsem_tree/subcell_limiters_3d.jl b/src/solvers/dgsem_tree/subcell_limiters_3d.jl
deleted file mode 100644
index 73ba3190404..00000000000
--- a/src/solvers/dgsem_tree/subcell_limiters_3d.jl
+++ /dev/null
@@ -1,104 +0,0 @@
-# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
-# Since these FMAs can increase the performance of many numerical algorithms,
-# we need to opt-in explicitly.
-# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
-@muladd begin
-#! format: noindent
-
-###############################################################################
-# IDP Limiting
-###############################################################################
-
-# this method is used when the limiter is constructed as for shock-capturing volume integrals
-function create_cache(limiter::Type{SubcellLimiterIDP}, equations::AbstractEquations{3},
-                      basis::LobattoLegendreBasis, bound_keys)
-    subcell_limiter_coefficients = Trixi.ContainerSubcellLimiterIDP3D{real(basis)}(0,
-                                                                                   nnodes(basis),
-                                                                                   bound_keys)
-
-    # Memory for bounds checking routine with `BoundsCheckCallback`.
-    # Local variable contains the maximum deviation since the last export.
-    idp_bounds_delta_local = Dict{Symbol, real(basis)}()
-    # Global variable contains the total maximum deviation.
-    idp_bounds_delta_global = Dict{Symbol, real(basis)}()
-    for key in bound_keys
-        idp_bounds_delta_local[key] = zero(real(basis))
-        idp_bounds_delta_global[key] = zero(real(basis))
-    end
-
-    return (; subcell_limiter_coefficients, idp_bounds_delta_local,
-            idp_bounds_delta_global)
-end
-
-###############################################################################
-# Global positivity limiting of conservative variables
-
-@inline function idp_positivity_conservative!(alpha, limiter,
-                                              u::AbstractArray{<:Real, 5}, dt, semi,
-                                              variable)
-    mesh, _, dg, cache = mesh_equations_solver_cache(semi)
-    (; antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R) = cache.antidiffusive_fluxes
-    (; inverse_weights) = dg.basis
-    (; positivity_correction_factor) = limiter
-
-    (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
-    var_min = variable_bounds[Symbol(string(variable), "_min")]
-
-    @threaded for element in eachelement(dg, semi.cache)
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            inverse_jacobian = get_inverse_jacobian(cache.elements.inverse_jacobian,
-                                                    mesh, i, j, k, element)
-            var = u[variable, i, j, k, element]
-            if var < 0
-                error("Safe low-order method produces negative value for conservative variable $variable. Try a smaller time step.")
-            end
-
-            # Compute bound
-            if limiter.local_twosided &&
-               variable in limiter.local_twosided_variables_cons &&
-               var_min[i, j, k, element] >= positivity_correction_factor * var
-                # Local limiting is more restrictive that positivity limiting
-                # => Skip positivity limiting for this node
-                continue
-            end
-            var_min[i, j, k, element] = positivity_correction_factor * var
-
-            # Real one-sided Zalesak-type limiter
-            # * Zalesak (1979). "Fully multidimensional flux-corrected transport algorithms for fluids"
-            # * Kuzmin et al. (2010). "Failsafe flux limiting and constrained data projections for equations of gas dynamics"
-            # Note: The Zalesak limiter has to be computed, even if the state is valid, because the correction is
-            #       for each interface, not each node
-            Qm = min(0, (var_min[i, j, k, element] - var) / dt)
-
-            # Calculate Pm
-            # Note: Boundaries of antidiffusive_flux1/2 are constant 0, so they make no difference here.
-            val_flux1_local = inverse_weights[i] *
-                              antidiffusive_flux1_R[variable, i, j, k, element]
-            val_flux1_local_ip1 = -inverse_weights[i] *
-                                  antidiffusive_flux1_L[variable, i + 1, j, k, element]
-            val_flux2_local = inverse_weights[j] *
-                              antidiffusive_flux2_R[variable, i, j, k, element]
-            val_flux2_local_jp1 = -inverse_weights[j] *
-                                  antidiffusive_flux2_L[variable, i, j + 1, k, element]
-            val_flux3_local = inverse_weights[k] *
-                              antidiffusive_flux3_R[variable, i, j, k, element]
-            val_flux3_local_jp1 = -inverse_weights[k] *
-                                  antidiffusive_flux3_L[variable, i, j, k + 1, element]
-
-            Pm = min(0, val_flux1_local) + min(0, val_flux1_local_ip1) +
-                 min(0, val_flux2_local) + min(0, val_flux2_local_jp1) +
-                 min(0, val_flux3_local) + min(0, val_flux3_local_jp1)
-            Pm = inverse_jacobian * Pm
-
-            # Compute blending coefficient avoiding division by zero
-            # (as in paper of [Guermond, Nazarov, Popov, Thomas] (4.8))
-            Qm = abs(Qm) / (abs(Pm) + eps(typeof(Qm)) * 100)
-
-            # Calculate alpha
-            alpha[i, j, k, element] = max(alpha[i, j, k, element], 1 - Qm)
-        end
-    end
-
-    return nothing
-end
-end # @muladd

From 50c681f80817059ad3a6fbfcd1f62b1dab12f6d9 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Mon, 6 Oct 2025 11:12:23 +0200
Subject: [PATCH 12/24] Reduce allowed allocs in test

---
 test/test_p4est_3d.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_p4est_3d.jl b/test/test_p4est_3d.jl
index b5908bc8b03..54d28bb7c80 100644
--- a/test/test_p4est_3d.jl
+++ b/test/test_p4est_3d.jl
@@ -394,7 +394,7 @@ end
     # integrator which are not *recorded* for the methods from
     # OrdinaryDiffEq.jl
     # Corresponding issue: https://github.com/trixi-framework/Trixi.jl/issues/1877
-    @test_allocations(Trixi.rhs!, semi, sol, 15_000)
+    @test_allocations(Trixi.rhs!, semi, sol, 10_000)
 end
 
 @trixi_testset "elixir_euler_sedov_sc_subcell.jl (local bounds)" begin

From 7dbc8cdb2e047dbe2141a9e9df039180cefd95dd Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Mon, 6 Oct 2025 11:55:39 +0200
Subject: [PATCH 13/24] Increase allocs number again

---
 test/test_p4est_3d.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_p4est_3d.jl b/test/test_p4est_3d.jl
index 54d28bb7c80..b5908bc8b03 100644
--- a/test/test_p4est_3d.jl
+++ b/test/test_p4est_3d.jl
@@ -394,7 +394,7 @@ end
     # integrator which are not *recorded* for the methods from
     # OrdinaryDiffEq.jl
     # Corresponding issue: https://github.com/trixi-framework/Trixi.jl/issues/1877
-    @test_allocations(Trixi.rhs!, semi, sol, 10_000)
+    @test_allocations(Trixi.rhs!, semi, sol, 15_000)
 end
 
 @trixi_testset "elixir_euler_sedov_sc_subcell.jl (local bounds)" begin

From e8ab82af61807f3100945183ca95b4bfca4f78b3 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Thu, 23 Oct 2025 13:47:59 +0200
Subject: [PATCH 14/24] Add full 3d implementation on P4estMesh

---
 ...e_terms_nonperiodic_hohqmesh_sc_subcell.jl |  75 ++++
 src/equations/ideal_glm_mhd_3d.jl             | 186 ++++++++
 .../dgsem_p4est/dg_3d_subcell_limiters.jl     | 408 ++++++++++++++++++
 .../dgsem_p4est/subcell_limiters_3d.jl        | 129 ++++++
 .../dgsem_tree/dg_2d_subcell_limiters.jl      |  15 +
 test/test_p4est_3d.jl                         |  55 +++
 6 files changed, 868 insertions(+)
 create mode 100644 examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl

diff --git a/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl b/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl
new file mode 100644
index 00000000000..b37c533840b
--- /dev/null
+++ b/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl
@@ -0,0 +1,75 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations3D(1.4)
+
+initial_condition = initial_condition_convergence_test
+
+boundary_condition = BoundaryConditionDirichlet(initial_condition)
+boundary_conditions = Dict(:Bottom => boundary_condition,
+                           :Top => boundary_condition,
+                           :Circle => boundary_condition,
+                           :Cut => boundary_condition)
+
+surface_flux = flux_lax_friedrichs
+volume_flux = flux_ranocha
+polydeg = 4
+basis = LobattoLegendreBasis(polydeg)
+limiter_idp = SubcellLimiterIDP(equations, basis;
+                                positivity_variables_cons = ["rho"],
+                                positivity_variables_nonlinear = [pressure],
+                                local_twosided_variables_cons = [],
+                                local_onesided_variables_nonlinear = [])
+volume_integral = VolumeIntegralSubcellLimiting(limiter_idp;
+                                                volume_flux_dg = volume_flux,
+                                                volume_flux_fv = surface_flux)
+solver = DGSEM(basis, surface_flux, volume_integral)
+
+# Unstructured 3D half circle mesh from HOHQMesh
+mesh_file = Trixi.download("https://gist.githubusercontent.com/andrewwinters5000/11461efbfb02c42e06aca338b3d0b645/raw/81deeb1ebc4945952c30af5bb75fe222a18d975c/abaqus_half_circle_3d.inp",
+                           joinpath(@__DIR__, "abaqus_half_circle_3d.inp"))
+
+mesh = P4estMesh{3}(mesh_file, initial_refinement_level = 0)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                    source_terms = source_terms_convergence_test,
+                                    boundary_conditions = boundary_conditions)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 1.0)
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval,
+                                     extra_analysis_integrals = (entropy,))
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+save_solution = SaveSolutionCallback(interval = 1,
+                                     save_initial_solution = true,
+                                     save_final_solution = true,
+                                     solution_variables = cons2prim,
+                                     extra_node_variables = (:limiting_coefficient,))
+
+stepsize_callback = StepsizeCallback(cfl = 0.5)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback, alive_callback,
+                        save_solution,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+stage_callbacks = (SubcellLimiterIDPCorrection(), BoundsCheckCallback())
+
+sol = Trixi.solve(ode, Trixi.SimpleSSPRK33(stage_callbacks = stage_callbacks);
+                  dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+                  ode_default_options()..., callback = callbacks);
diff --git a/src/equations/ideal_glm_mhd_3d.jl b/src/equations/ideal_glm_mhd_3d.jl
index f6cff2c9851..2f31dbad9ea 100644
--- a/src/equations/ideal_glm_mhd_3d.jl
+++ b/src/equations/ideal_glm_mhd_3d.jl
@@ -364,6 +364,9 @@ end
 # For `VolumeIntegralSubcellLimiting` the nonconservative flux is created as a callable struct to
 # enable dispatch on the type of the nonconservative term (symmetric / jump).
 """
+    flux_nonconservative_powell_local_symmetric(u_ll, u_rr,
+                                                orientation::Integer,
+                                                equations::IdealGlmMhdEquations3D)
     flux_nonconservative_powell_local_symmetric(u_ll, u_rr,
                                                 normal_direction::AbstractVector,
                                                 equations::IdealGlmMhdEquations3D)
@@ -389,6 +392,58 @@ compute the subcell fluxes in dg_3d_subcell_limiters.jl.
   [DOI: 10.1016/j.jcp.2023.112607](https://doi.org/10.1016/j.jcp.2023.112607).
   [arXiv: 2211.14009](https://doi.org/10.48550/arXiv.2211.14009).
 """
+@inline function (noncons_flux::FluxNonConservativePowellLocalSymmetric)(u_ll, u_rr,
+                                                                         orientation::Integer,
+                                                                         equations::IdealGlmMhdEquations3D)
+    rho_ll, rho_v1_ll, rho_v2_ll, rho_v3_ll, rho_e_ll, B1_ll, B2_ll, B3_ll, psi_ll = u_ll
+    rho_rr, rho_v1_rr, rho_v2_rr, rho_v3_rr, rho_e_rr, B1_rr, B2_rr, B3_rr, psi_rr = u_rr
+
+    v1_ll = rho_v1_ll / rho_ll
+    v2_ll = rho_v2_ll / rho_ll
+    v3_ll = rho_v3_ll / rho_ll
+    v_dot_B_ll = v1_ll * B1_ll + v2_ll * B2_ll + v3_ll * B3_ll
+
+    # Powell nonconservative term:   (0, B_1, B_2, B_3, v⋅B, v_1, v_2, v_3, 0)
+    # Galilean nonconservative term: (0, 0, 0, 0, ψ v_{1,2}, 0, 0, 0, v_{1,2})
+    psi_avg = (psi_ll + psi_rr) #* 0.5 # The flux is already multiplied by 0.5 wherever it is used in the code
+    if orientation == 1
+        B1_avg = (B1_ll + B1_rr) #* 0.5 # The flux is already multiplied by 0.5 wherever it is used in the code
+        f = SVector(0,
+                    B1_ll * B1_avg,
+                    B2_ll * B1_avg,
+                    B3_ll * B1_avg,
+                    v_dot_B_ll * B1_avg + v1_ll * psi_ll * psi_avg,
+                    v1_ll * B1_avg,
+                    v2_ll * B1_avg,
+                    v3_ll * B1_avg,
+                    v1_ll * psi_avg)
+    elseif orientation == 2
+        B2_avg = (B2_ll + B2_rr) #* 0.5 # The flux is already multiplied by 0.5 wherever it is used in the code
+        f = SVector(0,
+                    B1_ll * B2_avg,
+                    B2_ll * B2_avg,
+                    B3_ll * B2_avg,
+                    v_dot_B_ll * B2_avg + v2_ll * psi_ll * psi_avg,
+                    v1_ll * B2_avg,
+                    v2_ll * B2_avg,
+                    v3_ll * B2_avg,
+                    v2_ll * psi_avg)
+    else # orientation == 3
+        B3_avg = (B3_ll + B3_rr) #* 0.5 # The flux is already multiplied by 0.5 wherever it is used in the code
+        f = SVector(0,
+                    B1_ll * B3_avg,
+                    B2_ll * B3_avg,
+                    B3_ll * B3_avg,
+                    v_dot_B_ll * B3_avg + v3_ll * psi_ll * psi_avg,
+                    v1_ll * B3_avg,
+                    v2_ll * B3_avg,
+                    v3_ll * B3_avg,
+                    v3_ll * psi_avg)
+    end
+
+    return f
+end
+
 @inline function (noncons_flux::FluxNonConservativePowellLocalSymmetric)(u_ll, u_rr,
                                                                          normal_direction::AbstractVector,
                                                                          equations::IdealGlmMhdEquations3D)
@@ -428,6 +483,10 @@ compute the subcell fluxes in dg_3d_subcell_limiters.jl.
 end
 
 """
+    flux_nonconservative_powell_local_symmetric(u_ll, orientation::Integer,
+                                                equations::IdealGlmMhdEquations3D,
+                                                nonconservative_type::NonConservativeLocal,
+                                                nonconservative_term::Integer)
     flux_nonconservative_powell_local_symmetric(u_ll, normal_direction_ll::AbstractVector,
                                                 equations::IdealGlmMhdEquations3D,
                                                 nonconservative_type::NonConservativeLocal,
@@ -442,6 +501,68 @@ the non-conservative staggered "fluxes" for subcell limiting. See, e.g.,
 
 This function is used to compute the subcell fluxes in dg_3d_subcell_limiters.jl.
 """
+@inline function (noncons_flux::FluxNonConservativePowellLocalSymmetric)(u_ll,
+                                                                         orientation::Integer,
+                                                                         equations::IdealGlmMhdEquations3D,
+                                                                         nonconservative_type::NonConservativeLocal,
+                                                                         nonconservative_term::Integer)
+    rho_ll, rho_v1_ll, rho_v2_ll, rho_v3_ll, rho_e_ll, B1_ll, B2_ll, B3_ll, psi_ll = u_ll
+
+    if nonconservative_term == 1
+        # Powell nonconservative term:   (0, B_1, B_2, B_3, v⋅B, v_1, v_2, v_3, 0)
+        v1_ll = rho_v1_ll / rho_ll
+        v2_ll = rho_v2_ll / rho_ll
+        v3_ll = rho_v3_ll / rho_ll
+        v_dot_B_ll = v1_ll * B1_ll + v2_ll * B2_ll + v3_ll * B3_ll
+        f = SVector(0,
+                    B1_ll,
+                    B2_ll,
+                    B3_ll,
+                    v_dot_B_ll,
+                    v1_ll,
+                    v2_ll,
+                    v3_ll,
+                    0)
+    else #nonconservative_term ==2
+        # Galilean nonconservative term: (0, 0, 0, 0, ψ v_{1,2}, 0, 0, 0, v_{1,2})
+        if orientation == 1
+            v1_ll = rho_v1_ll / rho_ll
+            f = SVector(0,
+                        0,
+                        0,
+                        0,
+                        v1_ll * psi_ll,
+                        0,
+                        0,
+                        0,
+                        v1_ll)
+        elseif orientation == 2
+            v2_ll = rho_v2_ll / rho_ll
+            f = SVector(0,
+                        0,
+                        0,
+                        0,
+                        v2_ll * psi_ll,
+                        0,
+                        0,
+                        0,
+                        v2_ll)
+        else #orientation == 3
+            v3_ll = rho_v3_ll / rho_ll
+            f = SVector(0,
+                        0,
+                        0,
+                        0,
+                        v3_ll * psi_ll,
+                        0,
+                        0,
+                        0,
+                        v3_ll)
+        end
+    end
+    return f
+end
+
 @inline function (noncons_flux::FluxNonConservativePowellLocalSymmetric)(u_ll,
                                                                          normal_direction_ll::AbstractVector,
                                                                          equations::IdealGlmMhdEquations3D,
@@ -487,6 +608,10 @@ This function is used to compute the subcell fluxes in dg_3d_subcell_limiters.jl
 end
 
 """
+    flux_nonconservative_powell_local_symmetric(u_ll, orientation::Integer,
+                                                equations::IdealGlmMhdEquations3D,
+                                                nonconservative_type::NonConservativeSymmetric,
+                                                nonconservative_term::Integer)
     flux_nonconservative_powell_local_symmetric(u_ll, normal_direction_avg::AbstractVector,
                                                 equations::IdealGlmMhdEquations3D,
                                                 nonconservative_type::NonConservativeSymmetric,
@@ -501,6 +626,67 @@ the non-conservative staggered "fluxes" for subcell limiting. See, e.g.,
 
 This function is used to compute the subcell fluxes in dg_3d_subcell_limiters.jl.
 """
+@inline function (noncons_flux::FluxNonConservativePowellLocalSymmetric)(u_ll, u_rr,
+                                                                         orientation::Integer,
+                                                                         equations::IdealGlmMhdEquations3D,
+                                                                         nonconservative_type::NonConservativeSymmetric,
+                                                                         nonconservative_term::Integer)
+    rho_ll, rho_v1_ll, rho_v2_ll, rho_v3_ll, rho_e_ll, B1_ll, B2_ll, B3_ll, psi_ll = u_ll
+    rho_rr, rho_v1_rr, rho_v2_rr, rho_v3_rr, rho_e_rr, B1_rr, B2_rr, B3_rr, psi_rr = u_rr
+
+    if nonconservative_term == 1
+        # Powell nonconservative term:   (0, B_1, B_2, B_3, v⋅B, v_1, v_2, v_3, 0)
+        if orientation == 1
+            B1_avg = (B1_ll + B1_rr)#* 0.5 # The flux is already multiplied by 0.5 wherever it is used in the code
+            f = SVector(0,
+                        B1_avg,
+                        B1_avg,
+                        B1_avg,
+                        B1_avg,
+                        B1_avg,
+                        B1_avg,
+                        B1_avg,
+                        0)
+        elseif orientation == 2
+            B2_avg = (B2_ll + B2_rr)#* 0.5 # The flux is already multiplied by 0.5 wherever it is used in the code
+            f = SVector(0,
+                        B2_avg,
+                        B2_avg,
+                        B2_avg,
+                        B2_avg,
+                        B2_avg,
+                        B2_avg,
+                        B2_avg,
+                        0)
+        else # orientation == 3
+            B3_avg = (B3_ll + B3_rr)#* 0.5 # The flux is already multiplied by 0.5 wherever it is used in the code
+            f = SVector(0,
+                        B3_avg,
+                        B3_avg,
+                        B3_avg,
+                        B3_avg,
+                        B3_avg,
+                        B3_avg,
+                        B3_avg,
+                        0)
+        end
+    else #nonconservative_term == 2
+        # Galilean nonconservative term: (0, 0, 0, 0, ψ v_{1,2}, 0, 0, 0, v_{1,2})
+        psi_avg = (psi_ll + psi_rr)#* 0.5 # The flux is already multiplied by 0.5 wherever it is used in the code
+        f = SVector(0,
+                    0,
+                    0,
+                    0,
+                    psi_avg,
+                    0,
+                    0,
+                    0,
+                    psi_avg)
+    end
+
+    return f
+end
+
 @inline function (noncons_flux::FluxNonConservativePowellLocalSymmetric)(u_ll, u_rr,
                                                                          normal_direction_avg::AbstractVector,
                                                                          equations::IdealGlmMhdEquations3D,
diff --git a/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl
index ed57b91ce1c..cbf953fafa7 100644
--- a/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl
@@ -543,6 +543,414 @@ end
     return nothing
 end
 
+# Calculate the DG staggered volume fluxes `fhat` in subcell FV-form inside the element
+# (**with non-conservative terms in "local * jump" form**).
+#
+# See also `flux_differencing_kernel!`.
+#
+# The calculation of the non-conservative staggered "fluxes" requires non-conservative
+# terms that can be written as a product of local and jump contributions.
+@inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
+                                u, mesh::P4estMesh{3},
+                                nonconservative_terms::True, equations,
+                                volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
+                                element,
+                                cache) where {
+                                              F_CONS <: Function,
+                                              F_NONCONS <:
+                                              FluxNonConservative{NonConservativeJump()}
+                                              }
+    (; contravariant_vectors) = cache.elements
+    (; weights, derivative_split) = dg.basis
+    (; flux_temp_threaded, flux_nonconservative_temp_threaded) = cache
+    (; fhat_temp_threaded, fhat_nonconservative_temp_threaded, phi_threaded) = cache
+
+    volume_flux_cons, volume_flux_noncons = volume_flux
+
+    flux_temp = flux_temp_threaded[Threads.threadid()]
+    flux_noncons_temp = flux_nonconservative_temp_threaded[Threads.threadid()]
+
+    fhat_temp = fhat_temp_threaded[Threads.threadid()]
+    fhat_noncons_temp = fhat_nonconservative_temp_threaded[Threads.threadid()]
+    phi = phi_threaded[Threads.threadid()]
+
+    # The FV-form fluxes are calculated in a recursive manner, i.e.:
+    # fhat_(0,1)   = w_0 * FVol_0,
+    # fhat_(j,j+1) = fhat_(j-1,j) + w_j * FVol_j,   for j=1,...,N-1,
+    # with the split form volume fluxes FVol_j = -2 * sum_i=0^N D_ji f*_(j,i).
+
+    # To use the symmetry of the `volume_flux`, the split form volume flux is precalculated
+    # like in `calc_volume_integral!` for the `VolumeIntegralFluxDifferencing`
+    # and saved in in `flux_temp`.
+
+    # Split form volume flux in orientation 1: x direction
+    flux_temp .= zero(eltype(flux_temp))
+    flux_noncons_temp .= zero(eltype(flux_noncons_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # pull the contravariant vectors in each coordinate direction
+        Ja1_node = get_contravariant_vector(1, contravariant_vectors,
+                                            i, j, k, element) # x direction
+
+        # All diagonal entries of `derivative_split` are zero. Thus, we can skip
+        # the computation of the diagonal terms. In addition, we use the symmetry
+        # of `volume_flux_cons` and `volume_flux_noncons` to save half of the possible two-point flux
+        # computations.
+        for ii in (i + 1):nnodes(dg)
+            u_node_ii = get_node_vars(u, equations, dg, ii, j, k, element)
+            # pull the contravariant vectors and compute the average
+            Ja1_node_ii = get_contravariant_vector(1, contravariant_vectors,
+                                                   ii, j, k, element)
+            Ja1_avg = 0.5f0 * (Ja1_node + Ja1_node_ii)
+
+            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
+            fluxtilde1 = volume_flux_cons(u_node, u_node_ii, Ja1_avg, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[i, ii], fluxtilde1,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[ii, i], fluxtilde1,
+                                       equations, dg, ii, j, k)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                # We multiply by 0.5 because that is done in other parts of Trixi
+                flux1_noncons = volume_flux_noncons(u_node, u_node_ii, Ja1_avg,
+                                                    equations,
+                                                    NonConservativeJump(), noncons)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5f0 * derivative_split[i, ii],
+                                           flux1_noncons,
+                                           equations, dg, noncons, i, j, k)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           -0.5f0 * derivative_split[ii, i],
+                                           flux1_noncons,
+                                           equations, dg, noncons, ii, j, k)
+            end
+        end
+    end
+
+    # FV-form flux `fhat` in x direction
+    fhat_temp[:, 1, :, :] .= zero(eltype(fhat1_L))
+    fhat_noncons_temp[:, :, 1, :, :] .= zero(eltype(fhat1_L))
+
+    # Compute local contribution to non-conservative flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_local = get_node_vars(u, equations, dg, i, j, k, element)
+        # pull the local contravariant vector
+        Ja1_node = get_contravariant_vector(1, contravariant_vectors, i, j, k, element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            set_node_vars!(phi,
+                           volume_flux_noncons(u_local, Ja1_node, equations,
+                                               NonConservativeLocal(), noncons),
+                           equations, dg, noncons, i, j, k)
+        end
+    end
+
+    for k in eachnode(dg), j in eachnode(dg), i in 1:(nnodes(dg) - 1)
+        # Conservative part
+        for v in eachvariable(equations)
+            value = fhat_temp[v, i, j, k] + weights[i] * flux_temp[v, i, j, k]
+            fhat_temp[v, i + 1, j, k] = value
+            fhat1_L[v, i + 1, j, k] = value
+            fhat1_R[v, i + 1, j, k] = value
+        end
+        # Nonconservative part
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons),
+            v in eachvariable(equations)
+
+            value = fhat_noncons_temp[v, noncons, i, j, k] +
+                    weights[i] * flux_noncons_temp[v, noncons, i, j, k]
+            fhat_noncons_temp[v, noncons, i + 1, j, k] = value
+
+            fhat1_L[v, i + 1, j, k] = fhat1_L[v, i + 1, j, k] +
+                                      phi[v, noncons, i, j, k] * value
+            fhat1_R[v, i + 1, j, k] = fhat1_R[v, i + 1, j, k] +
+                                      phi[v, noncons, i + 1, j, k] * value
+        end
+    end
+
+    # Apply correction term to the flux-differencing formula for nonconservative local * jump fluxes.
+    for k in eachnode(dg), j in eachnode(dg)
+        u_0 = get_node_vars(u, equations, dg, 1, j, k, element)
+        Ja1_node_0 = get_contravariant_vector(1, contravariant_vectors,
+                                              1, j, k, element)
+
+        for i in 2:(nnodes(dg) - 1)
+            u_i = get_node_vars(u, equations, dg, i, j, k, element)
+            Ja1_node_i = get_contravariant_vector(1, contravariant_vectors,
+                                                  i, j, k, element)
+            Ja1_avg = 0.5f0 * (Ja1_node_0 + Ja1_node_i)
+
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                phi_jump = volume_flux_noncons(u_0, u_i, Ja1_avg, equations,
+                                               NonConservativeJump(), noncons)
+
+                for v in eachvariable(equations)
+                    # The factor of 2 is missing on each term because Trixi multiplies all the non-cons terms with 0.5
+                    fhat1_R[v, i, j, k] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                    fhat1_L[v, i + 1, j, k] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                end
+            end
+        end
+        u_N = get_node_vars(u, equations, dg, nnodes(dg), j, k, element)
+        Ja1_node_N = get_contravariant_vector(1, contravariant_vectors,
+                                              nnodes(dg), j, k, element)
+        Ja1_avg = 0.5f0 * (Ja1_node_0 + Ja1_node_N)
+
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            phi_jump = volume_flux_noncons(u_0, u_N, Ja1_avg, equations,
+                                           NonConservativeJump(), noncons)
+
+            for v in eachvariable(equations)
+                # The factor of 2 is missing because Trixi multiplies all the non-cons terms with 0.5
+                fhat1_R[v, nnodes(dg), j, k] -= phi[v, noncons, nnodes(dg), j, k] *
+                                                phi_jump[v]
+            end
+        end
+    end
+
+    # Split form volume flux in orientation 2: y direction
+    flux_temp .= zero(eltype(flux_temp))
+    flux_noncons_temp .= zero(eltype(flux_noncons_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # pull the contravariant vectors in each coordinate direction
+        Ja2_node = get_contravariant_vector(2, contravariant_vectors,
+                                            i, j, k, element)
+
+        for jj in (j + 1):nnodes(dg)
+            u_node_jj = get_node_vars(u, equations, dg, i, jj, k, element)
+            # pull the contravariant vectors and compute the average
+            Ja2_node_jj = get_contravariant_vector(2, contravariant_vectors,
+                                                   i, jj, k, element)
+            Ja2_avg = 0.5f0 * (Ja2_node + Ja2_node_jj)
+            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
+            fluxtilde2 = volume_flux_cons(u_node, u_node_jj, Ja2_avg, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[j, jj], fluxtilde2,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[jj, j], fluxtilde2,
+                                       equations, dg, i, jj, k)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                # We multiply by 0.5 because that is done in other parts of Trixi
+                flux2_noncons = volume_flux_noncons(u_node, u_node_jj, Ja2_avg,
+                                                    equations,
+                                                    NonConservativeJump(), noncons)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5f0 * derivative_split[j, jj],
+                                           flux2_noncons,
+                                           equations, dg, noncons, i, j, k)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           -0.5f0 * derivative_split[jj, j],
+                                           flux2_noncons,
+                                           equations, dg, noncons, i, jj, k)
+            end
+        end
+    end
+
+    # FV-form flux `fhat` in y direction
+    fhat_temp[:, :, 1, :] .= zero(eltype(fhat1_L))
+    fhat_noncons_temp[:, :, :, 1, :] .= zero(eltype(fhat1_L))
+
+    # Compute local contribution to non-conservative flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_local = get_node_vars(u, equations, dg, i, j, k, element)
+        # pull the local contravariant vector
+        Ja2_node = get_contravariant_vector(2, contravariant_vectors, i, j, k, element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            set_node_vars!(phi,
+                           volume_flux_noncons(u_local, Ja2_node, equations,
+                                               NonConservativeLocal(), noncons),
+                           equations, dg, noncons, i, j, k)
+        end
+    end
+
+    for k in eachnode(dg), j in 1:(nnodes(dg) - 1), i in eachnode(dg)
+        # Conservative part
+        for v in eachvariable(equations)
+            value = fhat_temp[v, i, j, k] + weights[j] * flux_temp[v, i, j, k]
+            fhat_temp[v, i, j + 1, k] = value
+            fhat2_L[v, i, j + 1, k] = value
+            fhat2_R[v, i, j + 1, k] = value
+        end
+        # Nonconservative part
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons),
+            v in eachvariable(equations)
+
+            value = fhat_noncons_temp[v, noncons, i, j, k] +
+                    weights[j] * flux_noncons_temp[v, noncons, i, j, k]
+            fhat_noncons_temp[v, noncons, i, j + 1, k] = value
+
+            fhat2_L[v, i, j + 1, k] = fhat2_L[v, i, j + 1, k] +
+                                      phi[v, noncons, i, j, k] * value
+            fhat2_R[v, i, j + 1, k] = fhat2_R[v, i, j + 1, k] +
+                                      phi[v, noncons, i, j + 1, k] * value
+        end
+    end
+
+    # Apply correction term to the flux-differencing formula for nonconservative local * jump fluxes.
+    for k in eachnode(dg), i in eachnode(dg)
+        u_0 = get_node_vars(u, equations, dg, i, 1, k, element)
+        Ja2_node_0 = get_contravariant_vector(2, contravariant_vectors,
+                                              i, 1, k, element)
+
+        for j in 2:(nnodes(dg) - 1)
+            u_j = get_node_vars(u, equations, dg, i, j, k, element)
+            Ja2_node_j = get_contravariant_vector(2, contravariant_vectors,
+                                                  i, j, k, element)
+            Ja2_avg = 0.5f0 * (Ja2_node_0 + Ja2_node_j)
+
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                phi_jump = volume_flux_noncons(u_0, u_j, Ja2_avg, equations,
+                                               NonConservativeJump(), noncons)
+
+                for v in eachvariable(equations)
+                    # The factor of 2 is missing on each term because Trixi multiplies all the non-cons terms with 0.5
+                    fhat2_R[v, i, j, k] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                    fhat2_L[v, i, j + 1, k] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                end
+            end
+        end
+        u_N = get_node_vars(u, equations, dg, i, nnodes(dg), k, element)
+        Ja2_node_N = get_contravariant_vector(2, contravariant_vectors,
+                                              i, nnodes(dg), k, element)
+        Ja2_avg = 0.5f0 * (Ja2_node_0 + Ja2_node_N)
+
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            phi_jump = volume_flux_noncons(u_0, u_N, Ja2_avg, equations,
+                                           NonConservativeJump(), noncons)
+
+            for v in eachvariable(equations)
+                # The factor of 2 is missing cause Trixi multiplies all the non-cons terms with 0.5
+                fhat2_R[v, i, nnodes(dg), k] -= phi[v, noncons, i, nnodes(dg), k] *
+                                                phi_jump[v]
+            end
+        end
+    end
+
+    # Split form volume flux in orientation 3: z direction
+    flux_temp .= zero(eltype(flux_temp))
+    flux_noncons_temp .= zero(eltype(flux_noncons_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # pull the contravariant vectors in each coordinate direction
+        Ja3_node = get_contravariant_vector(3, contravariant_vectors,
+                                            i, j, k, element)
+
+        for kk in (k + 1):nnodes(dg)
+            u_node_kk = get_node_vars(u, equations, dg, i, j, kk, element)
+            # pull the contravariant vectors and compute the average
+            Ja3_node_kk = get_contravariant_vector(3, contravariant_vectors,
+                                                   i, j, kk, element)
+            Ja3_avg = 0.5f0 * (Ja3_node + Ja3_node_kk)
+            # compute the contravariant sharp flux in the direction of the averaged contravariant vector
+            fluxtilde3 = volume_flux_cons(u_node, u_node_kk, Ja3_avg, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[k, kk], fluxtilde3,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[kk, k], fluxtilde3,
+                                       equations, dg, i, j, kk)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                # We multiply by 0.5 because that is done in other parts of Trixi
+                flux3_noncons = volume_flux_noncons(u_node, u_node_kk, Ja3_avg,
+                                                    equations,
+                                                    NonConservativeJump(), noncons)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5f0 * derivative_split[k, kk],
+                                           flux3_noncons,
+                                           equations, dg, noncons, i, j, k)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           -0.5f0 * derivative_split[kk, k],
+                                           flux3_noncons,
+                                           equations, dg, noncons, i, j, kk)
+            end
+        end
+    end
+
+    # FV-form flux `fhat` in z direction
+    fhat_temp[:, :, :, 1] .= zero(eltype(fhat1_L))
+    fhat_noncons_temp[:, :, :, :, 1] .= zero(eltype(fhat1_L))
+
+    # Compute local contribution to non-conservative flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_local = get_node_vars(u, equations, dg, i, j, k, element)
+        # pull the local contravariant vector
+        Ja3_node = get_contravariant_vector(3, contravariant_vectors, i, j, k, element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            set_node_vars!(phi,
+                           volume_flux_noncons(u_local, Ja3_node, equations,
+                                               NonConservativeLocal(), noncons),
+                           equations, dg, noncons, i, j, k)
+        end
+    end
+
+    for k in 1:(nnodes(dg) - 1), j in eachnode(dg), i in eachnode(dg)
+        # Conservative part
+        for v in eachvariable(equations)
+            value = fhat_temp[v, i, j, k] + weights[k] * flux_temp[v, i, j, k]
+            fhat_temp[v, i, j, k + 1] = value
+            fhat3_L[v, i, j, k + 1] = value
+            fhat3_R[v, i, j, k + 1] = value
+        end
+        # Nonconservative part
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons),
+            v in eachvariable(equations)
+
+            value = fhat_noncons_temp[v, noncons, i, j, k] +
+                    weights[k] * flux_noncons_temp[v, noncons, i, j, k]
+            fhat_noncons_temp[v, noncons, i, j, k + 1] = value
+
+            fhat3_L[v, i, j, k + 1] = fhat3_L[v, i, j, k + 1] +
+                                      phi[v, noncons, i, j, k] * value
+            fhat3_R[v, i, j, k + 1] = fhat3_R[v, i, j, k + 1] +
+                                      phi[v, noncons, i, j, k + 1] * value
+        end
+    end
+
+    # Apply correction term to the flux-differencing formula for nonconservative local * jump fluxes.
+    for j in eachnode(dg), i in eachnode(dg)
+        u_0 = get_node_vars(u, equations, dg, i, j, 1, element)
+        Ja3_node_0 = get_contravariant_vector(3, contravariant_vectors,
+                                              i, j, 1, element)
+
+        for k in 2:(nnodes(dg) - 1)
+            u_k = get_node_vars(u, equations, dg, i, j, k, element)
+            Ja3_node_k = get_contravariant_vector(3, contravariant_vectors,
+                                                  i, j, k, element)
+            Ja3_avg = 0.5f0 * (Ja3_node_0 + Ja3_node_k)
+
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                phi_jump = volume_flux_noncons(u_0, u_k, Ja3_avg, equations,
+                                               NonConservativeJump(), noncons)
+
+                for v in eachvariable(equations)
+                    # The factor of 2 is missing on each term because Trixi multiplies all the non-cons terms with 0.5
+                    fhat3_R[v, i, j, k] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                    fhat3_L[v, i, j, k + 1] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                end
+            end
+        end
+        u_N = get_node_vars(u, equations, dg, i, j, nnodes(dg), element)
+        Ja3_node_N = get_contravariant_vector(3, contravariant_vectors,
+                                              i, j, nnodes(dg), element)
+        Ja3_avg = 0.5f0 * (Ja3_node_0 + Ja3_node_N)
+
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            phi_jump = volume_flux_noncons(u_0, u_N, Ja3_avg, equations,
+                                           NonConservativeJump(), noncons)
+
+            for v in eachvariable(equations)
+                # The factor of 2 is missing cause Trixi multiplies all the non-cons terms with 0.5
+                fhat3_R[v, i, j, nnodes(dg)] -= phi[v, noncons, i, j, nnodes(dg)] *
+                                                phi_jump[v]
+            end
+        end
+    end
+
+    return nothing
+end
+
 # Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
 @inline function calcflux_antidiffusive!(fhat1_L, fhat1_R,
                                          fhat2_L, fhat2_R,
diff --git a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
index 4b1e10da09a..265d76e7da3 100644
--- a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
+++ b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
@@ -169,6 +169,73 @@ end
     return nothing
 end
 
+@inline function calc_bounds_twosided_boundary!(var_min, var_max, variable, u, t,
+                                                boundary_conditions,
+                                                mesh::P4estMesh{3},
+                                                equations, dg, cache)
+    (; boundary_condition_types, boundary_indices) = boundary_conditions
+    (; contravariant_vectors) = cache.elements
+
+    (; boundaries) = cache
+    index_range = eachnode(dg)
+
+    foreach_enumerate(boundary_condition_types) do (i, boundary_condition)
+        for boundary in boundary_indices[i]
+            element = boundaries.neighbor_ids[boundary]
+            node_indices = boundaries.node_indices[boundary]
+            direction = indices2direction(node_indices)
+
+            i_node_start, i_node_step_i, i_node_step_j = index_to_start_step_3d(node_indices[1],
+                                                                                index_range)
+            j_node_start, j_node_step_i, j_node_step_j = index_to_start_step_3d(node_indices[2],
+                                                                                index_range)
+            k_node_start, k_node_step_i, k_node_step_j = index_to_start_step_3d(node_indices[3],
+                                                                                index_range)
+
+            i_node = i_node_start
+            j_node = j_node_start
+            k_node = k_node_start
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(direction,
+                                                            contravariant_vectors,
+                                                            i_node, j_node, k_node,
+                                                            element)
+
+                    u_inner = get_node_vars(u, equations, dg, i_node, j_node, k_node,
+                                            element)
+
+                    u_outer = get_boundary_outer_state(u_inner, t, boundary_condition,
+                                                       normal_direction,
+                                                       mesh, equations, dg, cache,
+                                                       i_node, j_node, k_node, element)
+                    var_outer = u_outer[variable]
+
+                    var_min[i_node, j_node, k_node, element] = min(var_min[i_node,
+                                                                           j_node,
+                                                                           k_node,
+                                                                           element],
+                                                                   var_outer)
+                    var_max[i_node, j_node, k_node, element] = max(var_max[i_node,
+                                                                           j_node,
+                                                                           k_node,
+                                                                           element],
+                                                                   var_outer)
+
+                    i_node += i_node_step_i
+                    j_node += j_node_step_i
+                    k_node += k_node_step_i
+                end
+                i_node += i_node_step_j
+                j_node += j_node_step_j
+                k_node += k_node_step_j
+            end
+        end
+    end
+
+    return nothing
+end
+
 @inline function calc_bounds_onesided!(var_minmax, min_or_max, variable,
                                        u::AbstractArray{<:Any, 5}, t, semi)
     mesh, equations, dg, cache = mesh_equations_solver_cache(semi)
@@ -322,6 +389,68 @@ end
     return nothing
 end
 
+@inline function calc_bounds_onesided_boundary!(var_minmax, minmax, variable, u, t,
+                                                boundary_conditions,
+                                                mesh::P4estMesh{3},
+                                                equations, dg, cache)
+    (; boundary_condition_types, boundary_indices) = boundary_conditions
+    (; contravariant_vectors) = cache.elements
+
+    (; boundaries) = cache
+    index_range = eachnode(dg)
+
+    foreach_enumerate(boundary_condition_types) do (i, boundary_condition)
+        for boundary in boundary_indices[i]
+            element = boundaries.neighbor_ids[boundary]
+            node_indices = boundaries.node_indices[boundary]
+            direction = indices2direction(node_indices)
+
+            i_node_start, i_node_step_i, i_node_step_j = index_to_start_step_3d(node_indices[1],
+                                                                                index_range)
+            j_node_start, j_node_step_i, j_node_step_j = index_to_start_step_3d(node_indices[2],
+                                                                                index_range)
+            k_node_start, k_node_step_i, k_node_step_j = index_to_start_step_3d(node_indices[3],
+                                                                                index_range)
+
+            i_node = i_node_start
+            j_node = j_node_start
+            k_node = k_node_start
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(direction,
+                                                            contravariant_vectors,
+                                                            i_node, j_node, k_node,
+                                                            element)
+
+                    u_inner = get_node_vars(u, equations, dg, i_node, j_node, k_node,
+                                            element)
+
+                    u_outer = get_boundary_outer_state(u_inner, t, boundary_condition,
+                                                       normal_direction,
+                                                       mesh, equations, dg, cache,
+                                                       i_node, j_node, k_node, element)
+                    var_outer = variable(u_outer, equations)
+
+                    var_minmax[i_node, j_node, k_node, element] = minmax(var_minmax[i_node,
+                                                                                    j_node,
+                                                                                    k_node,
+                                                                                    element],
+                                                                         var_outer)
+
+                    i_node += i_node_step_i
+                    j_node += j_node_step_i
+                    k_node += k_node_step_i
+                end
+                i_node += i_node_step_j
+                j_node += j_node_step_j
+                k_node += k_node_step_j
+            end
+        end
+    end
+
+    return nothing
+end
+
 ###############################################################################
 # Local two-sided limiting of conservative variables
 
diff --git a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
index bccb96a1404..6f2c82f5949 100644
--- a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
@@ -671,6 +671,7 @@ end
     return nothing
 end
 
+# TODO: dimension independent implementation
 """
     get_boundary_outer_state(u_inner, t,
                              boundary_condition::BoundaryConditionDirichlet,
@@ -698,4 +699,18 @@ Should be used together with [`TreeMesh`](@ref) or [`StructuredMesh`](@ref).
 
     return u_outer
 end
+
+# TODO: dimension independent implementation
+@inline function get_boundary_outer_state(u_inner, t,
+                                          boundary_condition::BoundaryConditionDirichlet,
+                                          normal_direction,
+                                          mesh::P4estMesh,
+                                          equations, dg, cache, indices...)
+    (; node_coordinates) = cache.elements
+
+    x = get_node_coords(node_coordinates, equations, dg, indices...)
+    u_outer = boundary_condition.boundary_value_function(x, t, equations)
+
+    return u_outer
+end
 end # @muladd
diff --git a/test/test_p4est_3d.jl b/test/test_p4est_3d.jl
index b5908bc8b03..0c58c8261d2 100644
--- a/test/test_p4est_3d.jl
+++ b/test/test_p4est_3d.jl
@@ -590,6 +590,61 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl (positivity bounds)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                 "elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl"),
+                        l2=[
+                            0.003968297633693987,
+                            0.004219722654211142,
+                            0.004313961192612337,
+                            0.003994315173438687,
+                            0.008093257684168107
+                        ],
+                        linf=[
+                            0.03906896684353356,
+                            0.032089927158354126,
+                            0.04744970237203505,
+                            0.047720935760972694,
+                            0.10020886734372869
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    # Larger values for allowed allocations due to usage of custom
+    # integrator which are not *recorded* for the methods from
+    # OrdinaryDiffEq.jl
+    # Corresponding issue: https://github.com/trixi-framework/Trixi.jl/issues/1877
+    @test_allocations(Trixi.rhs!, semi, sol, 15000)
+end
+
+@trixi_testset "elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl (local bounds)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                 "elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl"),
+                        local_twosided_variables_cons=["rho"],
+                        local_onesided_variables_nonlinear=[(Trixi.entropy_guermond_etal,
+                                                             min)],
+                        l2=[
+                            0.03408584424980049,
+                            0.02779678380021509,
+                            0.027798412637516465,
+                            0.028828887822172678,
+                            0.08583486245614604
+                        ],
+                        linf=[
+                            0.14985741416461962,
+                            0.14670921773754952,
+                            0.1682308073619827,
+                            0.15212001588109558,
+                            0.33596974695378323
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    # Larger values for allowed allocations due to usage of custom
+    # integrator which are not *recorded* for the methods from
+    # OrdinaryDiffEq.jl
+    # Corresponding issue: https://github.com/trixi-framework/Trixi.jl/issues/1877
+    @test_allocations(Trixi.rhs!, semi, sol, 15_000)
+end
+
 @trixi_testset "elixir_mhd_alfven_wave_er.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_mhd_alfven_wave_er.jl"),

From fb1d93d3e568ede2695cc444e7076f8974a820ea Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Thu, 20 Nov 2025 09:58:08 +0100
Subject: [PATCH 15/24] Fix tests after fixing bug in last commit

---
 test/test_p4est_3d.jl | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/test/test_p4est_3d.jl b/test/test_p4est_3d.jl
index 0c58c8261d2..0ff4ee08329 100644
--- a/test/test_p4est_3d.jl
+++ b/test/test_p4est_3d.jl
@@ -623,18 +623,18 @@ end
                         local_onesided_variables_nonlinear=[(Trixi.entropy_guermond_etal,
                                                              min)],
                         l2=[
-                            0.03408584424980049,
-                            0.02779678380021509,
-                            0.027798412637516465,
-                            0.028828887822172678,
-                            0.08583486245614604
-                        ],
-                        linf=[
-                            0.14985741416461962,
-                            0.14670921773754952,
-                            0.1682308073619827,
-                            0.15212001588109558,
-                            0.33596974695378323
+                            0.03390196416615077,
+                            0.027635852530635028,
+                            0.02765156139098808,
+                            0.028877165738753797,
+                            0.08398670497514621
+                        ],
+                        linf=[
+                            0.16264620832647103,
+                            0.17331631520536006,
+                            0.18420209957841727,
+                            0.1578396641384181,
+                            0.34195502332901295
                         ])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)

From fdfee600f0cfcecc5c1a0d87c1474bff083199e0 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Fri, 9 Jan 2026 14:26:04 +0100
Subject: [PATCH 16/24] Add reset of bounds

---
 src/solvers/dgsem_p4est/subcell_limiters_3d.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
index 265d76e7da3..aa29d74b455 100644
--- a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
+++ b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
@@ -17,6 +17,9 @@
     mesh, _, dg, cache = mesh_equations_solver_cache(semi)
     # Calc bounds inside elements
     @threaded for element in eachelement(dg, cache)
+        var_min[:, :, :, element] .= typemax(eltype(var_min))
+        var_max[:, :, :, element] .= typemin(eltype(var_max))
+
         # Calculate bounds at Gauss-Lobatto nodes
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             var = u[variable, i, j, k, element]

From 6a146cfb4ede61eb039d1679375bb4d21dc47844 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Tue, 13 Jan 2026 13:29:12 +0100
Subject: [PATCH 17/24] Remove reset again

---
 src/solvers/dgsem_p4est/subcell_limiters_3d.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
index aa29d74b455..265d76e7da3 100644
--- a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
+++ b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
@@ -17,9 +17,6 @@
     mesh, _, dg, cache = mesh_equations_solver_cache(semi)
     # Calc bounds inside elements
     @threaded for element in eachelement(dg, cache)
-        var_min[:, :, :, element] .= typemax(eltype(var_min))
-        var_max[:, :, :, element] .= typemin(eltype(var_max))
-
         # Calculate bounds at Gauss-Lobatto nodes
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             var = u[variable, i, j, k, element]

From 081a2b7d3179a33d70c7162c93582212a35a72eb Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Tue, 13 Jan 2026 15:41:54 +0100
Subject: [PATCH 18/24] Remove `@threaded`

---
 src/solvers/dgsem_p4est/subcell_limiters_3d.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
index 265d76e7da3..6e338500699 100644
--- a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
+++ b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
@@ -16,7 +16,7 @@
                                        u::AbstractArray{<:Any, 5}, t, semi, equations)
     mesh, _, dg, cache = mesh_equations_solver_cache(semi)
     # Calc bounds inside elements
-    @threaded for element in eachelement(dg, cache)
+    for element in eachelement(dg, cache)
         # Calculate bounds at Gauss-Lobatto nodes
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             var = u[variable, i, j, k, element]

From eff3a340c750ee6e67a50d8984310d87f48d3c9a Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Wed, 14 Jan 2026 10:15:30 +0100
Subject: [PATCH 19/24] Add reset and `@threaded`

---
 src/solvers/dgsem_p4est/subcell_limiters_3d.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
index 6e338500699..aa29d74b455 100644
--- a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
+++ b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
@@ -16,7 +16,10 @@
                                        u::AbstractArray{<:Any, 5}, t, semi, equations)
     mesh, _, dg, cache = mesh_equations_solver_cache(semi)
     # Calc bounds inside elements
-    for element in eachelement(dg, cache)
+    @threaded for element in eachelement(dg, cache)
+        var_min[:, :, :, element] .= typemax(eltype(var_min))
+        var_max[:, :, :, element] .= typemin(eltype(var_max))
+
         # Calculate bounds at Gauss-Lobatto nodes
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             var = u[variable, i, j, k, element]

From fdef177193b23c13e5fd5c7b9892d72ee74c46a2 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Wed, 14 Jan 2026 11:55:54 +0100
Subject: [PATCH 20/24] Stabilize simulation

---
 ...r_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl | 3 ++-
 src/solvers/dgsem_p4est/subcell_limiters_3d.jl             | 3 ---
 test/test_p4est_3d.jl                                      | 7 ++++---
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl b/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl
index b37c533840b..ef91971f4d4 100644
--- a/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl
+++ b/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl
@@ -22,7 +22,8 @@ limiter_idp = SubcellLimiterIDP(equations, basis;
                                 positivity_variables_cons = ["rho"],
                                 positivity_variables_nonlinear = [pressure],
                                 local_twosided_variables_cons = [],
-                                local_onesided_variables_nonlinear = [])
+                                local_onesided_variables_nonlinear = [],
+                                max_iterations_newton = 10)
 volume_integral = VolumeIntegralSubcellLimiting(limiter_idp;
                                                 volume_flux_dg = volume_flux,
                                                 volume_flux_fv = surface_flux)
diff --git a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
index aa29d74b455..265d76e7da3 100644
--- a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
+++ b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
@@ -17,9 +17,6 @@
     mesh, _, dg, cache = mesh_equations_solver_cache(semi)
     # Calc bounds inside elements
     @threaded for element in eachelement(dg, cache)
-        var_min[:, :, :, element] .= typemax(eltype(var_min))
-        var_max[:, :, :, element] .= typemin(eltype(var_max))
-
         # Calculate bounds at Gauss-Lobatto nodes
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             var = u[variable, i, j, k, element]
diff --git a/test/test_p4est_3d.jl b/test/test_p4est_3d.jl
index 0ff4ee08329..177e8526f7e 100644
--- a/test/test_p4est_3d.jl
+++ b/test/test_p4est_3d.jl
@@ -613,7 +613,7 @@ end
     # integrator which are not *recorded* for the methods from
     # OrdinaryDiffEq.jl
     # Corresponding issue: https://github.com/trixi-framework/Trixi.jl/issues/1877
-    @test_allocations(Trixi.rhs!, semi, sol, 15000)
+    @test_allocations(Trixi.rhs!, semi, sol, 15_000)
 end
 
 @trixi_testset "elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl (local bounds)" begin
@@ -622,6 +622,7 @@ end
                         local_twosided_variables_cons=["rho"],
                         local_onesided_variables_nonlinear=[(Trixi.entropy_guermond_etal,
                                                              min)],
+                        max_iterations_newton=30,
                         l2=[
                             0.03390196416615077,
                             0.027635852530635028,
@@ -630,10 +631,10 @@ end
                             0.08398670497514621
                         ],
                         linf=[
-                            0.16264620832647103,
+                            0.1626462053447677,
                             0.17331631520536006,
                             0.18420209957841727,
-                            0.1578396641384181,
+                            0.15783966108549974,
                             0.34195502332901295
                         ])
     # Ensure that we do not have excessive memory allocations

From 128ce52733d1ac0d1ad58e2910f0f057680b5b7f Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Wed, 14 Jan 2026 12:47:10 +0100
Subject: [PATCH 21/24] Reduce run time in test

---
 ...e_terms_nonperiodic_hohqmesh_sc_subcell.jl |  2 +-
 test/test_p4est_3d.jl                         | 50 ++++++++++---------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl b/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl
index ef91971f4d4..92f93f983ae 100644
--- a/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl
+++ b/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl
@@ -73,4 +73,4 @@ stage_callbacks = (SubcellLimiterIDPCorrection(), BoundsCheckCallback())
 
 sol = Trixi.solve(ode, Trixi.SimpleSSPRK33(stage_callbacks = stage_callbacks);
                   dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
-                  ode_default_options()..., callback = callbacks);
+                  callback = callbacks);
diff --git a/test/test_p4est_3d.jl b/test/test_p4est_3d.jl
index 177e8526f7e..c13671f34d7 100644
--- a/test/test_p4est_3d.jl
+++ b/test/test_p4est_3d.jl
@@ -594,19 +594,20 @@ end
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl"),
                         l2=[
-                            0.003968297633693987,
-                            0.004219722654211142,
-                            0.004313961192612337,
-                            0.003994315173438687,
-                            0.008093257684168107
-                        ],
-                        linf=[
-                            0.03906896684353356,
-                            0.032089927158354126,
-                            0.04744970237203505,
-                            0.047720935760972694,
-                            0.10020886734372869
-                        ])
+                            0.0010625880666563514,
+                            0.0011220178360278588,
+                            0.001175570902488149,
+                            0.0013866847769005606,
+                            0.002942603300074813
+                        ],
+                        linf=[
+                            0.019666290518305374,
+                            0.019355047676718584,
+                            0.02588920954597862,
+                            0.022936612741258244,
+                            0.04983574326863449
+                        ],
+                        tspan=(0.0, 0.3))
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
     # Larger values for allowed allocations due to usage of custom
@@ -624,19 +625,20 @@ end
                                                              min)],
                         max_iterations_newton=30,
                         l2=[
-                            0.03390196416615077,
-                            0.027635852530635028,
-                            0.02765156139098808,
-                            0.028877165738753797,
-                            0.08398670497514621
+                            0.01573068932681815,
+                            0.01591679096422271,
+                            0.015795998889146637,
+                            0.01746834678809081,
+                            0.06766354772185222
                         ],
                         linf=[
-                            0.1626462053447677,
-                            0.17331631520536006,
-                            0.18420209957841727,
-                            0.15783966108549974,
-                            0.34195502332901295
-                        ])
+                            0.06511438601036024,
+                            0.09961666854491202,
+                            0.10474759248351417,
+                            0.07545726460168245,
+                            0.26949807234431944
+                        ],
+                        tspan=(0.0, 0.3))
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
     # Larger values for allowed allocations due to usage of custom

From 1b11be6c67a9102073ad823e24edf1ed8bdf8906 Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Wed, 21 Jan 2026 10:01:35 +0100
Subject: [PATCH 22/24] cl

---
 test/test_p4est_3d.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_p4est_3d.jl b/test/test_p4est_3d.jl
index c13671f34d7..2ffe35a1471 100644
--- a/test/test_p4est_3d.jl
+++ b/test/test_p4est_3d.jl
@@ -621,7 +621,7 @@ end
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl"),
                         local_twosided_variables_cons=["rho"],
-                        local_onesided_variables_nonlinear=[(Trixi.entropy_guermond_etal,
+                        local_onesided_variables_nonlinear=[(entropy_guermond_etal,
                                                              min)],
                         max_iterations_newton=30,
                         l2=[

From 9a150d2f4e9fc0305b32d1f85625fa6ca8af087c Mon Sep 17 00:00:00 2001
From: bennibolm <benjamin.bolm@gmx.de>
Date: Thu, 26 Feb 2026 11:44:03 +0100
Subject: [PATCH 23/24] Fix boundary conditions in elixir

---
 ..._euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl b/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl
index 92f93f983ae..269f40924c3 100644
--- a/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl
+++ b/examples/p4est_3d_dgsem/elixir_euler_source_terms_nonperiodic_hohqmesh_sc_subcell.jl
@@ -9,10 +9,10 @@ equations = CompressibleEulerEquations3D(1.4)
 initial_condition = initial_condition_convergence_test
 
 boundary_condition = BoundaryConditionDirichlet(initial_condition)
-boundary_conditions = Dict(:Bottom => boundary_condition,
-                           :Top => boundary_condition,
-                           :Circle => boundary_condition,
-                           :Cut => boundary_condition)
+boundary_conditions = (; Bottom = boundary_condition,
+                       Top = boundary_condition,
+                       Circle = boundary_condition,
+                       Cut = boundary_condition)
 
 surface_flux = flux_lax_friedrichs
 volume_flux = flux_ranocha

From a491cd095a233a189bf56799e2b1a73f3a608246 Mon Sep 17 00:00:00 2001
From: Benjamin Bolm <74359358+bennibolm@users.noreply.github.com>
Date: Thu, 12 Mar 2026 14:58:52 +0100
Subject: [PATCH 24/24] Add 3d TreeMesh implementation (#143)

* Add full 3d implementation on P4estMesh

* Add 3d positivity limiting for nonlinear variables

* Implement suggestions

* Reduce allowed allocs in test

* Increase allocs number again

* Add 3d TreeMesh implementation

* Adapt test

* Remove `TreeMesh`from `calc_bounds_two_sided_boundary!`

* fmt

* Fix test after fixing bug in last commit

* Update examples/tree_3d_dgsem/elixir_euler_sedov_blast_wave_sc_subcell.jl

* Adapt test numbers

* Remove doubled functions

* Fix elixir and add TreeMesh to `volume_integral_kernel!`

* Rename variable e to e_total
---
 .../elixir_euler_convergence_sc_subcell.jl    |  70 ++
 ...lixir_euler_sedov_blast_wave_sc_subcell.jl |   4 +-
 src/equations/ideal_glm_mhd_3d.jl             |   8 +-
 .../dgsem_tree/dg_3d_subcell_limiters.jl      | 596 ++++++++++++++++++
 src/solvers/dgsem_tree/subcell_limiters_3d.jl |  91 ++-
 test/test_tree_3d_euler.jl                    |   2 +-
 6 files changed, 763 insertions(+), 8 deletions(-)
 create mode 100644 examples/tree_3d_dgsem/elixir_euler_convergence_sc_subcell.jl

diff --git a/examples/tree_3d_dgsem/elixir_euler_convergence_sc_subcell.jl b/examples/tree_3d_dgsem/elixir_euler_convergence_sc_subcell.jl
new file mode 100644
index 00000000000..fea1afea654
--- /dev/null
+++ b/examples/tree_3d_dgsem/elixir_euler_convergence_sc_subcell.jl
@@ -0,0 +1,70 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations3D(1.4)
+
+initial_condition = initial_condition_convergence_test
+
+surface_flux = flux_lax_friedrichs
+volume_flux = flux_ranocha
+polydeg = 3
+basis = LobattoLegendreBasis(polydeg)
+limiter_idp = SubcellLimiterIDP(equations, basis;
+                                positivity_variables_cons = ["rho"],
+                                positivity_variables_nonlinear = [pressure],
+                                local_twosided_variables_cons = [],
+                                local_onesided_variables_nonlinear = [],
+                                max_iterations_newton = 40, # Default parameters are not sufficient to fulfill bounds properly.
+                                newton_tolerances = (1.0e-14, 1.0e-15))
+volume_integral = VolumeIntegralSubcellLimiting(limiter_idp;
+                                                volume_flux_dg = volume_flux,
+                                                volume_flux_fv = surface_flux)
+solver = DGSEM(basis, surface_flux, volume_integral)
+
+coordinates_min = (0.0, 0.0, 0.0)
+coordinates_max = (2.0, 2.0, 2.0)
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 2,
+                n_cells_max = 10_000,
+                periodicity = true)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                    source_terms = source_terms_convergence_test,
+                                    boundary_conditions = boundary_condition_periodic)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 1.0)
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+save_solution = SaveSolutionCallback(interval = 100,
+                                     save_initial_solution = true,
+                                     save_final_solution = true,
+                                     solution_variables = cons2prim)
+
+stepsize_callback = StepsizeCallback(cfl = 0.8)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback, alive_callback,
+                        save_solution,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+stage_callbacks = (SubcellLimiterIDPCorrection(), BoundsCheckCallback())
+
+sol = Trixi.solve(ode, Trixi.SimpleSSPRK33(stage_callbacks = stage_callbacks);
+                  dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+                  callback = callbacks);
diff --git a/examples/tree_3d_dgsem/elixir_euler_sedov_blast_wave_sc_subcell.jl b/examples/tree_3d_dgsem/elixir_euler_sedov_blast_wave_sc_subcell.jl
index ead0b003e81..61452ce3174 100644
--- a/examples/tree_3d_dgsem/elixir_euler_sedov_blast_wave_sc_subcell.jl
+++ b/examples/tree_3d_dgsem/elixir_euler_sedov_blast_wave_sc_subcell.jl
@@ -59,11 +59,11 @@ coordinates_max = (1.0, 1.0, 1.0)
 mesh = TreeMesh(coordinates_min, coordinates_max,
                 initial_refinement_level = 3,
                 n_cells_max = 100_000,
-                periodicity = true)
+                periodicity = false)
 
 # create the semi discretization object
 semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
-                                    boundary_conditions = boundary_condition_periodic)
+                                    boundary_conditions = BoundaryConditionDirichlet(initial_condition))
 
 ###############################################################################
 # ODE solvers, callbacks etc.
diff --git a/src/equations/ideal_glm_mhd_3d.jl b/src/equations/ideal_glm_mhd_3d.jl
index 2f31dbad9ea..4b8348f087e 100644
--- a/src/equations/ideal_glm_mhd_3d.jl
+++ b/src/equations/ideal_glm_mhd_3d.jl
@@ -395,8 +395,8 @@ compute the subcell fluxes in dg_3d_subcell_limiters.jl.
 @inline function (noncons_flux::FluxNonConservativePowellLocalSymmetric)(u_ll, u_rr,
                                                                          orientation::Integer,
                                                                          equations::IdealGlmMhdEquations3D)
-    rho_ll, rho_v1_ll, rho_v2_ll, rho_v3_ll, rho_e_ll, B1_ll, B2_ll, B3_ll, psi_ll = u_ll
-    rho_rr, rho_v1_rr, rho_v2_rr, rho_v3_rr, rho_e_rr, B1_rr, B2_rr, B3_rr, psi_rr = u_rr
+    rho_ll, rho_v1_ll, rho_v2_ll, rho_v3_ll, rho_e_total_ll, B1_ll, B2_ll, B3_ll, psi_ll = u_ll
+    rho_rr, rho_v1_rr, rho_v2_rr, rho_v3_rr, rho_e_total_rr, B1_rr, B2_rr, B3_rr, psi_rr = u_rr
 
     v1_ll = rho_v1_ll / rho_ll
     v2_ll = rho_v2_ll / rho_ll
@@ -631,8 +631,8 @@ This function is used to compute the subcell fluxes in dg_3d_subcell_limiters.jl
                                                                          equations::IdealGlmMhdEquations3D,
                                                                          nonconservative_type::NonConservativeSymmetric,
                                                                          nonconservative_term::Integer)
-    rho_ll, rho_v1_ll, rho_v2_ll, rho_v3_ll, rho_e_ll, B1_ll, B2_ll, B3_ll, psi_ll = u_ll
-    rho_rr, rho_v1_rr, rho_v2_rr, rho_v3_rr, rho_e_rr, B1_rr, B2_rr, B3_rr, psi_rr = u_rr
+    rho_ll, rho_v1_ll, rho_v2_ll, rho_v3_ll, rho_e_total_ll, B1_ll, B2_ll, B3_ll, psi_ll = u_ll
+    rho_rr, rho_v1_rr, rho_v2_rr, rho_v3_rr, rho_e_total_rr, B1_rr, B2_rr, B3_rr, psi_rr = u_rr
 
     if nonconservative_term == 1
         # Powell nonconservative term:   (0, B_1, B_2, B_3, v⋅B, v_1, v_2, v_3, 0)
diff --git a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
index 9154bed5c58..d7863d75c22 100644
--- a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
@@ -106,4 +106,600 @@
 
     return nothing
 end
+
+# Calculate the DG staggered volume fluxes `fhat` in subcell FV-form inside the element
+# (**with non-conservative terms in "local * symmetric" form**).
+#
+# See also `flux_differencing_kernel!`.
+#
+# The calculation of the non-conservative staggered "fluxes" requires non-conservative
+# terms that can be written as a product of local and a symmetric contributions. See, e.g.,
+#
+# - Rueda-Ramírez, Gassner (2023). A Flux-Differencing Formula for Split-Form Summation By Parts
+#   Discretizations of Non-Conservative Systems. https://arxiv.org/pdf/2211.14009.pdf.
+#
+@inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
+                                u, mesh::TreeMesh{2},
+                                have_nonconservative_terms::True, equations,
+                                volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
+                                element,
+                                cache) where {
+                                              F_CONS <: Function,
+                                              F_NONCONS <:
+                                              FluxNonConservative{NonConservativeSymmetric()}
+                                              }
+    @unpack weights, derivative_split = dg.basis
+    @unpack flux_temp_threaded, flux_nonconservative_temp_threaded = cache
+    @unpack fhat_temp_threaded, fhat_nonconservative_temp_threaded, phi_threaded = cache
+
+    volume_flux_cons, volume_flux_noncons = volume_flux
+
+    flux_temp = flux_temp_threaded[Threads.threadid()]
+    flux_noncons_temp = flux_nonconservative_temp_threaded[Threads.threadid()]
+
+    fhat_temp = fhat_temp_threaded[Threads.threadid()]
+    fhat_noncons_temp = fhat_nonconservative_temp_threaded[Threads.threadid()]
+    phi = phi_threaded[Threads.threadid()]
+
+    # The FV-form fluxes are calculated in a recursive manner, i.e.:
+    # fhat_(0,1)   = w_0 * FVol_0,
+    # fhat_(j,j+1) = fhat_(j-1,j) + w_j * FVol_j,   for j=1,...,N-1,
+    # with the split form volume fluxes FVol_j = -2 * sum_i=0^N D_ji f*_(j,i).
+
+    # To use the symmetry of the `volume_flux`, the split form volume flux is precalculated
+    # like in `calc_volume_integral!` for the `VolumeIntegralFluxDifferencing`
+    # and saved in in `flux_temp`.
+
+    # Split form volume flux in orientation 1: x direction
+    flux_temp .= zero(eltype(flux_temp))
+    flux_noncons_temp .= zero(eltype(flux_noncons_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # All diagonal entries of `derivative_split` are zero. Thus, we can skip
+        # the computation of the diagonal terms. In addition, we use the symmetry
+        # of `volume_flux_cons` and `volume_flux_noncons` to save half of the possible two-point flux
+        # computations.
+        for ii in (i + 1):nnodes(dg)
+            u_node_ii = get_node_vars(u, equations, dg, ii, j, k, element)
+            flux1 = volume_flux_cons(u_node, u_node_ii, 1, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[i, ii], flux1,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[ii, i], flux1,
+                                       equations, dg, ii, j, k)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                # We multiply by 0.5 because that is done in other parts of Trixi
+                flux1_noncons = volume_flux_noncons(u_node, u_node_ii, 1, equations,
+                                                    NonConservativeSymmetric(), noncons)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5f0 * derivative_split[i, ii],
+                                           flux1_noncons,
+                                           equations, dg, noncons, i, j, k)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5f0 * derivative_split[ii, i],
+                                           flux1_noncons,
+                                           equations, dg, noncons, ii, j, k)
+            end
+        end
+    end
+
+    # FV-form flux `fhat` in x direction
+    fhat_temp[:, 1, :, :] .= zero(eltype(fhat1_L))
+    fhat_noncons_temp[:, :, 1, :, :] .= zero(eltype(fhat1_L))
+
+    # Compute local contribution to non-conservative flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_local = get_node_vars(u, equations, dg, i, j, k, element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            set_node_vars!(phi,
+                           volume_flux_noncons(u_local, 1, equations,
+                                               NonConservativeLocal(), noncons),
+                           equations, dg, noncons, i, j, k)
+        end
+    end
+
+    for k in eachnode(dg), j in eachnode(dg), i in 1:(nnodes(dg) - 1)
+        # Conservative part
+        for v in eachvariable(equations)
+            value = fhat_temp[v, i, j, k] + weights[i] * flux_temp[v, i, j, k]
+            fhat_temp[v, i + 1, j, k] = value
+            fhat1_L[v, i + 1, j, k] = value
+            fhat1_R[v, i + 1, j, k] = value
+        end
+        # Nonconservative part
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons),
+            v in eachvariable(equations)
+
+            value = fhat_noncons_temp[v, noncons, i, j, k] +
+                    weights[i] * flux_noncons_temp[v, noncons, i, j, k]
+            fhat_noncons_temp[v, noncons, i + 1, j, k] = value
+
+            fhat1_L[v, i + 1, j, k] = fhat1_L[v, i + 1, j, k] +
+                                      phi[v, noncons, i, j, k] * value
+            fhat1_R[v, i + 1, j, k] = fhat1_R[v, i + 1, j, k] +
+                                      phi[v, noncons, i + 1, j, k] * value
+        end
+    end
+
+    # Split form volume flux in orientation 2: y direction
+    flux_temp .= zero(eltype(flux_temp))
+    flux_noncons_temp .= zero(eltype(flux_noncons_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+        for jj in (j + 1):nnodes(dg)
+            u_node_jj = get_node_vars(u, equations, dg, i, jj, k, element)
+            flux2 = volume_flux_cons(u_node, u_node_jj, 2, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[j, jj], flux2,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[jj, j], flux2,
+                                       equations, dg, i, jj, k)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                # We multiply by 0.5 because that is done in other parts of Trixi
+                flux2_noncons = volume_flux_noncons(u_node, u_node_jj, 2, equations,
+                                                    NonConservativeSymmetric(), noncons)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5 * derivative_split[j, jj],
+                                           flux2_noncons,
+                                           equations, dg, noncons, i, j, k)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5 * derivative_split[jj, j],
+                                           flux2_noncons,
+                                           equations, dg, noncons, i, jj, k)
+            end
+        end
+    end
+
+    # FV-form flux `fhat` in y direction
+    fhat_temp[:, :, 1, :] .= zero(eltype(fhat1_L))
+    fhat_noncons_temp[:, :, :, 1, :] .= zero(eltype(fhat1_L))
+
+    # Compute local contribution to non-conservative flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_local = get_node_vars(u, equations, dg, i, j, k, element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            set_node_vars!(phi,
+                           volume_flux_noncons(u_local, 2, equations,
+                                               NonConservativeLocal(), noncons),
+                           equations, dg, noncons, i, j, k)
+        end
+    end
+
+    for k in eachnode(dg), j in 1:(nnodes(dg) - 1), i in eachnode(dg)
+        # Conservative part
+        for v in eachvariable(equations)
+            value = fhat_temp[v, i, j, k] + weights[j] * flux_temp[v, i, j, k]
+            fhat_temp[v, i, j + 1, k] = value
+            fhat2_L[v, i, j + 1, k] = value
+            fhat2_R[v, i, j + 1, k] = value
+        end
+        # Nonconservative part
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons),
+            v in eachvariable(equations)
+
+            value = fhat_noncons_temp[v, noncons, i, j, k] +
+                    weights[j] * flux_noncons_temp[v, noncons, i, j, k]
+            fhat_noncons_temp[v, noncons, i, j + 1, k] = value
+
+            fhat2_L[v, i, j + 1, k] = fhat2_L[v, i, j + 1, k] +
+                                      phi[v, noncons, i, j, k] * value
+            fhat2_R[v, i, j + 1, k] = fhat2_R[v, i, j + 1, k] +
+                                      phi[v, noncons, i, j + 1, k] * value
+        end
+    end
+
+    # Split form volume flux in orientation 3: z direction
+    flux_temp .= zero(eltype(flux_temp))
+    flux_noncons_temp .= zero(eltype(flux_noncons_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+        for kk in (k + 1):nnodes(dg)
+            u_node_kk = get_node_vars(u, equations, dg, i, j, kk, element)
+            flux3 = volume_flux_cons(u_node, u_node_kk, 3, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[k, kk], flux3,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[kk, k], flux3,
+                                       equations, dg, i, j, kk)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                # We multiply by 0.5 because that is done in other parts of Trixi
+                flux3_noncons = volume_flux_noncons(u_node, u_node_kk, 3, equations,
+                                                    NonConservativeSymmetric(), noncons)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5 * derivative_split[k, kk],
+                                           flux3_noncons,
+                                           equations, dg, noncons, i, j, k)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5 * derivative_split[kk, k],
+                                           flux3_noncons,
+                                           equations, dg, noncons, i, j, kk)
+            end
+        end
+    end
+
+    # FV-form flux `fhat` in z direction
+    fhat_temp[:, :, :, 1] .= zero(eltype(fhat1_L))
+    fhat_noncons_temp[:, :, :, :, 1] .= zero(eltype(fhat1_L))
+
+    # Compute local contribution to non-conservative flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_local = get_node_vars(u, equations, dg, i, j, k, element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            set_node_vars!(phi,
+                           volume_flux_noncons(u_local, 3, equations,
+                                               NonConservativeLocal(), noncons),
+                           equations, dg, noncons, i, j, k)
+        end
+    end
+
+    for k in 1:(nnodes(dg) - 1), j in eachnode(dg), i in eachnode(dg)
+        # Conservative part
+        for v in eachvariable(equations)
+            value = fhat_temp[v, i, j, k] + weights[k] * flux_temp[v, i, j, k]
+            fhat_temp[v, i, j, k + 1] = value
+            fhat3_L[v, i, j, k + 1] = value
+            fhat3_R[v, i, j, k + 1] = value
+        end
+        # Nonconservative part
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons),
+            v in eachvariable(equations)
+
+            value = fhat_noncons_temp[v, noncons, i, j, k] +
+                    weights[k] * flux_noncons_temp[v, noncons, i, j, k]
+            fhat_noncons_temp[v, noncons, i, j, k + 1] = value
+
+            fhat3_L[v, i, j, k + 1] = fhat3_L[v, i, j, k + 1] +
+                                      phi[v, noncons, i, j, k] * value
+            fhat3_R[v, i, j, k + 1] = fhat3_R[v, i, j, k + 1] +
+                                      phi[v, noncons, i, j, k + 1] * value
+        end
+    end
+
+    return nothing
+end
+
+# Calculate the DG staggered volume fluxes `fhat` in subcell FV-form inside the element
+# (**with non-conservative terms in "local * jump" form**).
+#
+# See also `flux_differencing_kernel!`.
+#
+# The calculation of the non-conservative staggered "fluxes" requires non-conservative
+# terms that can be written as a product of local and jump contributions.
+@inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
+                                u, mesh::TreeMesh{2},
+                                nonconservative_terms::True, equations,
+                                volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
+                                element,
+                                cache) where {
+                                              F_CONS <: Function,
+                                              F_NONCONS <:
+                                              FluxNonConservative{NonConservativeJump()}
+                                              }
+    @unpack weights, derivative_split = dg.basis
+    @unpack flux_temp_threaded, flux_nonconservative_temp_threaded = cache
+    @unpack fhat_temp_threaded, fhat_nonconservative_temp_threaded, phi_threaded = cache
+
+    volume_flux_cons, volume_flux_noncons = volume_flux
+
+    flux_temp = flux_temp_threaded[Threads.threadid()]
+    flux_noncons_temp = flux_nonconservative_temp_threaded[Threads.threadid()]
+
+    fhat_temp = fhat_temp_threaded[Threads.threadid()]
+    fhat_noncons_temp = fhat_nonconservative_temp_threaded[Threads.threadid()]
+    phi = phi_threaded[Threads.threadid()]
+
+    # The FV-form fluxes are calculated in a recursive manner, i.e.:
+    # fhat_(0,1)   = w_0 * FVol_0,
+    # fhat_(j,j+1) = fhat_(j-1,j) + w_j * FVol_j,   for j=1,...,N-1,
+    # with the split form volume fluxes FVol_j = -2 * sum_i=0^N D_ji f*_(j,i).
+
+    # To use the symmetry of the `volume_flux`, the split form volume flux is precalculated
+    # like in `calc_volume_integral!` for the `VolumeIntegralFluxDifferencing`
+    # and saved in in `flux_temp`.
+
+    # Split form volume flux in orientation 1: x direction
+    flux_temp .= zero(eltype(flux_temp))
+    flux_noncons_temp .= zero(eltype(flux_noncons_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # All diagonal entries of `derivative_split` are zero. Thus, we can skip
+        # the computation of the diagonal terms. In addition, we use the symmetry
+        # of `volume_flux_cons` and skew-symmetry of `volume_flux_noncons` to save half of the possible two-point flux
+        # computations.
+        for ii in (i + 1):nnodes(dg)
+            u_node_ii = get_node_vars(u, equations, dg, ii, j, k, element)
+            flux1 = volume_flux_cons(u_node, u_node_ii, 1, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[i, ii], flux1,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[ii, i], flux1,
+                                       equations, dg, ii, j, k)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                # We multiply by 0.5 because that is done in other parts of Trixi
+                flux1_noncons = volume_flux_noncons(u_node, u_node_ii, 1, equations,
+                                                    NonConservativeJump(),
+                                                    noncons)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5f0 * derivative_split[i, ii],
+                                           flux1_noncons,
+                                           equations, dg, noncons, i, j, k)
+                # Note the sign flip due to skew-symmetry when argument order is swapped
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           -0.5f0 * derivative_split[ii, i],
+                                           flux1_noncons,
+                                           equations, dg, noncons, ii, j, k)
+            end
+        end
+    end
+
+    # FV-form flux `fhat` in x direction
+    fhat_temp[:, 1, :, :] .= zero(eltype(fhat1_L))
+    fhat_noncons_temp[:, :, 1, :, :] .= zero(eltype(fhat1_L))
+
+    # Compute local contribution to non-conservative flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_local = get_node_vars(u, equations, dg, i, j, k, element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            set_node_vars!(phi,
+                           volume_flux_noncons(u_local, 1, equations,
+                                               NonConservativeLocal(), noncons),
+                           equations, dg, noncons, i, j, k)
+        end
+    end
+
+    for k in eachnode(dg), j in eachnode(dg), i in 1:(nnodes(dg) - 1)
+        # Conservative part
+        for v in eachvariable(equations)
+            value = fhat_temp[v, i, j, k] + weights[i] * flux_temp[v, i, j, k]
+            fhat_temp[v, i + 1, j, k] = value
+            fhat1_L[v, i + 1, j, k] = value
+            fhat1_R[v, i + 1, j, k] = value
+        end
+        # Nonconservative part
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons),
+            v in eachvariable(equations)
+
+            value = fhat_noncons_temp[v, noncons, i, j, k] +
+                    weights[i] * flux_noncons_temp[v, noncons, i, j, k]
+            fhat_noncons_temp[v, noncons, i + 1, j, k] = value
+
+            fhat1_L[v, i + 1, j, k] = fhat1_L[v, i + 1, j, k] +
+                                      phi[v, noncons, i, j, k] * value
+            fhat1_R[v, i + 1, j, k] = fhat1_R[v, i + 1, j, k] +
+                                      phi[v, noncons, i + 1, j, k] * value
+        end
+    end
+
+    # Apply correction term to the flux-differencing formula for nonconservative local * jump fluxes.
+    for k in eachnode(dg), j in eachnode(dg)
+        u_0 = get_node_vars(u, equations, dg, 1, j, k, element)
+        for i in 2:(nnodes(dg) - 1)
+            u_i = get_node_vars(u, equations, dg, i, j, k, element)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                phi_jump = volume_flux_noncons(u_0, u_i, 1, equations,
+                                               NonConservativeJump(), noncons)
+
+                for v in eachvariable(equations)
+                    # The factor of 2 is missing on each term because Trixi multiplies all the non-cons terms with 0.5
+                    fhat1_R[v, i, j, k] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                    fhat1_L[v, i + 1, j, k] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                end
+            end
+        end
+        u_N = get_node_vars(u, equations, dg, nnodes(dg), j, k, element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            phi_jump = volume_flux_noncons(u_0, u_N, 1, equations,
+                                           NonConservativeJump(), noncons)
+
+            for v in eachvariable(equations)
+                # The factor of 2 is missing because Trixi multiplies all the non-cons terms with 0.5
+                fhat1_R[v, nnodes(dg), j, k] -= phi[v, noncons, nnodes(dg), j, k] *
+                                                phi_jump[v]
+            end
+        end
+    end
+
+    ########
+
+    # Split form volume flux in orientation 2: y direction
+    flux_temp .= zero(eltype(flux_temp))
+    flux_noncons_temp .= zero(eltype(flux_noncons_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+        for jj in (j + 1):nnodes(dg)
+            u_node_jj = get_node_vars(u, equations, dg, i, jj, k, element)
+            flux2 = volume_flux_cons(u_node, u_node_jj, 2, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[j, jj], flux2,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[jj, j], flux2,
+                                       equations, dg, i, jj, k)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                # We multiply by 0.5 because that is done in other parts of Trixi
+                flux2_noncons = volume_flux_noncons(u_node, u_node_jj, 2, equations,
+                                                    NonConservativeJump(),
+                                                    noncons)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5 * derivative_split[j, jj],
+                                           flux2_noncons,
+                                           equations, dg, noncons, i, j, k)
+                # Note the sign flip due to skew-symmetry when argument order is swapped
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           -0.5 * derivative_split[jj, j],
+                                           flux2_noncons,
+                                           equations, dg, noncons, i, jj, k)
+            end
+        end
+    end
+
+    # FV-form flux `fhat` in y direction
+    fhat_temp[:, :, 1, :] .= zero(eltype(fhat1_L))
+    fhat_noncons_temp[:, :, :, 1, :] .= zero(eltype(fhat1_L))
+
+    # Compute local contribution to non-conservative flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_local = get_node_vars(u, equations, dg, i, j, k, element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            set_node_vars!(phi,
+                           volume_flux_noncons(u_local, 2, equations,
+                                               NonConservativeLocal(), noncons),
+                           equations, dg, noncons, i, j, k)
+        end
+    end
+
+    for k in eachnode(dg), j in 1:(nnodes(dg) - 1), i in eachnode(dg)
+        # Conservative part
+        for v in eachvariable(equations)
+            value = fhat_temp[v, i, j, k] + weights[j] * flux_temp[v, i, j, k]
+            fhat_temp[v, i, j + 1, k] = value
+            fhat2_L[v, i, j + 1, k] = value
+            fhat2_R[v, i, j + 1, k] = value
+        end
+        # Nonconservative part
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons),
+            v in eachvariable(equations)
+
+            value = fhat_noncons_temp[v, noncons, i, j, k] +
+                    weights[j] * flux_noncons_temp[v, noncons, i, j, k]
+            fhat_noncons_temp[v, noncons, i, j + 1, k] = value
+
+            fhat2_L[v, i, j + 1, k] = fhat2_L[v, i, j + 1, k] +
+                                      phi[v, noncons, i, j, k] * value
+            fhat2_R[v, i, j + 1, k] = fhat2_R[v, i, j + 1, k] +
+                                      phi[v, noncons, i, j + 1, k] * value
+        end
+    end
+
+    # Apply correction term to the flux-differencing formula for nonconservative local * jump fluxes.
+    for k in eachnode(dg), i in eachnode(dg)
+        u_0 = get_node_vars(u, equations, dg, i, 1, k, element)
+        for j in 2:(nnodes(dg) - 1)
+            u_j = get_node_vars(u, equations, dg, i, j, k, element)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                phi_jump = volume_flux_noncons(u_0, u_j, 2, equations,
+                                               NonConservativeJump(), noncons)
+
+                for v in eachvariable(equations)
+                    # The factor of 2 is missing on each term because Trixi multiplies all the non-cons terms with 0.5
+                    fhat2_R[v, i, j, k] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                    fhat2_L[v, i, j + 1, k] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                end
+            end
+        end
+        u_N = get_node_vars(u, equations, dg, i, nnodes(dg), k, element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            phi_jump = volume_flux_noncons(u_0, u_N, 2, equations,
+                                           NonConservativeJump(), noncons)
+
+            for v in eachvariable(equations)
+                # The factor of 2 is missing cause Trixi multiplies all the non-cons terms with 0.5
+                fhat2_R[v, i, nnodes(dg), k] -= phi[v, noncons, i, nnodes(dg), k] *
+                                                phi_jump[v]
+            end
+        end
+    end
+
+    ########
+
+    # Split form volume flux in orientation 3: z direction
+    flux_temp .= zero(eltype(flux_temp))
+    flux_noncons_temp .= zero(eltype(flux_noncons_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+        for kk in (k + 1):nnodes(dg)
+            u_node_kk = get_node_vars(u, equations, dg, i, j, kk, element)
+            flux3 = volume_flux_cons(u_node, u_node_kk, 3, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[k, kk], flux3,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[kk, k], flux3,
+                                       equations, dg, i, j, kk)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                # We multiply by 0.5 because that is done in other parts of Trixi
+                flux3_noncons = volume_flux_noncons(u_node, u_node_kk, 3, equations,
+                                                    NonConservativeJump(),
+                                                    noncons)
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           0.5 * derivative_split[k, kk],
+                                           flux3_noncons,
+                                           equations, dg, noncons, i, j, k)
+                # Note the sign flip due to skew-symmetry when argument order is swapped
+                multiply_add_to_node_vars!(flux_noncons_temp,
+                                           -0.5 * derivative_split[kk, k],
+                                           flux3_noncons,
+                                           equations, dg, noncons, i, j, kk)
+            end
+        end
+    end
+
+    # FV-form flux `fhat` in y direction
+    fhat_temp[:, :, :, 1] .= zero(eltype(fhat1_L))
+    fhat_noncons_temp[:, :, :, :, 1] .= zero(eltype(fhat1_L))
+
+    # Compute local contribution to non-conservative flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_local = get_node_vars(u, equations, dg, i, j, k, element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            set_node_vars!(phi,
+                           volume_flux_noncons(u_local, 3, equations,
+                                               NonConservativeLocal(), noncons),
+                           equations, dg, noncons, i, j, k)
+        end
+    end
+
+    for k in 1:(nnodes(dg) - 1), j in eachnode(dg), i in eachnode(dg)
+        # Conservative part
+        for v in eachvariable(equations)
+            value = fhat_temp[v, i, j, k] + weights[k] * flux_temp[v, i, j, k]
+            fhat_temp[v, i, j, k + 1] = value
+            fhat3_L[v, i, j, k + 1] = value
+            fhat3_R[v, i, j, k + 1] = value
+        end
+        # Nonconservative part
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons),
+            v in eachvariable(equations)
+
+            value = fhat_noncons_temp[v, noncons, i, j, k] +
+                    weights[k] * flux_noncons_temp[v, noncons, i, j, k]
+            fhat_noncons_temp[v, noncons, i, j, k + 1] = value
+
+            fhat3_L[v, i, j, k + 1] = fhat3_L[v, i, j, k + 1] +
+                                      phi[v, noncons, i, j, k] * value
+            fhat3_R[v, i, j, k + 1] = fhat3_R[v, i, j, k + 1] +
+                                      phi[v, noncons, i, j, k + 1] * value
+        end
+    end
+
+    # Apply correction term to the flux-differencing formula for nonconservative local * jump fluxes.
+    for j in eachnode(dg), i in eachnode(dg)
+        u_0 = get_node_vars(u, equations, dg, i, j, 1, element)
+        for k in 2:(nnodes(dg) - 1)
+            u_k = get_node_vars(u, equations, dg, i, j, k, element)
+            for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+                phi_jump = volume_flux_noncons(u_0, u_k, 2, equations,
+                                               NonConservativeJump(), noncons)
+
+                for v in eachvariable(equations)
+                    # The factor of 2 is missing on each term because Trixi multiplies all the non-cons terms with 0.5
+                    fhat3_R[v, i, j, k] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                    fhat3_L[v, i, j, k + 1] -= phi[v, noncons, i, j, k] * phi_jump[v]
+                end
+            end
+        end
+        u_N = get_node_vars(u, equations, dg, i, j, nnodes(dg), element)
+        for noncons in 1:n_nonconservative_terms(volume_flux_noncons)
+            phi_jump = volume_flux_noncons(u_0, u_N, 3, equations,
+                                           NonConservativeJump(), noncons)
+
+            for v in eachvariable(equations)
+                # The factor of 2 is missing cause Trixi multiplies all the non-cons terms with 0.5
+                fhat3_R[v, i, j, nnodes(dg)] -= phi[v, noncons, i, j, nnodes(dg)] *
+                                                phi_jump[v]
+            end
+        end
+    end
+    return nothing
+end
 end # @muladd
diff --git a/src/solvers/dgsem_tree/subcell_limiters_3d.jl b/src/solvers/dgsem_tree/subcell_limiters_3d.jl
index 4f8328af493..d9a80609ba2 100644
--- a/src/solvers/dgsem_tree/subcell_limiters_3d.jl
+++ b/src/solvers/dgsem_tree/subcell_limiters_3d.jl
@@ -13,7 +13,7 @@
                                                  u, t, semi, mesh::TreeMesh3D,
                                                  equations)
     _, _, dg, cache = mesh_equations_solver_cache(semi)
-
+    (; boundary_conditions) = semi
     # Calc bounds at interfaces and periodic boundaries
     for interface in eachinterface(dg, cache)
         # Get neighboring element ids
@@ -54,12 +54,57 @@
         end
     end
 
+    # Calc bounds at physical boundaries
+    for boundary in eachboundary(dg, cache)
+        element = cache.boundaries.neighbor_ids[boundary]
+        orientation = cache.boundaries.orientations[boundary]
+        neighbor_side = cache.boundaries.neighbor_sides[boundary]
+
+        for j in eachnode(dg), i in eachnode(dg)
+            # Define node indices and boundary index based on the orientation and neighbor_side
+            if neighbor_side == 2 # Element is on the right, boundary on the left
+                if orientation == 1 # boundary in x-direction
+                    index = (1, i, j)
+                    boundary_index = 1
+                elseif orientation == 2 # boundary in y-direction
+                    index = (i, 1, j)
+                    boundary_index = 3
+                else # orientation == 3 # boundary in z-direction
+                    index = (i, j, 1)
+                    boundary_index = 5
+                end
+            else # Element is on the left, boundary on the right
+                if orientation == 1 # boundary in x-direction
+                    index = (nnodes(dg), i, j)
+                    boundary_index = 2
+                elseif orientation == 2 # boundary in y-direction
+                    index = (i, nnodes(dg), j)
+                    boundary_index = 4
+                else # orientation == 3 # boundary in z-direction
+                    index = (i, j, nnodes(dg))
+                    boundary_index = 6
+                end
+            end
+            u_inner = get_node_vars(u, equations, dg, index..., element)
+            u_outer = get_boundary_outer_state(u_inner, t,
+                                               boundary_conditions[boundary_index],
+                                               orientation, boundary_index,
+                                               mesh, equations, dg, cache,
+                                               index..., element)
+            var_outer = u_outer[variable]
+
+            var_min[index..., element] = min(var_min[index..., element], var_outer)
+            var_max[index..., element] = max(var_max[index..., element], var_outer)
+        end
+    end
+
     return nothing
 end
 
 @inline function calc_bounds_onesided_interface!(var_minmax, min_or_max, variable, u, t,
                                                  semi, mesh::TreeMesh{3})
     _, equations, dg, cache = mesh_equations_solver_cache(semi)
+    (; boundary_conditions) = semi
 
     # Calc bounds at interfaces and periodic boundaries
     for interface in eachinterface(dg, cache)
@@ -100,6 +145,50 @@ end
         end
     end
 
+    # Calc bounds at physical boundaries
+    for boundary in eachboundary(dg, cache)
+        element = cache.boundaries.neighbor_ids[boundary]
+        orientation = cache.boundaries.orientations[boundary]
+        neighbor_side = cache.boundaries.neighbor_sides[boundary]
+
+        for j in eachnode(dg), i in eachnode(dg)
+            # Define node indices and boundary index based on the orientation and neighbor_side
+            if neighbor_side == 2 # Element is on the right, boundary on the left
+                if orientation == 1 # boundary in x-direction
+                    index = (1, i, j)
+                    boundary_index = 1
+                elseif orientation == 2 # boundary in y-direction
+                    index = (i, 1, j)
+                    boundary_index = 3
+                else # orientation == 3 # boundary in z-direction
+                    index = (i, j, 1)
+                    boundary_index = 5
+                end
+            else # Element is on the left, boundary on the right
+                if orientation == 1 # boundary in x-direction
+                    index = (nnodes(dg), i, j)
+                    boundary_index = 2
+                elseif orientation == 2 # boundary in y-direction
+                    index = (i, nnodes(dg), j)
+                    boundary_index = 4
+                else # orientation == 3 # boundary in z-direction
+                    index = (i, j, nnodes(dg))
+                    boundary_index = 6
+                end
+            end
+            u_inner = get_node_vars(u, equations, dg, index..., element)
+            u_outer = get_boundary_outer_state(u_inner, t,
+                                               boundary_conditions[boundary_index],
+                                               orientation, boundary_index,
+                                               mesh, equations, dg, cache,
+                                               index..., element)
+            var_outer = variable(u_outer, equations)
+
+            var_minmax[index..., element] = min_or_max(var_minmax[index..., element],
+                                                       var_outer)
+        end
+    end
+
     return nothing
 end
 end # @muladd
diff --git a/test/test_tree_3d_euler.jl b/test/test_tree_3d_euler.jl
index 17866e74f47..ab0d0925dcd 100644
--- a/test/test_tree_3d_euler.jl
+++ b/test/test_tree_3d_euler.jl
@@ -30,7 +30,7 @@ EXAMPLES_DIR = joinpath(examples_dir(), "tree_3d_dgsem")
     # (e.g., from type instabilities)
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
     # Extra test to make sure the "TimeSeriesCallback" made correct data.
-    # Extracts data at all points from the first step of the time series and compares it to the 
+    # Extracts data at all points from the first step of the time series and compares it to the
     # exact solution and an interpolated reference solution
     point_data = [getindex(time_series.affect!.point_data[i], 1:5) for i in 1:3]
     exact_data = [initial_condition_convergence_test(time_series.affect!.point_coordinates[:,