diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index f1c07345901..d5c4abf01e7 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -10,6 +10,3 @@ updates:
       all-github-actions:
         patterns:
           - "*"
-    ignore:
-      - dependency-name: "julia-actions/julia-downgrade-compat"
-        versions: [ ">=2.0.0" ]
diff --git a/.github/workflows/Downgrade.yml b/.github/workflows/Downgrade.yml
index fc13bc61d05..f3e50aa5d36 100644
--- a/.github/workflows/Downgrade.yml
+++ b/.github/workflows/Downgrade.yml
@@ -66,7 +66,8 @@ jobs:
           # - performance_specializations_part1
           # - performance_specializations_part2
           # - mpi
-          - threaded
+          # - threaded
+          - downgrade
     steps:
       - uses: actions/checkout@v6
       - uses: julia-actions/setup-julia@v2
@@ -75,17 +76,22 @@ jobs:
           arch: ${{ matrix.arch }}
       - run: julia -e 'using InteractiveUtils; versioninfo(verbose=true)'
       - uses: julia-actions/cache@v2
-      - uses: julia-actions/julia-downgrade-compat@v1
+      - uses: julia-actions/julia-downgrade-compat@v2
         with:
           skip: LinearAlgebra,Printf,SparseArrays,UUIDs,DelimitedFiles,Test,Downloads,Random
           projects: ., test
-      - uses: julia-actions/julia-buildpkg@v1
+          mode: forcedeps
+      # We run the tests manually instead of using julia-action/julia-builpkg and julia-action/julia-runtest or `Pkg.test`
+      # because otherwise the downgraded Manifest.toml is not used in the tests under julia <v1.12
+      # see also https://github.com/julia-actions/julia-downgrade-compat/blob/00f940b7be2b50389571ed016d603be561649103/README.md#L89
+      - name: Run tests
+        run: |
+          julia --project=test --color=yes -e '
+            import Pkg
+            Pkg.develop(Pkg.PackageSpec(path=pwd()))
+            Pkg.instantiate()
+            Pkg.status(; mode = Pkg.PKGMODE_MANIFEST)
+            include("test/runtests.jl")
+          '
         env:
-          PYTHON: ''
-      - name: Run tests without coverage
-        uses: julia-actions/julia-runtest@v1
-        with:
-          coverage: false
-        env:
-          PYTHON: ''
           TRIXI_TEST: ${{ matrix.trixi_test }}
diff --git a/.github/workflows/SpellCheck.yml b/.github/workflows/SpellCheck.yml
index e24dde4322a..acd23fdf0ce 100644
--- a/.github/workflows/SpellCheck.yml
+++ b/.github/workflows/SpellCheck.yml
@@ -10,4 +10,4 @@ jobs:
       - name: Checkout Actions Repository
         uses: actions/checkout@v6
       - name: Check spelling
-        uses: crate-ci/typos@v1.42.3
+        uses: crate-ci/typos@v1.44.0
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 0c8249d741c..27cb55a35cd 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -45,7 +45,7 @@ jobs:
         run: julia --project=benchmark/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
       - name: Run benchmarks
         run: julia --project=benchmark/ --color=yes benchmark/run_benchmarks.jl
-      - uses: actions/upload-artifact@v6
+      - uses: actions/upload-artifact@v7
         with:
           name: my-artifact
           path: benchmark/results*.md
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6f8e4a3f2e4..7dbbf240d4d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -85,6 +85,7 @@ jobs:
           - performance_specializations
           - mpi
           - threaded
+          - kernelabstractions
         include:
           - version: '1.11'
             os: ubuntu-latest
@@ -163,7 +164,7 @@ jobs:
         if: ${{ !cancelled() }}
         run: |
           cp ./lcov.info ./lcov-${{ matrix.trixi_test }}-${{ matrix.os }}-${{ matrix.version }}-${{ matrix.arch }}.info
-      - uses: actions/upload-artifact@v6
+      - uses: actions/upload-artifact@v7
         with:
           name: lcov-${{ matrix.trixi_test }}-${{ matrix.os }}-${{ matrix.version }}-${{ matrix.arch }}
           path: ./lcov-${{ matrix.trixi_test }}-${{ matrix.os }}-${{ matrix.version }}-${{ matrix.arch }}.info
@@ -187,7 +188,7 @@ jobs:
       # At first, we check out the repository and download all artifacts
       # (and list files for debugging).
       - uses: actions/checkout@v6
-      - uses: actions/download-artifact@v7
+      - uses: actions/download-artifact@v8
       - run: ls -R
       # Next, we merge the individual coverage files and upload
       # the combined results to Coveralls.
@@ -210,7 +211,7 @@ jobs:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           path-to-lcov: ./lcov.info
       # Upload merged coverage data as artifact for debugging
-      - uses: actions/upload-artifact@v6
+      - uses: actions/upload-artifact@v7
         with:
           name: lcov
           path: ./lcov.info
diff --git a/AUTHORS.md b/AUTHORS.md
index cc973c01c33..eb56a6aa0ba 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -16,7 +16,7 @@ provided substantial additions or modifications. Together, these two groups form
 * [Andrew Winters](https://liu.se/en/employee/andwi94),
   Linköping University, Sweden
 * [Jesse Chan](https://jlchan.github.io),
-  Rice University, US
+  UT Austin, US
 * [Andrés Rueda-Ramírez](https://andres.rueda-ramirez.com),
   Polytechnic University of Madrid (UPM), Spain
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c40c40bb18e..8782f38a1d9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -14,6 +14,14 @@ Trixi.jl and its contributions are licensed under the MIT license (see
 contributions are in conformance with the *Developer Certificate of Origin
 (Version 1.1)*, which is reproduced below.
 
+## LLM/AI usage 
+
+PRs which make use of LLMs and AI tools are allowed. However, because an LLM-assisted PR may have different issues than human-only PRs and necessitate different review styles, we request that PRs disclose any LLM contributions. 
+
+As before, all PRs are expected to be of high quality, and when possible, the developers will work with submitters to help achieve this. You are responsible for any code you submit, regardless of whether it was manually written or generated by AI. You should understand and explain the code you submit, as well as existing related code.
+
+We encourage contributors to pre-check their PRs against our [checklist](https://github.com/trixi-framework/Trixi.jl/blob/main/.github/review-checklist.md), for example through manual review, using AI-based tools, or a combination of the two.
+
 ## Developer Certificate of Origin (Version 1.1)
 The following text was taken from
 [https://developercertificate.org](https://developercertificate.org):
diff --git a/NEWS.md b/NEWS.md
index 2570d6fad24..58b3783f533 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,6 +5,41 @@ Trixi.jl follows the interpretation of
 used in the Julia ecosystem. Notable changes will be documented in this file
 for human readability.
 
+## Changes when updating to v0.16 from v0.15.x
+
+#### Changed
+
+- The implementation of the local DG (`ViscousFormulationLocalDG`) `solver_parabolic` has been changed for the `P4estMesh`.
+In particular, instead of computing the `ldg_switch` as the dot product of the normal direction with ones,
+i.e., summing up the normal components, the `ldg_switch` is now selected as 
+the sign of the maximum (in absolute value sense) normal direction component,
+which corresponds to the dominant direction of the interface normal.
+This might change results slightly for some meshes where the sum of the normal might be close to zero,
+thus introducing some spurious switch assignments ([#2871]).
+- The word "viscous" is now used only where it refers specifically to fluid viscosity.
+The word "parabolic" is used in more general contexts.
+In particular, viscosity is no longer used as a proxy for any parabolic/diffusive process such as heat conduction.
+For example, `ViscousFormulationLocalDG` is now `ParabolicFormulationLocalDG` and
+`ViscousFormulationBassiRebay1` is now `ParabolicFormulationBassiRebay1`.
+For consistency, `cfl_advective` and `cfl_diffusive` have also been renamed `cfl_hyperbolic` and `cfl_parabolic` ([#2868]).
+Moreover, some internal functions have been renamed accordingly, including the results shown by the timer outputs after running a simulation.
+
+#### Added
+
+- Introducing GPU support: Based on work by Jan Kraus and Lars Christmann, Trixi.jl can
+  now partly be executed on GPUs. This includes simulations with flux differencing on
+  `P4estMesh` in 2D and 3D. Adaptive mesh refinement, multi-GPU, source terms, and callbacks
+  are not available, yet. Offloading is achieved via KernelAbstractions.jl kernels,
+  which, at the moment, execute the same code as usually run on CPUs. A backend is selected
+  by passing an appropriate data type as keyword argument `storage_type` to
+  `semidiscretize`. See the
+  [heterogeneous](https://trixi-framework.github.io/TrixiDocumentation/dev/heterogeneous/)
+  section for some instructions on how to port kernels. This is however still preliminaray
+  and will change.
+  GPU kernels are currently CI-tested on NVIDIA GPUs in a buildkite workflow using
+  `TRIXI_TEST=CUDA` ([#2590]).
+
+
 ## Changes in the v0.15 lifecycle
 
 #### Added
@@ -14,19 +49,24 @@ This is useful for (locally) diffusion-dominated problems.
 This enables in particular adaptive mesh refinement for that solver-mesh combination ([#2712]).
 - Added functionality to `ScalarPlotData2D` allowing visualization a field provided by a user-defined scalar function ([#2796]).
 - Added `NonIdealCompressibleEuler2D` ([#2768]).
-- Generalization of `VolumeIntegralShockCapturingHG` and `VolumeIntegralShockCapturingRRG` to support different volume integrals on the 
+- Generalization of `VolumeIntegralShockCapturingHG` and `VolumeIntegralShockCapturingRRG` to support different volume integrals on the
   non-stabilized and stabilized elements/cells.
   The generalized volume integral is called `VolumeIntegralShockCapturingHGType` and takes the three keyword arguments `volume_integral_default`,
   `volume_integral_blend_high_order`, and `volume_integral_blend_low_order` besides the usual `indicator` argument.
   In particular, `volume_integral_default` may be e.g.  `VolumeIntegralWeakForm` or `VolumeIntegralAdaptive`, i.e.,
   the non-stabilized elements/cells are no longer restricted to `VolumeIntegralFluxDifferencing` only ([#2802]).
-- Added `IndicatorEntropyCorrection`. When combined with `VolumeIntegralAdaptive`, this blends together a stabilized and non-stabilized 
-  volume integral based on the violation of a volume entropy condition. `IndicatorEntropyCorrectionShockCapturingCombined` additionally 
-  guides the blending by taking the maximum of the entropy correction indicator and a shock capturing indicator ([#2764]). 
+- Added `IndicatorEntropyCorrection`. When combined with `VolumeIntegralAdaptive`, this blends together a stabilized and non-stabilized
+  volume integral based on the violation of a volume entropy condition. `IndicatorEntropyCorrectionShockCapturingCombined` additionally
+  guides the blending by taking the maximum of the entropy correction indicator and a shock capturing indicator ([#2764]).
 - The second-order subcell volume integral is no longer limited to reconstruction in primitive variables.
   Instead, it is possible to reconstruct in custom variables, if functions `cons2recon` and `recon2cons` are provided to
   `VolumeIntegralPureLGLFiniteVolumeO2` and `VolumeIntegralShockCapturingRRG`([#2817]).
 - Add Legendre-Gauss basis for DGSEM and implement solver (`VolumeIntegralWeakForm` and `SurfaceIntegralWeakForm` only) support for conforming 1D & 2D `TreeMesh`es ([#1965]).
+- Extended 3D support for subcell limiting was added ([#2763], [#2846], [#2844], [#2848]).
+  In the new version, local (minimum and maximum) limiting for conservative variables (using the
+  keyword `local_twosided_variables_cons` in `SubcellLimiterIDP()`) with `P4estMesh` is supported.
+  The support was extended to nonperiodic `P4estMesh{3}`es.
+  Moreover, initial support for `TreeMesh{3}` including positivity limiting and local limiting on periodic meshes was added.
 
 ## Changes when updating to v0.15 from v0.14.x
 
diff --git a/Project.toml b/Project.toml
index ee7b35463cb..2564c2deca5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Trixi"
 uuid = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
-version = "0.15.7-DEV"
+version = "0.16.1-DEV"
 authors = ["Michael Schlottke-Lakemper <michael.schlottke-lakemper@uni-a.de>", "Gregor Gassner <ggassner@uni-koeln.de>", "Hendrik Ranocha <mail@ranocha.de>", "Andrew R. Winters <andrew.ross.winters@liu.se>", "Jesse Chan <jesse.chan@rice.edu>", "Andrés Rueda-Ramírez <am.rueda@upm.es>"]
 
 [deps]
@@ -33,7 +33,6 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
-Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 SimpleUnPack = "ce78b400-467f-4804-87d8-8f486da07d0a"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
@@ -59,6 +58,7 @@ Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
 ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
 
 [extensions]
@@ -66,56 +66,57 @@ TrixiCUDAExt = "CUDA"
 TrixiConvexECOSExt = ["Convex", "ECOS"]
 TrixiMakieExt = "Makie"
 TrixiNLsolveExt = "NLsolve"
+TrixiPlotsExt = "Plots"
 TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer"
 
 [compat]
-Accessors = "0.1.36"
-Adapt = "4.1"
+Accessors = "0.1.42"
+Adapt = "4.3"
 CUDA = "5.8.2"
 CodeTracking = "1.0.5, 2, 3"
-ConstructionBase = "1.5"
+ConstructionBase = "1.5.8"
 Convex = "0.16"
-DataStructures = "0.18.15, 0.19"
+DataStructures = "0.19"
 DelimitedFiles = "1"
-DiffEqBase = "6.174"
-DiffEqCallbacks = "2.35, 3, 4"
+DiffEqBase = "6.194"
+DiffEqCallbacks = "4.9"
 Downloads = "1.6"
 ECOS = "1.1.2"
 EllipsisNotation = "1.0"
-FillArrays = "1.9"
-ForwardDiff = "0.10.36, 1"
-HDF5 = "0.16.10, 0.17"
-KernelAbstractions = "0.9.36"
+FillArrays = "1.13"
+ForwardDiff = "0.10.38, 1"
+HDF5 = "0.17"
+KernelAbstractions = "0.9.38"
 LinearAlgebra = "1"
 LinearMaps = "2.7, 3.0"
 LoopVectorization = "0.12.171"
 MPI = "0.20.22"
-Makie = "0.21, 0.22, 0.23, 0.24"
+Makie = "0.22, 0.23, 0.24"
 MuladdMacro = "0.2.4"
 NLsolve = "4.5.1"
 Octavian = "0.3.28"
 OffsetArrays = "1.13"
 P4est = "0.4.12"
+Plots = "1.38.13"
 Polyester = "=0.7.16, 0.7.18"
-PrecompileTools = "1.2"
-Preferences = "1.4"
+PrecompileTools = "1.2.1"
+Preferences = "1.5"
 Printf = "1"
 RecipesBase = "1.3.4"
-RecursiveArrayTools = "3.31.1"
-Reexport = "1.2"
-Requires = "1.3"
-SciMLBase = "2.92.0"
+RecursiveArrayTools = "3.37"
+Reexport = "1.2.2"
+SciMLBase = "2.141.0"
 SimpleUnPack = "1.1"
 SparseArrays = "1"
 SparseConnectivityTracer = "1.0.1"
 StableRNGs = "1.0.2"
 StartUpDG = "1.1.5"
-Static = "1.1.1"
-StaticArrayInterface = "1.5.1"
-StaticArrays = "1.9"
+Static = "1.3"
+StaticArrayInterface = "1.8"
+StaticArrays = "1.9.8"
 StrideArrays = "0.1.29"
-StructArrays = "0.6.20, 0.7"
-SummationByPartsOperators = "0.5.52"
+StructArrays = "0.7"
+SummationByPartsOperators = "0.5.72"
 T8code = "0.7.4"
 TimerOutputs = "0.5.25"
 Triangulate = "2.2, 3"
diff --git a/benchmark/CUDA/Project.toml b/benchmark/CUDA/Project.toml
new file mode 100644
index 00000000000..2e9f130fe6c
--- /dev/null
+++ b/benchmark/CUDA/Project.toml
@@ -0,0 +1,16 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d"
+TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
+
+[sources]
+Trixi = {path = "../.."}
+
+[compat]
+CUDA = "5.8.2"
+JSON = "1.4.0"
+OrdinaryDiffEqLowStorageRK = "1.12.0"
+TimerOutputs = "0.5.25"
+Trixi = "0.16"
diff --git a/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
new file mode 100644
index 00000000000..b8b1084932b
--- /dev/null
+++ b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
@@ -0,0 +1,76 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations3D(1.4)
+
+function initial_condition_taylor_green_vortex(x, t,
+                                               equations::CompressibleEulerEquations3D)
+    A = 1.0 # magnitude of speed
+    Ms = 0.1 # maximum Mach number
+
+    rho = 1.0
+    v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3])
+    v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
+    v3 = 0.0
+    p = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
+    p = p +
+        1.0 / 16.0 * A^2 * rho *
+        (cos(2 * x[1]) * cos(2 * x[3]) +
+         2 * cos(2 * x[2]) + 2 * cos(2 * x[1]) + cos(2 * x[2]) * cos(2 * x[3]))
+
+    return prim2cons(SVector(rho, v1, v2, v3, p), equations)
+end
+
+initial_condition = initial_condition_taylor_green_vortex
+
+volume_flux = flux_ranocha
+surface_flux = flux_lax_friedrichs
+volume_integral = VolumeIntegralFluxDifferencing(volume_flux)
+solver = DGSEM(polydeg = 5, surface_flux = surface_flux, volume_integral = volume_integral)
+
+coordinates_min = (-1.0, -1.0, -1.0) .* pi
+coordinates_max = (1.0, 1.0, 1.0) .* pi
+
+initial_refinement_level = 1
+trees_per_dimension = (4, 4, 4)
+
+mesh = P4estMesh(trees_per_dimension, polydeg = 1,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 periodicity = true, initial_refinement_level = initial_refinement_level)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver;
+                                    boundary_conditions = boundary_condition_periodic)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 100.0)
+ode = semidiscretize(semi, tspan; storage_type = nothing, real_type = nothing)
+
+summary_callback = SummaryCallback()
+
+stepsize_callback = StepsizeCallback(cfl = 0.1)
+
+callbacks = CallbackSet(summary_callback,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+maxiters = 200
+run_profiler = false
+
+# disable warnings when maxiters is reached
+integrator = init(ode, CarpenterKennedy2N54(williamson_condition = false),
+                  dt = 1.0,
+                  save_everystep = false, callback = callbacks,
+                  maxiters = maxiters, verbose = false)
+if run_profiler
+    prof_result = CUDA.@profile solve!(integrator)
+else
+    solve!(integrator)
+    prof_result = nothing
+end
diff --git a/benchmark/CUDA/run.jl b/benchmark/CUDA/run.jl
new file mode 100644
index 00000000000..b9d02246c9b
--- /dev/null
+++ b/benchmark/CUDA/run.jl
@@ -0,0 +1,91 @@
+using Trixi
+using CUDA
+using TimerOutputs
+using JSON
+
+function main(elixir_path)
+
+    # setup
+    maxiters = 50
+    initial_refinement_level = 3
+    storage_type = CuArray
+    real_type = Float64
+
+    println("Warming up...")
+
+    # start simulation with tiny final time to trigger compilation
+    duration_compile = @elapsed begin
+        trixi_include(elixir_path,
+                      tspan = (0.0, 1e-14),
+                      storage_type = storage_type,
+                      real_type = real_type)
+        trixi_include(elixir_path,
+                      tspan = (0.0, 1e-14),
+                      storage_type = storage_type,
+                      real_type = Float32)
+    end
+
+    println("Finished warm-up in $duration_compile seconds\n")
+    println("Starting simulation...")
+
+    # start the real simulation
+    duration_elixir = @elapsed trixi_include(elixir_path,
+                                             maxiters = maxiters,
+                                             initial_refinement_level = initial_refinement_level,
+                                             storage_type = storage_type,
+                                             real_type = real_type)
+
+    # store metrics (on every rank!)
+    metrics = Dict{String, Float64}("elapsed time" => duration_elixir)
+
+    # read TimerOutputs timings
+    timer = Trixi.timer()
+    metrics["total time"] = 1.0e-9 * TimerOutputs.tottime(timer)
+    metrics["rhs! time"] = 1.0e-9 * TimerOutputs.time(timer["rhs!"])
+
+    # compute performance index
+    latest_semi = @invokelatest (@__MODULE__).semi
+    nrhscalls = Trixi.ncalls(latest_semi.performance_counter)
+    walltime = 1.0e-9 * take!(latest_semi.performance_counter)
+    metrics["PID"] = walltime * Trixi.mpi_nranks() /
+                     (Trixi.ndofsglobal(latest_semi) * nrhscalls)
+
+    # write json file
+    open("metrics.out", "w") do f
+        indent = 2
+        JSON.print(f, metrics, indent)
+    end
+
+    # run profiler
+    maxiters = 5
+    initial_refinement_level = 1
+
+    println("Running profiler (Float64)...")
+    trixi_include(elixir_path,
+                  maxiters = maxiters,
+                  initial_refinement_level = initial_refinement_level,
+                  storage_type = storage_type,
+                  real_type = Float64,
+                  run_profiler = true)
+
+    open("profile_float64.txt", "w") do io
+        show(io, @invokelatest (@__MODULE__).prof_result)
+    end
+
+    println("Running profiler (Float32)...")
+    trixi_include(elixir_path,
+                  maxiters = maxiters,
+                  initial_refinement_level = initial_refinement_level,
+                  storage_type = storage_type,
+                  real_type = Float32,
+                  run_profiler = true)
+
+    open("profile_float32.txt", "w") do io
+        show(io, @invokelatest (@__MODULE__).prof_result)
+    end
+end
+
+# hardcoded elixir
+elixir_path = joinpath(@__DIR__(), "elixir_euler_taylor_green_vortex.jl")
+
+main(elixir_path)
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index e2b13aabaaa..4cf957da32d 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -8,4 +8,4 @@ Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
 BenchmarkTools = "0.5, 0.7, 1.0"
 OrdinaryDiffEq = "5.65, 6"
 PkgBenchmark = "0.2.10"
-Trixi = "0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15"
+Trixi = "0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16"
diff --git a/docs/literate/src/files/adding_new_parabolic_terms.jl b/docs/literate/src/files/adding_new_parabolic_terms.jl
index 9ec30998eb8..1b252c89785 100644
--- a/docs/literate/src/files/adding_new_parabolic_terms.jl
+++ b/docs/literate/src/files/adding_new_parabolic_terms.jl
@@ -35,14 +35,14 @@ function varnames(variable_mapping, equations_parabolic::ConstantAnisotropicDiff
     return varnames(variable_mapping, equations_parabolic.equations_hyperbolic)
 end
 
-# Next, we define the viscous flux function. We assume that the mixed hyperbolic-parabolic system
+# Next, we define the parabolic flux function. We assume that the mixed hyperbolic-parabolic system
 # is of the form
 # ```math
 # \partial_t u(t,x) + \partial_x (f_1(u) - g_1(u, \nabla u))
 #                   + \partial_y (f_2(u) - g_2(u, \nabla u)) = 0
 # ```
 # where ``f_1(u)``, ``f_2(u)`` are the hyperbolic fluxes and ``g_1(u, \nabla u)``, ``g_2(u, \nabla u)`` denote
-# the viscous fluxes. For anisotropic diffusion, the viscous fluxes are the first and second components
+# the parabolic fluxes. For anisotropic diffusion, the parabolic fluxes are the first and second components
 # of the matrix-vector product involving `diffusivity` and the gradient vector.
 #
 # Here, we specialize the flux to our new parabolic equation type `ConstantAnisotropicDiffusion2D`.
@@ -66,12 +66,12 @@ end
 # \begin{aligned}
 # \bm{q} &= \nabla u \\
 # \bm{\sigma} &= \begin{pmatrix} g_1(u, \bm{q}) \\ g_2(u, \bm{q}) \end{pmatrix} \\
-# \text{viscous contribution } &= \nabla \cdot \bm{\sigma}
+# \text{parabolic contribution} &= \nabla \cdot \bm{\sigma}
 # \end{aligned}
 # ```
 #
 # Boundary data must be specified for all spatial derivatives, e.g., for both the gradient
-# equation ``\bm{q} = \nabla u`` and the divergence of the viscous flux
+# equation ``\bm{q} = \nabla u`` and the divergence of the parabolic flux
 # ``\nabla \cdot \bm{\sigma}``. We account for this by introducing internal `Gradient`
 # and `Divergence` types which are used to dispatch on each type of boundary condition.
 #
@@ -98,7 +98,7 @@ end
     return boundary_condition.boundary_value
 end
 
-# While the gradient acts on the solution `u`, the divergence acts on the viscous flux ``\bm{\sigma}``.
+# While the gradient acts on the solution `u`, the divergence acts on the parabolic flux ``\bm{\sigma}``.
 # Thus, we have to supply boundary data for the `Divergence` operator that corresponds to ``\bm{\sigma}``.
 # However, we've already imposed boundary data on `u` for a Dirichlet boundary condition, and imposing
 # boundary data for ``\bm{\sigma}`` might overconstrain our problem.
@@ -119,7 +119,7 @@ end
 # ### A note on the choice of gradient variables
 #
 # It is often simpler to transform the solution variables (and solution gradients) to another set of
-# variables prior to computing the viscous fluxes (see [`CompressibleNavierStokesDiffusion2D`](@ref)
+# variables prior to computing the parabolic fluxes (see [`CompressibleNavierStokesDiffusion2D`](@ref)
 # for an example of this). If this is done, then the boundary condition for the `Gradient` operator
 # should be modified accordingly as well.
 #
@@ -182,7 +182,7 @@ plot(sol)
 # To be able to do so, we need to define [`max_diffusivity`](@ref) and 
 # [`have_constant_diffusivity`](@ref) for the new parabolic terms.
 # In Trixi.jl, currently only the standard Laplace Diffusion and Compressible Navier-Stokes-Fourier 
-# viscous terms are implemented.
+# parabolic terms are implemented.
 # Since these equations have **isotropic** diffusivity, i.e., direction-independent coefficients,
 # [`max_diffusivity`](@ref) is expected to return a scalar value.
 #
@@ -204,11 +204,11 @@ end
     return lambda_max()
 end
 
-# We supply now the advective(hyperbolic) and diffusive(parabolic) CFL numbers
-cfl_advective = 2.0 # Not restrictive for this example
-cfl_diffusive = 0.21 # Restricts the timestep
-stepsize_callback = StepsizeCallback(cfl = cfl_advective,
-                                     cfl_diffusive = cfl_diffusive)
+# We now supply the hyperbolic and parabolic CFL numbers
+cfl_hyperbolic = 2.0 # Not restrictive for this example
+cfl_parabolic = 0.21 # Restricts the timestep
+stepsize_callback = StepsizeCallback(cfl = cfl_hyperbolic,
+                                     cfl_parabolic = cfl_parabolic)
 
 # Add the stepsize callback to the existing callbacks
 callbacks = CallbackSet(SummaryCallback(), stepsize_callback);
diff --git a/docs/literate/src/files/parabolic_source_terms.jl b/docs/literate/src/files/parabolic_source_terms.jl
index 458510ce6b0..f98f340da7f 100644
--- a/docs/literate/src/files/parabolic_source_terms.jl
+++ b/docs/literate/src/files/parabolic_source_terms.jl
@@ -62,7 +62,7 @@ end
 # to OrdinaryDiffEq.jl. 
 # 
 # Note that for this problem, since viscosity `nu` is relatively large, we utilize 
-# `ViscousFormulationLocalDG` instead of the default `ViscousFormulationBassiRebay1` 
+# `ParabolicFormulationLocalDG` instead of the default `ParabolicFormulationBassiRebay1` 
 # parabolic solver, since the Bassi-Rebay 1 formulation is not accurate when the 
 # diffusivity is large relative to the mesh size. 
 
@@ -76,7 +76,7 @@ boundary_conditions_parabolic = boundary_condition_periodic
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationLocalDG(),
+                                             solver_parabolic = ParabolicFormulationLocalDG(),
                                              source_terms = source_terms,
                                              source_terms_parabolic = source_terms_parabolic,
                                              boundary_conditions = (boundary_conditions,
@@ -89,10 +89,10 @@ ode = semidiscretize(semi, tspan)
 # stable time-step is $O(h^2)$ due to the dominant parabolic term. We enforce this more stringent
 # parabolic CFL condition using a diffusion-aware `StepsizeCallback`. 
 
-cfl_advective = 0.5
-cfl_diffusive = 0.05
-stepsize_callback = StepsizeCallback(cfl = cfl_advective,
-                                     cfl_diffusive = cfl_diffusive)
+cfl_hyperbolic = 0.5
+cfl_parabolic = 0.05
+stepsize_callback = StepsizeCallback(cfl = cfl_hyperbolic,
+                                     cfl_parabolic = cfl_parabolic)
 callbacks = CallbackSet(SummaryCallback(), stepsize_callback)
 sol = solve(ode, RDPK3SpFSAL35(); adaptive = false, dt = stepsize_callback(ode),
             ode_default_options()..., callback = callbacks)
diff --git a/docs/src/heterogeneous.md b/docs/src/heterogeneous.md
index 9d4dc50c181..70d40dd2f6d 100644
--- a/docs/src/heterogeneous.md
+++ b/docs/src/heterogeneous.md
@@ -120,9 +120,14 @@ function trixi_rhs_fct(mesh, equations, solver, cache, args)
 end
 ```
 
-1. Put the inner code in a new function `rhs_fct_per_element`. Besides the index
-   `element`, pass all required fields as arguments, but make sure to `@unpack` them from
-   their structs in advance.
+1. Move the inner code into a new inlined function `rhs_fct_per_element`.
+   ```julia
+   @inline function rhs_fct_per_element(..., element, ...)
+       ...
+   end
+   ```
+   Besides the index `element`, pass all required fields as arguments, but make sure to
+   `@unpack` them from their structs in advance.
 2. Where `trixi_rhs_fct` is called, get the backend, i.e., the hardware we are currently
    running on via `trixi_backend(x)`.
    This will, e.g., work with `u_ode`. Internally, KernelAbstractions.jl's `get_backend`
diff --git a/docs/src/styleguide.md b/docs/src/styleguide.md
index 07f2d90cddc..5fa838e83dc 100644
--- a/docs/src/styleguide.md
+++ b/docs/src/styleguide.md
@@ -22,6 +22,10 @@ conventions, we apply and enforce automated source code formatting
       and its siblings, put the `cache` first.
     * Some internal functions take a "computational backend" argument, this should always be passed as the first argument.
     * Otherwise, use the order `mesh, equations, solver, cache`.
+    * In course of GPU offloading we sometimes pass `MeshT = typeof(mesh)` instead of
+      `mesh` when the called method needs the type of the mesh for dispatch only. This part
+      of the code is in active development and not considered to be stable API at the
+      moment.
     * If something needs to be specified in more detail for dispatch, put the additional argument before the general one
       that is specified in more detail. For example, we use `have_nonconservative_terms(equations), equations`
       and `dg.mortar, dg`.
diff --git a/docs/src/visualization.md b/docs/src/visualization.md
index 92e2f91492d..1ffc147ae13 100644
--- a/docs/src/visualization.md
+++ b/docs/src/visualization.md
@@ -386,7 +386,7 @@ purposes. An example for how to create a `VisualizationCallback` can be found in
 
 # Enable in-situ visualization with a new plot generated every 20 time steps
 # and additional plotting options passed as keyword arguments
-visualization = VisualizationCallback(interval=20; clims=(0,1))
+visualization = VisualizationCallback(semi; interval = 20, clims = (0, 1))
 
 [...]
 ```
diff --git a/examples/dgmulti_1d/elixir_advection_diffusion_gradient_source_terms.jl b/examples/dgmulti_1d/elixir_advection_diffusion_gradient_source_terms.jl
index 29ee42de3f3..f2e7b8a542e 100644
--- a/examples/dgmulti_1d/elixir_advection_diffusion_gradient_source_terms.jl
+++ b/examples/dgmulti_1d/elixir_advection_diffusion_gradient_source_terms.jl
@@ -58,10 +58,10 @@ analysis_callback = AnalysisCallback(semi, interval = analysis_interval, uEltype
 
 alive_callback = AliveCallback(analysis_interval = 100)
 
-cfl_advective = 0.5   # Not restrictive for this example
-cfl_diffusive = 0.025 # Restricts the timestep
-stepsize_callback = StepsizeCallback(cfl = cfl_advective,
-                                     cfl_diffusive = cfl_diffusive)
+cfl_hyperbolic = 0.5   # Not restrictive for this example
+cfl_parabolic = 0.025 # Restricts the timestep
+stepsize_callback = StepsizeCallback(cfl = cfl_hyperbolic,
+                                     cfl_parabolic = cfl_parabolic)
 
 callbacks = CallbackSet(summary_callback, analysis_callback, alive_callback,
                         stepsize_callback)
diff --git a/examples/dgmulti_2d/elixir_advection_diffusion.jl b/examples/dgmulti_2d/elixir_advection_diffusion.jl
index 8eed206f315..df9935f1d54 100644
--- a/examples/dgmulti_2d/elixir_advection_diffusion.jl
+++ b/examples/dgmulti_2d/elixir_advection_diffusion.jl
@@ -34,7 +34,7 @@ boundary_conditions = (; left = boundary_condition_left,
                        top = boundary_condition_do_nothing,
                        right = boundary_condition_do_nothing)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; left = boundary_condition_left,
                                  bottom = boundary_condition_zero,
                                  top = boundary_condition_zero,
diff --git a/examples/dgmulti_2d/elixir_advection_diffusion_nonperiodic.jl b/examples/dgmulti_2d/elixir_advection_diffusion_nonperiodic.jl
index 59ffbf181ea..4f339968256 100644
--- a/examples/dgmulti_2d/elixir_advection_diffusion_nonperiodic.jl
+++ b/examples/dgmulti_2d/elixir_advection_diffusion_nonperiodic.jl
@@ -53,7 +53,7 @@ boundary_conditions = (; left = boundary_condition,
                        bottom = boundary_condition,
                        right = boundary_condition_do_nothing)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; entire_boundary = boundary_condition)
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
diff --git a/examples/dgmulti_2d/elixir_euler_kelvin_helmholtz_instability_adaptive_vol_int.jl b/examples/dgmulti_2d/elixir_euler_kelvin_helmholtz_instability_adaptive_vol_int.jl
new file mode 100644
index 00000000000..e3845ae78d5
--- /dev/null
+++ b/examples/dgmulti_2d/elixir_euler_kelvin_helmholtz_instability_adaptive_vol_int.jl
@@ -0,0 +1,86 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+volume_integral_weakform = VolumeIntegralWeakForm()
+volume_integral_fluxdiff = VolumeIntegralFluxDifferencing(flux_ranocha)
+
+# This indicator compares the entropy production of the weak form to the
+# true entropy evolution in that cell.
+# If the weak form does not increase entropy beyond `maximum_entropy_increase`,
+# we keep the weak form result. Otherwise, we switch to the stabilized/EC volume integral.
+indicator = IndicatorEntropyChange(maximum_entropy_increase = 5e-3)
+
+# Adaptive volume integral using the entropy production comparison indicator to perform the
+# stabilized/EC volume integral when needed.
+volume_integral = VolumeIntegralAdaptive(volume_integral_default = volume_integral_weakform,
+                                         volume_integral_stabilized = volume_integral_fluxdiff,
+                                         indicator = indicator)
+
+dg = DGMulti(polydeg = 3,
+             # `Tri()` and `Polynomial()` make flux differencing really(!) expensive
+             element_type = Tri(), approximation_type = Polynomial(),
+             surface_integral = SurfaceIntegralWeakForm(flux_hllc),
+             volume_integral = volume_integral)
+
+equations = CompressibleEulerEquations2D(1.4)
+
+"""
+    initial_condition_kelvin_helmholtz_instability(x, t, equations::CompressibleEulerEquations2D)
+
+A version of the classical Kelvin-Helmholtz instability based on
+- Andrés M. Rueda-Ramírez, Gregor J. Gassner (2021)
+  A Subcell Finite Volume Positivity-Preserving Limiter for DGSEM Discretizations
+  of the Euler Equations
+  [arXiv: 2102.06017](https://arxiv.org/abs/2102.06017)
+"""
+function initial_condition_kelvin_helmholtz_instability(x, t,
+                                                        equations::CompressibleEulerEquations2D)
+    # change discontinuity to tanh
+    # typical resolution 128^2, 256^2
+    # domain size is [-1,+1]^2
+    slope = 15
+    amplitude = 0.02
+    B = tanh(slope * x[2] + 7.5) - tanh(slope * x[2] - 7.5)
+    rho = 0.5 + 0.75 * B
+    v1 = 0.5 * (B - 1)
+    v2 = 0.1 * sin(2 * pi * x[1])
+    p = 1.0
+    return prim2cons(SVector(rho, v1, v2, p), equations)
+end
+initial_condition = initial_condition_kelvin_helmholtz_instability
+
+cells_per_dimension = (32, 32)
+mesh = DGMultiMesh(dg, cells_per_dimension; periodicity = true)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, dg;
+                                    boundary_conditions = boundary_condition_periodic)
+
+tspan = (0.0, 4.6) # stable time for limited entropy-increase adaptive volume integral
+
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+alive_callback = AliveCallback(alive_interval = 50)
+
+stepsize_callback = StepsizeCallback(cfl = 1.0)
+
+analysis_interval = 10
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval, uEltype = real(dg),
+                                     save_analysis = true,
+                                     analysis_errors = Symbol[],
+                                     extra_analysis_integrals = (entropy,))
+
+save_solution = SaveSolutionCallback(interval = 1000,
+                                     solution_variables = cons2prim)
+
+callbacks = CallbackSet(summary_callback,
+                        alive_callback,
+                        stepsize_callback,
+                        analysis_callback,
+                        save_solution)
+
+###############################################################################
+# run the simulation
+
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 1.0, ode_default_options()..., callback = callbacks);
diff --git a/examples/dgmulti_2d/elixir_navierstokes_convergence.jl b/examples/dgmulti_2d/elixir_navierstokes_convergence.jl
index 63e4285d6b9..c7f0c5ca498 100644
--- a/examples/dgmulti_2d/elixir_navierstokes_convergence.jl
+++ b/examples/dgmulti_2d/elixir_navierstokes_convergence.jl
@@ -198,7 +198,7 @@ boundary_condition_top_bottom = BoundaryConditionNavierStokesWall(velocity_bc_to
 # define inviscid boundary conditions
 boundary_conditions = (; top_bottom = boundary_condition_slip_wall)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; top_bottom = boundary_condition_top_bottom)
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
diff --git a/examples/dgmulti_2d/elixir_navierstokes_convergence_curved.jl b/examples/dgmulti_2d/elixir_navierstokes_convergence_curved.jl
index a4ca0c60791..b0b7601a9a9 100644
--- a/examples/dgmulti_2d/elixir_navierstokes_convergence_curved.jl
+++ b/examples/dgmulti_2d/elixir_navierstokes_convergence_curved.jl
@@ -206,7 +206,7 @@ boundary_condition_top_bottom = BoundaryConditionNavierStokesWall(velocity_bc_to
 # define inviscid boundary conditions
 boundary_conditions = (; top_bottom = boundary_condition_slip_wall)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; top_bottom = boundary_condition_top_bottom)
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
diff --git a/examples/dgmulti_2d/elixir_navierstokes_lid_driven_cavity.jl b/examples/dgmulti_2d/elixir_navierstokes_lid_driven_cavity.jl
index 38c8234b839..167bc54e57c 100644
--- a/examples/dgmulti_2d/elixir_navierstokes_lid_driven_cavity.jl
+++ b/examples/dgmulti_2d/elixir_navierstokes_lid_driven_cavity.jl
@@ -51,7 +51,7 @@ boundary_condition_cavity = BoundaryConditionNavierStokesWall(velocity_bc_cavity
 boundary_conditions = (; top = boundary_condition_slip_wall,
                        rest_of_boundary = boundary_condition_slip_wall)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; top = boundary_condition_lid,
                                  rest_of_boundary = boundary_condition_cavity)
 
diff --git a/examples/dgmulti_3d/elixir_navierstokes_convergence.jl b/examples/dgmulti_3d/elixir_navierstokes_convergence.jl
index 9adb48efa0a..3c174196457 100644
--- a/examples/dgmulti_3d/elixir_navierstokes_convergence.jl
+++ b/examples/dgmulti_3d/elixir_navierstokes_convergence.jl
@@ -241,7 +241,7 @@ boundary_condition_top_bottom = BoundaryConditionNavierStokesWall(velocity_bc_to
 # define inviscid boundary conditions
 boundary_conditions = (; top_bottom = boundary_condition_slip_wall)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; top_bottom = boundary_condition_top_bottom)
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
diff --git a/examples/dgmulti_3d/elixir_navierstokes_convergence_curved.jl b/examples/dgmulti_3d/elixir_navierstokes_convergence_curved.jl
index 96469770bd1..bdc997f8953 100644
--- a/examples/dgmulti_3d/elixir_navierstokes_convergence_curved.jl
+++ b/examples/dgmulti_3d/elixir_navierstokes_convergence_curved.jl
@@ -249,7 +249,7 @@ boundary_condition_top_bottom = BoundaryConditionNavierStokesWall(velocity_bc_to
 # define inviscid boundary conditions
 boundary_conditions = (; top_bottom = boundary_condition_slip_wall)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; top_bottom = boundary_condition_top_bottom)
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
index 4553d4823ed..db474e2b624 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -33,6 +33,7 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
+# Change `storage_type` to, e.g., `CuArray` to actually run on GPU
 ode = semidiscretize(semi, (0.0, 1.0); real_type = nothing, storage_type = nothing)
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
@@ -50,9 +51,8 @@ save_solution = SaveSolutionCallback(interval = 100,
 stepsize_callback = StepsizeCallback(cfl = 1.6)
 
 # Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
-callbacks = CallbackSet(summary_callback, stepsize_callback)
-# TODO: GPU. The `analysis_callback` needs to be updated for GPU support
-# analysis_callback, save_solution, stepsize_callback)
+callbacks = CallbackSet(summary_callback, analysis_callback,
+                        save_solution, stepsize_callback)
 
 ###############################################################################
 # run the simulation
diff --git a/examples/p4est_2d_dgsem/elixir_advection_coupled.jl b/examples/p4est_2d_dgsem/elixir_advection_coupled.jl
new file mode 100644
index 00000000000..c7ad29b6294
--- /dev/null
+++ b/examples/p4est_2d_dgsem/elixir_advection_coupled.jl
@@ -0,0 +1,88 @@
+using OrdinaryDiffEqSSPRK, OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# Simplest coupled setup consisting of two non-trivial mesh views.
+
+advection_velocity = (0.2, -0.7)
+equations = LinearScalarAdvectionEquation2D(advection_velocity)
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+# Define the physical domain for the parent mesh.
+coordinates_min = (-1.0, -1.0) # minimum coordinates (min(x), min(y))
+coordinates_max = (1.0, 1.0) # maximum coordinates (max(x), max(y))
+
+trees_per_dimension = (8, 8)
+
+# Create parent P4estMesh with 8 x 8 trees and 8 x 8 elements
+# Since we couple through the boundaries, the periodicity does not matter here,
+# but it is to trigger parts of the code for the test.
+parent_mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                        coordinates_min = coordinates_min,
+                        coordinates_max = coordinates_max,
+                        initial_refinement_level = 0,
+                        periodicity = false)
+
+# Define the mesh views consisting of a small square in the center
+# and a square ring around it.
+cell_ids1 = vcat((1:18), (23:26), (31:34), (39:42), (47:64))
+mesh1 = P4estMeshView(parent_mesh, cell_ids1)
+cell_ids2 = vcat((19:22), (27:30), (35:38), (43:46))
+mesh2 = P4estMeshView(parent_mesh, cell_ids2)
+
+# Define a trivial coupling function.
+coupling_function = (x, u, equations_other, equations_own) -> u
+
+# The mesh is coupled across the physical boundaries, which makes this setup
+# effectively double periodic.
+boundary_conditions = (; x_neg = BoundaryConditionCoupledP4est(coupling_function),
+                       y_neg = BoundaryConditionCoupledP4est(coupling_function),
+                       y_pos = BoundaryConditionCoupledP4est(coupling_function),
+                       x_pos = BoundaryConditionCoupledP4est(coupling_function))
+
+semi1 = SemidiscretizationHyperbolic(mesh1, equations, initial_condition_convergence_test,
+                                     solver,
+                                     boundary_conditions = boundary_conditions)
+semi2 = SemidiscretizationHyperbolic(mesh2, equations, initial_condition_convergence_test,
+                                     solver,
+                                     boundary_conditions = boundary_conditions)
+
+# Create a semidiscretization that bundles semi1 and semi2
+semi = SemidiscretizationCoupledP4est(semi1, semi2)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span from 0.0 to 2.0
+ode = semidiscretize(semi, (0.0, 2.0))
+
+# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
+# and resets the timers
+summary_callback = SummaryCallback()
+
+# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
+# We require this definition for the test, even though we don't use it in the CallbackSet.
+analysis_callback1 = AnalysisCallback(semi1, interval = 100)
+analysis_callback2 = AnalysisCallback(semi2, interval = 100)
+analysis_callback = AnalysisCallbackCoupledP4est(semi, analysis_callback1,
+                                                 analysis_callback2)
+
+# The SaveSolutionCallback allows to save the solution to a file in regular intervals
+save_solution = SaveSolutionCallback(interval = 100,
+                                     solution_variables = cons2prim)
+
+# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
+stepsize_callback = StepsizeCallback(cfl = 1.6)
+
+# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
+callbacks = CallbackSet(summary_callback, save_solution, stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+# OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/examples/p4est_2d_dgsem/elixir_advection_diffusion_nonperiodic_amr.jl b/examples/p4est_2d_dgsem/elixir_advection_diffusion_nonperiodic_amr.jl
index 5490425f558..eec4ea0d79b 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_diffusion_nonperiodic_amr.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_diffusion_nonperiodic_amr.jl
@@ -50,7 +50,7 @@ boundary_conditions_parabolic = BoundaryConditionDirichlet(initial_condition)
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/p4est_2d_dgsem/elixir_advection_diffusion_rotated.jl b/examples/p4est_2d_dgsem/elixir_advection_diffusion_rotated.jl
new file mode 100644
index 00000000000..110e87937e6
--- /dev/null
+++ b/examples/p4est_2d_dgsem/elixir_advection_diffusion_rotated.jl
@@ -0,0 +1,60 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the linear advection-diffusion equation
+
+diffusivity() = 1.0e-2
+advection_velocity = (-1.0, 1.0)
+equations = LinearScalarAdvectionEquation2D(advection_velocity)
+equations_parabolic = LaplaceDiffusion2D(diffusivity(), equations)
+
+function initial_condition_gauss_damped(x, t, equations)
+    damping_factor = 1 + 4 * diffusivity() * t
+    return SVector(exp(-(x[1]^2 + x[2]^2) / damping_factor) / damping_factor)
+end
+initial_condition = initial_condition_gauss_damped
+
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+# This maps the domain [-1, 1]^2 to a 45-degree rotated increased square
+square_size() = 5.0
+function mapping(xi, eta)
+    x = square_size() * xi
+    y = square_size() * eta
+    return SVector((x - y) / sqrt(2), (x + y) / sqrt(2))
+end
+
+trees_per_dimension = (23, 23)
+mesh = P4estMesh(trees_per_dimension,
+                 polydeg = 3, initial_refinement_level = 0,
+                 mapping = mapping, periodicity = true)
+
+semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
+                                             initial_condition, solver;
+                                             solver_parabolic = ParabolicFormulationLocalDG(),
+                                             boundary_conditions = (boundary_condition_periodic,
+                                                                    boundary_condition_periodic))
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+n_passes = 2
+tspan = (0.0, n_passes * square_size() * sqrt(2))
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+callbacks = CallbackSet(summary_callback, analysis_callback, alive_callback)
+
+###############################################################################
+# run the simulation
+
+time_int_tol = 1.0e-6
+sol = solve(ode, RDPK3SpFSAL49(); abstol = time_int_tol, reltol = time_int_tol,
+            ode_default_options()..., callback = callbacks)
diff --git a/examples/p4est_2d_dgsem/elixir_euler_supersonic_cylinder_scO2.jl b/examples/p4est_2d_dgsem/elixir_euler_supersonic_cylinder_scO2.jl
index 21a90529a0d..0e9b58e150d 100644
--- a/examples/p4est_2d_dgsem/elixir_euler_supersonic_cylinder_scO2.jl
+++ b/examples/p4est_2d_dgsem/elixir_euler_supersonic_cylinder_scO2.jl
@@ -113,7 +113,7 @@ amr_controller = ControllerThreeLevel(semi, amr_indicator,
                                       max_level = 5, max_threshold = 0.1)
 
 amr_callback = AMRCallback(semi, amr_controller,
-                           interval = 2,
+                           interval = 3,
                            adapt_initial_condition = true,
                            adapt_initial_condition_only_refine = true)
 
@@ -130,8 +130,9 @@ stage_limiter! = PositivityPreservingLimiterZhangShu(thresholds = (5.0e-7, 1.0e-
 ###############################################################################
 # run the simulation
 
-# We supply a small initial timestep to be able to use a larger AMR interval (2 instead of 1) throughout the simulation.
+# We supply a small initial timestep to be able to use a larger AMR interval (3 instead of 1) throughout the simulation.
 # This pays off almost immediately as only the first couple timesteps use this timestep before it is ramped up.
 dt0 = 1e-8
 sol = solve(ode, SSPRK43(stage_limiter! = stage_limiter!, thread = Trixi.True());
-            dt = dt0, ode_default_options()..., callback = callbacks);
+            adaptive = true, dt = dt0,
+            ode_default_options()..., callback = callbacks);
diff --git a/examples/p4est_2d_dgsem/elixir_navierstokes_convergence.jl b/examples/p4est_2d_dgsem/elixir_navierstokes_convergence.jl
index 11ae50eec90..5341299e572 100644
--- a/examples/p4est_2d_dgsem/elixir_navierstokes_convergence.jl
+++ b/examples/p4est_2d_dgsem/elixir_navierstokes_convergence.jl
@@ -199,7 +199,7 @@ boundary_condition_top_bottom = BoundaryConditionNavierStokesWall(velocity_bc_to
 boundary_conditions = (; y_neg = boundary_condition_slip_wall,
                        y_pos = boundary_condition_slip_wall)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; y_neg = boundary_condition_top_bottom,
                                  y_pos = boundary_condition_top_bottom)
 
diff --git a/examples/p4est_2d_dgsem/elixir_navierstokes_convergence_nonperiodic.jl b/examples/p4est_2d_dgsem/elixir_navierstokes_convergence_nonperiodic.jl
index a5025116087..4e11d052d96 100644
--- a/examples/p4est_2d_dgsem/elixir_navierstokes_convergence_nonperiodic.jl
+++ b/examples/p4est_2d_dgsem/elixir_navierstokes_convergence_nonperiodic.jl
@@ -203,7 +203,7 @@ boundary_conditions = (; x_neg = boundary_condition_left_right,
                        y_neg = boundary_condition_slip_wall,
                        y_pos = boundary_condition_slip_wall)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; x_neg = boundary_condition_left_right,
                                  x_pos = boundary_condition_left_right,
                                  y_neg = boundary_condition_top_bottom,
diff --git a/examples/p4est_2d_dgsem/elixir_navierstokes_freestream_ldg.jl b/examples/p4est_2d_dgsem/elixir_navierstokes_freestream_ldg.jl
index a5280d1a3e8..3efd44f1afb 100644
--- a/examples/p4est_2d_dgsem/elixir_navierstokes_freestream_ldg.jl
+++ b/examples/p4est_2d_dgsem/elixir_navierstokes_freestream_ldg.jl
@@ -19,7 +19,7 @@ initial_condition = initial_condition_const
 polydeg = 3
 solver = DGSEM(polydeg = polydeg, surface_flux = flux_lax_friedrichs,
                volume_integral = VolumeIntegralFluxDifferencing(flux_ranocha))
-solver_parabolic = ViscousFormulationLocalDG()
+solver_parabolic = ParabolicFormulationLocalDG()
 
 mu() = 0.5
 prandtl_number() = 0.72
diff --git a/examples/p4est_2d_dgsem/elixir_navierstokes_shearlayer_nonconforming.jl b/examples/p4est_2d_dgsem/elixir_navierstokes_shearlayer_nonconforming.jl
index c4595b85180..70fe918a476 100644
--- a/examples/p4est_2d_dgsem/elixir_navierstokes_shearlayer_nonconforming.jl
+++ b/examples/p4est_2d_dgsem/elixir_navierstokes_shearlayer_nonconforming.jl
@@ -68,7 +68,7 @@ Trixi.refine_p4est!(mesh.p4est, true, refine_fn_c, C_NULL)
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_condition_periodic,
                                                                     boundary_condition_periodic))
 
diff --git a/examples/p4est_2d_dgsem/elixir_navierstokes_vortex_street.jl b/examples/p4est_2d_dgsem/elixir_navierstokes_vortex_street.jl
index c206202af03..932926770e9 100644
--- a/examples/p4est_2d_dgsem/elixir_navierstokes_vortex_street.jl
+++ b/examples/p4est_2d_dgsem/elixir_navierstokes_vortex_street.jl
@@ -119,8 +119,8 @@ function Trixi.get_node_variable(::Val{:vorticity}, u, mesh, equations, dg, cach
                             n_nodes, n_nodes, # equivalent: `ntuple(_ -> n_nodes, ndims(mesh))...,`
                             n_elements)
 
-    @unpack viscous_container = cache_parabolic
-    @unpack gradients = viscous_container
+    @unpack parabolic_container = cache_parabolic
+    @unpack gradients = parabolic_container
     gradients_x, gradients_y = gradients
 
     # We can accelerate the computation by thread-parallelizing the loop over elements
diff --git a/examples/p4est_2d_dgsem/elixir_advection_meshview.jl b/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl
similarity index 58%
rename from examples/p4est_2d_dgsem/elixir_advection_meshview.jl
rename to examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl
index 0ad6e6e18e8..e0f8d735e21 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_meshview.jl
+++ b/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl
@@ -1,30 +1,27 @@
+# The same setup as tree_3d_dgsem/elixir_advection_basic.jl
+# to verify GPU support and Adapt.jl support.
+
 using OrdinaryDiffEqLowStorageRK
 using Trixi
 
 ###############################################################################
-# Most basic p4est mesh view setup where the entire domain
-# is part of the single mesh view.
+# semidiscretization of the linear advection equation
 
-advection_velocity = (0.2, -0.7)
-equations = LinearScalarAdvectionEquation2D(advection_velocity)
+advection_velocity = (0.2, -0.7, 0.5)
+equations = LinearScalarAdvectionEquation3D(advection_velocity)
 
 # Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
 solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
 
-coordinates_min = (-1.0, -1.0) # minimum coordinates (min(x), min(y))
-coordinates_max = (1.0, 1.0) # maximum coordinates (max(x), max(y))
-
-trees_per_dimension = (8, 8)
-
-# Create parent P4estMesh with 8 x 8 trees and 8 x 8 elements
-parent_mesh = P4estMesh(trees_per_dimension, polydeg = 3,
-                        coordinates_min = coordinates_min,
-                        coordinates_max = coordinates_max,
-                        periodicity = true)
+coordinates_min = (-1.0, -1.0, -1.0) # minimum coordinates (min(x), min(y), min(z))
+coordinates_max = (1.0, 1.0, 1.0) # maximum coordinates (max(x), max(y), max(z))
 
-# Define the mesh view covering the whole parent mesh.
-cell_ids = collect(1:Trixi.ncells(parent_mesh))
-mesh = P4estMeshView(parent_mesh, cell_ids)
+# Create P4estMesh with 8 x 8 x 8 elements (note `refinement_level=1`)
+trees_per_dimension = (4, 4, 4)
+mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 initial_refinement_level = 1,
+                 periodicity = true)
 
 # A semidiscretization collects data structures and functions for the spatial discretization
 semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test,
@@ -35,31 +32,32 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergen
 # ODE solvers, callbacks etc.
 
 # Create ODE problem with time span from 0.0 to 1.0
-ode = semidiscretize(semi, (0.0, 1.0))
+# Change `storage_type` to, e.g., `CuArray` to actually run on GPU
+tspan = (0.0, 1.0)
+ode = semidiscretize(semi, tspan; real_type = nothing, storage_type = nothing)
 
 # At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
 # and resets the timers
 summary_callback = SummaryCallback()
 
 # The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
-# We require this definition for the test, even though we don't use it in the CallbackSet.
-analysis_callback = AnalysisCallback(semi)
+analysis_callback = AnalysisCallback(semi, interval = 100)
 
 # The SaveSolutionCallback allows to save the solution to a file in regular intervals
 save_solution = SaveSolutionCallback(interval = 100,
                                      solution_variables = cons2prim)
 
 # The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
-stepsize_callback = StepsizeCallback(cfl = 1.6)
+stepsize_callback = StepsizeCallback(cfl = 1.2)
 
 # Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
-callbacks = CallbackSet(summary_callback, save_solution,
-                        stepsize_callback)
+callbacks = CallbackSet(summary_callback, analysis_callback,
+                        save_solution, stepsize_callback)
 
 ###############################################################################
 # run the simulation
 
 # OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
 sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
-            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            dt = 0.05, # solve needs some value here but it will be overwritten by the stepsize_callback
             ode_default_options()..., callback = callbacks);
diff --git a/examples/p4est_3d_dgsem/elixir_advection_diffusion_amr_curved.jl b/examples/p4est_3d_dgsem/elixir_advection_diffusion_amr_curved.jl
index 4e39dac1e69..aacca586c23 100644
--- a/examples/p4est_3d_dgsem/elixir_advection_diffusion_amr_curved.jl
+++ b/examples/p4est_3d_dgsem/elixir_advection_diffusion_amr_curved.jl
@@ -64,7 +64,7 @@ mesh = P4estMesh{3}(mesh_file, polydeg = 2,
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions))
 
diff --git a/examples/p4est_3d_dgsem/elixir_advection_diffusion_nonconforming.jl b/examples/p4est_3d_dgsem/elixir_advection_diffusion_nonconforming.jl
index 4498efe0936..68e6c6ffb6e 100644
--- a/examples/p4est_3d_dgsem/elixir_advection_diffusion_nonconforming.jl
+++ b/examples/p4est_3d_dgsem/elixir_advection_diffusion_nonconforming.jl
@@ -70,7 +70,7 @@ boundary_conditions_parabolic = BoundaryConditionDirichlet(initial_condition)
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/p4est_3d_dgsem/elixir_advection_diffusion_nonperiodic.jl b/examples/p4est_3d_dgsem/elixir_advection_diffusion_nonperiodic.jl
index d098dc0579a..a440f570767 100644
--- a/examples/p4est_3d_dgsem/elixir_advection_diffusion_nonperiodic.jl
+++ b/examples/p4est_3d_dgsem/elixir_advection_diffusion_nonperiodic.jl
@@ -58,7 +58,7 @@ boundary_conditions = BoundaryConditionDirichlet(initial_condition)
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions))
 
@@ -76,7 +76,7 @@ analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
 alive_callback = AliveCallback(analysis_interval = analysis_interval)
 
 stepsize_callback = StepsizeCallback(cfl = 1.6,
-                                     cfl_diffusive = 0.25)
+                                     cfl_parabolic = 0.25)
 
 callbacks = CallbackSet(summary_callback, analysis_callback, alive_callback,
                         stepsize_callback)
diff --git a/examples/p4est_3d_dgsem/elixir_euler_sedov_sc_subcell.jl b/examples/p4est_3d_dgsem/elixir_euler_sedov_sc_subcell.jl
index 395e9a0e711..9efd49d3ad8 100644
--- a/examples/p4est_3d_dgsem/elixir_euler_sedov_sc_subcell.jl
+++ b/examples/p4est_3d_dgsem/elixir_euler_sedov_sc_subcell.jl
@@ -45,6 +45,7 @@ basis = LobattoLegendreBasis(polydeg)
 limiter_idp = SubcellLimiterIDP(equations, basis;
                                 positivity_variables_cons = ["rho"],
                                 positivity_variables_nonlinear = [pressure],
+                                local_twosided_variables_cons = [],
                                 local_onesided_variables_nonlinear = [],
                                 max_iterations_newton = 25,
                                 bar_states = false)
diff --git a/examples/p4est_3d_dgsem/elixir_euler_tandem_spheres.jl b/examples/p4est_3d_dgsem/elixir_euler_tandem_spheres.jl
index 73c11c410a9..60f800e7f62 100644
--- a/examples/p4est_3d_dgsem/elixir_euler_tandem_spheres.jl
+++ b/examples/p4est_3d_dgsem/elixir_euler_tandem_spheres.jl
@@ -58,8 +58,10 @@ solver = DGSEM(polydeg = polydeg, surface_flux = surface_flux,
 #
 # in the .msh file.
 
-mesh_file = Trixi.download("https://rwth-aachen.sciebo.de/s/pioS9PmdSWnLc8D/download/TandemSpheresHexMesh1P2_fixed.inp",
-                           joinpath(@__DIR__, "TandemSpheresHexMesh1P2_fixed.inp"))
+mesh_file = joinpath(@__DIR__, "TandemSpheresHexMesh1P2_fixed.inp")
+using Downloads
+Downloads.download("https://zenodo.org/records/18921889/files/TandemSpheresHexMesh1P2_fixed.inp?download=1",
+                   mesh_file)
 
 # Boundary symbols follow from nodesets in the mesh file
 boundary_symbols = [:FrontSphere, :BackSphere, :FarField]
diff --git a/examples/p4est_3d_dgsem/elixir_navierstokes_blast_wave_amr.jl b/examples/p4est_3d_dgsem/elixir_navierstokes_blast_wave_amr.jl
index 497c0a741e5..2c3b563df00 100644
--- a/examples/p4est_3d_dgsem/elixir_navierstokes_blast_wave_amr.jl
+++ b/examples/p4est_3d_dgsem/elixir_navierstokes_blast_wave_amr.jl
@@ -67,7 +67,7 @@ mesh = P4estMesh(trees_per_dimension, polydeg = 3,
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_condition_periodic,
                                                                     boundary_condition_periodic))
 
diff --git a/examples/p4est_3d_dgsem/elixir_navierstokes_convergence.jl b/examples/p4est_3d_dgsem/elixir_navierstokes_convergence.jl
index 27ccde81b66..f2e7eb08028 100644
--- a/examples/p4est_3d_dgsem/elixir_navierstokes_convergence.jl
+++ b/examples/p4est_3d_dgsem/elixir_navierstokes_convergence.jl
@@ -243,7 +243,7 @@ boundary_condition_top_bottom = BoundaryConditionNavierStokesWall(velocity_bc_to
 boundary_conditions = (; y_neg = boundary_condition_slip_wall,
                        y_pos = boundary_condition_slip_wall)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; y_neg = boundary_condition_top_bottom,
                                  y_pos = boundary_condition_top_bottom)
 
diff --git a/examples/p4est_3d_dgsem/elixir_navierstokes_freestream_boundaries.jl b/examples/p4est_3d_dgsem/elixir_navierstokes_freestream_boundaries.jl
index 51d19f5302e..4ca0cb88c41 100644
--- a/examples/p4est_3d_dgsem/elixir_navierstokes_freestream_boundaries.jl
+++ b/examples/p4est_3d_dgsem/elixir_navierstokes_freestream_boundaries.jl
@@ -19,7 +19,7 @@ initial_condition = initial_condition_const
 
 polydeg = 3
 solver = DGSEM(polydeg = polydeg, surface_flux = flux_lax_friedrichs)
-solver_parabolic = ViscousFormulationBassiRebay1()
+solver_parabolic = ParabolicFormulationBassiRebay1()
 
 mu() = 0.5
 prandtl_number() = 0.72
diff --git a/examples/structured_2d_dgsem/elixir_mhdmultiion_convergence_twospecies.jl b/examples/structured_2d_dgsem/elixir_mhdmultiion_convergence_twospecies.jl
new file mode 100644
index 00000000000..e9afdff30c6
--- /dev/null
+++ b/examples/structured_2d_dgsem/elixir_mhdmultiion_convergence_twospecies.jl
@@ -0,0 +1,179 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+"""
+  electron_pressure_alpha(u, equations::IdealGlmMhdMultiIonEquations2D)
+Returns a fraction (alpha) of the total ion pressure for the electron pressure.
+"""
+function electron_pressure_alpha(u, equations::IdealGlmMhdMultiIonEquations2D)
+    alpha = 0.2
+    prim = cons2prim(u, equations)
+    p_e = zero(u[1])
+    for k in eachcomponent(equations)
+        _, _, _, _, p_k = Trixi.get_component(k, prim, equations)
+        p_e += p_k
+    end
+    return alpha * p_e
+end
+# semidiscretization of the ideal multi-ion MHD equations
+equations = IdealGlmMhdMultiIonEquations2D(gammas = (2.0, 4.0),
+                                           charge_to_mass = (2.0, 1.0),
+                                           electron_pressure = electron_pressure_alpha)
+
+"""
+Initial (and exact) solution for the the manufactured solution test. Runs with 
+* gammas = (2.0, 4.0),
+* charge_to_mass = (2.0, 1.0)
+* Domain size: [-1,1]²
+"""
+function initial_condition_manufactured_solution(x, t,
+                                                 equations::IdealGlmMhdMultiIonEquations2D)
+    am = 0.1
+    om = π
+    h = am * sin(om * (x[1] + x[2] - t)) + 2
+    hh1 = am * 0.4 * sin(om * (x[1] + x[2] - t)) + 1
+    hh2 = h - hh1
+
+    u1 = hh1
+    u2 = hh1
+    u3 = hh1
+    u4 = 0.1 * hh1
+    u5 = 2 * hh1^2 + hh1
+    u6 = hh2
+    u7 = hh2
+    u8 = hh2
+    u9 = 0.1 * hh2
+    u10 = 2 * hh2^2 + hh2
+    u11 = 0.25 * h
+    u12 = -0.25 * h
+    u13 = 0.1 * h
+
+    return SVector{nvariables(equations), real(equations)}(u11, u12, u13,
+                                                           u1, u2, u3, u4, u5,
+                                                           u6, u7, u8, u9, u10,
+                                                           0)
+end
+
+"""
+Source term that corresponds to the manufactured solution test. Runs with 
+* gammas = (2.0, 4.0),
+* charge_to_mass = (2.0, 1.0)
+* Domain size: [-1,1]²
+"""
+function source_terms_manufactured_solution_pe(u, x, t,
+                                               equations::IdealGlmMhdMultiIonEquations2D)
+    am = 0.1
+    om = pi
+    h1 = am * sin(om * (x[1] + x[2] - t))
+    hx = am * om * cos(om * (x[1] + x[2] - t))
+
+    s1 = (2 * hx) / 5
+    s2 = (38055 * hx * h1^2 + 185541 * hx * h1 + 220190 * hx) / (35000 * h1 + 75000)
+    s3 = (38055 * hx * h1^2 + 185541 * hx * h1 + 220190 * hx) / (35000 * h1 + 75000)
+    s4 = hx / 25
+    s5 = (1835811702576186755 * hx * h1^2 + 8592627463681183181 * hx * h1 +
+          9884050459977240490 * hx) / (652252660543767500 * h1 + 1397684272593787500)
+    s6 = (3 * hx) / 5
+    s7 = (76155 * hx * h1^2 + 295306 * hx * h1 + 284435 * hx) / (17500 * h1 + 37500)
+    s8 = (76155 * hx * h1^2 + 295306 * hx * h1 + 284435 * hx) / (17500 * h1 + 37500)
+    s9 = (3 * hx) / 50
+    s10 = (88755 * hx * h1^2 + 338056 * hx * h1 + 318185 * hx) / (8750 * h1 + 18750)
+    s11 = hx / 4
+    s12 = -hx / 4
+    s13 = hx / 10
+
+    s = SVector{nvariables(equations), real(equations)}(s11, s12, s13,
+                                                        s1, s2, s3, s4, s5,
+                                                        s6, s7, s8, s9, s10,
+                                                        0)
+    S_std = source_terms_lorentz(u, x, t, equations::IdealGlmMhdMultiIonEquations2D)
+
+    return SVector{nvariables(equations), real(equations)}(S_std .+ s)
+end
+
+initial_condition = initial_condition_manufactured_solution
+source_terms = source_terms_manufactured_solution_pe
+
+volume_flux = (flux_ruedaramirez_etal, flux_nonconservative_ruedaramirez_etal)
+surface_flux = (FluxLaxFriedrichs(max_abs_speed_naive),
+                flux_nonconservative_central) # Also works with flux_nonconservative_ruedaramirez_etal
+
+solver = DGSEM(polydeg = 3, surface_flux = surface_flux,
+               volume_integral = VolumeIntegralFluxDifferencing(volume_flux))
+
+coordinates_min = (-1.0, -1.0)
+coordinates_max = (1.0, 1.0)
+
+# To test convergence use:
+#   convergence_test("../examples/structured_2d_dgsem/elixir_mhdmultiion_convergence_twospecies.jl", 3, cells_per_dimension = (2, 2), polydeg = 3)
+# Mapping as described in https://arxiv.org/abs/2012.12040
+function mapping(xi_, eta_)
+    # Transform input variables between -1 and 1 onto [0,3]
+    xi = 1.5 * xi_ + 1.5
+    eta = 1.5 * eta_ + 1.5
+
+    y = eta +
+        0.05 * (cospi(1.5 * (2 * xi - 3) / 3) *
+                cospi(0.5 * (2 * eta - 3) / 3))
+
+    x = xi +
+        0.05 * (cospi(0.5 * (2 * xi - 3) / 3) *
+                cospi(2 * (2 * y - 3) / 3))
+
+    # Go back to [-1,1]^3
+    x = x * 2 / 3 - 1
+    y = y * 2 / 3 - 1
+
+    return SVector(x, y)
+end
+cells_per_dimension = (2, 2)
+mesh = StructuredMesh(cells_per_dimension, mapping;
+                      periodicity = true)
+
+# # Alternatively, you can test with a TreeMesh
+# #   convergence_test("../examples/structured_2d_dgsem/elixir_mhdmultiion_convergence_twospecies.jl", 3, initial_refinement_level = 1, polydeg = 3)
+# initial_refinement_level = 1
+# mesh = TreeMesh(coordinates_min, coordinates_max,
+#                 initial_refinement_level = initial_refinement_level,
+#                 n_cells_max = 1_000_000,
+#                 periodicity = true)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                    source_terms = source_terms,
+                                    boundary_conditions = boundary_condition_periodic)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 1.0)
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+save_solution = SaveSolutionCallback(interval = 1000,
+                                     save_initial_solution = true,
+                                     save_final_solution = true,
+                                     solution_variables = cons2prim)
+
+cfl = 0.5
+stepsize_callback = StepsizeCallback(cfl = cfl)
+
+glm_speed_callback = GlmSpeedCallback(glm_scale = 0.5, cfl = cfl)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback, alive_callback,
+                        save_solution,
+                        stepsize_callback,
+                        glm_speed_callback)
+
+###############################################################################
+# run the simulation
+
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false),
+            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            save_everystep = false, callback = callbacks);
diff --git a/examples/structured_2d_dgsem/elixir_mhdmultiion_ec.jl b/examples/structured_2d_dgsem/elixir_mhdmultiion_ec.jl
new file mode 100644
index 00000000000..071985d669d
--- /dev/null
+++ b/examples/structured_2d_dgsem/elixir_mhdmultiion_ec.jl
@@ -0,0 +1,64 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the ideal multi-ion MHD equations
+equations = IdealGlmMhdMultiIonEquations2D(gammas = (1.4, 1.667),
+                                           charge_to_mass = (1.0, 2.0))
+
+initial_condition = initial_condition_weak_blast_wave
+
+# Entropy conservative numerical fluxes
+volume_flux = (flux_ruedaramirez_etal, flux_nonconservative_ruedaramirez_etal)
+surface_flux = (flux_ruedaramirez_etal, flux_nonconservative_ruedaramirez_etal)
+
+solver = DGSEM(polydeg = 3, surface_flux = surface_flux,
+               volume_integral = VolumeIntegralFluxDifferencing(volume_flux))
+
+coordinates_min = (-2.0, -2.0)
+coordinates_max = (2.0, 2.0)
+cells_per_dimension = (100, 100)
+mesh = StructuredMesh(cells_per_dimension, coordinates_min,
+                      coordinates_max, periodicity = true)
+
+# The multi-ion GLM-MHD equations require the inclusion of source_terms_lorentz 
+# whenever multiple ion species are present
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                    source_terms = source_terms_lorentz,
+                                    boundary_conditions = boundary_condition_periodic)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 0.4)
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 10
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+save_solution = SaveSolutionCallback(dt = 0.1, # interval=100,
+                                     save_initial_solution = true,
+                                     save_final_solution = true,
+                                     solution_variables = cons2prim)
+
+cfl = 0.5
+
+stepsize_callback = StepsizeCallback(cfl = cfl)
+
+glm_speed_callback = GlmSpeedCallback(glm_scale = 0.5, cfl = cfl)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback, alive_callback,
+                        save_solution,
+                        stepsize_callback,
+                        glm_speed_callback)
+
+###############################################################################
+# run the simulation
+
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/examples/tree_1d_dgsem/elixir_advection_diffusion_cfl.jl b/examples/tree_1d_dgsem/elixir_advection_diffusion_cfl.jl
index d06efc92334..d9969ccade6 100644
--- a/examples/tree_1d_dgsem/elixir_advection_diffusion_cfl.jl
+++ b/examples/tree_1d_dgsem/elixir_advection_diffusion_cfl.jl
@@ -62,10 +62,10 @@ analysis_callback = AnalysisCallback(semi, interval = 100)
 alive_callback = AliveCallback(analysis_interval = 100)
 
 # Stepsize callback which selects the timestep according to the most restrictive CFL condition.
-# For coarser grids, linear stability is governed by the advective CFL condition,
+# For coarser grids, linear stability is governed by the hyperbolic CFL condition,
 # while for high refinements the flow becomes diffusion-dominated.
 stepsize_callback = StepsizeCallback(cfl = 1.6,
-                                     cfl_diffusive = 0.3)
+                                     cfl_parabolic = 0.3)
 
 callbacks = CallbackSet(summary_callback, analysis_callback, alive_callback,
                         stepsize_callback)
diff --git a/examples/tree_1d_dgsem/elixir_advection_diffusion_dirichlet_amr.jl b/examples/tree_1d_dgsem/elixir_advection_diffusion_dirichlet_amr.jl
index a2caf612b24..13d8d03f784 100644
--- a/examples/tree_1d_dgsem/elixir_advection_diffusion_dirichlet_amr.jl
+++ b/examples/tree_1d_dgsem/elixir_advection_diffusion_dirichlet_amr.jl
@@ -42,7 +42,7 @@ boundary_conditions_parabolic = BoundaryConditionDirichlet(initial_condition)
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/tree_1d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl b/examples/tree_1d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl
index 7908945fdae..83459b535bc 100644
--- a/examples/tree_1d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl
+++ b/examples/tree_1d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl
@@ -40,7 +40,7 @@ boundary_conditions_parabolic = boundary_condition_periodic
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition,
                                              solver;
-                                             solver_parabolic = ViscousFormulationLocalDG(),
+                                             solver_parabolic = ParabolicFormulationLocalDG(),
                                              source_terms = source_terms,
                                              source_terms_parabolic = source_terms_parabolic,
                                              boundary_conditions = (boundary_conditions,
@@ -58,10 +58,10 @@ analysis_callback = AnalysisCallback(semi, interval = 100)
 
 alive_callback = AliveCallback(analysis_interval = 100)
 
-cfl_advective = 0.5
-cfl_diffusive = 0.05
-stepsize_callback = StepsizeCallback(cfl = cfl_advective,
-                                     cfl_diffusive = cfl_diffusive)
+cfl_hyperbolic = 0.5
+cfl_parabolic = 0.05
+stepsize_callback = StepsizeCallback(cfl = cfl_hyperbolic,
+                                     cfl_parabolic = cfl_parabolic)
 
 callbacks = CallbackSet(summary_callback, analysis_callback, alive_callback,
                         stepsize_callback)
diff --git a/examples/tree_1d_dgsem/elixir_advection_diffusion_implicit_sparse_jacobian.jl b/examples/tree_1d_dgsem/elixir_advection_diffusion_implicit_sparse_jacobian.jl
index dfa4963d17c..fb170734e5a 100644
--- a/examples/tree_1d_dgsem/elixir_advection_diffusion_implicit_sparse_jacobian.jl
+++ b/examples/tree_1d_dgsem/elixir_advection_diffusion_implicit_sparse_jacobian.jl
@@ -1,7 +1,8 @@
 using Trixi
+using OrdinaryDiffEqBDF
 using SparseConnectivityTracer # For obtaining the Jacobian sparsity pattern
 using SparseMatrixColorings # For obtaining the coloring vector
-using OrdinaryDiffEqBDF, ADTypes
+using ADTypes # To access the types choosing how to evaluate Jacobian-vector products
 
 ###############################################################################
 # semidiscretization of the linear advection-diffusion equation
@@ -126,5 +127,4 @@ callbacks = CallbackSet(summary_callback, analysis_callback, alive_callback, sav
 sol = solve(ode, SBDF2(; autodiff = AutoFiniteDiff());
             ode_default_options()...,
             dt = 0.01,
-            abstol = 1e-9, reltol = 1e-9,
             callback = callbacks)
diff --git a/examples/tree_1d_dgsem/elixir_advection_diffusion_implicit_sparse_jacobian_restart.jl b/examples/tree_1d_dgsem/elixir_advection_diffusion_implicit_sparse_jacobian_restart.jl
index 4e76349f437..1bbb69f64e4 100644
--- a/examples/tree_1d_dgsem/elixir_advection_diffusion_implicit_sparse_jacobian_restart.jl
+++ b/examples/tree_1d_dgsem/elixir_advection_diffusion_implicit_sparse_jacobian_restart.jl
@@ -25,5 +25,4 @@ ode = semidiscretize(semi, tspan,
 sol = solve(ode, SBDF2(; autodiff = AutoFiniteDiff());
             ode_default_options()...,
             dt = dt_restart,
-            abstol = 1e-9, reltol = 1e-9,
             callback = callbacks);
diff --git a/examples/tree_1d_dgsem/elixir_advection_diffusion_ldg.jl b/examples/tree_1d_dgsem/elixir_advection_diffusion_ldg.jl
index 4a5c9776e62..8cc4fa8aad9 100644
--- a/examples/tree_1d_dgsem/elixir_advection_diffusion_ldg.jl
+++ b/examples/tree_1d_dgsem/elixir_advection_diffusion_ldg.jl
@@ -54,7 +54,7 @@ boundary_conditions_parabolic = boundary_condition_periodic
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition,
                                              solver;
-                                             solver_parabolic = ViscousFormulationLocalDG(),
+                                             solver_parabolic = ParabolicFormulationLocalDG(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/tree_1d_dgsem/elixir_advection_diffusion_neumann_amr.jl b/examples/tree_1d_dgsem/elixir_advection_diffusion_neumann_amr.jl
index 0a8531679b2..e34a594c4fd 100644
--- a/examples/tree_1d_dgsem/elixir_advection_diffusion_neumann_amr.jl
+++ b/examples/tree_1d_dgsem/elixir_advection_diffusion_neumann_amr.jl
@@ -26,7 +26,7 @@ boundary_condition_neumann_zero = BoundaryConditionNeumann((x, t, equations_para
 boundary_conditions = (; x_neg = boundary_condition_left,
                        x_pos = boundary_condition_do_nothing)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; x_neg = boundary_condition_left,
                                  x_pos = boundary_condition_neumann_zero)
 
diff --git a/examples/tree_1d_dgsem/elixir_diffusion_ldg.jl b/examples/tree_1d_dgsem/elixir_diffusion_ldg.jl
index 143a9cc9840..8f79de2d3c4 100644
--- a/examples/tree_1d_dgsem/elixir_diffusion_ldg.jl
+++ b/examples/tree_1d_dgsem/elixir_diffusion_ldg.jl
@@ -41,7 +41,7 @@ boundary_conditions = boundary_condition_periodic
 boundary_conditions_parabolic = boundary_condition_periodic
 
 # A semidiscretization collects data structures and functions for the spatial discretization
-solver_parabolic = ViscousFormulationLocalDG()
+solver_parabolic = ParabolicFormulationLocalDG()
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition,
                                              solver; solver_parabolic,
diff --git a/examples/tree_1d_dgsem/elixir_diffusion_ldg_newton_krylov.jl b/examples/tree_1d_dgsem/elixir_diffusion_ldg_newton_krylov.jl
index 8fc08add88a..f52b62738cd 100644
--- a/examples/tree_1d_dgsem/elixir_diffusion_ldg_newton_krylov.jl
+++ b/examples/tree_1d_dgsem/elixir_diffusion_ldg_newton_krylov.jl
@@ -33,7 +33,7 @@ function initial_condition_pure_diffusion_1d_convergence_test(x, t,
 end
 initial_condition = initial_condition_pure_diffusion_1d_convergence_test
 
-solver_parabolic = ViscousFormulationLocalDG()
+solver_parabolic = ParabolicFormulationLocalDG()
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition,
                                              solver; solver_parabolic,
diff --git a/examples/tree_1d_dgsem/elixir_navierstokes_convergence_periodic_cfl.jl b/examples/tree_1d_dgsem/elixir_navierstokes_convergence_periodic_cfl.jl
index ea61ce8c547..7f0006b9c1b 100644
--- a/examples/tree_1d_dgsem/elixir_navierstokes_convergence_periodic_cfl.jl
+++ b/examples/tree_1d_dgsem/elixir_navierstokes_convergence_periodic_cfl.jl
@@ -125,10 +125,10 @@ analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
 alive_callback = AliveCallback(analysis_interval = analysis_interval)
 
 # Stepsize callback which selects the timestep according to the most restrictive CFL condition.
-# For coarser grids, linear stability is governed by the advective/convective CFL condition,
+# For coarser grids, linear stability is governed by the hyperbolic CFL condition,
 # while for high refinements (e.g. initial_refinement_level = 8) the flow becomes diffusion-dominated.
 stepsize_callback = StepsizeCallback(cfl = 2.7,
-                                     cfl_diffusive = 0.2)
+                                     cfl_parabolic = 0.2)
 
 callbacks = CallbackSet(summary_callback,
                         analysis_callback,
diff --git a/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls.jl b/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls.jl
index c0239f5c08d..f31dd16be61 100644
--- a/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls.jl
+++ b/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls.jl
@@ -152,7 +152,7 @@ boundary_condition_right = BoundaryConditionNavierStokesWall(velocity_bc_left_ri
 boundary_conditions = (; x_neg = boundary_condition_slip_wall,
                        x_pos = boundary_condition_slip_wall)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; x_neg = boundary_condition_left,
                                  x_pos = boundary_condition_right)
 
diff --git a/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls_amr.jl b/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls_amr.jl
index 998f5bf107d..0a387965160 100644
--- a/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls_amr.jl
+++ b/examples/tree_1d_dgsem/elixir_navierstokes_convergence_walls_amr.jl
@@ -152,7 +152,7 @@ boundary_condition_right = BoundaryConditionNavierStokesWall(velocity_bc_left_ri
 boundary_conditions = (; x_neg = boundary_condition_slip_wall,
                        x_pos = boundary_condition_slip_wall)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; x_neg = boundary_condition_left,
                                  x_pos = boundary_condition_right)
 
diff --git a/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock.jl b/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock.jl
index a7a91921604..5707c6b3efd 100644
--- a/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock.jl
+++ b/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock.jl
@@ -141,7 +141,7 @@ boundary_conditions_parabolic = (; x_neg = boundary_condition_parabolic,
 # Since this is a diffusion-dominated problem, using the LDG scheme should achieve optimal rates of convergence. 
 # In contrast, BR-1 may achieve suboptimal rates of convergence in diffusion-dominated regimes. 
 # The LDG scheme can be used by specifying the keyword
-# solver_parabolic = ViscousFormulationLocalDG()
+# solver_parabolic = ParabolicFormulationLocalDG()
 # in the semidiscretization call below.
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition, solver;
diff --git a/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock_imex.jl b/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock_imex.jl
index c3166cab961..428ba79535d 100644
--- a/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock_imex.jl
+++ b/examples/tree_1d_dgsem/elixir_navierstokes_viscous_shock_imex.jl
@@ -133,7 +133,7 @@ boundary_conditions_parabolic = (; x_neg = boundary_condition_parabolic,
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationLocalDG(),
+                                             solver_parabolic = ParabolicFormulationLocalDG(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/tree_1d_dgsem/elixir_viscous_burgers_n_wave.jl b/examples/tree_1d_dgsem/elixir_viscous_burgers_n_wave.jl
index 091ab163eae..6555c3fcba2 100644
--- a/examples/tree_1d_dgsem/elixir_viscous_burgers_n_wave.jl
+++ b/examples/tree_1d_dgsem/elixir_viscous_burgers_n_wave.jl
@@ -53,8 +53,8 @@ analysis_callback = AnalysisCallback(semi, interval = 100)
 
 alive_callback = AliveCallback(analysis_interval = 100)
 
-# Timestep is limited by standard/advective/convective CFL
-stepsize_callback = StepsizeCallback(cfl = 0.6, cfl_diffusive = 0.1)
+# Timestep is limited by the hyperbolic CFL
+stepsize_callback = StepsizeCallback(cfl = 0.6, cfl_parabolic = 0.1)
 
 callbacks = CallbackSet(summary_callback,
                         analysis_callback, alive_callback,
diff --git a/examples/tree_1d_dgsem/elixir_viscous_burgers_shock.jl b/examples/tree_1d_dgsem/elixir_viscous_burgers_shock.jl
index 83bc8498b76..e04f14d640d 100644
--- a/examples/tree_1d_dgsem/elixir_viscous_burgers_shock.jl
+++ b/examples/tree_1d_dgsem/elixir_viscous_burgers_shock.jl
@@ -51,8 +51,8 @@ analysis_callback = AnalysisCallback(semi, interval = 100)
 
 alive_callback = AliveCallback(analysis_interval = 100)
 
-# Timestep is limited by diffusive CFL
-stepsize_callback = StepsizeCallback(cfl = 0.8, cfl_diffusive = 0.15)
+# Timestep is limited by parabolic CFL
+stepsize_callback = StepsizeCallback(cfl = 0.8, cfl_parabolic = 0.15)
 
 callbacks = CallbackSet(summary_callback,
                         analysis_callback, alive_callback,
diff --git a/examples/tree_2d_dgsem/elixir_advection_diffusion.jl b/examples/tree_2d_dgsem/elixir_advection_diffusion.jl
index ed4ed2432e5..f1d209c7ea6 100644
--- a/examples/tree_2d_dgsem/elixir_advection_diffusion.jl
+++ b/examples/tree_2d_dgsem/elixir_advection_diffusion.jl
@@ -47,7 +47,7 @@ boundary_conditions_parabolic = boundary_condition_periodic
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/tree_2d_dgsem/elixir_advection_diffusion_amr.jl b/examples/tree_2d_dgsem/elixir_advection_diffusion_amr.jl
index b4277434129..d91fd2226bf 100644
--- a/examples/tree_2d_dgsem/elixir_advection_diffusion_amr.jl
+++ b/examples/tree_2d_dgsem/elixir_advection_diffusion_amr.jl
@@ -43,7 +43,7 @@ boundary_conditions_parabolic = boundary_condition_periodic
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationLocalDG(),
+                                             solver_parabolic = ParabolicFormulationLocalDG(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/tree_2d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl b/examples/tree_2d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl
index 740594ee954..a68f0c1eeda 100644
--- a/examples/tree_2d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl
+++ b/examples/tree_2d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl
@@ -49,7 +49,7 @@ boundary_conditions_parabolic = boundary_condition_periodic
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition,
                                              solver;
-                                             solver_parabolic = ViscousFormulationLocalDG(),
+                                             solver_parabolic = ParabolicFormulationLocalDG(),
                                              source_terms = source_terms,
                                              source_terms_parabolic = source_terms_parabolic,
                                              boundary_conditions = (boundary_conditions,
@@ -67,10 +67,10 @@ analysis_callback = AnalysisCallback(semi, interval = 100)
 
 alive_callback = AliveCallback(analysis_interval = 100)
 
-cfl_advective = 0.5
-cfl_diffusive = 0.01
-stepsize_callback = StepsizeCallback(cfl = cfl_advective,
-                                     cfl_diffusive = cfl_diffusive)
+cfl_hyperbolic = 0.5
+cfl_parabolic = 0.01
+stepsize_callback = StepsizeCallback(cfl = cfl_hyperbolic,
+                                     cfl_parabolic = cfl_parabolic)
 
 callbacks = CallbackSet(summary_callback, analysis_callback, alive_callback,
                         stepsize_callback)
diff --git a/examples/tree_2d_dgsem/elixir_advection_diffusion_nonperiodic.jl b/examples/tree_2d_dgsem/elixir_advection_diffusion_nonperiodic.jl
index 93abfd49a3a..2234d8fdf76 100644
--- a/examples/tree_2d_dgsem/elixir_advection_diffusion_nonperiodic.jl
+++ b/examples/tree_2d_dgsem/elixir_advection_diffusion_nonperiodic.jl
@@ -51,7 +51,7 @@ boundary_conditions_parabolic = BoundaryConditionDirichlet(initial_condition)
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/tree_2d_dgsem/elixir_advection_diffusion_nonperiodic_amr.jl b/examples/tree_2d_dgsem/elixir_advection_diffusion_nonperiodic_amr.jl
index 95f9c46f2b8..e7667f9808d 100644
--- a/examples/tree_2d_dgsem/elixir_advection_diffusion_nonperiodic_amr.jl
+++ b/examples/tree_2d_dgsem/elixir_advection_diffusion_nonperiodic_amr.jl
@@ -49,7 +49,7 @@ boundary_conditions_parabolic = BoundaryConditionDirichlet(initial_condition)
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/tree_2d_dgsem/elixir_advection_implicit_sparse_jacobian.jl b/examples/tree_2d_dgsem/elixir_advection_implicit_sparse_jacobian.jl
index abf68d33908..710bcfc66df 100644
--- a/examples/tree_2d_dgsem/elixir_advection_implicit_sparse_jacobian.jl
+++ b/examples/tree_2d_dgsem/elixir_advection_implicit_sparse_jacobian.jl
@@ -1,7 +1,8 @@
 using Trixi
 using SparseConnectivityTracer # For obtaining the Jacobian sparsity pattern
 using SparseMatrixColorings # For obtaining the coloring vector
-using OrdinaryDiffEqSDIRK, ADTypes
+using OrdinaryDiffEqSDIRK, OrdinaryDiffEqDifferentiation
+using ADTypes
 
 ###############################################################################
 ### equation, solver, mesh ###
diff --git a/examples/tree_2d_dgsem/elixir_diffusion_steady_state_linear_map.jl b/examples/tree_2d_dgsem/elixir_diffusion_steady_state_linear_map.jl
index c92140a6613..51530f3286f 100644
--- a/examples/tree_2d_dgsem/elixir_diffusion_steady_state_linear_map.jl
+++ b/examples/tree_2d_dgsem/elixir_diffusion_steady_state_linear_map.jl
@@ -44,11 +44,11 @@ boundary_conditions = (; x_neg = bc_homogeneous_dirichlet,
                        y_pos = bc_sin_dirichlet,
                        x_pos = bc_homogeneous_dirichlet)
 
-# `solver_parabolic = ViscousFormulationLocalDG()` strictly required for elliptic/diffusion-dominated problem
+# `solver_parabolic = ParabolicFormulationLocalDG()` strictly required for elliptic/diffusion-dominated problem
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationLocalDG(),
+                                             solver_parabolic = ParabolicFormulationLocalDG(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions))
 
diff --git a/examples/tree_2d_dgsem/elixir_navierstokes_convergence.jl b/examples/tree_2d_dgsem/elixir_navierstokes_convergence.jl
index 16ea4742315..38aed3028c6 100644
--- a/examples/tree_2d_dgsem/elixir_navierstokes_convergence.jl
+++ b/examples/tree_2d_dgsem/elixir_navierstokes_convergence.jl
@@ -205,7 +205,7 @@ boundary_conditions = (; x_neg = boundary_condition_periodic,
                        y_neg = boundary_condition_slip_wall,
                        y_pos = boundary_condition_slip_wall)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; x_neg = boundary_condition_periodic,
                                  x_pos = boundary_condition_periodic,
                                  y_neg = boundary_condition_top_bottom,
diff --git a/examples/tree_2d_dgsem/elixir_navierstokes_shearlayer_nonconforming.jl b/examples/tree_2d_dgsem/elixir_navierstokes_shearlayer_nonconforming.jl
index 257d3293448..ef623715a20 100644
--- a/examples/tree_2d_dgsem/elixir_navierstokes_shearlayer_nonconforming.jl
+++ b/examples/tree_2d_dgsem/elixir_navierstokes_shearlayer_nonconforming.jl
@@ -55,7 +55,7 @@ mesh = TreeMesh(coordinates_min, coordinates_max,
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_condition_periodic,
                                                                     boundary_condition_periodic))
 
diff --git a/examples/tree_2d_dgsem/elixir_navierstokes_viscous_shock.jl b/examples/tree_2d_dgsem/elixir_navierstokes_viscous_shock.jl
index 6a06e3cb4f6..4f45baa4a88 100644
--- a/examples/tree_2d_dgsem/elixir_navierstokes_viscous_shock.jl
+++ b/examples/tree_2d_dgsem/elixir_navierstokes_viscous_shock.jl
@@ -153,6 +153,7 @@ boundary_conditions_parabolic = (x_neg = boundary_condition_parabolic,
 
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition, solver;
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
@@ -170,12 +171,12 @@ alive_callback = AliveCallback(alive_interval = 10)
 analysis_interval = 100
 analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
 
-# Admissible stepsize is governed by the diffusive CFL condition.
-# Unless the advective cfl number `cfl` is not reduced to e.g. `0.1`
+# Admissible stepsize is governed by the parabolic CFL condition.
+# Unless the hyperbolic CFL number `cfl` is reduced to e.g. `0.1`
 # (which is overly restrictive for this problem),
-# the diffusive CFL restricts the timestep for this problem.
+# the parabolic CFL restricts the timestep for this problem.
 stepsize_callback = StepsizeCallback(cfl = 0.2,
-                                     cfl_diffusive = 0.2)
+                                     cfl_parabolic = 0.2)
 
 callbacks = CallbackSet(summary_callback, alive_callback, analysis_callback,
                         stepsize_callback)
diff --git a/examples/tree_3d_dgsem/elixir_advection_diffusion_amr.jl b/examples/tree_3d_dgsem/elixir_advection_diffusion_amr.jl
index 5d886b01cf4..e79f495bef5 100644
--- a/examples/tree_3d_dgsem/elixir_advection_diffusion_amr.jl
+++ b/examples/tree_3d_dgsem/elixir_advection_diffusion_amr.jl
@@ -43,7 +43,7 @@ boundary_conditions_parabolic = boundary_condition_periodic
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/tree_3d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl b/examples/tree_3d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl
index 1a107511344..d8379ba0972 100644
--- a/examples/tree_3d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl
+++ b/examples/tree_3d_dgsem/elixir_advection_diffusion_gradient_source_terms.jl
@@ -49,7 +49,7 @@ boundary_conditions_parabolic = boundary_condition_periodic
 semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic),
                                              initial_condition,
                                              solver;
-                                             solver_parabolic = ViscousFormulationLocalDG(),
+                                             solver_parabolic = ParabolicFormulationLocalDG(),
                                              source_terms = source_terms,
                                              source_terms_parabolic = source_terms_parabolic,
                                              boundary_conditions = (boundary_conditions,
@@ -67,10 +67,10 @@ analysis_callback = AnalysisCallback(semi, interval = 100)
 
 alive_callback = AliveCallback(analysis_interval = 100)
 
-cfl_advective = 0.5  # Not restrictive for this example
-cfl_diffusive = 0.01 # Restricts the timestep
-stepsize_callback = StepsizeCallback(cfl = cfl_advective,
-                                     cfl_diffusive = cfl_diffusive)
+cfl_hyperbolic = 0.5  # Not restrictive for this example
+cfl_parabolic = 0.01 # Restricts the timestep
+stepsize_callback = StepsizeCallback(cfl = cfl_hyperbolic,
+                                     cfl_parabolic = cfl_parabolic)
 callbacks = CallbackSet(summary_callback, analysis_callback, alive_callback,
                         stepsize_callback)
 
diff --git a/examples/tree_3d_dgsem/elixir_advection_diffusion_nonconforming.jl b/examples/tree_3d_dgsem/elixir_advection_diffusion_nonconforming.jl
index 9ddd6f875e3..226d583b105 100644
--- a/examples/tree_3d_dgsem/elixir_advection_diffusion_nonconforming.jl
+++ b/examples/tree_3d_dgsem/elixir_advection_diffusion_nonconforming.jl
@@ -55,7 +55,7 @@ boundary_conditions_parabolic = BoundaryConditionDirichlet(initial_condition)
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/tree_3d_dgsem/elixir_advection_diffusion_nonperiodic.jl b/examples/tree_3d_dgsem/elixir_advection_diffusion_nonperiodic.jl
index a8a6d27edb0..03d2ffff6c7 100644
--- a/examples/tree_3d_dgsem/elixir_advection_diffusion_nonperiodic.jl
+++ b/examples/tree_3d_dgsem/elixir_advection_diffusion_nonperiodic.jl
@@ -52,7 +52,7 @@ boundary_conditions_parabolic = BoundaryConditionDirichlet(initial_condition)
 semi = SemidiscretizationHyperbolicParabolic(mesh,
                                              (equations, equations_parabolic),
                                              initial_condition, solver;
-                                             solver_parabolic = ViscousFormulationBassiRebay1(),
+                                             solver_parabolic = ParabolicFormulationBassiRebay1(),
                                              boundary_conditions = (boundary_conditions,
                                                                     boundary_conditions_parabolic))
 
diff --git a/examples/tree_3d_dgsem/elixir_euler_sedov_blast_wave_sc_subcell.jl b/examples/tree_3d_dgsem/elixir_euler_sedov_blast_wave_sc_subcell.jl
new file mode 100644
index 00000000000..19b3b6d04f2
--- /dev/null
+++ b/examples/tree_3d_dgsem/elixir_euler_sedov_blast_wave_sc_subcell.jl
@@ -0,0 +1,102 @@
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations3D(1.4)
+
+"""
+    initial_condition_sedov_blast_wave(x, t, equations::CompressibleEulerEquations3D)
+
+The Sedov blast wave setup based on example 35.1.4 from Flash
+- https://flash.rochester.edu/site/flashcode/user_support/flash4_ug_4p8.pdf
+with smaller strength of the initial discontinuity.
+"""
+function initial_condition_sedov_blast_wave(x, t,
+                                            equations::CompressibleEulerEquations3D)
+    # Set up polar coordinates
+    inicenter = SVector(0.0, 0.0, 0.0)
+    x_norm = x[1] - inicenter[1]
+    y_norm = x[2] - inicenter[2]
+    z_norm = x[3] - inicenter[3]
+    r = sqrt(x_norm^2 + y_norm^2 + z_norm^2)
+
+    # Setup based on https://flash.rochester.edu/site/flashcode/user_support/flash_ug_devel/node187.html#SECTION010114000000000000000
+    r0 = 0.21875 # = 3.5 * smallest dx (for domain length=4 and max-ref=6)
+    E = 1.0
+    p0_inner = 3 * (equations.gamma - 1) * E / (4 * pi * r0^2)
+    p0_outer = 1.0e-5
+
+    # Calculate primitive variables
+    rho = 1.0
+    v1 = 0.0
+    v2 = 0.0
+    v3 = 0.0
+    p = r > r0 ? p0_outer : p0_inner
+
+    return prim2cons(SVector(rho, v1, v2, v3, p), equations)
+end
+initial_condition = initial_condition_sedov_blast_wave
+
+surface_flux = flux_lax_friedrichs
+volume_flux = flux_ranocha
+polydeg = 3
+basis = LobattoLegendreBasis(polydeg)
+limiter_idp = SubcellLimiterIDP(equations, basis;
+                                positivity_variables_cons = ["rho"],
+                                positivity_variables_nonlinear = [pressure],
+                                local_twosided_variables_cons = ["rho"],
+                                local_onesided_variables_nonlinear = [(entropy,
+                                                                       max)],
+                                max_iterations_newton = 70,
+                                bar_states = false)
+volume_integral = VolumeIntegralSubcellLimiting(limiter_idp;
+                                                volume_flux_dg = volume_flux,
+                                                volume_flux_fv = surface_flux)
+solver = DGSEM(basis, surface_flux, volume_integral)
+
+coordinates_min = (-1.0, -1.0, -1.0)
+coordinates_max = (1.0, 1.0, 1.0)
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 3,
+                n_cells_max = 100_000,
+                periodicity = true)
+
+# create the semi discretization object
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
+                                    boundary_conditions = boundary_condition_periodic)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 3.0)
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+save_solution = SaveSolutionCallback(interval = 10,
+                                     save_initial_solution = true,
+                                     save_final_solution = true,
+                                     extra_node_variables = (:limiting_coefficient,))
+
+stepsize_callback = StepsizeCallback(cfl = 0.5)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback,
+                        alive_callback,
+                        save_solution,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+stage_callbacks = (SubcellLimiterIDPCorrection(), BoundsCheckCallback())
+
+sol = Trixi.solve(ode, Trixi.SimpleSSPRK33(stage_callbacks = stage_callbacks);
+                  dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+                  callback = callbacks);
diff --git a/examples/tree_3d_dgsem/elixir_mhd_convergence.jl b/examples/tree_3d_dgsem/elixir_mhd_convergence.jl
new file mode 100644
index 00000000000..c92141fd2f1
--- /dev/null
+++ b/examples/tree_3d_dgsem/elixir_mhd_convergence.jl
@@ -0,0 +1,118 @@
+using Trixi
+using OrdinaryDiffEqLowStorageRK
+
+###############################################################################
+# semidiscretization of the compressible ideal GLM-MHD equations
+
+gamma() = 2.0 # required to make solution below working!
+equations = IdealGlmMhdEquations3D(gamma())
+
+"""
+    initial_condition_convergence(x, t, equations::IdealGlmMhdEquations3D)
+
+Manufactured solution for the 3D(only!) compressible ideal GLM-MHD equations.
+Proposed in 
+- Marvin Bohm, Andrew R Winters, Gregor J Gassner, Dominik Derigs, Florian Hindenlang, Joachim Saur (2020):
+  An entropy stable nodal discontinuous Galerkin method for the resistive MHD equations.
+  Part I: Theory and numerical verification
+  [10.1016/j.jcp.2018.06.027](https://doi.org/10.1016/j.jcp.2018.06.027)
+"""
+@inline function initial_condition_convergence(x, t, equations::IdealGlmMhdEquations3D)
+    h = 0.5f0 * sinpi(2 * (x[1] + x[2] + x[3] - t)) + 2
+
+    u_1 = h
+    u_2 = h
+    u_3 = h
+    u_4 = 0
+    u_5 = 2 * h^2 + h
+
+    u_6 = h
+    u_7 = -h
+    u_8 = 0
+    u_9 = 0
+
+    return SVector(u_1, u_2, u_3, u_4, u_5, u_6, u_7, u_8, u_9)
+end
+
+"""
+    source_terms_convergence(x, t, equations::IdealGlmMhdEquations3D)
+
+Manufactured solution for the 3D(only!) compressible ideal GLM-MHD equations.
+Proposed in 
+- Marvin Bohm, Andrew R Winters, Gregor J Gassner, Dominik Derigs, Florian Hindenlang, Joachim Saur (2020):
+  An entropy stable nodal discontinuous Galerkin method for the resistive MHD equations.
+  Part I: Theory and numerical verification
+  [10.1016/j.jcp.2018.06.027](https://doi.org/10.1016/j.jcp.2018.06.027)
+
+For the version without parabolic terms see the implementation in FLUXO:
+https://github.com/project-fluxo/fluxo/blob/c7e0cc9b7fd4569dcab67bbb6e5a25c0a84859f1/src/equation/mhd/equation.f90#L1539-L1554
+"""
+function source_terms_convergence(u, x, t, equations::IdealGlmMhdEquations3D)
+    h = 0.5f0 * sinpi(2 * (x[1] + x[2] + x[3] - t)) + 2
+    h_x = pi * cospi(2 * (x[1] + x[2] + x[3] - t))
+
+    s_1 = h_x
+    s_2 = h_x + 4 * h * h_x
+    s_3 = h_x + 4 * h * h_x
+    s_4 = 4 * h * h_x
+    s_5 = h_x + 12 * h * h_x
+    s_6 = h_x
+    s_7 = -h_x
+    s_8 = 0
+    s_9 = 0
+
+    return SVector(s_1, s_2, s_3, s_4, s_5, s_6, s_7, s_8, s_9)
+end
+
+surface_flux = (flux_hll, flux_nonconservative_powell)
+volume_flux = (flux_hindenlang_gassner, flux_nonconservative_powell)
+
+polydeg = 3
+basis = LobattoLegendreBasis(polydeg)
+
+volume_integral = VolumeIntegralFluxDifferencing(volume_flux)
+solver = DGSEM(basis, surface_flux, volume_integral)
+
+coordinates_min = (0.0, 0.0, 0.0)
+coordinates_max = (1.0, 1.0, 1.0)
+
+mesh = TreeMesh(coordinates_min, coordinates_max,
+                initial_refinement_level = 2,
+                periodicity = true,
+                n_cells_max = 100_000)
+
+semi = SemidiscretizationHyperbolic(mesh, equations,
+                                    initial_condition_convergence, solver;
+                                    source_terms = source_terms_convergence,
+                                    boundary_conditions = boundary_condition_periodic)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 1.0)
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+
+analysis_interval = 50
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
+
+cfl = 1.8
+stepsize_callback = StepsizeCallback(cfl = cfl)
+
+glm_speed_callback = GlmSpeedCallback(glm_scale = 0.5, cfl = cfl)
+
+callbacks = CallbackSet(summary_callback,
+                        analysis_callback,
+                        alive_callback,
+                        stepsize_callback,
+                        glm_speed_callback)
+
+###############################################################################
+# run the simulation
+
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 1.0, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/examples/tree_3d_dgsem/elixir_navierstokes_convergence.jl b/examples/tree_3d_dgsem/elixir_navierstokes_convergence.jl
index 1f1c734ba00..83f60647e61 100644
--- a/examples/tree_3d_dgsem/elixir_navierstokes_convergence.jl
+++ b/examples/tree_3d_dgsem/elixir_navierstokes_convergence.jl
@@ -246,7 +246,7 @@ boundary_conditions = (; x_neg = boundary_condition_periodic,
                        z_neg = boundary_condition_periodic,
                        z_pos = boundary_condition_periodic)
 
-# define viscous boundary conditions
+# define parabolic boundary conditions
 boundary_conditions_parabolic = (; x_neg = boundary_condition_periodic,
                                  x_pos = boundary_condition_periodic,
                                  y_neg = boundary_condition_top_bottom,
diff --git a/examples/tree_3d_dgsem/elixir_navierstokes_viscous_shock.jl b/examples/tree_3d_dgsem/elixir_navierstokes_viscous_shock.jl
index 8df31a0561a..858029fdd96 100644
--- a/examples/tree_3d_dgsem/elixir_navierstokes_viscous_shock.jl
+++ b/examples/tree_3d_dgsem/elixir_navierstokes_viscous_shock.jl
@@ -174,10 +174,10 @@ alive_callback = AliveCallback(alive_interval = 10)
 analysis_interval = 100
 analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
 
-# For this setup, both advective and diffusive time step restrictions are relevant, i.e.,
+# For this setup, both hyperbolic and parabolic timestep restrictions are relevant, i.e.,
 # may not be increased beyond the given values.
 stepsize_callback = StepsizeCallback(cfl = 0.4,
-                                     cfl_diffusive = 0.2)
+                                     cfl_parabolic = 0.2)
 
 callbacks = CallbackSet(summary_callback, alive_callback, analysis_callback,
                         stepsize_callback)
diff --git a/ext/TrixiCUDAExt.jl b/ext/TrixiCUDAExt.jl
index 681d2f53a1e..7f4f31c3f0f 100644
--- a/ext/TrixiCUDAExt.jl
+++ b/ext/TrixiCUDAExt.jl
@@ -1,11 +1,27 @@
 # Package extension for adding CUDA-based features to Trixi.jl
 module TrixiCUDAExt
 
-import CUDA: CuArray
+using CUDA: CUDA, CuArray, CuDeviceArray, KernelAdaptor, @device_override
 import Trixi
 
 function Trixi.storage_type(::Type{<:CuArray})
     return CuArray
 end
 
+function Trixi.unsafe_wrap_or_alloc(::KernelAdaptor, vec, size)
+    return Trixi.unsafe_wrap_or_alloc(CuDeviceArray, vec, size)
+end
+
+function Trixi.unsafe_wrap_or_alloc(::Type{<:CuDeviceArray}, vec::CuDeviceArray, size)
+    return reshape(vec, size)
+end
+
+@static if Trixi._PREFERENCE_LOG == "log_Trixi_NaN"
+    @device_override Trixi.log(x::Float64) = ccall("extern __nv_log", llvmcall, Cdouble,
+                                                   (Cdouble,), x)
+    @device_override Trixi.log(x::Float32) = ccall("extern __nv_logf", llvmcall, Cfloat,
+                                                   (Cfloat,), x)
+    # TODO: Trixi.log(x::Float16)
+end
+
 end
diff --git a/ext/TrixiPlotsExt.jl b/ext/TrixiPlotsExt.jl
new file mode 100644
index 00000000000..18cfc87d3c1
--- /dev/null
+++ b/ext/TrixiPlotsExt.jl
@@ -0,0 +1,72 @@
+module TrixiPlotsExt
+
+# Load the required packages
+using Plots: Plots
+using Trixi: Trixi, getmesh
+using MuladdMacro: @muladd
+using Printf: @sprintf
+
+@muladd begin
+#! format: noindent
+
+function Trixi.show_plot(plot_data, variable_names;
+                         show_mesh = true, plot_arguments = Dict{Symbol, Any}(),
+                         time = nothing, timestep = nothing)
+    # Gather subplots
+    plots = []
+    for v in variable_names
+        push!(plots, Plots.plot(plot_data[v]; plot_arguments...))
+    end
+    if show_mesh
+        push!(plots, Plots.plot(getmesh(plot_data); plot_arguments...))
+    end
+
+    # Note, for the visualization callback to work for general equation systems
+    # this layout construction would need to use the if-logic below.
+    # Currently, there is no use case for this so it is left here as a note.
+    #
+    # Determine layout
+    # if length(plots) <= 3
+    #   cols = length(plots)
+    #   rows = 1
+    # else
+    #   cols = ceil(Int, sqrt(length(plots)))
+    #   rows = div(length(plots), cols, RoundUp)
+    # end
+    # layout = (rows, cols)
+
+    # Determine layout
+    cols = ceil(Int, sqrt(length(plots)))
+    rows = div(length(plots), cols, RoundUp)
+    layout = (rows, cols)
+
+    # Show plot
+    return display(Plots.plot(plots..., layout = layout))
+end
+
+function Trixi.save_plot(plot_data, variable_names;
+                         show_mesh = true, plot_arguments = Dict{Symbol, Any}(),
+                         time = nothing, timestep = nothing)
+    # Gather subplots
+    plots = []
+    for v in variable_names
+        push!(plots, Plots.plot(plot_data[v]; plot_arguments...))
+    end
+    if show_mesh
+        push!(plots, Plots.plot(getmesh(plot_data); plot_arguments...))
+    end
+
+    # Determine layout
+    cols = ceil(Int, sqrt(length(plots)))
+    rows = div(length(plots), cols, RoundUp)
+    layout = (rows, cols)
+
+    # Create plot
+    Plots.plot(plots..., layout = layout)
+
+    # Determine filename and save plot
+    filename = joinpath("out", @sprintf("solution_%09d.png", timestep))
+    return Plots.savefig(filename)
+end
+end # @muladd
+end # module TrixiPlotsExt
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 36df191b314..56e94deb148 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -25,11 +25,12 @@ const _PREFERENCE_LOOPVECTORIZATION = @load_preference("loop_vectorization", tru
 # (standard library packages first, other packages next, all of them sorted alphabetically)
 
 using Accessors: @reset
-using LinearAlgebra: LinearAlgebra, Diagonal, diag, dot, eigvals, mul!, norm, cross,
+using LinearAlgebra: LinearAlgebra, Adjoint, Diagonal, diag, dot, eigvals, mul!, norm,
+                     cross,
                      normalize, I,
                      UniformScaling, det
 using Printf: @printf, @sprintf, println
-using SparseArrays: AbstractSparseMatrix, AbstractSparseMatrixCSC, sparse, droptol!,
+using SparseArrays: SparseMatrixCSC, AbstractSparseMatrix, sparse, droptol!,
                     rowvals, nzrange, nonzeros
 
 # import @reexport now to make it available for further imports/exports
@@ -59,7 +60,8 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
 using FillArrays: Ones, Zeros
 using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
-using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend
+using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend,
+                          allocate
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices
@@ -77,7 +79,6 @@ using P4est
 using T8code
 using RecipesBase: RecipesBase
 using RecursiveArrayTools: VectorOfArray
-using Requires: @require
 using Static: Static, One, True, False
 @reexport using StaticArrays: SVector
 using StaticArrays: StaticArrays, MVector, MArray, SMatrix, @SMatrix
@@ -147,6 +148,7 @@ include("semidiscretization/semidiscretization_hyperbolic.jl")
 include("semidiscretization/semidiscretization_hyperbolic_parabolic.jl")
 include("semidiscretization/semidiscretization_euler_acoustics.jl")
 include("semidiscretization/semidiscretization_coupled.jl")
+include("semidiscretization/semidiscretization_coupled_p4est.jl")
 include("time_integration/time_integration.jl")
 include("callbacks_step/callbacks_step.jl")
 include("callbacks_stage/callbacks_stage.jl")
@@ -235,7 +237,7 @@ export boundary_condition_do_nothing,
        BoundaryConditionNavierStokesWall,
        NoSlip, Slip,
        Adiabatic, Isothermal,
-       BoundaryConditionCoupled
+       BoundaryConditionCoupled, BoundaryConditionCoupledP4est
 
 export initial_condition_convergence_test, source_terms_convergence_test,
        source_terms_lorentz, source_terms_collision_ion_electron,
@@ -307,14 +309,14 @@ export SemidiscretizationEulerGravity, ParametersEulerGravity,
        timestep_gravity_erk53_3Sstar!,
        timestep_gravity_carpenter_kennedy_erk54_2N!
 
-export SemidiscretizationCoupled
+export SemidiscretizationCoupled, SemidiscretizationCoupledP4est
 
 export SummaryCallback, SteadyStateCallback, AnalysisCallback, AliveCallback,
        SaveRestartCallback, SaveSolutionCallback, TimeSeriesCallback, VisualizationCallback,
        AveragingCallback,
        AMRCallback, StepsizeCallback, LimitingAnalysisCallback,
        GlmSpeedCallback, LBMCollisionCallback, EulerAcousticsCouplingCallback,
-       TrivialCallback, AnalysisCallbackCoupled,
+       TrivialCallback, AnalysisCallbackCoupled, AnalysisCallbackCoupledP4est,
        AnalysisSurfaceIntegral, DragCoefficientPressure2D, LiftCoefficientPressure2D,
        DragCoefficientShearStress2D, LiftCoefficientShearStress2D,
        DragCoefficientPressure3D, LiftCoefficientPressure3D
@@ -338,7 +340,7 @@ export convergence_test,
 
 export DGMulti, DGMultiBasis, estimate_dt, DGMultiMesh, GaussSBP
 
-export ViscousFormulationBassiRebay1, ViscousFormulationLocalDG
+export ParabolicFormulationBassiRebay1, ParabolicFormulationLocalDG
 
 # Visualization-related exports
 export PlotData1D, PlotData2D, ScalarPlotData2D, getmesh, adapt_to_mesh_level!,
@@ -352,11 +354,6 @@ function __init__()
     init_t8code()
 
     register_error_hints()
-
-    # Enable features that depend on the availability of the Plots package
-    @require Plots="91a5bcdd-55d7-5caf-9e0b-520d859cae80" begin
-        using .Plots: Plots
-    end
 end
 
 include("auxiliary/precompile.jl")
diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl
index e56cd399f7a..28e4c47d339 100644
--- a/src/auxiliary/auxiliary.jl
+++ b/src/auxiliary/auxiliary.jl
@@ -82,6 +82,26 @@ end
     return ncalls_first
 end
 
+"""
+    @trixi_timeit_ext backend timer() "some label" expression
+
+This macro is an extension of [`@trixi_timeit`](@ref) that also synchronizes the given `backend` after executing the given `expression`.
+This is useful to get accurate timing measurements for GPU backends, where the execution of kernels is asynchronous.
+The synchronization ensures that all GPU operations are completed before the timer is stopped.
+
+See also [`@trixi_timeit`](@ref).
+"""
+macro trixi_timeit_ext(backend, timer_output, label, expr)
+    expr = quote
+        local val = $(esc(expr))
+        if $(esc(backend)) !== nothing && $(TrixiBase).timeit_debug_enabled()
+            $(KernelAbstractions.synchronize)($(esc(backend)))
+        end
+        val
+    end
+    return :(@trixi_timeit($(esc(timer_output)), $(esc(label)), $(expr)))
+end
+
 """
     examples_dir()
 
diff --git a/src/auxiliary/precompile.jl b/src/auxiliary/precompile.jl
index 7f5cec6e731..335e1fa158a 100644
--- a/src/auxiliary/precompile.jl
+++ b/src/auxiliary/precompile.jl
@@ -396,7 +396,8 @@ function _precompile_manual_()
         # 1D, serial
         @assert Base.precompile(Tuple{typeof(Trixi.init_boundaries), Array{Int, 1},
                                       TreeMesh{1, Trixi.SerialTree{1}, RealT},
-                                      Trixi.TreeElementContainer1D{RealT, uEltype}})
+                                      Trixi.TreeElementContainer1D{RealT, uEltype},
+                                      basis_type_dgsem(RealT, nnodes_)})
         @assert Base.precompile(Tuple{typeof(Trixi.init_interfaces), Array{Int, 1},
                                       TreeMesh{1, Trixi.SerialTree{1}, RealT},
                                       Trixi.TreeElementContainer1D{RealT, uEltype}})
@@ -406,7 +407,8 @@ function _precompile_manual_()
         # 2D, serial
         @assert Base.precompile(Tuple{typeof(Trixi.init_boundaries), Array{Int, 1},
                                       TreeMesh{2, Trixi.SerialTree{2}, RealT},
-                                      Trixi.TreeElementContainer2D{RealT, uEltype}})
+                                      Trixi.TreeElementContainer2D{RealT, uEltype},
+                                      basis_type_dgsem(RealT, nnodes_)})
         @assert Base.precompile(Tuple{typeof(Trixi.init_interfaces), Array{Int, 1},
                                       TreeMesh{2, Trixi.SerialTree{2}, RealT},
                                       Trixi.TreeElementContainer2D{RealT, uEltype}})
@@ -420,7 +422,8 @@ function _precompile_manual_()
         # 2D, parallel
         @assert Base.precompile(Tuple{typeof(Trixi.init_boundaries), Array{Int, 1},
                                       TreeMesh{2, Trixi.ParallelTree{2}, RealT},
-                                      Trixi.TreeElementContainer2D{RealT, uEltype}})
+                                      Trixi.TreeElementContainer2D{RealT, uEltype},
+                                      basis_type_dgsem(RealT, nnodes_)})
         @assert Base.precompile(Tuple{typeof(Trixi.init_interfaces), Array{Int, 1},
                                       TreeMesh{2, Trixi.ParallelTree{2}, RealT},
                                       Trixi.TreeElementContainer2D{RealT, uEltype}})
@@ -437,7 +440,8 @@ function _precompile_manual_()
         # 3D, serial
         @assert Base.precompile(Tuple{typeof(Trixi.init_boundaries), Array{Int, 1},
                                       TreeMesh{3, Trixi.SerialTree{3}, RealT},
-                                      Trixi.TreeElementContainer3D{RealT, uEltype}})
+                                      Trixi.TreeElementContainer3D{RealT, uEltype},
+                                      basis_type_dgsem(RealT, nnodes_)})
         @assert Base.precompile(Tuple{typeof(Trixi.init_interfaces), Array{Int, 1},
                                       TreeMesh{3, Trixi.SerialTree{3}, RealT},
                                       Trixi.TreeElementContainer3D{RealT, uEltype}})
diff --git a/src/auxiliary/special_elixirs.jl b/src/auxiliary/special_elixirs.jl
index d656d5bd79c..cfbd023f4fe 100644
--- a/src/auxiliary/special_elixirs.jl
+++ b/src/auxiliary/special_elixirs.jl
@@ -55,7 +55,7 @@ function convergence_test(mod::Module, elixir::AbstractString, iterations,
 end
 
 """
-    calc_mean_convergence(eocs)
+    Trixi.calc_mean_convergence(eocs)
 
 Calculate the mean convergence rates from the given experimental orders of convergence `eocs`.
 The `eocs` are expected to be in the format returned by [`convergence_test`](@ref), i.e., a `Dict` where
diff --git a/src/callbacks_stage/subcell_bounds_check_2d.jl b/src/callbacks_stage/subcell_bounds_check_2d.jl
index b90518b5631..6647aed8fdc 100644
--- a/src/callbacks_stage/subcell_bounds_check_2d.jl
+++ b/src/callbacks_stage/subcell_bounds_check_2d.jl
@@ -65,6 +65,8 @@
     end
     if positivity
         for v in limiter.positivity_variables_cons
+            # Note: If a variable appears here and in the local min/max limiting, the positivity
+            # lower bound is taken into account there. Skip these variables here.
             if v in limiter.local_twosided_variables_cons
                 continue
             end
diff --git a/src/callbacks_stage/subcell_bounds_check_3d.jl b/src/callbacks_stage/subcell_bounds_check_3d.jl
index d2abb99cee8..52693102cde 100644
--- a/src/callbacks_stage/subcell_bounds_check_3d.jl
+++ b/src/callbacks_stage/subcell_bounds_check_3d.jl
@@ -19,6 +19,33 @@
     # `@batch` here to allow a possible redefinition of `@threaded` without creating errors here.
     # See also https://github.com/trixi-framework/Trixi.jl/pull/1888#discussion_r1537785293.
 
+    if local_twosided
+        for v in limiter.local_twosided_variables_cons
+            v_string = string(v)
+            key_min = Symbol(v_string, "_min")
+            key_max = Symbol(v_string, "_max")
+            deviation_min = idp_bounds_delta_local[key_min]
+            deviation_max = idp_bounds_delta_local[key_max]
+            @batch reduction=((max, deviation_min), (max, deviation_max)) for element in eachelement(solver,
+                                                                                                     cache)
+                for k in eachnode(solver), j in eachnode(solver), i in eachnode(solver)
+                    var = u[v, i, j, k, element]
+                    # Note: We always save the absolute deviations >= 0 and therefore use the
+                    # `max` operator for the lower and upper bound. The different directions of
+                    # upper and lower bound are considered in their calculations with a
+                    # different sign.
+                    deviation_min = max(deviation_min,
+                                        variable_bounds[key_min][i, j, k, element] -
+                                        var)
+                    deviation_max = max(deviation_max,
+                                        var -
+                                        variable_bounds[key_max][i, j, k, element])
+                end
+            end
+            idp_bounds_delta_local[key_min] = deviation_min
+            idp_bounds_delta_local[key_max] = deviation_max
+        end
+    end
     if local_onesided
         for (variable, min_or_max) in limiter.local_onesided_variables_nonlinear
             key = Symbol(string(variable), "_", string(min_or_max))
@@ -41,6 +68,11 @@
     end
     if positivity
         for v in limiter.positivity_variables_cons
+            # Note: If a variable appears here and in the local min/max limiting, the positivity
+            # lower bound is taken into account there. Skip these variables here.
+            if v in limiter.local_twosided_variables_cons
+                continue
+            end
             key = Symbol(string(v), "_min")
             deviation = idp_bounds_delta_local[key]
             @batch reduction=(max, deviation) for element in eachelement(solver, cache)
diff --git a/src/callbacks_stage/subcell_limiter_idp_correction_3d.jl b/src/callbacks_stage/subcell_limiter_idp_correction_3d.jl
index f96a491487c..8390c47821a 100644
--- a/src/callbacks_stage/subcell_limiter_idp_correction_3d.jl
+++ b/src/callbacks_stage/subcell_limiter_idp_correction_3d.jl
@@ -6,7 +6,7 @@
 #! format: noindent
 
 function perform_idp_correction!(u, dt,
-                                 mesh::P4estMesh{3},
+                                 mesh::Union{TreeMesh{3}, P4estMesh{3}},
                                  equations, dg, cache)
     @unpack inverse_weights = dg.basis # Plays role of inverse DG-subcell sizes
     @unpack antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R = cache.antidiffusive_fluxes
diff --git a/src/callbacks_step/amr_dg1d.jl b/src/callbacks_step/amr_dg1d.jl
index 8b36edfee91..346b8d6da69 100644
--- a/src/callbacks_step/amr_dg1d.jl
+++ b/src/callbacks_step/amr_dg1d.jl
@@ -70,8 +70,8 @@ function refine!(u_ode::AbstractVector, adaptor, mesh::TreeMesh{1},
     refine!(u_ode, adaptor, mesh, equations, dg, cache, elements_to_refine)
 
     # Resize parabolic helper variables
-    @unpack viscous_container = cache_parabolic
-    resize!(viscous_container, equations, dg, cache)
+    @unpack parabolic_container = cache_parabolic
+    resize!(parabolic_container, equations, dg, cache)
 
     return nothing
 end
@@ -195,8 +195,8 @@ function coarsen!(u_ode::AbstractVector, adaptor, mesh::TreeMesh{1},
     coarsen!(u_ode, adaptor, mesh, equations, dg, cache, elements_to_remove)
 
     # Resize parabolic helper variables
-    @unpack viscous_container = cache_parabolic
-    resize!(viscous_container, equations, dg, cache)
+    @unpack parabolic_container = cache_parabolic
+    resize!(parabolic_container, equations, dg, cache)
 
     return nothing
 end
diff --git a/src/callbacks_step/amr_dg2d.jl b/src/callbacks_step/amr_dg2d.jl
index 2141c2713bc..fd4cceb9efc 100644
--- a/src/callbacks_step/amr_dg2d.jl
+++ b/src/callbacks_step/amr_dg2d.jl
@@ -190,8 +190,8 @@ function refine!(u_ode::AbstractVector, adaptor,
     refine!(u_ode, adaptor, mesh, equations, dg, cache, elements_to_refine)
 
     # Resize parabolic helper variables
-    @unpack viscous_container = cache_parabolic
-    resize!(viscous_container, equations, dg, cache)
+    @unpack parabolic_container = cache_parabolic
+    resize!(parabolic_container, equations, dg, cache)
 
     return nothing
 end
@@ -385,8 +385,8 @@ function coarsen!(u_ode::AbstractVector, adaptor,
     coarsen!(u_ode, adaptor, mesh, equations, dg, cache, elements_to_remove)
 
     # Resize parabolic helper variables
-    @unpack viscous_container = cache_parabolic
-    resize!(viscous_container, equations, dg, cache)
+    @unpack parabolic_container = cache_parabolic
+    resize!(parabolic_container, equations, dg, cache)
 
     return nothing
 end
diff --git a/src/callbacks_step/analysis.jl b/src/callbacks_step/analysis.jl
index f405508f75c..ab4736582ec 100644
--- a/src/callbacks_step/analysis.jl
+++ b/src/callbacks_step/analysis.jl
@@ -702,12 +702,11 @@ end
 # Special analyze for `SemidiscretizationHyperbolicParabolic` such that
 # precomputed gradients are available. Required for `enstrophy` (see above) and viscous forces.
 # Note that this needs to be included after `analysis_surface_integral_2d.jl` to
-# have `VariableViscous` available.
+# have `VariableParabolic` available.
 function analyze(quantity::AnalysisSurfaceIntegral{Variable},
                  du, u, t,
-                 semi::SemidiscretizationHyperbolicParabolic) where {
-                                                                     Variable <:
-                                                                     VariableViscous}
+                 semi::SemidiscretizationHyperbolicParabolic) where {Variable <:
+                                                                     VariableParabolic}
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     equations_parabolic = semi.equations_parabolic
     cache_parabolic = semi.cache_parabolic
diff --git a/src/callbacks_step/analysis_dg1d.jl b/src/callbacks_step/analysis_dg1d.jl
index e53df1dd4c3..b098b89d7e5 100644
--- a/src/callbacks_step/analysis_dg1d.jl
+++ b/src/callbacks_step/analysis_dg1d.jl
@@ -124,7 +124,8 @@ end
 # This avoids the need to divide the RHS of the DG scheme by the Jacobian when computing
 # the time derivative of entropy, see `entropy_change_reference_element`.
 function integrate_reference_element(func::Func, u, element,
-                                     mesh::AbstractMesh{1}, equations, dg::DGSEM, cache,
+                                     ::Type{<:AbstractMesh{1}}, equations, dg::DGSEM,
+                                     cache,
                                      args...) where {Func}
     @unpack weights = dg.basis
 
@@ -142,9 +143,9 @@ end
 # Calculate ∫_e (∂S/∂u ⋅ ∂u/∂t) dΩ_e where the result on element 'e' is kept in reference space
 # Note that ∂S/∂u = w(u) with entropy variables w
 function entropy_change_reference_element(du, u, element,
-                                          mesh::AbstractMesh{1},
+                                          MeshT::Type{<:AbstractMesh{1}},
                                           equations, dg::DGSEM, cache, args...)
-    return integrate_reference_element(u, element, mesh, equations, dg, cache,
+    return integrate_reference_element(u, element, MeshT, equations, dg, cache,
                                        du) do u, i, element, equations, dg, du
         u_node = get_node_vars(u, equations, dg, i, element)
         du_node = get_node_vars(du, equations, dg, i, element)
@@ -155,7 +156,8 @@ end
 
 # calculate surface integral of func(u, equations) * normal on the reference element.
 function surface_integral_reference_element(func::Func, u, element,
-                                            mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                                            ::Type{<:Union{TreeMesh{1},
+                                                           StructuredMesh{1}}},
                                             equations, dg::DGSEM,
                                             cache, args...) where {Func}
     u_left = get_node_vars(u, equations, dg, 1, element)
diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl
index 5035e831eed..368b7f5e4cc 100644
--- a/src/callbacks_step/analysis_dg2d.jl
+++ b/src/callbacks_step/analysis_dg2d.jl
@@ -138,7 +138,7 @@ function calc_error_norms(func, u, t, analyzer,
     return l2_error, linf_error
 end
 
-function calc_error_norms(func, u, t, analyzer,
+function calc_error_norms(func, _u, t, analyzer,
                           mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                       UnstructuredMesh2D,
                                       P4estMesh{2}, P4estMeshView{2},
@@ -146,9 +146,19 @@ function calc_error_norms(func, u, t, analyzer,
                           equations,
                           initial_condition, dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
-    @unpack node_coordinates, inverse_jacobian = cache.elements
     @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis
 
+    # TODO GPU AnalysisCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack node_coordinates, inverse_jacobian = cache.elements
+        u = _u
+    else
+        node_coordinates = Array(cache.elements.node_coordinates)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
+
     # Set up data structures
     l2_error = zero(func(get_node_vars(u, equations, dg, 1, 1, 1), equations))
     linf_error = copy(l2_error)
@@ -190,7 +200,8 @@ end
 # This avoids the need to divide the RHS of the DG scheme by the Jacobian when computing
 # the time derivative of entropy, see `entropy_change_reference_element`.
 function integrate_reference_element(func::Func, u, element,
-                                     mesh::AbstractMesh{2}, equations, dg::DGSEM, cache,
+                                     ::Type{<:AbstractMesh{2}}, equations, dg::DGSEM,
+                                     cache,
                                      args...) where {Func}
     @unpack weights = dg.basis
 
@@ -208,9 +219,9 @@ end
 # Calculate ∫_e (∂S/∂u ⋅ ∂u/∂t) dΩ_e where the result on element 'e' is kept in reference space
 # Note that ∂S/∂u = w(u) with entropy variables w
 function entropy_change_reference_element(du, u, element,
-                                          mesh::AbstractMesh{2},
+                                          MeshT::Type{<:AbstractMesh{2}},
                                           equations, dg::DGSEM, cache, args...)
-    return integrate_reference_element(u, element, mesh, equations, dg, cache,
+    return integrate_reference_element(u, element, MeshT, equations, dg, cache,
                                        du) do u, i, j, element, equations, dg, du
         u_node = get_node_vars(u, equations, dg, i, j, element)
         du_node = get_node_vars(du, equations, dg, i, j, element)
@@ -221,7 +232,7 @@ end
 
 # calculate surface integral of func(u, equations) * normal on the reference element.
 function surface_integral_reference_element(func::Func, u, element,
-                                            mesh::TreeMesh{2}, equations, dg::DGSEM,
+                                            ::Type{<:TreeMesh{2}}, equations, dg::DGSEM,
                                             cache, args...) where {Func}
     @unpack weights = dg.basis
 
@@ -250,11 +261,11 @@ end
 # Note: `get_normal_direction` already returns an outward-pointing normal for all directions,
 # thus no +- flips are needed here.
 function surface_integral_reference_element(func::Func, u, element,
-                                            mesh::Union{StructuredMesh{2},
-                                                        StructuredMeshView{2},
-                                                        UnstructuredMesh2D,
-                                                        P4estMesh{2},
-                                                        T8codeMesh{2}},
+                                            ::Type{<:Union{StructuredMesh{2},
+                                                           StructuredMeshView{2},
+                                                           UnstructuredMesh2D,
+                                                           P4estMesh{2},
+                                                           T8codeMesh{2}}},
                                             equations, dg::DGSEM,
                                             cache, args...) where {Func}
     @unpack contravariant_vectors = cache.elements
@@ -327,13 +338,24 @@ function integrate_via_indices(func::Func, u,
     return integral
 end
 
-function integrate_via_indices(func::Func, u,
+function integrate_via_indices(func::Func, _u,
                                mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
-                                           UnstructuredMesh2D, P4estMesh{2},
+                                           UnstructuredMesh2D,
+                                           P4estMesh{2}, P4estMeshView{2},
                                            T8codeMesh{2}},
                                equations, dg::DGSEM, cache,
                                args...; normalize = true) where {Func}
-    @unpack weights = dg.basis
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack weights = dg.basis
+        @unpack inverse_jacobian = cache.elements
+        u = _u
+    else
+        weights = Array(dg.basis.weights)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
 
     # Initialize integral with zeros of the right shape
     integral = zero(func(u, 1, 1, 1, equations, dg, args...))
@@ -343,7 +365,7 @@ function integrate_via_indices(func::Func, u,
     @batch reduction=((+, integral), (+, total_volume)) for element in eachelement(dg,
                                                                                    cache)
         for j in eachnode(dg), i in eachnode(dg)
-            volume_jacobian = abs(inv(cache.elements.inverse_jacobian[i, j, element]))
+            volume_jacobian = abs(inv(inverse_jacobian[i, j, element]))
             integral += volume_jacobian * weights[i] * weights[j] *
                         func(u, i, j, element, equations, dg, args...)
             total_volume += volume_jacobian * weights[i] * weights[j]
@@ -375,7 +397,7 @@ function integrate(func::Func, u,
                    mesh::Union{TreeMesh{2}, P4estMesh{2}},
                    equations, equations_parabolic, dg::DGSEM,
                    cache, cache_parabolic; normalize = true) where {Func}
-    gradients_x, gradients_y = cache_parabolic.viscous_container.gradients
+    gradients_x, gradients_y = cache_parabolic.parabolic_container.gradients
     integrate_via_indices(u, mesh, equations, dg, cache;
                           normalize = normalize) do u, i, j, element, equations, dg
         u_local = get_node_vars(u, equations, dg, i, j, element)
@@ -388,10 +410,19 @@ function integrate(func::Func, u,
     end
 end
 
-function analyze(::typeof(entropy_timederivative), du, u, t,
+function analyze(::typeof(entropy_timederivative), _du, u, t,
                  mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2},
                              UnstructuredMesh2D, P4estMesh{2}, T8codeMesh{2}},
                  equations, dg::Union{DGSEM, FDSBP}, cache)
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        du = _du
+    else
+        du = Array(_du)
+    end
+
+    # Calculate
     # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
     integrate_via_indices(u, mesh, equations, dg, cache,
                           du) do u, i, j, element, equations, dg, du
diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl
index 634db48de29..7708a32e6ba 100644
--- a/src/callbacks_step/analysis_dg3d.jl
+++ b/src/callbacks_step/analysis_dg3d.jl
@@ -161,14 +161,24 @@ function calc_error_norms(func, u, t, analyzer,
     return l2_error, linf_error
 end
 
-function calc_error_norms(func, u, t, analyzer,
+function calc_error_norms(func, _u, t, analyzer,
                           mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                           equations, initial_condition,
                           dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
-    @unpack node_coordinates, inverse_jacobian = cache.elements
     @unpack u_local, u_tmp1, u_tmp2, x_local, x_tmp1, x_tmp2, jacobian_local, jacobian_tmp1, jacobian_tmp2 = cache_analysis
 
+    # TODO GPU AnalysisCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack node_coordinates, inverse_jacobian = cache.elements
+        u = _u
+    else
+        node_coordinates = Array(cache.elements.node_coordinates)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
+
     # Set up data structures
     l2_error = zero(func(get_node_vars(u, equations, dg, 1, 1, 1, 1), equations))
     linf_error = copy(l2_error)
@@ -214,7 +224,8 @@ end
 # This avoids the need to divide the RHS of the DG scheme by the Jacobian when computing
 # the time derivative of entropy, see `entropy_change_reference_element`.
 function integrate_reference_element(func::Func, u, element,
-                                     mesh::AbstractMesh{3}, equations, dg::DGSEM, cache,
+                                     ::Type{<:AbstractMesh{3}}, equations, dg::DGSEM,
+                                     cache,
                                      args...; normalize = true) where {Func}
     @unpack weights = dg.basis
 
@@ -232,9 +243,9 @@ end
 # Calculate ∫_e (∂S/∂u ⋅ ∂u/∂t) dΩ_e where the result on element 'e' is kept in reference space
 # Note that ∂S/∂u = w(u) with entropy variables w
 function entropy_change_reference_element(du, u, element,
-                                          mesh::AbstractMesh{3},
+                                          MeshT::Type{<:AbstractMesh{3}},
                                           equations, dg::DGSEM, cache, args...)
-    return integrate_reference_element(u, element, mesh, equations, dg, cache,
+    return integrate_reference_element(u, element, MeshT, equations, dg, cache,
                                        du) do u, i, j, k, element, equations, dg, du
         u_node = get_node_vars(u, equations, dg, i, j, k, element)
         du_node = get_node_vars(du, equations, dg, i, j, k, element)
@@ -245,7 +256,7 @@ end
 
 # calculate surface integral of func(u, equations) * normal on the reference element.
 function surface_integral_reference_element(func::Func, u, element,
-                                            mesh::TreeMesh{3}, equations, dg::DGSEM,
+                                            ::Type{<:TreeMesh{3}}, equations, dg::DGSEM,
                                             cache, args...) where {Func}
     @unpack weights = dg.basis
 
@@ -281,8 +292,9 @@ end
 # Note: `get_normal_direction` already returns an outward-pointing normal for all directions,
 # thus no +- flips are needed here.
 function surface_integral_reference_element(func::Func, u, element,
-                                            mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                                        T8codeMesh{3}},
+                                            ::Type{<:Union{StructuredMesh{3},
+                                                           P4estMesh{3},
+                                                           T8codeMesh{3}}},
                                             equations, dg::DGSEM, cache,
                                             args...) where {Func}
     @unpack contravariant_vectors = cache.elements
@@ -377,12 +389,22 @@ function integrate_via_indices(func::Func, u,
     return integral
 end
 
-function integrate_via_indices(func::Func, u,
+function integrate_via_indices(func::Func, _u,
                                mesh::Union{StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
                                equations, dg::DGSEM, cache,
                                args...; normalize = true) where {Func}
-    @unpack weights = dg.basis
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack weights = dg.basis
+        @unpack inverse_jacobian = cache.elements
+        u = _u
+    else
+        weights = Array(dg.basis.weights)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
 
     # Initialize integral with zeros of the right shape
     integral = zero(func(u, 1, 1, 1, 1, equations, dg, args...))
@@ -392,7 +414,7 @@ function integrate_via_indices(func::Func, u,
     @batch reduction=((+, integral), (+, total_volume)) for element in eachelement(dg,
                                                                                    cache)
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            volume_jacobian = abs(inv(cache.elements.inverse_jacobian[i, j, k, element]))
+            volume_jacobian = abs(inv(inverse_jacobian[i, j, k, element]))
             integral += volume_jacobian * weights[i] * weights[j] * weights[k] *
                         func(u, i, j, k, element, equations, dg, args...)
             total_volume += volume_jacobian * weights[i] * weights[j] * weights[k]
@@ -423,7 +445,7 @@ function integrate(func::Func, u,
                    mesh::Union{TreeMesh{3}, P4estMesh{3}},
                    equations, equations_parabolic, dg::DGSEM,
                    cache, cache_parabolic; normalize = true) where {Func}
-    gradients_x, gradients_y, gradients_z = cache_parabolic.viscous_container.gradients
+    gradients_x, gradients_y, gradients_z = cache_parabolic.parabolic_container.gradients
     integrate_via_indices(u, mesh, equations, dg, cache;
                           normalize = normalize) do u, i, j, k, element, equations, dg
         u_local = get_node_vars(u, equations, dg, i, j, k, element)
@@ -438,10 +460,18 @@ function integrate(func::Func, u,
     end
 end
 
-function analyze(::typeof(entropy_timederivative), du, u, t,
+function analyze(::typeof(entropy_timederivative), _du, u, t,
                  mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                              T8codeMesh{3}},
                  equations, dg::Union{DGSEM, FDSBP}, cache)
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        du = _du
+    else
+        du = Array(_du)
+    end
+
     # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
     integrate_via_indices(u, mesh, equations, dg, cache,
                           du) do u, i, j, k, element, equations, dg, du
diff --git a/src/callbacks_step/analysis_dgmulti.jl b/src/callbacks_step/analysis_dgmulti.jl
index e720d283ffa..805c3c5d5bc 100644
--- a/src/callbacks_step/analysis_dgmulti.jl
+++ b/src/callbacks_step/analysis_dgmulti.jl
@@ -10,7 +10,7 @@ function calc_error_norms(func, u, t, analyzer,
                           dg::DGMulti{NDIMS}, cache, cache_analysis) where {NDIMS}
     rd = dg.basis
     md = mesh.md
-    @unpack u_values = cache
+    (; u_values) = cache.solution_container
 
     # interpolate u to quadrature points
     apply_to_each_field(mul_by!(rd.Vq), u_values, u)
@@ -31,7 +31,7 @@ function integrate(func::Func, u, mesh::DGMultiMesh,
                    equations, dg::DGMulti, cache; normalize = true) where {Func}
     rd = dg.basis
     md = mesh.md
-    @unpack u_values = cache
+    (; u_values) = cache.solution_container
 
     # interpolate u to quadrature points
     apply_to_each_field(mul_by!(rd.Vq), u_values, u)
@@ -47,7 +47,7 @@ function analyze(::typeof(entropy_timederivative), du, u, t,
                  mesh::DGMultiMesh, equations, dg::DGMulti, cache)
     rd = dg.basis
     md = mesh.md
-    @unpack u_values = cache
+    (; u_values) = cache.solution_container
 
     # interpolate u, du to quadrature points
     du_values = similar(u_values) # Todo: DGMulti. Can we move this to the analysis cache somehow?
@@ -134,10 +134,62 @@ function analyze(::Val{:linf_divb}, du, u, t,
     return linf_divB
 end
 
+# Calculate ∫_e (∂S/∂u ⋅ ∂u/∂t) dΩ_e where the result on element 'e' is kept in reference space
+# Note that ∂S/∂u = w(u) with entropy variables w.
+# This assumes that both du and u are already interpolated to the quadrature points
+function entropy_change_reference_element(du_values_local, u_values_local,
+                                          mesh::DGMultiMesh, equations,
+                                          dg::DGMulti, cache)
+    rd = dg.basis
+    @unpack Nq, wq = rd
+
+    # Compute entropy change for this element
+    dS_dt_elem = zero(eltype(first(du_values_local)))
+    for i in Base.OneTo(Nq) # Loop over quadrature points in the element
+        dS_dt_elem += dot(cons2entropy(u_values_local[i], equations),
+                          du_values_local[i]) * wq[i]
+    end
+
+    return dS_dt_elem
+end
+
+# calculate surface integral of func(u, normal_direction, equations) on the reference element.
+# For DGMulti, we loop over all faces of the element and integrate using face quadrature weights.
+# Restricted to `Polynomial` approximation type which requires interpolation to face quadrature nodes
+function surface_integral_reference_element(func::Func, u, element,
+                                            mesh::DGMultiMesh, equations,
+                                            dg::DGMulti,
+                                            cache, args...) where {Func}
+    rd = dg.basis
+    @unpack Nfq, wf, Vf = rd
+    md = mesh.md
+    @unpack nxyzJ = md
+
+    # Interpolate volume solution to face quadrature nodes for this element
+    @unpack u_face_local_threaded = cache
+    u_face_local = u_face_local_threaded[Threads.threadid()]
+    u_elem = view(u, :, element)
+    apply_to_each_field(mul_by!(Vf), u_face_local, u_elem)
+
+    surface_integral = zero(eltype(first(u)))
+    # Loop over all face nodes for this element
+    for i in 1:Nfq
+        # Get solution at this face node
+        u_node = u_face_local[i]
+
+        # Get face normal; nxyzJ stores components as (nxJ, nyJ, nxJ)
+        normal_direction = SVector(getindex.(nxyzJ, i, element))
+
+        # Multiply with face quadrature weight and accumulate
+        surface_integral += wf[i] * func(u_node, normal_direction, equations)
+    end
+
+    return surface_integral
+end
+
 function create_cache_analysis(analyzer, mesh::DGMultiMesh,
                                equations, dg::DGMulti, cache,
                                RealT, uEltype)
-    md = mesh.md
     return (;)
 end
 
diff --git a/src/callbacks_step/analysis_surface_integral.jl b/src/callbacks_step/analysis_surface_integral.jl
index 1366c8c73a9..33d4262429a 100644
--- a/src/callbacks_step/analysis_surface_integral.jl
+++ b/src/callbacks_step/analysis_surface_integral.jl
@@ -54,7 +54,7 @@ end
 
 # Abstract base type used for dispatch of `analyze` for quantities
 # requiring gradients of the velocity field.
-abstract type VariableViscous end
+abstract type VariableParabolic end
 
 struct LiftCoefficientPressure{RealT <: Real, NDIMS}
     force_state::ForceState{RealT, NDIMS}
@@ -64,11 +64,11 @@ struct DragCoefficientPressure{RealT <: Real, NDIMS}
     force_state::ForceState{RealT, NDIMS}
 end
 
-struct LiftCoefficientShearStress{RealT <: Real, NDIMS} <: VariableViscous
+struct LiftCoefficientShearStress{RealT <: Real, NDIMS} <: VariableParabolic
     force_state::ForceState{RealT, NDIMS}
 end
 
-struct DragCoefficientShearStress{RealT <: Real, NDIMS} <: VariableViscous
+struct DragCoefficientShearStress{RealT <: Real, NDIMS} <: VariableParabolic
     force_state::ForceState{RealT, NDIMS}
 end
 
diff --git a/src/callbacks_step/analysis_surface_integral_2d.jl b/src/callbacks_step/analysis_surface_integral_2d.jl
index fef4b9872d1..9c1899ef750 100644
--- a/src/callbacks_step/analysis_surface_integral_2d.jl
+++ b/src/callbacks_step/analysis_surface_integral_2d.jl
@@ -261,7 +261,7 @@ function analyze(surface_variable::AnalysisSurfaceIntegral{Variable}, du, u, t,
                  mesh::P4estMesh{2},
                  equations, equations_parabolic,
                  dg::DGSEM, cache, semi,
-                 cache_parabolic) where {Variable <: VariableViscous}
+                 cache_parabolic) where {Variable <: VariableParabolic}
     @unpack boundaries = cache
     @unpack node_coordinates, contravariant_vectors = cache.elements
     @unpack weights = dg.basis
@@ -271,8 +271,8 @@ function analyze(surface_variable::AnalysisSurfaceIntegral{Variable}, du, u, t,
     boundary_indices = get_boundary_indices(boundary_symbols, boundary_symbol_indices)
 
     # Additions for parabolic
-    @unpack viscous_container = cache_parabolic
-    @unpack gradients = viscous_container
+    @unpack parabolic_container = cache_parabolic
+    @unpack gradients = parabolic_container
 
     gradients_x, gradients_y = gradients
 
diff --git a/src/callbacks_step/save_solution.jl b/src/callbacks_step/save_solution.jl
index 6e002d2eb23..68110a3b257 100644
--- a/src/callbacks_step/save_solution.jl
+++ b/src/callbacks_step/save_solution.jl
@@ -287,6 +287,11 @@ end
                                     element_variables = Dict{Symbol, Any}(),
                                     node_variables = Dict{Symbol, Any}();
                                     system = "")
+    # TODO GPU currently on CPU
+    backend = trixi_backend(u_ode)
+    if backend !== nothing
+        u_ode = Array(u_ode)
+    end
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array_native(u_ode, mesh, equations, solver, cache)
     save_solution_file(u, t, dt, iter, mesh, equations, solver, cache,
diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl
index d22599c9f09..7dde973652f 100644
--- a/src/callbacks_step/stepsize.jl
+++ b/src/callbacks_step/stepsize.jl
@@ -6,30 +6,30 @@
 #! format: noindent
 
 """
-    StepsizeCallback(; cfl=1.0, cfl_diffusive = 0.0,
+    StepsizeCallback(; cfl=1.0, cfl_parabolic = 0.0,
                      interval = 1)
 
-Set the time step size according to a CFL condition with CFL number `cfl`
+Set the time step size according to a CFL condition with hyperbolic CFL number `cfl`
 if the time integration method isn't adaptive itself.
-The keyword argument `cfl` must be either a `Real` number, corresponding to a constant
+The hyperbolic CFL number `cfl` must be either a `Real` number, corresponding to a constant
 CFL number, or a function of time `t` returning a `Real` number.
 The latter approach allows for variable CFL numbers that can be used to realize, e.g.,
 a ramp-up of the time step.
 
-One can additionally supply a diffusive CFL number `cfl_diffusive` to
-limit the admissible timestep also respecting diffusive restrictions.
+One can additionally supply a parabolic CFL number `cfl_parabolic` to
+limit the admissible timestep also respecting parabolic restrictions.
 This is only applicable for semidiscretizations of type [`SemidiscretizationHyperbolicParabolic`](@ref).
-To enable checking for diffusive timestep restrictions, provide a value greater than zero for `cfl_diffusive`.
-By default, `cfl_diffusive` is set to zero which means that only the advective/convective CFL number is considered.
-The keyword argument `cfl_diffusive` must be either a `Real` number, corresponding to a constant
-diffusive CFL number, or a function of time `t` returning a `Real` number.
+To enable checking for parabolic timestep restrictions, provide a value greater than zero for `cfl_parabolic`.
+By default, `cfl_parabolic` is set to zero which means that only the hyperbolic CFL number `cfl` is considered.
+The keyword argument `cfl_parabolic` must be either a `Real` number, corresponding to a constant
+parabolic CFL number, or a function of time `t` returning a `Real` number.
 
 By default, the timestep will be adjusted at every step.
 For different values of `interval`, the timestep will be adjusted every `interval` steps.
 """
-struct StepsizeCallback{CflAdvectiveType, CflDiffusiveType}
-    cfl_advective::CflAdvectiveType
-    cfl_diffusive::CflDiffusiveType
+struct StepsizeCallback{CflHyperbolicType, CflParabolicType}
+    cfl_hyperbolic::CflHyperbolicType
+    cfl_parabolic::CflParabolicType
     interval::Int
 end
 
@@ -37,10 +37,10 @@ function Base.show(io::IO, cb::DiscreteCallback{<:Any, <:StepsizeCallback})
     @nospecialize cb # reduce precompilation time
 
     stepsize_callback = cb.affect!
-    @unpack cfl_advective, cfl_diffusive, interval = stepsize_callback
+    @unpack cfl_hyperbolic, cfl_parabolic, interval = stepsize_callback
     print(io, "StepsizeCallback(",
-          "cfl_advective=", cfl_advective, ", ",
-          "cfl_diffusive=", cfl_diffusive, ", ",
+          "cfl_hyperbolic=", cfl_hyperbolic, ", ",
+          "cfl_parabolic=", cfl_parabolic, ", ",
           "interval=", interval, ")")
     return nothing
 end
@@ -55,22 +55,22 @@ function Base.show(io::IO, ::MIME"text/plain",
         stepsize_callback = cb.affect!
 
         setup = [
-            "CFL Advective" => stepsize_callback.cfl_advective,
-            "CFL Diffusive" => stepsize_callback.cfl_diffusive,
+            "CFL Hyperbolic" => stepsize_callback.cfl_hyperbolic,
+            "CFL Parabolic" => stepsize_callback.cfl_parabolic,
             "Interval" => stepsize_callback.interval
         ]
         summary_box(io, "StepsizeCallback", setup)
     end
 end
 
-function StepsizeCallback(; cfl = 1.0, cfl_diffusive = 0.0,
+function StepsizeCallback(; cfl = 1.0, cfl_parabolic = 0.0,
                           interval = 1)
     # Convert plain real numbers to functions for unified treatment
-    cfl_conv = isa(cfl, Real) ? Returns(cfl) : cfl
-    cfl_diff = isa(cfl_diffusive, Real) ? Returns(cfl_diffusive) : cfl_diffusive
-    stepsize_callback = StepsizeCallback{typeof(cfl_conv), typeof(cfl_diff)}(cfl_conv,
-                                                                             cfl_diff,
-                                                                             interval)
+    cfl_hyp = isa(cfl, Real) ? Returns(cfl) : cfl
+    cfl_para = isa(cfl_parabolic, Real) ? Returns(cfl_parabolic) : cfl_parabolic
+    stepsize_callback = StepsizeCallback{typeof(cfl_hyp), typeof(cfl_para)}(cfl_hyp,
+                                                                            cfl_para,
+                                                                            interval)
 
     return DiscreteCallback(stepsize_callback, stepsize_callback, # the first one is the condition, the second the affect!
                             save_positions = (false, false),
@@ -78,9 +78,9 @@ function StepsizeCallback(; cfl = 1.0, cfl_diffusive = 0.0,
 end
 
 # Compatibility constructor used in `EulerAcousticsCouplingCallback`
-function StepsizeCallback(cfl_advective)
-    RealT = typeof(cfl_advective)
-    return StepsizeCallback{RealT, RealT}(cfl_advective, zero(RealT), 1)
+function StepsizeCallback(cfl_hyperbolic)
+    RealT = typeof(cfl_hyperbolic)
+    return StepsizeCallback{RealT, RealT}(cfl_hyperbolic, zero(RealT), 1)
 end
 
 function initialize!(cb::DiscreteCallback{Condition, Affect!}, u, t,
@@ -106,11 +106,11 @@ end
     t = integrator.t
     u_ode = integrator.u
     semi = integrator.p
-    @unpack cfl_advective, cfl_diffusive = stepsize_callback
+    @unpack cfl_hyperbolic, cfl_parabolic = stepsize_callback
 
     # Dispatch based on semidiscretization
-    dt = @trixi_timeit timer() "calculate dt" calculate_dt(u_ode, t, cfl_advective,
-                                                           cfl_diffusive, semi)
+    dt = @trixi_timeit timer() "calculate dt" calculate_dt(u_ode, t, cfl_hyperbolic,
+                                                           cfl_parabolic, semi)
 
     set_proposed_dt!(integrator, dt)
     integrator.opts.dtmax = dt
@@ -130,32 +130,32 @@ function (cb::DiscreteCallback{Condition, Affect!})(ode::ODEProblem) where {Cond
                                                                             StepsizeCallback
                                                                             }
     stepsize_callback = cb.affect!
-    @unpack cfl_advective, cfl_diffusive = stepsize_callback
+    @unpack cfl_hyperbolic, cfl_parabolic = stepsize_callback
     u_ode = ode.u0
     t = first(ode.tspan)
     semi = ode.p
 
-    return calculate_dt(u_ode, t, cfl_advective, cfl_diffusive, semi)
+    return calculate_dt(u_ode, t, cfl_hyperbolic, cfl_parabolic, semi)
 end
 
 # General case for an abstract single (i.e., non-coupled) semidiscretization
-function calculate_dt(u_ode, t, cfl_advective, cfl_diffusive,
+function calculate_dt(u_ode, t, cfl_hyperbolic, cfl_parabolic,
                       semi::AbstractSemidiscretization)
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array(u_ode, mesh, equations, solver, cache)
 
-    return cfl_advective(t) * max_dt(u, t, mesh,
+    return cfl_hyperbolic(t) * max_dt(u, t, mesh,
                   have_constant_speed(equations), semi, equations, solver, cache,
                   solver.volume_integral)
 end
 
 # For Euler-Acoustic simulations with `EulerAcousticsCouplingCallback`
-function calculate_dt(u_ode, t, cfl_advective::Real, cfl_diffusive::Real,
+function calculate_dt(u_ode, t, cfl_hyperbolic::Real, cfl_parabolic::Real,
                       semi::AbstractSemidiscretization)
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array(u_ode, mesh, equations, solver, cache)
 
-    return cfl_advective * max_dt(u, t, mesh,
+    return cfl_hyperbolic * max_dt(u, t, mesh,
                   have_constant_speed(equations), semi, equations, solver, cache,
                   solver.volume_integral)
 end
@@ -178,29 +178,74 @@ end
 end
 
 # Case for a hyperbolic-parabolic semidiscretization
-function calculate_dt(u_ode, t, cfl_advective, cfl_diffusive,
+function calculate_dt(u_ode, t, cfl_hyperbolic, cfl_parabolic,
                       semi::SemidiscretizationHyperbolicParabolic)
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     equations_parabolic = semi.equations_parabolic
 
     u = wrap_array(u_ode, mesh, equations, solver, cache)
 
-    dt_advective = cfl_advective(t) * max_dt(u, t, mesh,
-                          have_constant_speed(equations), equations,
-                          solver, cache)
+    dt_hyperbolic = cfl_hyperbolic(t) * max_dt(u, t, mesh,
+                           have_constant_speed(equations), equations,
+                           solver, cache)
 
-    cfl_diff = cfl_diffusive(t)
-    if cfl_diff > 0 # Check if diffusive CFL should be considered
-        dt_diffusive = cfl_diff * max_dt(u, t, mesh,
+    cfl_para = cfl_parabolic(t)
+    if cfl_para > 0 # Check if parabolic CFL should be considered
+        dt_parabolic = cfl_para * max_dt(u, t, mesh,
                               have_constant_diffusivity(equations_parabolic), equations,
                               equations_parabolic, solver, cache)
 
-        return min(dt_advective, dt_diffusive)
+        return min(dt_hyperbolic, dt_parabolic)
     else
-        return dt_advective
+        return dt_hyperbolic
     end
 end
 
+function calc_max_scaled_speed(backend::Nothing, u, mesh, constant_speed, equations, dg,
+                               cache)
+    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+
+    max_scaled_speed = zero(eltype(u))
+    @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
+        max_lambda = max_scaled_speed_per_element(u, typeof(mesh), constant_speed,
+                                                  equations, dg,
+                                                  contravariant_vectors,
+                                                  inverse_jacobian,
+                                                  element)
+        # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
+        # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
+        max_scaled_speed = Base.max(max_scaled_speed, max_lambda)
+    end
+    return max_scaled_speed
+end
+
+function calc_max_scaled_speed(backend::Backend, u, mesh, constant_speed, equations, dg,
+                               cache)
+    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+
+    num_elements = nelements(dg, cache)
+    max_scaled_speeds = allocate(backend, eltype(u), num_elements)
+
+    kernel! = max_scaled_speed_KAkernel!(backend)
+    kernel!(max_scaled_speeds, u, typeof(mesh), constant_speed, equations, dg,
+            contravariant_vectors,
+            inverse_jacobian;
+            ndrange = num_elements)
+
+    return maximum(max_scaled_speeds)
+end
+
+@kernel function max_scaled_speed_KAkernel!(max_scaled_speeds, u, MeshT, constant_speed,
+                                            equations,
+                                            dg, contravariant_vectors, inverse_jacobian)
+    element = @index(Global)
+    max_scaled_speeds[element] = max_scaled_speed_per_element(u, MeshT, constant_speed,
+                                                              equations, dg,
+                                                              contravariant_vectors,
+                                                              inverse_jacobian,
+                                                              element)
+end
+
 include("stepsize_dg1d.jl")
 include("stepsize_dg2d.jl")
 include("stepsize_dg3d.jl")
diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl
index aa17bd4527c..7c8980d3b97 100644
--- a/src/callbacks_step/stepsize_dg2d.jl
+++ b/src/callbacks_step/stepsize_dg2d.jl
@@ -214,40 +214,48 @@ end
 
 function max_dt(u, t,
                 mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
-                            T8codeMesh{2}, StructuredMeshView{2}},
-                constant_speed::False, equations, dg::DG, cache)
-    # Avoid division by zero if the speed vanishes everywhere,
-    # e.g. for steady-state linear advection
-    max_scaled_speed = nextfloat(zero(t))
+                            P4estMeshView{2}, T8codeMesh{2}, StructuredMeshView{2}},
+                constant_speed, equations, dg::DG, cache)
+    backend = trixi_backend(u)
 
-    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    max_lambda = calc_max_scaled_speed(backend, u, mesh, constant_speed, equations, dg,
+                                       cache)
 
-    @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
-        max_lambda1 = max_lambda2 = zero(max_scaled_speed)
-        for j in eachnode(dg), i in eachnode(dg)
-            u_node = get_node_vars(u, equations, dg, i, j, element)
-            lambda1, lambda2 = max_abs_speeds(u_node, equations)
-
-            # Local speeds transformed to the reference element
-            Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
-                                                  i, j, element)
-            lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2)
-            Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
-                                                  i, j, element)
-            lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2)
-
-            inv_jacobian = abs(inverse_jacobian[i, j, element])
+    # Avoid division by zero if the speed vanishes everywhere,
+    # e.g. for steady-state linear advection
+    max_scaled_speed = Base.max(nextfloat(zero(t)), max_lambda)
 
-            max_lambda1 = Base.max(max_lambda1, lambda1_transformed * inv_jacobian)
-            max_lambda2 = Base.max(max_lambda2, lambda2_transformed * inv_jacobian)
-        end
+    return 2 / (nnodes(dg) * max_scaled_speed)
+end
 
-        # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
-        # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
-        max_scaled_speed = Base.max(max_scaled_speed, max_lambda1 + max_lambda2)
+@inline function max_scaled_speed_per_element(u,
+                                              ::Type{<:Union{StructuredMesh{2},
+                                                             UnstructuredMesh2D,
+                                                             P4estMesh{2},
+                                                             T8codeMesh{2},
+                                                             StructuredMeshView{2}}},
+                                              constant_speed::False, equations, dg::DG,
+                                              contravariant_vectors, inverse_jacobian,
+                                              element)
+    max_lambda1 = max_lambda2 = zero(eltype(u))
+    for j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, element)
+        lambda1, lambda2 = max_abs_speeds(u_node, equations)
+
+        # Local speeds transformed to the reference element
+        Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
+                                              i, j, element)
+        lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2)
+        Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
+                                              i, j, element)
+        lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2)
+
+        inv_jacobian = abs(inverse_jacobian[i, j, element])
+
+        max_lambda1 = Base.max(max_lambda1, lambda1_transformed * inv_jacobian)
+        max_lambda2 = Base.max(max_lambda2, lambda2_transformed * inv_jacobian)
     end
-
-    return 2 / (nnodes(dg) * max_scaled_speed)
+    return max_lambda1 + max_lambda2
 end
 
 function max_dt(u, t,
@@ -296,38 +304,35 @@ function max_dt(u, t,
     return 4 / (nnodes(dg) * max_scaled_diffusivity)
 end
 
-function max_dt(u, t,
-                mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
-                            P4estMeshView{2}, T8codeMesh{2}, StructuredMeshView{2}},
-                constant_speed::True, equations, dg::DG, cache)
-    @unpack contravariant_vectors, inverse_jacobian = cache.elements
-
-    # Avoid division by zero if the speed vanishes everywhere,
-    # e.g. for steady-state linear advection
-    max_scaled_speed = nextfloat(zero(t))
-
+@inline function max_scaled_speed_per_element(u,
+                                              ::Type{<:Union{StructuredMesh{2},
+                                                             UnstructuredMesh2D,
+                                                             P4estMesh{2},
+                                                             P4estMeshView{2},
+                                                             T8codeMesh{2},
+                                                             StructuredMeshView{2}}},
+                                              constant_speed::True, equations, dg::DG,
+                                              contravariant_vectors, inverse_jacobian,
+                                              element)
+    max_scaled_speed = zero(eltype(u))
     max_lambda1, max_lambda2 = max_abs_speeds(equations)
+    for j in eachnode(dg), i in eachnode(dg)
+        # Local speeds transformed to the reference element
+        Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
+                                              i, j, element)
+        lambda1_transformed = abs(Ja11 * max_lambda1 + Ja12 * max_lambda2)
+        Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
+                                              i, j, element)
+        lambda2_transformed = abs(Ja21 * max_lambda1 + Ja22 * max_lambda2)
 
-    @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
-        for j in eachnode(dg), i in eachnode(dg)
-            # Local speeds transformed to the reference element
-            Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
-                                                  i, j, element)
-            lambda1_transformed = abs(Ja11 * max_lambda1 + Ja12 * max_lambda2)
-            Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
-                                                  i, j, element)
-            lambda2_transformed = abs(Ja21 * max_lambda1 + Ja22 * max_lambda2)
+        inv_jacobian = abs(inverse_jacobian[i, j, element])
 
-            inv_jacobian = abs(inverse_jacobian[i, j, element])
-            # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
-            # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
-            max_scaled_speed = Base.max(max_scaled_speed,
-                                        inv_jacobian *
-                                        (lambda1_transformed + lambda2_transformed))
-        end
+        max_scaled_speed = Base.max(max_scaled_speed,
+                                    inv_jacobian *
+                                    (lambda1_transformed + lambda2_transformed))
     end
 
-    return 2 / (nnodes(dg) * max_scaled_speed)
+    return max_scaled_speed
 end
 
 function max_dt(u, t,
diff --git a/src/callbacks_step/stepsize_dg3d.jl b/src/callbacks_step/stepsize_dg3d.jl
index d310be1675e..ca434918f53 100644
--- a/src/callbacks_step/stepsize_dg3d.jl
+++ b/src/callbacks_step/stepsize_dg3d.jl
@@ -76,44 +76,50 @@ function max_dt(u, t, mesh::TreeMesh{3},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
-                constant_speed::False, equations, dg::DG, cache)
-    # Avoid division by zero if the speed vanishes everywhere,
-    # e.g. for steady-state linear advection
-    max_scaled_speed = nextfloat(zero(t))
-
-    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+function max_dt(u, t,
+                mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
+                constant_speed, equations, dg::DG, cache)
+    backend = trixi_backend(u)
 
-    @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
-        max_lambda1 = max_lambda2 = max_lambda3 = zero(max_scaled_speed)
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            u_node = get_node_vars(u, equations, dg, i, j, k, element)
-            lambda1, lambda2, lambda3 = max_abs_speeds(u_node, equations)
+    max_lambda = calc_max_scaled_speed(backend, u, mesh, constant_speed, equations, dg,
+                                       cache)
 
-            Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2 + Ja13 * lambda3)
-            Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2 + Ja23 * lambda3)
-            Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda3_transformed = abs(Ja31 * lambda1 + Ja32 * lambda2 + Ja33 * lambda3)
-
-            inv_jacobian = abs(inverse_jacobian[i, j, k, element])
+    # Avoid division by zero if the speed vanishes everywhere,
+    # e.g. for steady-state linear advection
+    max_scaled_speed = Base.max(nextfloat(zero(t)), max_lambda)
 
-            max_lambda1 = Base.max(max_lambda1, inv_jacobian * lambda1_transformed)
-            max_lambda2 = Base.max(max_lambda2, inv_jacobian * lambda2_transformed)
-            max_lambda3 = Base.max(max_lambda3, inv_jacobian * lambda3_transformed)
-        end
+    return 2 / (nnodes(dg) * max_scaled_speed)
+end
 
-        # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
-        # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
-        max_scaled_speed = Base.max(max_scaled_speed,
-                                    max_lambda1 + max_lambda2 + max_lambda3)
+@inline function max_scaled_speed_per_element(u,
+                                              ::Type{<:Union{StructuredMesh{3},
+                                                             P4estMesh{3},
+                                                             T8codeMesh{3}}},
+                                              constant_speed::False, equations, dg,
+                                              contravariant_vectors, inverse_jacobian,
+                                              element)
+    max_lambda1 = max_lambda2 = max_lambda3 = zero(eltype(u))
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+        lambda1, lambda2, lambda3 = max_abs_speeds(u_node, equations)
+
+        Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2 + Ja13 * lambda3)
+        Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2 + Ja23 * lambda3)
+        Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda3_transformed = abs(Ja31 * lambda1 + Ja32 * lambda2 + Ja33 * lambda3)
+
+        inv_jacobian = abs(inverse_jacobian[i, j, k, element])
+
+        max_lambda1 = max(max_lambda1, inv_jacobian * lambda1_transformed)
+        max_lambda2 = max(max_lambda2, inv_jacobian * lambda2_transformed)
+        max_lambda3 = max(max_lambda3, inv_jacobian * lambda3_transformed)
     end
-
-    return 2 / (nnodes(dg) * max_scaled_speed)
+    return max_lambda1 + max_lambda2 + max_lambda3
 end
 
 function max_dt(u, t,
@@ -168,43 +174,40 @@ function max_dt(u, t,
     return 4 / (nnodes(dg) * max_scaled_diffusivity)
 end
 
-function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
-                constant_speed::True, equations, dg::DG, cache)
-    # Avoid division by zero if the speed vanishes everywhere,
-    # e.g. for steady-state linear advection
-    max_scaled_speed = nextfloat(zero(t))
-
-    @unpack contravariant_vectors, inverse_jacobian = cache.elements
-
+@inline function max_scaled_speed_per_element(u,
+                                              ::Type{<:Union{StructuredMesh{3},
+                                                             P4estMesh{3},
+                                                             T8codeMesh{3}}},
+                                              constant_speed::True, equations, dg::DG,
+                                              contravariant_vectors, inverse_jacobian,
+                                              element)
+    max_scaled_speed = zero(eltype(u))
     max_lambda1, max_lambda2, max_lambda3 = max_abs_speeds(equations)
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda1_transformed = abs(Ja11 * max_lambda1 + Ja12 * max_lambda2 +
+                                  Ja13 * max_lambda3)
+        Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda2_transformed = abs(Ja21 * max_lambda1 + Ja22 * max_lambda2 +
+                                  Ja23 * max_lambda3)
+        Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda3_transformed = abs(Ja31 * max_lambda1 + Ja32 * max_lambda2 +
+                                  Ja33 * max_lambda3)
+
+        inv_jacobian = abs(inverse_jacobian[i, j, k, element])
 
-    @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda1_transformed = abs(Ja11 * max_lambda1 + Ja12 * max_lambda2 +
-                                      Ja13 * max_lambda3)
-            Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda2_transformed = abs(Ja21 * max_lambda1 + Ja22 * max_lambda2 +
-                                      Ja23 * max_lambda3)
-            Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda3_transformed = abs(Ja31 * max_lambda1 + Ja32 * max_lambda2 +
-                                      Ja33 * max_lambda3)
-
-            inv_jacobian = abs(inverse_jacobian[i, j, k, element])
-
-            # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
-            # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
-            max_scaled_speed = Base.max(max_scaled_speed,
-                                        inv_jacobian *
-                                        (lambda1_transformed + lambda2_transformed +
-                                         lambda3_transformed))
-        end
+        # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
+        # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
+        max_scaled_speed = Base.max(max_scaled_speed,
+                                    inv_jacobian *
+                                    (lambda1_transformed + lambda2_transformed +
+                                     lambda3_transformed))
     end
 
-    return 2 / (nnodes(dg) * max_scaled_speed)
+    return max_scaled_speed
 end
 
 function max_dt(u, t,
diff --git a/src/callbacks_step/visualization.jl b/src/callbacks_step/visualization.jl
index dc7dca055ce..a7d6a31e4e6 100644
--- a/src/callbacks_step/visualization.jl
+++ b/src/callbacks_step/visualization.jl
@@ -110,19 +110,6 @@ function VisualizationCallback(semi, plot_data_creator = nothing;
                                                    plot_creator,
                                                    Dict{Symbol, Any}(plot_arguments))
 
-    # Warn users if they create a visualization callback without having loaded the Plots package
-    #
-    # Note: This warning is added for convenience, as Plots is the only "officially" supported
-    #       visualization package right now. However, in general nothing prevents anyone from using
-    #       other packages such as Makie, Gadfly etc., given that appropriate `plot_creator`s are
-    #       passed. This is also the reason why the visualization callback is not included via
-    #       Requires.jl only when Plots is present.
-    #       In the future, we should update/remove this warning if other plotting packages are
-    #       starting to be used.
-    if !(:Plots in names(@__MODULE__, all = true))
-        @warn "Package `Plots` not loaded but required by `VisualizationCallback` to visualize results"
-    end
-
     return DiscreteCallback(visualization_callback, visualization_callback, # the first one is the condition, the second the affect!
                             save_positions = (false, false),
                             initialize = initialize!)
@@ -186,42 +173,12 @@ variables in `variable_names` and, optionally, the mesh (if `show_mesh` is `true
 This function is the default `plot_creator` argument for the [`VisualizationCallback`](@ref).
 `time` and `timestep` are currently unused by this function.
 
+!!! note
+    This requires loading [Plots.jl](https://github.com/JuliaPlots/Plots.jl), e.g., via `using Plots`.
+
 See also: [`VisualizationCallback`](@ref), [`save_plot`](@ref)
 """
-function show_plot(plot_data, variable_names;
-                   show_mesh = true, plot_arguments = Dict{Symbol, Any}(),
-                   time = nothing, timestep = nothing)
-    # Gather subplots
-    plots = []
-    for v in variable_names
-        push!(plots, Plots.plot(plot_data[v]; plot_arguments...))
-    end
-    if show_mesh
-        push!(plots, Plots.plot(getmesh(plot_data); plot_arguments...))
-    end
-
-    # Note, for the visualization callback to work for general equation systems
-    # this layout construction would need to use the if-logic below.
-    # Currently, there is no use case for this so it is left here as a note.
-    #
-    # Determine layout
-    # if length(plots) <= 3
-    #   cols = length(plots)
-    #   rows = 1
-    # else
-    #   cols = ceil(Int, sqrt(length(plots)))
-    #   rows = div(length(plots), cols, RoundUp)
-    # end
-    # layout = (rows, cols)
-
-    # Determine layout
-    cols = ceil(Int, sqrt(length(plots)))
-    rows = div(length(plots), cols, RoundUp)
-    layout = (rows, cols)
-
-    # Show plot
-    return display(Plots.plot(plots..., layout = layout))
-end
+function show_plot end
 
 """
     save_plot(plot_data, variable_names;
@@ -235,30 +192,10 @@ is `true`).  Additionally, `plot_arguments` will be unpacked and passed as keywo
 
 The `timestep` is used in the filename. `time` is currently unused by this function.
 
+!!! note
+    This requires loading [Plots.jl](https://github.com/JuliaPlots/Plots.jl), e.g., via `using Plots`.
+
 See also: [`VisualizationCallback`](@ref), [`show_plot`](@ref)
 """
-function save_plot(plot_data, variable_names;
-                   show_mesh = true, plot_arguments = Dict{Symbol, Any}(),
-                   time = nothing, timestep = nothing)
-    # Gather subplots
-    plots = []
-    for v in variable_names
-        push!(plots, Plots.plot(plot_data[v]; plot_arguments...))
-    end
-    if show_mesh
-        push!(plots, Plots.plot(getmesh(plot_data); plot_arguments...))
-    end
-
-    # Determine layout
-    cols = ceil(Int, sqrt(length(plots)))
-    rows = div(length(plots), cols, RoundUp)
-    layout = (rows, cols)
-
-    # Create plot
-    Plots.plot(plots..., layout = layout)
-
-    # Determine filename and save plot
-    filename = joinpath("out", @sprintf("solution_%09d.png", timestep))
-    return Plots.savefig(filename)
-end
+function save_plot end
 end # @muladd
diff --git a/src/equations/compressible_navier_stokes.jl b/src/equations/compressible_navier_stokes.jl
index 3cfeaaf5018..e15c11f3a02 100644
--- a/src/equations/compressible_navier_stokes.jl
+++ b/src/equations/compressible_navier_stokes.jl
@@ -115,7 +115,7 @@ dynamic_viscosity(u, mu::T, equations) where {T} = mu(u, equations)
 # Returns
 - `False()`
 
-Used in diffusive CFL condition computation (see [`StepsizeCallback`](@ref)) to indicate that the
+Used in parabolic CFL condition computation (see [`StepsizeCallback`](@ref)) to indicate that the
 diffusivity is not constant in space and that [`max_diffusivity`](@ref) needs to be computed
 at every node in every element.
 
diff --git a/src/equations/compressible_navier_stokes_1d.jl b/src/equations/compressible_navier_stokes_1d.jl
index f37a751d3a5..3802c23afd8 100644
--- a/src/equations/compressible_navier_stokes_1d.jl
+++ b/src/equations/compressible_navier_stokes_1d.jl
@@ -96,7 +96,7 @@ struct CompressibleNavierStokesDiffusion1D{GradientVariables, RealT <: Real, Mu,
     mu::Mu                     # viscosity
     Pr::RealT                  # Prandtl number
     kappa::RealT               # thermal diffusivity for Fick's law
-    max_1_kappa::RealT         # max(1, kappa) used for diffusive CFL => `max_diffusivity`
+    max_1_kappa::RealT         # max(1, kappa) used for parabolic CFL => `max_diffusivity`
 
     equations_hyperbolic::E    # CompressibleEulerEquations1D
     gradient_variables::GradientVariables # GradientVariablesPrimitive or GradientVariablesEntropy
@@ -176,7 +176,7 @@ function flux(u, gradients, orientation::Integer,
     # by dispatching on the type of `equations.mu`.
     mu = dynamic_viscosity(u, equations)
 
-    # viscous flux components in the x-direction
+    # parabolic flux components in the x-direction
     f1 = 0
     f2 = tau_11 * mu
     f3 = (v1 * tau_11 + q1) * mu
@@ -242,7 +242,7 @@ function entropy2cons(w, equations::CompressibleNavierStokesDiffusion1D)
 end
 
 # the `flux` function takes in transformed variables `u` which depend on the type of the gradient variables.
-# For CNS, it is simplest to formulate the viscous terms in primitive variables, so we transform the transformed
+# For CNS, it is simplest to formulate the parabolic terms in primitive variables, so we transform the transformed
 # variables into primitive variables.
 @inline function convert_transformed_to_primitive(u_transformed,
                                                   equations::CompressibleNavierStokesDiffusion1D{GradientVariablesPrimitive})
@@ -260,7 +260,7 @@ end
 # reverse engineers the gradients to be terms of the primitive variables (v1, T).
 # Helpful because then the diffusive fluxes have the same form as on paper.
 # Note, the first component of `gradient_entropy_vars` contains gradient(rho) which is unused.
-# TODO: parabolic; entropy stable viscous terms
+# TODO: parabolic; entropy stable parabolic terms
 @inline function convert_derivative_to_primitive(u, gradient,
                                                  ::CompressibleNavierStokesDiffusion1D{GradientVariablesPrimitive})
     return gradient
diff --git a/src/equations/compressible_navier_stokes_2d.jl b/src/equations/compressible_navier_stokes_2d.jl
index 0deb1599742..2e7625838da 100644
--- a/src/equations/compressible_navier_stokes_2d.jl
+++ b/src/equations/compressible_navier_stokes_2d.jl
@@ -96,7 +96,7 @@ struct CompressibleNavierStokesDiffusion2D{GradientVariables, RealT <: Real, Mu,
     mu::Mu                     # viscosity
     Pr::RealT                  # Prandtl number
     kappa::RealT               # thermal diffusivity for Fick's law
-    max_4over3_kappa::RealT    # max(4/3, kappa) used for diffusive CFL => `max_diffusivity`
+    max_4over3_kappa::RealT    # max(4/3, kappa) used for parabolic CFL => `max_diffusivity`
 
     equations_hyperbolic::E    # CompressibleEulerEquations2D
     gradient_variables::GradientVariables # GradientVariablesPrimitive or GradientVariablesEntropy
@@ -185,7 +185,7 @@ function flux(u, gradients, orientation::Integer,
     mu = dynamic_viscosity(u, equations)
 
     if orientation == 1
-        # viscous flux components in the x-direction
+        # parabolic flux components in the x-direction
         f1 = 0
         f2 = tau_11 * mu
         f3 = tau_12 * mu
@@ -193,7 +193,7 @@ function flux(u, gradients, orientation::Integer,
 
         return SVector(f1, f2, f3, f4)
     else # if orientation == 2
-        # viscous flux components in the y-direction
+        # parabolic flux components in the y-direction
         # Note, symmetry is exploited for tau_12 = tau_21
         g1 = 0
         g2 = tau_12 * mu # tau_21 * mu
@@ -263,7 +263,7 @@ function entropy2cons(w, equations::CompressibleNavierStokesDiffusion2D)
 end
 
 # the `flux` function takes in transformed variables `u` which depend on the type of the gradient variables.
-# For CNS, it is simplest to formulate the viscous terms in primitive variables, so we transform the transformed
+# For CNS, it is simplest to formulate the parabolic terms in primitive variables, so we transform the transformed
 # variables into primitive variables.
 @inline function convert_transformed_to_primitive(u_transformed,
                                                   equations::CompressibleNavierStokesDiffusion2D{GradientVariablesPrimitive})
@@ -281,7 +281,7 @@ end
 # reverse engineers the gradients to be terms of the primitive variables (v1, v2, T).
 # Helpful because then the diffusive fluxes have the same form as on paper.
 # Note, the first component of `gradient_entropy_vars` contains gradient(rho) which is unused.
-# TODO: parabolic; entropy stable viscous terms
+# TODO: parabolic; entropy stable parabolic terms
 @inline function convert_derivative_to_primitive(u, gradient,
                                                  ::CompressibleNavierStokesDiffusion2D{GradientVariablesPrimitive})
     return gradient
@@ -579,7 +579,7 @@ end
                                                                   x, t,
                                                                   operator_type::Divergence,
                                                                   equations::CompressibleNavierStokesDiffusion2D{GradientVariablesPrimitive})
-    # for Dirichlet boundary conditions, we do not impose any conditions on the viscous fluxes
+    # for Dirichlet boundary conditions, we do not impose any conditions on the parabolic fluxes
     return flux_inner
 end
 end # @muladd
diff --git a/src/equations/compressible_navier_stokes_3d.jl b/src/equations/compressible_navier_stokes_3d.jl
index b24f8467033..874408e1dc8 100644
--- a/src/equations/compressible_navier_stokes_3d.jl
+++ b/src/equations/compressible_navier_stokes_3d.jl
@@ -96,7 +96,7 @@ struct CompressibleNavierStokesDiffusion3D{GradientVariables, RealT <: Real, Mu,
     mu::Mu                     # viscosity
     Pr::RealT                  # Prandtl number
     kappa::RealT               # thermal diffusivity for Fick's law
-    max_4over3_kappa::RealT    # max(4/3, kappa) used for diffusive CFL => `max_diffusivity`
+    max_4over3_kappa::RealT    # max(4/3, kappa) used for parabolic CFL => `max_diffusivity`
 
     equations_hyperbolic::E    # CompressibleEulerEquations3D
     gradient_variables::GradientVariables # GradientVariablesPrimitive or GradientVariablesEntropy
@@ -198,7 +198,7 @@ function flux(u, gradients, orientation::Integer,
     mu = dynamic_viscosity(u, equations)
 
     if orientation == 1
-        # viscous flux components in the x-direction
+        # parabolic flux components in the x-direction
         f1 = 0
         f2 = tau_11 * mu
         f3 = tau_12 * mu
@@ -207,7 +207,7 @@ function flux(u, gradients, orientation::Integer,
 
         return SVector(f1, f2, f3, f4, f5)
     elseif orientation == 2
-        # viscous flux components in the y-direction
+        # parabolic flux components in the y-direction
         # Note, symmetry is exploited for tau_12 = tau_21
         g1 = 0
         g2 = tau_12 * mu # tau_21 * mu
@@ -217,7 +217,7 @@ function flux(u, gradients, orientation::Integer,
 
         return SVector(g1, g2, g3, g4, g5)
     else # if orientation == 3
-        # viscous flux components in the z-direction
+        # parabolic flux components in the z-direction
         # Note, symmetry is exploited for tau_13 = tau_31, tau_23 = tau_32
         h1 = 0
         h2 = tau_13 * mu # tau_31 * mu
@@ -289,7 +289,7 @@ function entropy2cons(w, equations::CompressibleNavierStokesDiffusion3D)
 end
 
 # the `flux` function takes in transformed variables `u` which depend on the type of the gradient variables.
-# For CNS, it is simplest to formulate the viscous terms in primitive variables, so we transform the transformed
+# For CNS, it is simplest to formulate the parabolic terms in primitive variables, so we transform the transformed
 # variables into primitive variables.
 @inline function convert_transformed_to_primitive(u_transformed,
                                                   equations::CompressibleNavierStokesDiffusion3D{GradientVariablesPrimitive})
@@ -307,7 +307,7 @@ end
 # reverse engineers the gradients to be terms of the primitive variables (v1, v2, v3, T).
 # Helpful because then the diffusive fluxes have the same form as on paper.
 # Note, the first component of `gradient_entropy_vars` contains gradient(rho) which is unused.
-# TODO: parabolic; entropy stable viscous terms
+# TODO: parabolic; entropy stable parabolic terms
 @inline function convert_derivative_to_primitive(u, gradient,
                                                  ::CompressibleNavierStokesDiffusion3D{GradientVariablesPrimitive})
     return gradient
@@ -619,7 +619,7 @@ end
                                                                   x, t,
                                                                   operator_type::Divergence,
                                                                   equations::CompressibleNavierStokesDiffusion3D{GradientVariablesPrimitive})
-    # for Dirichlet boundary conditions, we do not impose any conditions on the viscous fluxes
+    # for Dirichlet boundary conditions, we do not impose any conditions on the parabolic fluxes
     return flux_inner
 end
 end # @muladd
diff --git a/src/equations/equations_parabolic.jl b/src/equations/equations_parabolic.jl
index bfd33cfae78..716aba0ef2a 100644
--- a/src/equations/equations_parabolic.jl
+++ b/src/equations/equations_parabolic.jl
@@ -17,7 +17,7 @@ abstract type AbstractLaplaceDiffusion{NDIMS, NVARS} <:
 # Returns
 - `True()`
 
-Used in diffusive CFL condition computation (see [`StepsizeCallback`](@ref)) to indicate that the
+Used in parabolic CFL condition computation (see [`StepsizeCallback`](@ref)) to indicate that the
 diffusivity is constant in space and that [`max_diffusivity`](@ref) needs **not** to be re-computed
 at every node in every element.
 
@@ -32,7 +32,7 @@ if the diffusion term is linear in the variables/constant.
 # Returns
 - `equations_parabolic.diffusivity`
 
-Returns isotropic diffusion coefficient for use in diffusive CFL condition computation,
+Returns isotropic diffusion coefficient for use in parabolic CFL condition computation,
 see [`StepsizeCallback`](@ref).
 """
 @inline function max_diffusivity(equations_parabolic::AbstractLaplaceDiffusion)
diff --git a/src/equations/ideal_glm_mhd_multiion_2d.jl b/src/equations/ideal_glm_mhd_multiion_2d.jl
index 929f6809e96..fa7b7e6b48f 100644
--- a/src/equations/ideal_glm_mhd_multiion_2d.jl
+++ b/src/equations/ideal_glm_mhd_multiion_2d.jl
@@ -319,9 +319,62 @@ end
     return SVector(f)
 end
 
+@inline function flux(u, normal_direction::AbstractVector,
+                      equations::IdealGlmMhdMultiIonEquations2D)
+    B1, B2, B3 = magnetic_field(u, equations)
+    psi = divergence_cleaning_field(u, equations)
+
+    v1_plus, v2_plus, v3_plus, vk1_plus, vk2_plus,
+    vk3_plus = charge_averaged_velocities(u,
+                                          equations)
+
+    mag_en = 0.5f0 * (B1^2 + B2^2 + B3^2)
+    div_clean_energy = 0.5f0 * psi^2
+
+    f = zero(MVector{nvariables(equations), eltype(u)})
+
+    f[1] = equations.c_h * psi * normal_direction[1] +
+           (v2_plus * B1 - v1_plus * B2) * normal_direction[2]
+    f[2] = (v1_plus * B2 - v2_plus * B1) * normal_direction[1] +
+           equations.c_h * psi * normal_direction[2]
+    f[3] = (v1_plus * B3 - v3_plus * B1) * normal_direction[1] +
+           (v2_plus * B3 - v3_plus * B2) * normal_direction[2]
+
+    for k in eachcomponent(equations)
+        rho, rho_v1, rho_v2, rho_v3, rho_e_total = get_component(k, u, equations)
+        rho_inv = 1 / rho
+        v1 = rho_v1 * rho_inv
+        v2 = rho_v2 * rho_inv
+        v3 = rho_v3 * rho_inv
+        kin_en = 0.5f0 * rho * (v1^2 + v2^2 + v3^2)
+
+        gamma = equations.gammas[k]
+        p = (gamma - 1) * (rho_e_total - kin_en - mag_en - div_clean_energy)
+
+        v_normal = v1 * normal_direction[1] + v2 * normal_direction[2]
+        rho_v_normal = rho * v_normal
+
+        f1 = rho_v_normal
+        f2 = rho_v_normal * v1 + p * normal_direction[1]
+        f3 = rho_v_normal * v2 + p * normal_direction[2]
+        f4 = rho_v_normal * v3
+        f5 = ((kin_en + gamma * p / (gamma - 1)) * v1 + 2 * mag_en * vk1_plus[k] -
+              B1 * (vk1_plus[k] * B1 + vk2_plus[k] * B2 + vk3_plus[k] * B3) +
+              equations.c_h * psi * B1) * normal_direction[1] +
+             ((kin_en + gamma * p / (gamma - 1)) * v2 + 2 * mag_en * vk2_plus[k] -
+              B2 * (vk1_plus[k] * B1 + vk2_plus[k] * B2 + vk3_plus[k] * B3) +
+              equations.c_h * psi * B2) * normal_direction[2]
+
+        set_component!(f, k, f1, f2, f3, f4, f5, equations)
+    end
+    f[end] = equations.c_h * (B1 * normal_direction[1] + B2 * normal_direction[2])
+
+    return SVector(f)
+end
+
 """
     flux_nonconservative_ruedaramirez_etal(u_ll, u_rr,
-                                           orientation::Integer,
+                                           orientation_or_normal_direction,
                                            equations::IdealGlmMhdMultiIonEquations2D)
 
 Entropy-conserving non-conservative two-point "flux" as described in
@@ -379,10 +432,12 @@ The term is composed of four individual non-conservative terms:
     charge_ratio_ll ./= total_electron_charge
 
     # Compute auxiliary variables
-    v1_plus_ll, v2_plus_ll, v3_plus_ll, vk1_plus_ll, vk2_plus_ll, vk3_plus_ll = charge_averaged_velocities(u_ll,
-                                                                                                           equations)
-    v1_plus_rr, v2_plus_rr, v3_plus_rr, vk1_plus_rr, vk2_plus_rr, vk3_plus_rr = charge_averaged_velocities(u_rr,
-                                                                                                           equations)
+    v1_plus_ll, v2_plus_ll, v3_plus_ll, vk1_plus_ll, vk2_plus_ll,
+    vk3_plus_ll = charge_averaged_velocities(u_ll,
+                                             equations)
+    v1_plus_rr, v2_plus_rr, v3_plus_rr, vk1_plus_rr, vk2_plus_rr,
+    vk3_plus_rr = charge_averaged_velocities(u_rr,
+                                             equations)
 
     f = zero(MVector{nvariables(equations), eltype(u_ll)})
 
@@ -483,6 +538,114 @@ The term is composed of four individual non-conservative terms:
     return SVector(f)
 end
 
+@inline function flux_nonconservative_ruedaramirez_etal(u_ll, u_rr,
+                                                        normal_direction::AbstractVector,
+                                                        equations::IdealGlmMhdMultiIonEquations2D)
+    @unpack charge_to_mass = equations
+    # Unpack left and right states to get the magnetic field
+    B1_ll, B2_ll, B3_ll = magnetic_field(u_ll, equations)
+    B1_rr, B2_rr, B3_rr = magnetic_field(u_rr, equations)
+    psi_ll = divergence_cleaning_field(u_ll, equations)
+    psi_rr = divergence_cleaning_field(u_rr, equations)
+    B_dot_n_ll = B1_ll * normal_direction[1] +
+                 B2_ll * normal_direction[2]
+    B_dot_n_rr = B1_rr * normal_direction[1] +
+                 B2_rr * normal_direction[2]
+    B_dot_n_avg = 0.5f0 * (B_dot_n_ll + B_dot_n_rr)
+
+    # Compute important averages
+    B1_avg = 0.5f0 * (B1_ll + B1_rr)
+    B2_avg = 0.5f0 * (B2_ll + B2_rr)
+    B3_avg = 0.5f0 * (B3_ll + B3_rr)
+    mag_norm_ll = B1_ll^2 + B2_ll^2 + B3_ll^2
+    mag_norm_rr = B1_rr^2 + B2_rr^2 + B3_rr^2
+    mag_norm_avg = 0.5f0 * (mag_norm_ll + mag_norm_rr)
+    psi_avg = 0.5f0 * (psi_ll + psi_rr)
+
+    # Mean electron pressure
+    pe_ll = equations.electron_pressure(u_ll, equations)
+    pe_rr = equations.electron_pressure(u_rr, equations)
+    pe_mean = 0.5f0 * (pe_ll + pe_rr)
+
+    # Compute charge ratio of u_ll
+    charge_ratio_ll = zero(MVector{ncomponents(equations), eltype(u_ll)})
+    total_electron_charge = zero(eltype(u_ll))
+    for k in eachcomponent(equations)
+        rho_k = u_ll[3 + (k - 1) * 5 + 1] # Extract densities from conserved variable vector
+        charge_ratio_ll[k] = rho_k * charge_to_mass[k]
+        total_electron_charge += charge_ratio_ll[k]
+    end
+    charge_ratio_ll ./= total_electron_charge
+
+    # Compute auxiliary variables
+    v1_plus_ll, v2_plus_ll, v3_plus_ll, vk1_plus_ll, vk2_plus_ll,
+    vk3_plus_ll = charge_averaged_velocities(u_ll,
+                                             equations)
+    v1_plus_rr, v2_plus_rr, v3_plus_rr, vk1_plus_rr, vk2_plus_rr,
+    vk3_plus_rr = charge_averaged_velocities(u_rr,
+                                             equations)
+    v_plus_dot_n_ll = (v1_plus_ll * normal_direction[1] +
+                       v2_plus_ll * normal_direction[2])
+    f = zero(MVector{nvariables(equations), eltype(u_ll)})
+
+    # Entries of Godunov-Powell term for induction equation (multiply by 2 because the non-conservative flux is
+    # multiplied by 0.5 whenever it's used in the Trixi code)
+    f[1] = 2 * v1_plus_ll * B_dot_n_avg
+    f[2] = 2 * v2_plus_ll * B_dot_n_avg
+    f[3] = 2 * v3_plus_ll * B_dot_n_avg
+
+    for k in eachcomponent(equations)
+        # Compute term Lorentz term
+        f2 = charge_ratio_ll[k] *
+             ((0.5f0 * mag_norm_avg + pe_mean) * normal_direction[1] -
+              B_dot_n_avg * B1_avg)
+        f3 = charge_ratio_ll[k] *
+             ((0.5f0 * mag_norm_avg + pe_mean) * normal_direction[2] -
+              B_dot_n_avg * B2_avg)
+        f4 = charge_ratio_ll[k] *
+             (-B_dot_n_avg * B3_avg)
+        f5 = (vk1_plus_ll[k] * normal_direction[1] +
+              vk2_plus_ll[k] * normal_direction[2]) * pe_mean
+
+        # Compute multi-ion term (vanishes for NCOMP==1)
+        vk1_minus_ll = v1_plus_ll - vk1_plus_ll[k]
+        vk2_minus_ll = v2_plus_ll - vk2_plus_ll[k]
+        vk3_minus_ll = v3_plus_ll - vk3_plus_ll[k]
+        vk1_minus_rr = v1_plus_rr - vk1_plus_rr[k]
+        vk2_minus_rr = v2_plus_rr - vk2_plus_rr[k]
+        vk3_minus_rr = v3_plus_rr - vk3_plus_rr[k]
+        vk1_minus_avg = 0.5f0 * (vk1_minus_ll + vk1_minus_rr)
+        vk2_minus_avg = 0.5f0 * (vk2_minus_ll + vk2_minus_rr)
+        vk3_minus_avg = 0.5f0 * (vk3_minus_ll + vk3_minus_rr)
+        f5 += ((B2_ll * (vk1_minus_avg * B2_avg - vk2_minus_avg * B1_avg) +
+                B3_ll * (vk1_minus_avg * B3_avg - vk3_minus_avg * B1_avg)) *
+               normal_direction[1] +
+               (B1_ll * (vk2_minus_avg * B1_avg - vk1_minus_avg * B2_avg) +
+                B3_ll * (vk2_minus_avg * B3_avg - vk3_minus_avg * B2_avg)) *
+               normal_direction[2])
+
+        # Compute Godunov-Powell term
+        f2 += charge_ratio_ll[k] * B1_ll * B_dot_n_avg
+        f3 += charge_ratio_ll[k] * B2_ll * B_dot_n_avg
+        f4 += charge_ratio_ll[k] * B3_ll * B_dot_n_avg
+        f5 += (v1_plus_ll * B1_ll + v2_plus_ll * B2_ll + v3_plus_ll * B3_ll) *
+              B_dot_n_avg
+
+        # Compute GLM term for the energy
+        f5 += v_plus_dot_n_ll * psi_ll * psi_avg
+
+        # Add to the flux vector (multiply by 2 because the non-conservative flux is
+        # multiplied by 0.5 whenever it's used in the Trixi code)
+        set_component!(f, k, 0, 2 * f2, 2 * f3, 2 * f4, 2 * f5,
+                       equations)
+    end
+    # Compute GLM term for psi (multiply by 2 because the non-conservative flux is
+    # multiplied by 0.5 whenever it's used in the Trixi code)
+    f[end] = 2 * v_plus_dot_n_ll * psi_avg
+
+    return SVector(f)
+end
+
 """
     flux_nonconservative_central(u_ll, u_rr, orientation::Integer,
                                  equations::IdealGlmMhdMultiIonEquations2D)
@@ -536,10 +699,12 @@ The term is composed of four individual non-conservative terms:
     charge_ratio_ll ./= total_electron_charge
 
     # Compute auxiliary variables
-    v1_plus_ll, v2_plus_ll, v3_plus_ll, vk1_plus_ll, vk2_plus_ll, vk3_plus_ll = charge_averaged_velocities(u_ll,
-                                                                                                           equations)
-    v1_plus_rr, v2_plus_rr, v3_plus_rr, vk1_plus_rr, vk2_plus_rr, vk3_plus_rr = charge_averaged_velocities(u_rr,
-                                                                                                           equations)
+    v1_plus_ll, v2_plus_ll, v3_plus_ll, vk1_plus_ll, vk2_plus_ll,
+    vk3_plus_ll = charge_averaged_velocities(u_ll,
+                                             equations)
+    v1_plus_rr, v2_plus_rr, v3_plus_rr, vk1_plus_rr, vk2_plus_rr,
+    vk3_plus_rr = charge_averaged_velocities(u_rr,
+                                             equations)
 
     f = zero(MVector{nvariables(equations), eltype(u_ll)})
 
@@ -630,6 +795,111 @@ The term is composed of four individual non-conservative terms:
     return SVector(f)
 end
 
+@inline function flux_nonconservative_central(u_ll, u_rr,
+                                              normal_direction::AbstractVector,
+                                              equations::IdealGlmMhdMultiIonEquations2D)
+    @unpack charge_to_mass = equations
+    # Unpack left and right states to get the magnetic field
+    B1_ll, B2_ll, B3_ll = magnetic_field(u_ll, equations)
+    B1_rr, B2_rr, B3_rr = magnetic_field(u_rr, equations)
+    psi_ll = divergence_cleaning_field(u_ll, equations)
+    psi_rr = divergence_cleaning_field(u_rr, equations)
+
+    # Compute important averages
+    mag_norm_ll = B1_ll^2 + B2_ll^2 + B3_ll^2
+    mag_norm_rr = B1_rr^2 + B2_rr^2 + B3_rr^2
+
+    # Electron pressure
+    pe_ll = equations.electron_pressure(u_ll, equations)
+    pe_rr = equations.electron_pressure(u_rr, equations)
+
+    # Compute charge ratio of u_ll
+    charge_ratio_ll = zero(MVector{ncomponents(equations), eltype(u_ll)})
+    total_electron_charge = zero(real(equations))
+    for k in eachcomponent(equations)
+        rho_k = u_ll[3 + (k - 1) * 5 + 1]
+        charge_ratio_ll[k] = rho_k * charge_to_mass[k]
+        total_electron_charge += charge_ratio_ll[k]
+    end
+    charge_ratio_ll ./= total_electron_charge
+
+    # Compute auxiliary variables
+    v1_plus_ll, v2_plus_ll, v3_plus_ll, vk1_plus_ll, vk2_plus_ll,
+    vk3_plus_ll = charge_averaged_velocities(u_ll,
+                                             equations)
+    v1_plus_rr, v2_plus_rr, v3_plus_rr, vk1_plus_rr, vk2_plus_rr,
+    vk3_plus_rr = charge_averaged_velocities(u_rr,
+                                             equations)
+
+    f = zero(MVector{nvariables(equations), eltype(u_ll)})
+
+    # Compute B_dot_n, v_dot_B_ll, and psi terms once (they are constant for all species)
+    B1_sum = B1_ll + B1_rr
+    B2_sum = B2_ll + B2_rr
+    B_dot_n = B1_sum * normal_direction[1] + B2_sum * normal_direction[2]
+    v_dot_B_ll = v1_plus_ll * B1_ll + v2_plus_ll * B2_ll + v3_plus_ll * B3_ll
+    psi_sum = psi_ll + psi_rr
+
+    # Entries of Godunov-Powell term for induction equation
+    f[1] = v1_plus_ll * B_dot_n
+    f[2] = v2_plus_ll * B_dot_n
+    f[3] = v3_plus_ll * B_dot_n
+
+    for k in eachcomponent(equations)
+        # Compute Lorentz term
+        f2 = charge_ratio_ll[k] *
+             ((0.5f0 * mag_norm_ll - B1_ll * B1_ll + pe_ll) +
+              (0.5f0 * mag_norm_rr - B1_rr * B1_rr + pe_rr)) * normal_direction[1] +
+             charge_ratio_ll[k] * ((-B2_ll * B1_ll) + (-B2_rr * B1_rr)) *
+             normal_direction[2]
+        f3 = charge_ratio_ll[k] * ((-B1_ll * B2_ll) + (-B1_rr * B2_rr)) *
+             normal_direction[1] +
+             charge_ratio_ll[k] *
+             ((-B2_ll * B2_ll + 0.5f0 * mag_norm_ll + pe_ll) +
+              (-B2_rr * B2_rr + 0.5f0 * mag_norm_rr + pe_rr)) * normal_direction[2]
+        f4 = charge_ratio_ll[k] * ((-B1_ll * B3_ll) + (-B1_rr * B3_rr)) *
+             normal_direction[1] +
+             charge_ratio_ll[k] * ((-B2_ll * B3_ll) + (-B2_rr * B3_rr)) *
+             normal_direction[2]
+        f5 = vk1_plus_ll[k] * (pe_ll + pe_rr) * normal_direction[1] +
+             vk2_plus_ll[k] * (pe_ll + pe_rr) * normal_direction[2]
+
+        # Compute multi-ion term, which vanishes for NCOMP==1
+        vk1_minus_ll = v1_plus_ll - vk1_plus_ll[k]
+        vk2_minus_ll = v2_plus_ll - vk2_plus_ll[k]
+        vk3_minus_ll = v3_plus_ll - vk3_plus_ll[k]
+        vk1_minus_rr = v1_plus_rr - vk1_plus_rr[k]
+        vk2_minus_rr = v2_plus_rr - vk2_plus_rr[k]
+        vk3_minus_rr = v3_plus_rr - vk3_plus_rr[k]
+        f5 += (B2_ll * ((vk1_minus_ll * B2_ll - vk2_minus_ll * B1_ll) +
+                (vk1_minus_rr * B2_rr - vk2_minus_rr * B1_rr)) +
+               B3_ll * ((vk1_minus_ll * B3_ll - vk3_minus_ll * B1_ll) +
+                (vk1_minus_rr * B3_rr - vk3_minus_rr * B1_rr))) * normal_direction[1]
+        f5 += (B1_ll * ((vk2_minus_ll * B1_ll - vk1_minus_ll * B2_ll) +
+                (vk2_minus_rr * B1_rr - vk1_minus_rr * B2_rr)) +
+               B3_ll * ((vk2_minus_ll * B3_ll - vk3_minus_ll * B2_ll) +
+                (vk2_minus_rr * B3_rr - vk3_minus_rr * B2_rr))) * normal_direction[2]
+
+        # Compute Godunov-Powell term
+        f2 += charge_ratio_ll[k] * B1_ll * B_dot_n
+        f3 += charge_ratio_ll[k] * B2_ll * B_dot_n
+        f4 += charge_ratio_ll[k] * B3_ll * B_dot_n
+        f5 += v_dot_B_ll * B_dot_n
+
+        # Compute GLM term for the energy
+        f5 += (v1_plus_ll * normal_direction[1] + v2_plus_ll * normal_direction[2]) *
+              psi_ll * psi_sum
+
+        # Append to the flux vector
+        set_component!(f, k, 0, f2, f3, f4, f5, equations)
+    end
+    # Compute GLM term for psi
+    f[end] = (v1_plus_ll * normal_direction[1] + v2_plus_ll * normal_direction[2]) *
+             psi_sum
+
+    return SVector(f)
+end
+
 """
     flux_ruedaramirez_etal(u_ll, u_rr, orientation, equations::IdealGlmMhdMultiIonEquations2D)
 
@@ -652,10 +922,12 @@ function flux_ruedaramirez_etal(u_ll, u_rr, orientation::Integer,
     psi_ll = divergence_cleaning_field(u_ll, equations)
     psi_rr = divergence_cleaning_field(u_rr, equations)
 
-    v1_plus_ll, v2_plus_ll, v3_plus_ll, vk1_plus_ll, vk2_plus_ll, vk3_plus_ll = charge_averaged_velocities(u_ll,
-                                                                                                           equations)
-    v1_plus_rr, v2_plus_rr, v3_plus_rr, vk1_plus_rr, vk2_plus_rr, vk3_plus_rr = charge_averaged_velocities(u_rr,
-                                                                                                           equations)
+    v1_plus_ll, v2_plus_ll, v3_plus_ll, vk1_plus_ll, vk2_plus_ll,
+    vk3_plus_ll = charge_averaged_velocities(u_ll,
+                                             equations)
+    v1_plus_rr, v2_plus_rr, v3_plus_rr, vk1_plus_rr, vk2_plus_rr,
+    vk3_plus_rr = charge_averaged_velocities(u_rr,
+                                             equations)
 
     f = zero(MVector{nvariables(equations), eltype(u_ll)})
 
@@ -881,6 +1153,161 @@ function flux_ruedaramirez_etal(u_ll, u_rr, orientation::Integer,
     return SVector(f)
 end
 
+function flux_ruedaramirez_etal(u_ll, u_rr, normal_direction::AbstractVector,
+                                equations::IdealGlmMhdMultiIonEquations2D)
+    @unpack gammas = equations
+    # Unpack left and right states to get the magnetic field
+    B1_ll, B2_ll, B3_ll = magnetic_field(u_ll, equations)
+    B1_rr, B2_rr, B3_rr = magnetic_field(u_rr, equations)
+    psi_ll = divergence_cleaning_field(u_ll, equations)
+    psi_rr = divergence_cleaning_field(u_rr, equations)
+
+    v1_plus_ll, v2_plus_ll, v3_plus_ll, vk1_plus_ll, vk2_plus_ll,
+    vk3_plus_ll = charge_averaged_velocities(u_ll,
+                                             equations)
+    v1_plus_rr, v2_plus_rr, v3_plus_rr, vk1_plus_rr, vk2_plus_rr,
+    vk3_plus_rr = charge_averaged_velocities(u_rr,
+                                             equations)
+
+    f = zero(MVector{nvariables(equations), eltype(u_ll)})
+
+    # Compute averages for global variables
+    v1_plus_avg = 0.5f0 * (v1_plus_ll + v1_plus_rr)
+    v2_plus_avg = 0.5f0 * (v2_plus_ll + v2_plus_rr)
+    v3_plus_avg = 0.5f0 * (v3_plus_ll + v3_plus_rr)
+    B1_avg = 0.5f0 * (B1_ll + B1_rr)
+    B2_avg = 0.5f0 * (B2_ll + B2_rr)
+    B3_avg = 0.5f0 * (B3_ll + B3_rr)
+    mag_norm_ll = B1_ll^2 + B2_ll^2 + B3_ll^2
+    mag_norm_rr = B1_rr^2 + B2_rr^2 + B3_rr^2
+    mag_norm_avg = 0.5f0 * (mag_norm_ll + mag_norm_rr)
+    psi_avg = 0.5f0 * (psi_ll + psi_rr)
+
+    psi_B1_avg = 0.5f0 * (B1_ll * psi_ll + B1_rr * psi_rr)
+    psi_B2_avg = 0.5f0 * (B2_ll * psi_ll + B2_rr * psi_rr)
+
+    # Compute B_dot_n_avg and psi_B_dot_n_avg once (they are constant for all species)
+    B_dot_n_avg = B1_avg * normal_direction[1] + B2_avg * normal_direction[2]
+    psi_B_dot_n_avg = psi_B1_avg * normal_direction[1] +
+                      psi_B2_avg * normal_direction[2]
+
+    # Magnetic field components from f^MHD
+    f6 = (equations.c_h * psi_avg * normal_direction[1] +
+          (v2_plus_avg * B1_avg - v1_plus_avg * B2_avg) * normal_direction[2])
+    f7 = ((v1_plus_avg * B2_avg - v2_plus_avg * B1_avg) * normal_direction[1] +
+          equations.c_h * psi_avg * normal_direction[2])
+    f8 = ((v1_plus_avg * B3_avg - v3_plus_avg * B1_avg) * normal_direction[1] +
+          (v2_plus_avg * B3_avg - v3_plus_avg * B2_avg) * normal_direction[2])
+    f9 = (equations.c_h * B1_avg * normal_direction[1] +
+          equations.c_h * B2_avg * normal_direction[2])
+
+    # Start building the flux
+    f[1] = f6
+    f[2] = f7
+    f[3] = f8
+    f[end] = f9
+
+    # Iterate over all components
+    for k in eachcomponent(equations)
+        # Unpack left and right states
+        rho_ll, rho_v1_ll, rho_v2_ll, rho_v3_ll, rho_e_total_ll = get_component(k, u_ll,
+                                                                                equations)
+        rho_rr, rho_v1_rr, rho_v2_rr, rho_v3_rr, rho_e_total_rr = get_component(k, u_rr,
+                                                                                equations)
+        rho_inv_ll = 1 / rho_ll
+        v1_ll = rho_v1_ll * rho_inv_ll
+        v2_ll = rho_v2_ll * rho_inv_ll
+        v3_ll = rho_v3_ll * rho_inv_ll
+        rho_inv_rr = 1 / rho_rr
+        v1_rr = rho_v1_rr * rho_inv_rr
+        v2_rr = rho_v2_rr * rho_inv_rr
+        v3_rr = rho_v3_rr * rho_inv_rr
+        vel_norm_ll = v1_ll^2 + v2_ll^2 + v3_ll^2
+        vel_norm_rr = v1_rr^2 + v2_rr^2 + v3_rr^2
+
+        p_ll = (gammas[k] - 1) *
+               (rho_e_total_ll - 0.5f0 * rho_ll * vel_norm_ll - 0.5f0 * mag_norm_ll -
+                0.5f0 * psi_ll^2)
+        p_rr = (gammas[k] - 1) *
+               (rho_e_total_rr - 0.5f0 * rho_rr * vel_norm_rr - 0.5f0 * mag_norm_rr -
+                0.5f0 * psi_rr^2)
+        beta_ll = 0.5f0 * rho_ll / p_ll
+        beta_rr = 0.5f0 * rho_rr / p_rr
+        # for convenience store vk_plus⋅B
+        vel_dot_mag_ll = vk1_plus_ll[k] * B1_ll + vk2_plus_ll[k] * B2_ll +
+                         vk3_plus_ll[k] * B3_ll
+        vel_dot_mag_rr = vk1_plus_rr[k] * B1_rr + vk2_plus_rr[k] * B2_rr +
+                         vk3_plus_rr[k] * B3_rr
+
+        # Compute the necessary mean values needed for either direction
+        rho_avg = 0.5f0 * (rho_ll + rho_rr)
+        rho_mean = ln_mean(rho_ll, rho_rr)
+        beta_mean = ln_mean(beta_ll, beta_rr)
+        beta_avg = 0.5f0 * (beta_ll + beta_rr)
+        p_mean = 0.5f0 * rho_avg / beta_avg
+        v1_avg = 0.5f0 * (v1_ll + v1_rr)
+        v2_avg = 0.5f0 * (v2_ll + v2_rr)
+        v3_avg = 0.5f0 * (v3_ll + v3_rr)
+        vel_norm_avg = 0.5f0 * (vel_norm_ll + vel_norm_rr)
+        vel_dot_mag_avg = 0.5f0 * (vel_dot_mag_ll + vel_dot_mag_rr)
+        vk1_plus_avg = 0.5f0 * (vk1_plus_ll[k] + vk1_plus_rr[k])
+        vk2_plus_avg = 0.5f0 * (vk2_plus_ll[k] + vk2_plus_rr[k])
+        vk3_plus_avg = 0.5f0 * (vk3_plus_ll[k] + vk3_plus_rr[k])
+        # v_minus
+        vk1_minus_ll = v1_plus_ll - vk1_plus_ll[k]
+        vk2_minus_ll = v2_plus_ll - vk2_plus_ll[k]
+        vk3_minus_ll = v3_plus_ll - vk3_plus_ll[k]
+        vk1_minus_rr = v1_plus_rr - vk1_plus_rr[k]
+        vk2_minus_rr = v2_plus_rr - vk2_plus_rr[k]
+        vk3_minus_rr = v3_plus_rr - vk3_plus_rr[k]
+        vk1_minus_avg = 0.5f0 * (vk1_minus_ll + vk1_minus_rr)
+        vk2_minus_avg = 0.5f0 * (vk2_minus_ll + vk2_minus_rr)
+        vk3_minus_avg = 0.5f0 * (vk3_minus_ll + vk3_minus_rr)
+
+        # Fill the fluxes for the mass and momentum equations
+        f1 = rho_mean * (v1_avg * normal_direction[1] + v2_avg * normal_direction[2])
+        f2 = f1 * v1_avg + p_mean * normal_direction[1]
+        f3 = f1 * v2_avg + p_mean * normal_direction[2]
+        f4 = f1 * v3_avg
+
+        # total energy flux is complicated and involves the previous eight components
+        vk1_plus_mag_avg = 0.5f0 * (vk1_plus_ll[k] * mag_norm_ll +
+                            vk1_plus_rr[k] * mag_norm_rr)
+        vk2_plus_mag_avg = 0.5f0 * (vk2_plus_ll[k] * mag_norm_ll +
+                            vk2_plus_rr[k] * mag_norm_rr)
+        # Euler part
+        f5 = f1 * 0.5f0 * (1 / (gammas[k] - 1) / beta_mean - vel_norm_avg) +
+             f2 * v1_avg + f3 * v2_avg + f4 * v3_avg
+        # MHD part
+        f5 += (f6 * B1_avg + f7 * B2_avg + f8 * B3_avg -
+               0.5f0 * vk1_plus_mag_avg * normal_direction[1] -
+               0.5f0 * vk2_plus_mag_avg * normal_direction[2] +
+               B_dot_n_avg * vel_dot_mag_avg     # Same terms as in Derigs (but with v_plus)
+               + f9 * psi_avg -
+               equations.c_h * psi_B_dot_n_avg   # GLM term
+               +
+               0.5f0 *
+               (vk1_plus_avg * normal_direction[1] + vk2_plus_avg * normal_direction[2]) *
+               mag_norm_avg -
+               vk1_plus_avg * B_dot_n_avg * B1_avg -
+               vk2_plus_avg * B_dot_n_avg * B2_avg -
+               vk3_plus_avg * B_dot_n_avg * B3_avg   # Additional terms related to the Lorentz non-conservative term (momentum eqs)
+               -
+               B2_avg * (vk1_minus_avg * B2_avg - vk2_minus_avg * B1_avg) *
+               normal_direction[1] -
+               B3_avg * (vk1_minus_avg * B3_avg - vk3_minus_avg * B1_avg) *
+               normal_direction[1] -
+               B1_avg * (vk2_minus_avg * B1_avg - vk1_minus_avg * B2_avg) *
+               normal_direction[2] -
+               B3_avg * (vk2_minus_avg * B3_avg - vk3_minus_avg * B2_avg) *
+               normal_direction[2])       # Terms related to the multi-ion non-conservative term (induction equation!)
+
+        set_component!(f, k, f1, f2, f3, f4, f5, equations)
+    end
+
+    return SVector(f)
+end
+
 # Calculate maximum wave speed for local Lax-Friedrichs-type dissipation
 # This routine approximates the maximum wave speed as sum of the maximum ion velocity
 # for all species and the maximum magnetosonic speed.
@@ -914,6 +1341,31 @@ end
     return max(abs(v_ll), abs(v_rr)) + max(cf_ll, cf_rr)
 end
 
+@inline function max_abs_speed_naive(u_ll, u_rr, normal_direction::AbstractVector,
+                                     equations::IdealGlmMhdMultiIonEquations2D)
+    # Calculate fast magnetoacoustic wave speeds
+    # left
+    cf_ll = calc_fast_wavespeed(u_ll, normal_direction, equations)
+    # right
+    cf_rr = calc_fast_wavespeed(u_rr, normal_direction, equations)
+
+    # Calculate velocities
+    v_ll = zero(eltype(u_ll))
+    v_rr = zero(eltype(u_rr))
+    for k in eachcomponent(equations)
+        rho, rho_v1, rho_v2, _ = get_component(k, u_ll, equations)
+        v_ll = max(v_ll,
+                   abs((rho_v1 * normal_direction[1] + rho_v2 * normal_direction[2]) /
+                       rho))
+        rho, rho_v1, rho_v2, _ = get_component(k, u_rr, equations)
+        v_rr = max(v_rr,
+                   abs((rho_v1 * normal_direction[1] + rho_v2 * normal_direction[2]) /
+                       rho))
+    end
+
+    return max(abs(v_ll), abs(v_rr)) + max(cf_ll, cf_rr)
+end
+
 # Less "cautious", i.e., less overestimating `λ_max` compared to `max_abs_speed_naive`
 @inline function max_abs_speed(u_ll, u_rr, orientation::Integer,
                                equations::IdealGlmMhdMultiIonEquations2D)
@@ -1005,4 +1457,45 @@ end
 
     return c_f
 end
+
+@inline function calc_fast_wavespeed(cons, normal_direction::AbstractVector,
+                                     equations::IdealGlmMhdMultiIonEquations2D)
+    B1, B2, B3 = magnetic_field(cons, equations)
+    psi = divergence_cleaning_field(cons, equations)
+
+    norm_squared = (normal_direction[1]^2 + normal_direction[2]^2)
+
+    c_f = zero(real(equations))
+    for k in eachcomponent(equations)
+        rho, rho_v1, rho_v2, rho_v3, rho_e_total = get_component(k, cons, equations)
+
+        rho_inv = 1 / rho
+        v1 = rho_v1 * rho_inv
+        v2 = rho_v2 * rho_inv
+        v3 = rho_v3 * rho_inv
+        gamma = equations.gammas[k]
+        p = (gamma - 1) *
+            (rho_e_total - 0.5f0 * rho * (v1^2 + v2^2 + v3^2) -
+             0.5f0 * (B1^2 + B2^2 + B3^2) -
+             0.5f0 * psi^2)
+        a_square = gamma * p * rho_inv
+        inv_sqrt_rho = 1 / sqrt(rho)
+
+        b1 = B1 * inv_sqrt_rho
+        b2 = B2 * inv_sqrt_rho
+        b3 = B3 * inv_sqrt_rho
+        b_square = b1^2 + b2^2 + b3^2
+        # Properly normalize the magnetic field projection onto the unit normal
+        b_dot_n_squared = (b1 * normal_direction[1] + b2 * normal_direction[2])^2 /
+                          norm_squared
+
+        c_f = max(c_f,
+                  sqrt((0.5f0 * (a_square + b_square) +
+                        0.5f0 *
+                        sqrt((a_square + b_square)^2 - 4 * a_square * b_dot_n_squared)) *
+                       norm_squared))
+    end
+
+    return c_f
+end
 end # @muladd
diff --git a/src/equations/laplace_diffusion_2d.jl b/src/equations/laplace_diffusion_2d.jl
index 3741116f1bb..c243f3de364 100644
--- a/src/equations/laplace_diffusion_2d.jl
+++ b/src/equations/laplace_diffusion_2d.jl
@@ -32,7 +32,7 @@ end
 # The penalization depends on the solver, but also depends explicitly on physical parameters,
 # and would probably need to be specialized for every different equation.
 function penalty(u_outer, u_inner, inv_h, equations_parabolic::LaplaceDiffusion2D,
-                 dg::ViscousFormulationLocalDG)
+                 dg::ParabolicFormulationLocalDG)
     return dg.penalty_parameter * (u_outer - u_inner) * equations_parabolic.diffusivity
 end
 
diff --git a/src/equations/laplace_diffusion_3d.jl b/src/equations/laplace_diffusion_3d.jl
index c2e3c49afee..ec3957676a2 100644
--- a/src/equations/laplace_diffusion_3d.jl
+++ b/src/equations/laplace_diffusion_3d.jl
@@ -35,7 +35,7 @@ end
 # The penalization depends on the solver, but also depends explicitly on physical parameters,
 # and would probably need to be specialized for every different equation.
 function penalty(u_outer, u_inner, inv_h, equations_parabolic::LaplaceDiffusion3D,
-                 dg::ViscousFormulationLocalDG)
+                 dg::ParabolicFormulationLocalDG)
     return dg.penalty_parameter * (u_outer - u_inner) * equations_parabolic.diffusivity
 end
 
diff --git a/src/meshes/meshes.jl b/src/meshes/meshes.jl
index 69a8ea79ffa..b1c1b6c918e 100644
--- a/src/meshes/meshes.jl
+++ b/src/meshes/meshes.jl
@@ -5,6 +5,8 @@
 @muladd begin
 #! format: noindent
 
+@inline Base.ndims(::Type{<:AbstractMesh{NDIMS}}) where {NDIMS} = NDIMS
+
 include("tree_mesh.jl")
 include("structured_mesh.jl")
 include("structured_mesh_view.jl")
diff --git a/src/meshes/p4est_mesh_view.jl b/src/meshes/p4est_mesh_view.jl
index 208ce7250f9..4f0bce4c69b 100644
--- a/src/meshes/p4est_mesh_view.jl
+++ b/src/meshes/p4est_mesh_view.jl
@@ -35,8 +35,11 @@ function P4estMeshView(parent::P4estMesh{NDIMS, NDIMS_AMBIENT, RealT},
 end
 
 @inline Base.ndims(::P4estMeshView{NDIMS}) where {NDIMS} = NDIMS
-@inline Base.real(::P4estMeshView{NDIMS, NDIMS_AMBIENT, RealT}) where {NDIMS, NDIMS_AMBIENT, RealT} = RealT
+@inline Base.real(::P4estMeshView{NDIMS, NDIMS_AMBIENT, RealT}) where {NDIMS,
+NDIMS_AMBIENT,
+RealT} = RealT
 
+# Extract interfaces, boundaries and parent element ids from the neighbors.
 function extract_p4est_mesh_view(elements_parent,
                                  interfaces_parent,
                                  boundaries_parent,
@@ -60,20 +63,32 @@ function extract_p4est_mesh_view(elements_parent,
                                                                                    mesh.cell_ids]
     @views elements.surface_flux_values .= elements_parent.surface_flux_values[..,
                                                                                mesh.cell_ids]
-    # Extract interfaces that belong to mesh view
+    # Extract interfaces that belong to mesh view.
     interfaces = extract_interfaces(mesh, interfaces_parent)
 
-    return elements, interfaces, boundaries_parent, mortars_parent
+    # Extract boundaries of this mesh view.
+    boundaries = extract_boundaries(mesh, boundaries_parent, interfaces_parent,
+                                    interfaces)
+
+    # Get the parent element ids of the neighbors.
+    neighbor_ids_parent = extract_neighbor_ids_parent(mesh, boundaries_parent,
+                                                      interfaces_parent,
+                                                      boundaries)
+
+    return elements, interfaces, boundaries, mortars_parent, neighbor_ids_parent
 end
 
 # Remove all interfaces that have a tuple of neighbor_ids where at least one is
-# not part of this meshview, i.e. mesh.cell_ids, and return the new interface container
+# not part of this mesh view, i.e. mesh.cell_ids, and return the new interface container.
 function extract_interfaces(mesh::P4estMeshView, interfaces_parent)
     # Identify interfaces that need to be retained
     mask = BitArray(undef, ninterfaces(interfaces_parent))
+    # Loop over all interfaces (index 2).
     for interface in 1:size(interfaces_parent.neighbor_ids)[2]
-        mask[interface] = (interfaces_parent.neighbor_ids[1, interface] in mesh.cell_ids) &&
-                          (interfaces_parent.neighbor_ids[2, interface] in mesh.cell_ids)
+        mask[interface] = (interfaces_parent.neighbor_ids[1,
+                           interface] in mesh.cell_ids) &&
+                          (interfaces_parent.neighbor_ids[2,
+                           interface] in mesh.cell_ids)
     end
 
     # Create deepcopy to get completely independent interfaces container
@@ -85,39 +100,208 @@ function extract_interfaces(mesh::P4estMeshView, interfaces_parent)
     @views interfaces.node_indices .= interfaces_parent.node_indices[.., mask]
     @views neighbor_ids = interfaces_parent.neighbor_ids[.., mask]
 
-    # Transform the global (parent) indices into local (view) indices.
+    # Transform the parent indices into view indices.
     interfaces.neighbor_ids = zeros(Int, size(neighbor_ids))
     for interface in 1:size(neighbor_ids)[2]
         interfaces.neighbor_ids[1, interface] = findall(id -> id ==
-                                                              neighbor_ids[1, interface],
+                                                              neighbor_ids[1,
+                                                                           interface],
                                                         mesh.cell_ids)[1]
         interfaces.neighbor_ids[2, interface] = findall(id -> id ==
-                                                              neighbor_ids[2, interface],
+                                                              neighbor_ids[2,
+                                                                           interface],
                                                         mesh.cell_ids)[1]
     end
 
     return interfaces
 end
 
+# Remove all boundaries that are not part of this p4est mesh view and add new boundaries
+# that were interfaces of the parent mesh.
+function extract_boundaries(mesh::P4estMeshView{2},
+                            boundaries_parent, interfaces_parent,
+                            interfaces)
+    # Remove all boundaries that are not part of this p4est mesh view.
+    boundaries = deepcopy(boundaries_parent)
+    mask = BitArray(undef, nboundaries(boundaries_parent))
+    for boundary in 1:nboundaries(boundaries_parent)
+        mask[boundary] = boundaries_parent.neighbor_ids[boundary] in mesh.cell_ids
+    end
+    boundaries.neighbor_ids = parent_cell_id_to_view(boundaries_parent.neighbor_ids[mask],
+                                                     mesh)
+    boundaries.name = boundaries_parent.name[mask]
+    boundaries.node_indices = boundaries_parent.node_indices[mask]
+
+    # Add new boundaries that were interfaces of the parent mesh.
+    # Loop over all interfaces (index 2).
+    for interface in 1:ninterfaces(interfaces_parent)
+        # Create new boundary if exactly one of the neighbor cells is in the mesh view ("exclusive or" with ⊻)
+        if ((interfaces_parent.neighbor_ids[1, interface] in mesh.cell_ids) ⊻
+            (interfaces_parent.neighbor_ids[2, interface] in mesh.cell_ids))
+            # Determine which of the ids is part of the mesh view.
+            if interfaces_parent.neighbor_ids[1, interface] in mesh.cell_ids
+                neighbor_id = interfaces_parent.neighbor_ids[1, interface]
+                view_idx = 1
+            else
+                neighbor_id = interfaces_parent.neighbor_ids[2, interface]
+                view_idx = 2
+            end
+
+            # Update the neighbor ids.
+            push!(boundaries.neighbor_ids,
+                  parent_cell_id_to_view(neighbor_id, mesh))
+            # Update the boundary names to reflect where the neighboring cell is
+            # relative to this one, i.e. left, right, up, down.
+            # In 3d one would need to add the third dimension.
+            if (interfaces_parent.node_indices[view_idx, interface] ==
+                (:end, :i_forward))
+                push!(boundaries.name, :x_pos)
+            elseif (interfaces_parent.node_indices[view_idx, interface] ==
+                    (:begin, :i_forward))
+                push!(boundaries.name, :x_neg)
+            elseif (interfaces_parent.node_indices[view_idx, interface] ==
+                    (:i_forward, :end))
+                push!(boundaries.name, :y_pos)
+            else
+                push!(boundaries.name, :y_neg)
+            end
+
+            # Update the node indices.
+            push!(boundaries.node_indices,
+                  interfaces_parent.node_indices[view_idx, interface])
+        end
+    end
+
+    # Create the boundary vector for u, which will be populated later.
+    n_dims = ndims(boundaries)
+    n_nodes = size(boundaries.u, 2)
+    n_variables = size(boundaries.u, 1)
+    capacity = length(boundaries.neighbor_ids)
+
+    resize!(boundaries._u, n_variables * n_nodes^(n_dims - 1) * capacity)
+    boundaries.u = unsafe_wrap(Array, pointer(boundaries._u),
+                               (n_variables, ntuple(_ -> n_nodes, n_dims - 1)...,
+                                capacity))
+
+    return boundaries
+end
+
+# Extract the ids of the neighboring elements using the parent mesh indexing.
+# For every boundary of the mesh view find the neighboring cell id in global (parent) indexing.
+# Such neighboring cells are either inside the domain and have an interface
+# in the parent mesh, or they are physical boundaries for which we then
+# construct a periodic coupling by assigning as neighbor id the cell id
+# on the other end of the domain.
+function extract_neighbor_ids_parent(mesh::P4estMeshView,
+                                     boundaries_parent, interfaces_parent,
+                                     boundaries)
+    # Determine the parent indices of the neighboring elements.
+    neighbor_ids_parent = similar(boundaries.neighbor_ids)
+    for (idx, id) in enumerate(boundaries.neighbor_ids)
+        parent_id = mesh.cell_ids[id]
+        # Find this id in the parent's interfaces.
+        for interface in eachindex(interfaces_parent.neighbor_ids[1, :])
+            if (parent_id == interfaces_parent.neighbor_ids[1, interface] ||
+                parent_id == interfaces_parent.neighbor_ids[2, interface])
+                if parent_id == interfaces_parent.neighbor_ids[1, interface]
+                    matching_boundary = 1
+                else
+                    matching_boundary = 2
+                end
+                # Check if interfaces with this id have the right name/node_indices.
+                if (boundaries.name[idx] ==
+                    node_indices_to_name(interfaces_parent.node_indices[matching_boundary,
+                                                                        interface]))
+                    if parent_id == interfaces_parent.neighbor_ids[1, interface]
+                        neighbor_ids_parent[idx] = interfaces_parent.neighbor_ids[2,
+                                                                                  interface]
+                    else
+                        neighbor_ids_parent[idx] = interfaces_parent.neighbor_ids[1,
+                                                                                  interface]
+                    end
+                end
+            end
+        end
+
+        # Find this id in the parent's boundaries.
+        parent_xneg_cell_ids = boundaries_parent.neighbor_ids[boundaries_parent.name .== :x_neg]
+        parent_xpos_cell_ids = boundaries_parent.neighbor_ids[boundaries_parent.name .== :x_pos]
+        parent_yneg_cell_ids = boundaries_parent.neighbor_ids[boundaries_parent.name .== :y_neg]
+        parent_ypos_cell_ids = boundaries_parent.neighbor_ids[boundaries_parent.name .== :y_pos]
+        for (parent_idx, boundary) in enumerate(boundaries_parent.neighbor_ids)
+            if parent_id == boundary
+                # Check if boundaries with this id have the right name/node_indices.
+                if boundaries.name[idx] == boundaries_parent.name[parent_idx]
+                    # Make the coupling periodic.
+                    if boundaries_parent.name[parent_idx] == :x_neg
+                        neighbor_ids_parent[idx] = parent_xpos_cell_ids[findfirst(parent_xneg_cell_ids .==
+                                                                                  boundary)]
+                    elseif boundaries_parent.name[parent_idx] == :x_pos
+                        neighbor_ids_parent[idx] = parent_xneg_cell_ids[findfirst(parent_xpos_cell_ids .==
+                                                                                  boundary)]
+                    elseif boundaries_parent.name[parent_idx] == :y_neg
+                        neighbor_ids_parent[idx] = parent_ypos_cell_ids[findfirst(parent_yneg_cell_ids .==
+                                                                                  boundary)]
+                    elseif boundaries_parent.name[parent_idx] == :y_pos
+                        neighbor_ids_parent[idx] = parent_yneg_cell_ids[findfirst(parent_ypos_cell_ids .==
+                                                                                  boundary)]
+                    else
+                        error("Unknown boundary name: $(boundaries_parent.name[parent_idx])")
+                    end
+                end
+            end
+        end
+    end
+
+    return neighbor_ids_parent
+end
+
+# Translate the interface indices into boundary names.
+# This works only in 2d currently.
+function node_indices_to_name(node_index)
+    if node_index == (:end, :i_forward)
+        return :x_pos
+    elseif node_index == (:begin, :i_forward)
+        return :x_neg
+    elseif node_index == (:i_forward, :end)
+        return :y_pos
+    elseif node_index == (:i_forward, :begin)
+        return :y_neg
+    else
+        error("Unknown node index: $node_index")
+    end
+end
+
+# Convert a parent cell id to a view cell id in the mesh view.
+function parent_cell_id_to_view(id::Integer, mesh::P4estMeshView)
+    # Find the index of the cell id in the mesh view
+    view_id = searchsortedfirst(mesh.cell_ids, id)
+
+    return view_id
+end
+
+# Convert an array of parent cell ids to view cell ids in the mesh view.
+function parent_cell_id_to_view(ids::AbstractArray, mesh::P4estMeshView)
+    # Find the index of the cell id in the mesh view
+    view_id = zeros(Int, length(ids))
+    for i in eachindex(ids)
+        view_id[i] = parent_cell_id_to_view(ids[i], mesh)
+    end
+    return view_id
+end
+
 # Does not save the mesh itself to an HDF5 file. Instead saves important attributes
 # of the mesh, like its size and the type of boundary mapping function.
 # Then, within Trixi2Vtk, the P4estMeshView and its node coordinates are reconstructured from
 # these attributes for plotting purposes
 # | Warning: This overwrites any existing mesh file, either for a mesh view or parent mesh.
-function save_mesh_file(mesh::P4estMeshView, output_directory, timestep,
-                        mpi_parallel::False)
+function save_mesh_file(mesh::P4estMeshView, output_directory; system = "",
+                        timestep = 0)
     # Create output directory (if it does not exist)
     mkpath(output_directory)
 
-    # Determine file name based on existence of meaningful time step
-    if timestep > 0
-        filename = joinpath(output_directory, @sprintf("mesh_%09d.h5", timestep))
-        p4est_filename = @sprintf("p4est_data_%09d", timestep)
-    else
-        filename = joinpath(output_directory, "mesh.h5")
-        p4est_filename = "p4est_data"
-    end
-
+    filename = joinpath(output_directory, "mesh.h5")
+    p4est_filename = "p4est_data"
     p4est_file = joinpath(output_directory, p4est_filename)
 
     # Save the complete connectivity and `p4est` data to disk.
diff --git a/src/semidiscretization/semidiscretization_coupled.jl b/src/semidiscretization/semidiscretization_coupled.jl
index aad8422afc4..dc86a8bac84 100644
--- a/src/semidiscretization/semidiscretization_coupled.jl
+++ b/src/semidiscretization/semidiscretization_coupled.jl
@@ -357,11 +357,12 @@ end
 ### StepsizeCallback
 ################################################################################
 # In case of coupled system, use minimum timestep over all systems
-function calculate_dt(u_ode, t, cfl_advective, cfl_diffusive,
+function calculate_dt(u_ode, t, cfl_hyperbolic, cfl_parabolic,
                       semi::SemidiscretizationCoupled)
     dt = minimum(eachsystem(semi)) do i
         u_ode_slice = get_system_u_ode(u_ode, i, semi)
-        return calculate_dt(u_ode_slice, t, cfl_advective, cfl_diffusive, semi.semis[i])
+        return calculate_dt(u_ode_slice, t, cfl_hyperbolic, cfl_parabolic,
+                            semi.semis[i])
     end
 
     return dt
@@ -646,7 +647,7 @@ end
 ### DGSEM/structured
 ################################################################################
 
-@inline function calc_boundary_flux_by_direction!(surface_flux_values, u, t,
+@inline function calc_boundary_flux_by_direction!(surface_flux_values, t,
                                                   orientation,
                                                   boundary_condition::BoundaryConditionCoupled,
                                                   mesh::Union{StructuredMesh,
@@ -656,12 +657,15 @@ end
                                                   surface_integral, dg::DG, cache,
                                                   direction, node_indices,
                                                   surface_node_indices, element)
-    @unpack node_coordinates, contravariant_vectors, inverse_jacobian = cache.elements
+    @unpack node_coordinates, contravariant_vectors, inverse_jacobian, interfaces_u = cache.elements
+    # Boundary values are for `StructuredMesh` stored in the interface datastructure
+    boundaries_u = interfaces_u
     @unpack surface_flux = surface_integral
 
     cell_indices = get_boundary_indices(element, orientation, mesh)
 
-    u_inner = get_node_vars(u, equations, dg, node_indices..., element)
+    u_inner = get_node_vars(boundaries_u, equations, dg, surface_node_indices...,
+                            direction, element)
 
     # If the mapping is orientation-reversing, the contravariant vectors' orientation
     # is reversed as well. The normal vector must be oriented in the direction
@@ -686,7 +690,7 @@ end
     return nothing
 end
 
-@inline function calc_boundary_flux_by_direction!(surface_flux_values, u, t,
+@inline function calc_boundary_flux_by_direction!(surface_flux_values, t,
                                                   orientation,
                                                   boundary_condition::BoundaryConditionCoupled,
                                                   mesh::Union{StructuredMesh,
@@ -696,12 +700,15 @@ end
                                                   surface_integral, dg::DG, cache,
                                                   direction, node_indices,
                                                   surface_node_indices, element)
-    @unpack node_coordinates, contravariant_vectors, inverse_jacobian = cache.elements
+    @unpack node_coordinates, contravariant_vectors, inverse_jacobian, interfaces_u = cache.elements
+    # Boundary values are for `StructuredMesh` stored in the interface datastructure
+    boundaries_u = interfaces_u
     @unpack surface_flux = surface_integral
 
     cell_indices = get_boundary_indices(element, orientation, mesh)
 
-    u_inner = get_node_vars(u, equations, dg, node_indices..., element)
+    u_inner = get_node_vars(boundaries_u, equations, dg, surface_node_indices...,
+                            direction, element)
 
     # If the mapping is orientation-reversing, the contravariant vectors' orientation
     # is reversed as well. The normal vector must be oriented in the direction
diff --git a/src/semidiscretization/semidiscretization_coupled_p4est.jl b/src/semidiscretization/semidiscretization_coupled_p4est.jl
new file mode 100644
index 00000000000..9de383d5b1c
--- /dev/null
+++ b/src/semidiscretization/semidiscretization_coupled_p4est.jl
@@ -0,0 +1,500 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+"""
+    SemidiscretizationCoupledP4est
+
+Specialized semidiscretization routines for coupled problems using P4est mesh views.
+This is analogous to the implementation for structured meshes.
+[`semidiscretize`](@ref) will return an `ODEProblem` that synchronizes time steps between the semidiscretizations.
+Each call of `rhs!` will call `rhs!` for each semidiscretization individually.
+The semidiscretizations can be coupled by glueing meshes together using [`BoundaryConditionCoupled`](@ref).
+
+See also: [`SemidiscretizationCoupled`](@ref)
+
+!!! warning "Experimental code"
+    This is an experimental feature and can change any time.
+"""
+mutable struct SemidiscretizationCoupledP4est{Semis, Indices, EquationList} <:
+               AbstractSemidiscretization
+    semis::Semis
+    u_indices::Indices # u_ode[u_indices[i]] is the part of u_ode corresponding to semis[i]
+    performance_counter::PerformanceCounter
+    parent_cell_ids::Vector{Int}
+    view_cell_ids::Vector{Int}
+    mesh_ids::Vector{Int}
+end
+
+"""
+    SemidiscretizationCoupledP4est(semis...)
+
+Create a coupled semidiscretization that consists of the semidiscretizations passed as arguments.
+"""
+function SemidiscretizationCoupledP4est(semis...)
+    @assert all(semi -> ndims(semi) == ndims(semis[1]), semis) "All semidiscretizations must have the same dimension!"
+
+    # Number of coefficients for each semidiscretization
+    n_coefficients = zeros(Int, length(semis))
+    for i in 1:length(semis)
+        _, equations, _, _ = mesh_equations_solver_cache(semis[i])
+        n_coefficients[i] = ndofs(semis[i]) * nvariables(equations)
+    end
+
+    # Compute range of coefficients associated with each semidiscretization
+    u_indices = Vector{UnitRange{Int}}(undef, length(semis))
+    for i in 1:length(semis)
+        offset = sum(n_coefficients[1:(i - 1)]) + 1
+        u_indices[i] = range(offset, length = n_coefficients[i])
+    end
+
+    # Create correspondence between parent mesh cell IDs and view cell IDs.
+    parent_cell_ids = 1:size(semis[1].mesh.parent.tree_node_coordinates)[end]
+    view_cell_ids = zeros(Int, length(parent_cell_ids))
+    mesh_ids = zeros(Int, length(parent_cell_ids))
+    for i in eachindex(semis)
+        view_cell_ids[semis[i].mesh.cell_ids] = parent_cell_id_to_view(parent_cell_ids[semis[i].mesh.cell_ids],
+                                                                       semis[i].mesh)
+        mesh_ids[semis[i].mesh.cell_ids] .= i
+    end
+
+    performance_counter = PerformanceCounter()
+
+    SemidiscretizationCoupledP4est{typeof(semis), typeof(u_indices),
+                                   typeof(performance_counter)}(semis, u_indices,
+                                                                performance_counter,
+                                                                parent_cell_ids,
+                                                                view_cell_ids,
+                                                                mesh_ids)
+end
+
+function Base.show(io::IO, ::MIME"text/plain", semi::SemidiscretizationCoupledP4est)
+    @nospecialize semi # reduce precompilation time
+
+    if get(io, :compact, false)
+        show(io, semi)
+    else
+        summary_header(io, "SemidiscretizationCoupledP4est")
+        summary_line(io, "#spatial dimensions", ndims(semi.semis[1]))
+        summary_line(io, "#systems", nsystems(semi))
+        for i in eachsystem(semi)
+            summary_line(io, "system", i)
+            mesh, equations, solver, _ = mesh_equations_solver_cache(semi.semis[i])
+            summary_line(increment_indent(io), "mesh", mesh |> typeof |> nameof)
+            summary_line(increment_indent(io), "equations",
+                         equations |> typeof |> nameof)
+            summary_line(increment_indent(io), "initial condition",
+                         semi.semis[i].initial_condition)
+            # no boundary conditions since that could be too much
+            summary_line(increment_indent(io), "source terms",
+                         semi.semis[i].source_terms)
+            summary_line(increment_indent(io), "solver", solver |> typeof |> nameof)
+        end
+        summary_line(io, "total #DOFs per field", ndofsglobal(semi))
+        summary_footer(io)
+    end
+end
+
+function print_summary_semidiscretization(io::IO, semi::SemidiscretizationCoupledP4est)
+    show(io, MIME"text/plain"(), semi)
+    println(io, "\n")
+    for i in eachsystem(semi)
+        mesh, equations, solver, _ = mesh_equations_solver_cache(semi.semis[i])
+        summary_header(io, "System #$i")
+
+        summary_line(io, "mesh", mesh |> typeof |> nameof)
+        show(increment_indent(io), MIME"text/plain"(), mesh)
+
+        summary_line(io, "equations", equations |> typeof |> nameof)
+        show(increment_indent(io), MIME"text/plain"(), equations)
+
+        summary_line(io, "solver", solver |> typeof |> nameof)
+        show(increment_indent(io), MIME"text/plain"(), solver)
+
+        summary_footer(io)
+        println(io, "\n")
+    end
+end
+
+@inline nsystems(semi::SemidiscretizationCoupledP4est) = length(semi.semis)
+
+@inline eachsystem(semi::SemidiscretizationCoupledP4est) = Base.OneTo(nsystems(semi))
+
+@inline Base.real(semi::SemidiscretizationCoupledP4est) = promote_type(real.(semi.semis)...)
+
+@inline function ndofs(semi::SemidiscretizationCoupledP4est)
+    return sum(ndofs, semi.semis)
+end
+
+"""
+    ndofsglobal(semi::SemidiscretizationCoupledP4est)
+
+Return the global number of degrees of freedom associated with each scalar variable across all MPI ranks, and summed up over all coupled systems.
+This is the same as [`ndofs`](@ref) for simulations running in serial or
+parallelized via threads. It will in general be different for simulations
+running in parallel with MPI.
+"""
+@inline function ndofsglobal(semi::SemidiscretizationCoupledP4est)
+    return sum(ndofsglobal, semi.semis)
+end
+
+function compute_coefficients(t, semi::SemidiscretizationCoupledP4est)
+    @unpack u_indices = semi
+
+    u_ode = Vector{real(semi)}(undef, u_indices[end][end])
+
+    # Distribute the partial solution vectors onto the global one.
+    @threaded for i in eachsystem(semi)
+        # Call `compute_coefficients` in `src/semidiscretization/semidiscretization.jl`
+        u_ode[u_indices[i]] .= compute_coefficients(t, semi.semis[i])
+    end
+
+    return u_ode
+end
+
+@inline function get_system_u_ode(u_ode, index, semi::SemidiscretizationCoupledP4est)
+    return @view u_ode[semi.u_indices[index]]
+end
+
+# RHS call for the coupled system.
+function rhs!(du_ode, u_ode, semi::SemidiscretizationCoupledP4est, t)
+    time_start = time_ns()
+
+    n_nodes = length(semi.semis[1].mesh.parent.nodes)
+    # Reformat the parent solutions vector.
+    u_ode_reformatted = Vector{real(semi)}(undef, ndofs(semi))
+    u_ode_reformatted_reshape = reshape(u_ode_reformatted,
+                                        (n_nodes,
+                                         n_nodes,
+                                         length(semi.mesh_ids)))
+    # Extract the parent solution vector from the local solutions.
+    foreach_enumerate(semi.semis) do (i, semi_)
+        system_ode = get_system_u_ode(u_ode, i, semi)
+        system_ode_reshape = reshape(system_ode,
+                                     (n_nodes, n_nodes,
+                                      Int(length(system_ode) /
+                                          n_nodes^ndims(semi_.mesh))))
+        u_ode_reformatted_reshape[:, :, semi.mesh_ids .== i] .= system_ode_reshape
+    end
+
+    # Call rhs! for each semidiscretization
+    foreach_enumerate(semi.semis) do (i, semi_)
+        u_loc = get_system_u_ode(u_ode, i, semi)
+        du_loc = get_system_u_ode(du_ode, i, semi)
+        rhs!(du_loc, u_loc, u_ode_reformatted, semi, semi_, t)
+    end
+
+    runtime = time_ns() - time_start
+    put!(semi.performance_counter, runtime)
+
+    return nothing
+end
+
+# RHS call for the local system.
+# Here we require the data from u_parent for each semidiscretization in order
+# to exchange the correct boundary values.
+function rhs!(du_ode, u_ode, u_parent, semis,
+              semi::SemidiscretizationHyperbolic, t)
+    @unpack mesh, equations, boundary_conditions, source_terms, solver, cache = semi
+
+    u = wrap_array(u_ode, mesh, equations, solver, cache)
+    du = wrap_array(du_ode, mesh, equations, solver, cache)
+
+    time_start = time_ns()
+    @trixi_timeit timer() "rhs!" rhs!(du, u, t, u_parent, semis, mesh, equations,
+                                      boundary_conditions, source_terms, solver, cache)
+    runtime = time_ns() - time_start
+    put!(semi.performance_counter, runtime)
+
+    return nothing
+end
+
+################################################################################
+### AnalysisCallback
+################################################################################
+
+"""
+    AnalysisCallbackCoupledP4est(semi, callbacks...)
+
+Combine multiple analysis callbacks for coupled simulations with a
+[`SemidiscretizationCoupled`](@ref). For each coupled system, an indididual
+[`AnalysisCallback`](@ref) **must** be created and passed to the `AnalysisCallbackCoupledP4est` **in
+order**, i.e., in the same sequence as the indidvidual semidiscretizations are stored in the
+`SemidiscretizationCoupled`.
+
+!!! warning "Experimental code"
+    This is an experimental feature and can change any time.
+"""
+struct AnalysisCallbackCoupledP4est{CB}
+    callbacks::CB
+end
+
+# Convenience constructor for the coupled callback that gets called directly from the elixirs
+function AnalysisCallbackCoupledP4est(semi_coupled, callbacks...)
+    if length(callbacks) != nsystems(semi_coupled)
+        error("an AnalysisCallbackCoupledP4est requires one AnalysisCallback for each semidiscretization")
+    end
+
+    analysis_callback_coupled = AnalysisCallbackCoupledP4est{typeof(callbacks)}(callbacks)
+
+    # This callback is triggered if any of its subsidiary callbacks' condition is triggered
+    condition = (u, t, integrator) -> any(callbacks) do callback
+        callback.condition(u, t, integrator)
+    end
+
+    DiscreteCallback(condition, analysis_callback_coupled,
+                     save_positions = (false, false),
+                     initialize = initialize!)
+end
+
+# used for error checks and EOC analysis
+function (cb::DiscreteCallback{Condition, Affect!})(sol) where {Condition,
+                                                                Affect! <:
+                                                                AnalysisCallbackCoupledP4est
+                                                                }
+    semi_coupled = sol.prob.p
+    u_ode_coupled = sol.u[end]
+    @unpack callbacks = cb.affect!
+
+    uEltype = real(semi_coupled)
+    n_vars_upto_semi = cumsum(nvariables(semi_coupled.semis[i].equations)
+                              for i in eachindex(semi_coupled.semis))[begin:end]
+    error_indices = Array([1, 1 .+ n_vars_upto_semi...])
+    length_error_array = sum(nvariables(semi_coupled.semis[i].equations)
+                             for i in eachindex(semi_coupled.semis))
+    l2_error_collection = uEltype[]
+    linf_error_collection = uEltype[]
+    for i in eachsystem(semi_coupled)
+        analysis_callback = callbacks[i].affect!
+        @unpack analyzer = analysis_callback
+        cache_analysis = analysis_callback.cache
+
+        semi = semi_coupled.semis[i]
+        u_ode = get_system_u_ode(u_ode_coupled, i, semi_coupled)
+
+        l2_error,
+        linf_error = calc_error_norms(u_ode, sol.t[end], analyzer, semi,
+                                      cache_analysis)
+        append!(l2_error_collection, l2_error)
+        append!(linf_error_collection, linf_error)
+    end
+
+    return (; l2 = l2_error_collection, linf = linf_error_collection)
+end
+
+################################################################################
+### SaveSolutionCallback
+################################################################################
+
+# Save mesh for a coupled semidiscretization, which contains multiple meshes internally
+function save_mesh(semi::SemidiscretizationCoupledP4est, output_directory, timestep = 0)
+    for i in eachsystem(semi)
+        mesh, _, _, _ = mesh_equations_solver_cache(semi.semis[i])
+
+        if mesh.unsaved_changes
+            mesh.current_filename = save_mesh_file(mesh, output_directory;
+                                                   system = string(i),
+                                                   timestep = timestep)
+            mesh.unsaved_changes = false
+        end
+    end
+    return nothing
+end
+
+@inline function save_solution_file(semi::SemidiscretizationCoupledP4est, u_ode,
+                                    solution_callback,
+                                    integrator)
+    @unpack semis = semi
+
+    for i in eachsystem(semi)
+        u_ode_slice = get_system_u_ode(u_ode, i, semi)
+        save_solution_file(semis[i], u_ode_slice, solution_callback, integrator,
+                           system = i)
+    end
+    return nothing
+end
+
+################################################################################
+### StepsizeCallback
+################################################################################
+
+# In case of coupled system, use minimum timestep over all systems
+# Case for constant `cfl_number`.
+function calculate_dt(u_ode, t, cfl_hyperbolic, cfl_parabolic,
+                      semi::SemidiscretizationCoupledP4est)
+    dt = minimum(eachsystem(semi)) do i
+        u_ode_slice = get_system_u_ode(u_ode, i, semi)
+        calculate_dt(u_ode_slice, t, cfl_hyperbolic, cfl_parabolic, semi.semis[i])
+    end
+
+    return dt
+end
+
+################################################################################
+### Boundary conditions
+################################################################################
+
+"""
+    BoundaryConditionCoupledP4est(coupling_converter)
+
+Boundary condition struct where the user can specify the coupling converter function.
+
+# Arguments
+- `coupling_converter::CouplingConverter`: function to call for converting the solution
+                                           state of one system to the other system
+"""
+mutable struct BoundaryConditionCoupledP4est{CouplingConverter}
+    coupling_converter::CouplingConverter
+
+    function BoundaryConditionCoupledP4est(coupling_converter)
+        new{typeof(coupling_converter)}(coupling_converter)
+    end
+end
+
+"""
+Extract the boundary values from the neighboring element.
+This requires values from other mesh views.
+This currently only works for Cartesian meshes.
+"""
+function (boundary_condition::BoundaryConditionCoupledP4est)(u_inner, mesh, equations,
+                                                             cache,
+                                                             i_index, j_index,
+                                                             element_index,
+                                                             normal_direction,
+                                                             surface_flux_function,
+                                                             direction,
+                                                             u_ode_coupled)
+    n_nodes = length(mesh.parent.nodes)
+    # Using a projection onto e_x, -e_x, e_y, -e_y to determine which way our boundary interfaces points to.
+    # Knowing this, we then find the cell index in the global (parent) space of the neighboring cell.
+    if abs(sum(normal_direction .* (1.0, 0.0))) >
+       abs(sum(normal_direction .* (0.0, 1.0)))
+        if sum(normal_direction .* (1.0, 0.0)) >
+           sum(normal_direction .* (-1.0, 0.0))
+            cell_index_parent = cache.neighbor_ids_parent[findfirst((cache.boundaries.name .==
+                                                                     :x_pos) .*
+                                                                    (cache.boundaries.neighbor_ids .==
+                                                                     element_index))]
+        else
+            cell_index_parent = cache.neighbor_ids_parent[findfirst((cache.boundaries.name .==
+                                                                     :x_neg) .*
+                                                                    (cache.boundaries.neighbor_ids .==
+                                                                     element_index))]
+        end
+        i_index_g = i_index
+        # Make sure we do not leave the domain.
+        if i_index == n_nodes
+            i_index_g = 1
+        elseif i_index == 1
+            i_index_g = n_nodes
+        end
+        j_index_g = j_index
+    else
+        if sum(normal_direction .* (0.0, 1.0)) > sum(normal_direction .* (0.0, -1.0))
+            cell_index_parent = cache.neighbor_ids_parent[findfirst((cache.boundaries.name .==
+                                                                     :y_pos) .*
+                                                                    (cache.boundaries.neighbor_ids .==
+                                                                     element_index))]
+        else
+            cell_index_parent = cache.neighbor_ids_parent[findfirst((cache.boundaries.name .==
+                                                                     :y_neg) .*
+                                                                    (cache.boundaries.neighbor_ids .==
+                                                                     element_index))]
+        end
+        j_index_g = j_index
+        # Make sure we do not leave the domain.
+        if j_index == n_nodes
+            j_index_g = 1
+        elseif j_index == 1
+            j_index_g = n_nodes
+        end
+        i_index_g = i_index
+    end
+    # Perform integer division to get the right shape of the array.
+    u_parent_reshape = reshape(u_ode_coupled,
+                               (n_nodes, n_nodes,
+                                length(u_ode_coupled) ÷ n_nodes^ndims(mesh.parent)))
+    u_boundary = SVector(u_parent_reshape[i_index_g, j_index_g, cell_index_parent])
+
+    # u_boundary = u_inner
+    orientation = normal_direction
+
+    # Calculate boundary flux
+    flux = surface_flux_function(u_inner, u_boundary, orientation, equations)
+
+    return flux
+end
+
+function calc_boundary_flux!(cache, t, boundary_condition::BC, boundary_indexing,
+                             mesh::P4estMeshView{2},
+                             equations, surface_integral, dg::DG, u_parent) where {BC}
+    @unpack boundaries = cache
+    @unpack surface_flux_values = cache.elements
+    index_range = eachnode(dg)
+
+    @threaded for local_index in eachindex(boundary_indexing)
+        # Use the local index to get the global boundary index from the pre-sorted list
+        boundary = boundary_indexing[local_index]
+
+        # Get information on the adjacent element, compute the surface fluxes,
+        # and store them
+        element = boundaries.neighbor_ids[boundary]
+        node_indices = boundaries.node_indices[boundary]
+        direction = indices2direction(node_indices)
+
+        i_node_start, i_node_step = index_to_start_step_2d(node_indices[1], index_range)
+        j_node_start, j_node_step = index_to_start_step_2d(node_indices[2], index_range)
+
+        i_node = i_node_start
+        j_node = j_node_start
+        for node in eachnode(dg)
+            calc_boundary_flux!(surface_flux_values, t, boundary_condition,
+                                mesh, have_nonconservative_terms(equations),
+                                equations, surface_integral, dg, cache,
+                                i_node, j_node,
+                                node, direction, element, boundary,
+                                u_parent)
+
+            i_node += i_node_step
+            j_node += j_node_step
+        end
+    end
+    return nothing
+end
+
+# Iterate over tuples of boundary condition types and associated indices
+# in a type-stable way using "lispy tuple programming".
+function calc_boundary_flux_by_type!(cache, t, BCs::NTuple{N, Any},
+                                     BC_indices::NTuple{N, Vector{Int}},
+                                     mesh::P4estMeshView,
+                                     equations, surface_integral, dg::DG,
+                                     u_parent) where {N}
+    # Extract the boundary condition type and index vector
+    boundary_condition = first(BCs)
+    boundary_condition_indices = first(BC_indices)
+    # Extract the remaining types and indices to be processed later
+    remaining_boundary_conditions = Base.tail(BCs)
+    remaining_boundary_condition_indices = Base.tail(BC_indices)
+
+    # process the first boundary condition type
+    calc_boundary_flux!(cache, t, boundary_condition, boundary_condition_indices,
+                        mesh, equations, surface_integral, dg, u_parent)
+
+    # recursively call this method with the unprocessed boundary types
+    calc_boundary_flux_by_type!(cache, t, remaining_boundary_conditions,
+                                remaining_boundary_condition_indices,
+                                mesh, equations, surface_integral, dg, u_parent)
+
+    return nothing
+end
+
+# terminate the type-stable iteration over tuples
+function calc_boundary_flux_by_type!(cache, t, BCs::Tuple{}, BC_indices::Tuple{},
+                                     mesh::P4estMeshView,
+                                     equations, surface_integral, dg::DG, u_parent)
+    return nothing
+end
+end # @muladd
diff --git a/src/semidiscretization/semidiscretization_hyperbolic.jl b/src/semidiscretization/semidiscretization_hyperbolic.jl
index cd20fbf8e3f..afdaf93d520 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic.jl
@@ -215,7 +215,8 @@ end
 # Allow NamedTuple for P4estMesh, UnstructuredMesh2D, and T8codeMesh
 # define in two functions to resolve ambiguities
 function digest_boundary_conditions(boundary_conditions::NamedTuple,
-                                    mesh::Union{P4estMesh{2}, UnstructuredMesh2D,
+                                    mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                                UnstructuredMesh2D,
                                                 T8codeMesh{2}},
                                     solver, cache)
     return UnstructuredSortedBoundaryTypes(boundary_conditions, cache)
@@ -228,7 +229,8 @@ function digest_boundary_conditions(boundary_conditions::NamedTuple,
 end
 
 function digest_boundary_conditions(boundary_conditions::UnstructuredSortedBoundaryTypes,
-                                    mesh::Union{P4estMesh{2}, UnstructuredMesh2D,
+                                    mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                                UnstructuredMesh2D,
                                                 T8codeMesh{2}},
                                     solver, cache)
     return boundary_conditions
@@ -243,7 +245,8 @@ end
 # allow passing a single BC that get converted into a named tuple of BCs
 # on (mapped) hypercube domains
 function digest_boundary_conditions(boundary_conditions,
-                                    mesh::Union{P4estMesh{2}, UnstructuredMesh2D,
+                                    mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                                UnstructuredMesh2D,
                                                 T8codeMesh{2}},
                                     solver, cache)
     bcs = (; x_neg = boundary_conditions, x_pos = boundary_conditions,
diff --git a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
index c730439017c..bb6487d46f4 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
@@ -79,7 +79,7 @@ function SemidiscretizationHyperbolicParabolic(mesh, equations::Tuple,
     @assert ndims(mesh) == ndims(equations_parabolic)
 
     if !(nvariables(equations) == nvariables(equations_parabolic))
-        throw(ArgumentError("Current implementation of viscous terms requires the same number of conservative and gradient variables."))
+        throw(ArgumentError("Current implementation of parabolic terms requires the same number of conservative and gradient variables."))
     end
 
     boundary_conditions, boundary_conditions_parabolic = boundary_conditions
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 56835fa1f96..5e52a463daa 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -711,26 +711,6 @@ function Base.show(io::IO, mime::MIME"text/plain",
     end
 end
 
-# Required to be able to run `SimpleSSPRK33` without `VolumeIntegralSubcellLimiting`
-Base.resize!(semi, volume_integral::AbstractVolumeIntegral, new_size) = nothing
-
-function Base.resize!(semi, volume_integral::VolumeIntegralSubcellLimiting, new_size)
-    # Resize container antidiffusive_fluxes
-    resize!(semi.cache.antidiffusive_fluxes, new_size)
-
-    # Resize container subcell_limiter_coefficients
-    @unpack limiter = volume_integral
-    resize!(limiter.cache.subcell_limiter_coefficients, new_size)
-
-    # Calc subcell normal directions before StepsizeCallback
-    if limiter isa SubcellLimiterMCL ||
-       (limiter isa SubcellLimiterIDP && limiter.bar_states)
-        resize!(limiter.cache.container_bar_states, new_size)
-    end
-
-    return nothing
-end
-
 function get_element_variables!(element_variables, u, mesh, equations,
                                 volume_integral::VolumeIntegralSubcellLimiting, dg,
                                 cache)
@@ -1078,6 +1058,13 @@ end
     return u_ll, u_rr
 end
 
+# As above but dispatches on an type argument
+@inline function get_surface_node_vars(u, equations, ::Type{<:DG}, indices...)
+    u_ll = SVector(ntuple(@inline(v->u[1, v, indices...]), Val(nvariables(equations))))
+    u_rr = SVector(ntuple(@inline(v->u[2, v, indices...]), Val(nvariables(equations))))
+    return u_ll, u_rr
+end
+
 @inline function set_node_vars!(u, u_node, equations, solver::DG, indices...)
     for v in eachvariable(equations)
         u[v, indices...] = u_node[v]
@@ -1240,55 +1227,48 @@ function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{
     return nothing
 end
 
-function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{2},
+function compute_coefficients!(backend::Nothing, u, func, t,
+                               mesh::Union{AbstractMesh{2}, AbstractMesh{3}},
                                equations, dg::DG, cache)
     @unpack node_coordinates = cache.elements
+    node_indices = CartesianIndices(ntuple(_ -> nnodes(dg), ndims(mesh)))
     @threaded for element in eachelement(dg, cache)
-        compute_coefficients_element!(u, func, t, equations, dg, node_coordinates,
-                                      element)
+        compute_coefficients_per_element!(u, func, t, equations, dg, node_coordinates,
+                                          element, node_indices)
     end
 
     return nothing
 end
 
-function compute_coefficients!(backend::Backend, u, func, t, mesh::AbstractMesh{2},
+function compute_coefficients!(backend::Backend, u, func, t,
+                               mesh::Union{AbstractMesh{2}, AbstractMesh{3}},
                                equations, dg::DG, cache)
     nelements(dg, cache) == 0 && return nothing
+
     @unpack node_coordinates = cache.elements
-    kernel! = compute_coefficients_kernel!(backend)
-    kernel!(u, func, t, equations, dg, node_coordinates,
+    node_indices = CartesianIndices(ntuple(_ -> nnodes(dg), ndims(mesh)))
+
+    kernel! = compute_coefficients_KAkernel!(backend)
+    kernel!(u, func, t, equations, dg, node_coordinates, node_indices,
             ndrange = nelements(dg, cache))
     return nothing
 end
 
-@kernel function compute_coefficients_kernel!(u, func, t, equations,
-                                              dg::DG, node_coordinates)
+@kernel function compute_coefficients_KAkernel!(u, func, t, equations,
+                                                dg::DG, node_coordinates, node_indices)
     element = @index(Global)
-    compute_coefficients_element!(u, func, t, equations, dg, node_coordinates,
-                                  element)
+    compute_coefficients_per_element!(u, func, t, equations, dg, node_coordinates,
+                                      element,
+                                      node_indices)
 end
 
-function compute_coefficients_element!(u, func, t, equations, dg::DG,
-                                       node_coordinates, element)
-    for j in eachnode(dg), i in eachnode(dg)
-        x_node = get_node_coords(node_coordinates, equations, dg, i,
-                                 j, element)
+@inline function compute_coefficients_per_element!(u, func, t, equations, dg::DG,
+                                                   node_coordinates, element,
+                                                   node_indices)
+    for indices in node_indices
+        x_node = get_node_coords(node_coordinates, equations, dg, indices, element)
         u_node = func(x_node, t, equations)
-        set_node_vars!(u, u_node, equations, dg, i, j, element)
-    end
-
-    return nothing
-end
-
-function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{3},
-                               equations, dg::DG, cache)
-    @threaded for element in eachelement(dg, cache)
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,
-                                     j, k, element)
-            u_node = func(x_node, t, equations)
-            set_node_vars!(u, u_node, equations, dg, i, j, k, element)
-        end
+        set_node_vars!(u, u_node, equations, dg, indices, element)
     end
 
     return nothing
diff --git a/src/solvers/dgmulti.jl b/src/solvers/dgmulti.jl
deleted file mode 100644
index 363d91b5a4c..00000000000
--- a/src/solvers/dgmulti.jl
+++ /dev/null
@@ -1,17 +0,0 @@
-# includes solver files for DGMulti solvers
-include("dgmulti/types.jl")
-include("dgmulti/dg.jl")
-include("dgmulti/flux_differencing_gauss_sbp.jl")
-include("dgmulti/flux_differencing.jl")
-
-# integration of SummationByPartsOperators.jl
-include("dgmulti/sbp.jl")
-
-# specialization of DGMulti to specific equations
-include("dgmulti/flux_differencing_compressible_euler.jl")
-
-# shock capturing
-include("dgmulti/shock_capturing.jl")
-
-# parabolic terms for DGMulti solvers
-include("dgmulti/dg_parabolic.jl")
diff --git a/src/solvers/dgmulti/dg.jl b/src/solvers/dgmulti/dg.jl
index 4f302a196a9..d4779911cee 100644
--- a/src/solvers/dgmulti/dg.jl
+++ b/src/solvers/dgmulti/dg.jl
@@ -42,7 +42,23 @@ end
     end
 end
 
-@inline nelements(dg::DGMulti, cache) = size(cache.u_values)[end]
+@inline nelements(dg::DGMulti, cache) = size(cache.solution_container.u_values)[end]
+
+# Returns the components needed to iterate efficiently over the entries of either a
+# `SparseMatrixCSC` or `Adjoint{SparseMatrixCSC}`, for example when performing flux
+# differencing calculations. 
+# 
+# For `Adjoint{SparseMatrixCSC}` (used by `DGMultiFluxDiff`), since `parent(A)` is a 
+# `SparseMatrixCSC` stored in column-major order, iterating over its columns gives 
+# row-major access to `A`.
+# 
+# For `SparseMatrixCSC` (used by `DGMultiPeriodicFDSBP`, for example), `parent(A)` 
+# simply returns `A`. 
+@inline function sparse_operator_data(A::Union{<:SparseMatrixCSC,
+                                               <:Adjoint{<:Any, <:SparseMatrixCSC}})
+    A_base = parent(A)
+    return A_base, axes(A, 2), rowvals(A_base), nonzeros(A_base)
+end
 
 """
     eachdim(mesh)
@@ -161,6 +177,31 @@ function set_zero!(du, dg::DGMulti, other_args...)
     return nothing
 end
 
+# Holds arrays shared across most DGMulti cache types:
+# solution values at volume/face quadrature points and thread-local scratch storage.
+struct DGMultiSolutionContainer{uType, ufType, ffType, lType}
+    u_values::uType
+    u_face_values::ufType
+    flux_face_values::ffType
+    local_values_threaded::lType
+end
+
+# Allocates arrays shared across most DGMulti cache types.
+function initialize_dgmulti_solution_container(mesh::DGMultiMesh, equations,
+                                               dg::DGMulti,
+                                               uEltype)
+    rd = dg.basis
+    md = mesh.md
+    nvars = nvariables(equations)
+    u_values = allocate_nested_array(uEltype, nvars, size(md.xq), dg)
+    u_face_values = allocate_nested_array(uEltype, nvars, size(md.xf), dg)
+    flux_face_values = allocate_nested_array(uEltype, nvars, size(md.xf), dg)
+    local_values_threaded = [allocate_nested_array(uEltype, nvars, (rd.Nq,), dg)
+                             for _ in 1:Threads.maxthreadid()]
+    return DGMultiSolutionContainer(u_values, u_face_values, flux_face_values,
+                                    local_values_threaded)
+end
+
 # Constructs cache variables for both affine and non-affine (curved) DGMultiMeshes
 function create_cache(mesh::DGMultiMesh{NDIMS}, equations, dg::DGMultiWeakForm, RealT,
                       uEltype) where {NDIMS}
@@ -173,12 +214,6 @@ function create_cache(mesh::DGMultiMesh{NDIMS}, equations, dg::DGMultiWeakForm,
     # ∫f(u) * dv/dx_i = ∑_j (Vq*Drst[i])'*diagm(wq)*(rstxyzJ[i,j].*f(Vq*u))
     weak_differentiation_matrices = map(D -> -M \ ((Vq * D)' * Diagonal(wq)), Drst)
 
-    nvars = nvariables(equations)
-
-    # storage for volume quadrature values, face quadrature values, flux values
-    u_values = allocate_nested_array(uEltype, nvars, size(md.xq), dg)
-    u_face_values = allocate_nested_array(uEltype, nvars, size(md.xf), dg)
-    flux_face_values = allocate_nested_array(uEltype, nvars, size(md.xf), dg)
     if typeof(rd.approximation_type) <:
        Union{SBP, AbstractNonperiodicDerivativeOperator}
         lift_scalings = rd.wf ./ rd.wq[rd.Fmask] # lift scalings for diag-norm SBP operators
@@ -186,10 +221,6 @@ function create_cache(mesh::DGMultiMesh{NDIMS}, equations, dg::DGMultiWeakForm,
         lift_scalings = nothing
     end
 
-    # local storage for volume integral and source computations
-    local_values_threaded = [allocate_nested_array(uEltype, nvars, (rd.Nq,), dg)
-                             for _ in 1:Threads.maxthreadid()]
-
     # For curved meshes, we interpolate geometric terms from nodal points to quadrature points.
     # For affine meshes, we just access one element of this interpolated data.
     dxidxhatj = map(x -> rd.Vq * x, md.rstxyzJ)
@@ -198,21 +229,24 @@ function create_cache(mesh::DGMultiMesh{NDIMS}, equations, dg::DGMultiWeakForm,
     invJ = inv.(rd.Vq * md.J)
 
     # for scaling by curved geometric terms (not used by affine DGMultiMesh)
+    nvars = nvariables(equations)
     flux_threaded = [[allocate_nested_array(uEltype, nvars, (rd.Nq,), dg)
                       for _ in 1:NDIMS] for _ in 1:Threads.maxthreadid()]
     rotated_flux_threaded = [allocate_nested_array(uEltype, nvars, (rd.Nq,), dg)
                              for _ in 1:Threads.maxthreadid()]
 
+    solution_container = initialize_dgmulti_solution_container(mesh, equations, dg,
+                                                               uEltype)
+
     return (; md, weak_differentiation_matrices, lift_scalings, invJ, dxidxhatj,
-            u_values, u_face_values, flux_face_values,
-            local_values_threaded, flux_threaded, rotated_flux_threaded)
+            solution_container, flux_threaded, rotated_flux_threaded)
 end
 
 function compute_coefficients!(::Nothing, u, initial_condition, t,
                                mesh::DGMultiMesh, equations, dg::DGMulti, cache)
     md = mesh.md
     rd = dg.basis
-    @unpack u_values = cache
+    (; u_values) = cache.solution_container
 
     # evaluate the initial condition at quadrature points
     @threaded for i in each_quad_node_global(mesh, dg, cache)
@@ -345,20 +379,21 @@ end
 function prolong2interfaces!(cache, u,
                              mesh::DGMultiMesh, equations, dg::DGMulti)
     rd = dg.basis
-    @unpack u_face_values = cache
+    (; u_face_values) = cache.solution_container
     apply_to_each_field(mul_by!(rd.Vf), u_face_values, u)
 
     return nothing
 end
 
 # CARE: This function requires that interpolation to quadrature points is performed before
-# to populate cache.u_values, see `calc_volume_integral!` for `VolumeIntegralWeakForm`.
+# to populate cache.solution_container.u_values, see `calc_volume_integral!` for `VolumeIntegralWeakForm`.
 # version for affine meshes
 @inline function volume_integral_kernel!(du, u, element, mesh::DGMultiMesh,
                                          have_nonconservative_terms::False, equations,
                                          volume_integral::VolumeIntegralWeakForm,
                                          dg::DGMulti, cache)
-    @unpack weak_differentiation_matrices, dxidxhatj, u_values, local_values_threaded = cache
+    @unpack weak_differentiation_matrices, dxidxhatj = cache
+    (; u_values, local_values_threaded) = cache.solution_container
 
     flux_values = local_values_threaded[Threads.threadid()]
     for i in eachdim(mesh)
@@ -379,14 +414,15 @@ end
 end
 
 # CARE: This function requires that interpolation to quadrature points is performed before
-# to populate cache.u_values, see `calc_volume_integral!` for `VolumeIntegralWeakForm`.
+# to populate cache.solution_container.u_values, see `calc_volume_integral!` for `VolumeIntegralWeakForm`.
 # version for curved meshes
 @inline function volume_integral_kernel!(du, u, element,
                                          mesh::DGMultiMesh{NDIMS, <:NonAffine},
                                          have_nonconservative_terms::False, equations,
                                          volume_integral::VolumeIntegralWeakForm,
                                          dg::DGMulti, cache) where {NDIMS}
-    (; weak_differentiation_matrices, dxidxhatj, u_values) = cache
+    (; weak_differentiation_matrices, dxidxhatj) = cache
+    (; u_values) = cache.solution_container
 
     flux_values = cache.flux_threaded[Threads.threadid()]
     for i in eachdim(mesh)
@@ -427,7 +463,7 @@ function calc_volume_integral!(du, u, mesh::DGMultiMesh,
                                volume_integral::VolumeIntegralWeakForm, dg::DGMulti,
                                cache)
     rd = dg.basis
-    (; u_values) = cache
+    (; u_values) = cache.solution_container
     # interpolate to quadrature points
     apply_to_each_field(mul_by!(rd.Vq), u_values, u)
 
@@ -447,7 +483,7 @@ function calc_interface_flux!(cache, surface_integral::SurfaceIntegralWeakForm,
     @unpack surface_flux = surface_integral
     md = mesh.md
     @unpack mapM, mapP, nxyzJ, Jf = md
-    @unpack u_face_values, flux_face_values = cache
+    (; u_face_values, flux_face_values) = cache.solution_container
 
     @threaded for face_node_index in each_face_node_global(mesh, dg, cache)
 
@@ -469,7 +505,7 @@ function calc_interface_flux!(cache, surface_integral::SurfaceIntegralWeakForm,
     flux_conservative, flux_nonconservative = surface_integral.surface_flux
     md = mesh.md
     @unpack mapM, mapP, nxyzJ, Jf = md
-    @unpack u_face_values, flux_face_values = cache
+    (; u_face_values, flux_face_values) = cache.solution_container
 
     @threaded for face_node_index in each_face_node_global(mesh, dg, cache)
 
@@ -497,13 +533,14 @@ function calc_interface_flux!(cache, surface_integral::SurfaceIntegralWeakForm,
     return nothing
 end
 
-# assumes cache.flux_face_values is computed and filled with
+# assumes cache.solution_container.flux_face_values is computed and filled with
 # for polynomial discretizations, use dense LIFT matrix for surface contributions.
 function calc_surface_integral!(du, u, mesh::DGMultiMesh, equations,
                                 surface_integral::SurfaceIntegralWeakForm,
                                 dg::DGMulti, cache)
     rd = dg.basis
-    apply_to_each_field(mul_by_accum!(rd.LIFT), du, cache.flux_face_values)
+    apply_to_each_field(mul_by_accum!(rd.LIFT), du,
+                        cache.solution_container.flux_face_values)
 
     return nothing
 end
@@ -513,7 +550,7 @@ function prolong2interfaces!(cache, u,
                              mesh::DGMultiMesh, equations, dg::DGMultiSBP)
     rd = dg.basis
     @unpack Fmask = rd
-    @unpack u_face_values = cache
+    (; u_face_values) = cache.solution_container
     @threaded for e in eachelement(mesh, dg, cache)
         for (i, fid) in enumerate(Fmask)
             u_face_values[i, e] = u[fid, e]
@@ -529,7 +566,8 @@ function calc_surface_integral!(du, u, mesh::DGMultiMesh, equations,
                                 surface_integral::SurfaceIntegralWeakForm,
                                 dg::DGMultiSBP, cache)
     rd = dg.basis
-    @unpack flux_face_values, lift_scalings = cache
+    (; flux_face_values) = cache.solution_container
+    @unpack lift_scalings = cache
 
     @threaded for e in eachelement(mesh, dg, cache)
         for i in each_face_node(mesh, dg, cache)
@@ -563,7 +601,7 @@ function calc_single_boundary_flux!(cache, t, boundary_condition, boundary_key,
                                     dg::DGMulti{NDIMS}) where {NDIMS}
     rd = dg.basis
     md = mesh.md
-    @unpack u_face_values, flux_face_values = cache
+    (; u_face_values, flux_face_values) = cache.solution_container
     @unpack xyzf, nxyzJ, Jf = md
     @unpack surface_flux = dg.surface_integral
 
@@ -598,8 +636,8 @@ function calc_single_boundary_flux!(cache, t, boundary_condition, boundary_key,
         end
     end
 
-    # Note: modifying the values of the reshaped array modifies the values of cache.flux_face_values.
-    # However, we don't have to re-reshape, since cache.flux_face_values still retains its original shape.
+    # Note: modifying the values of the reshaped array modifies the values of cache.solution_container.flux_face_values.
+    # However, we don't have to re-reshape, since cache.solution_container.flux_face_values still retains its original shape.
 
     return nothing
 end
@@ -622,8 +660,8 @@ function calc_single_boundary_flux!(cache, t, boundary_condition, boundary_key,
     # https://github.com/JuliaLang/julia/issues/36313#issuecomment-782336300.
     reshape_by_face(u) = Base.ReshapedArray(u, (num_pts_per_face, num_faces_total), ())
 
-    u_face_values = reshape_by_face(cache.u_face_values)
-    flux_face_values = reshape_by_face(cache.flux_face_values)
+    u_face_values = reshape_by_face(cache.solution_container.u_face_values)
+    flux_face_values = reshape_by_face(cache.solution_container.flux_face_values)
     Jf = reshape_by_face(md.Jf)
     nxyzJ, xyzf = reshape_by_face.(md.nxyzJ), reshape_by_face.(md.xyzf) # broadcast over nxyzJ::NTuple{NDIMS,Matrix}
 
@@ -648,8 +686,8 @@ function calc_single_boundary_flux!(cache, t, boundary_condition, boundary_key,
         end
     end
 
-    # Note: modifying the values of the reshaped array modifies the values of cache.flux_face_values.
-    # However, we don't have to re-reshape, since cache.flux_face_values still retains its original shape.
+    # Note: modifying the values of the reshaped array modifies the values of cache.solution_container.flux_face_values.
+    # However, we don't have to re-reshape, since cache.solution_container.flux_face_values still retains its original shape.
 
     return nothing
 end
@@ -675,7 +713,8 @@ function invert_jacobian!(du, mesh::DGMultiMesh{NDIMS, <:NonAffine}, equations,
                           dg::DGMulti, cache; scaling = -1) where {NDIMS}
     # Vq = interpolation matrix to quadrature points, Pq = quadrature-based L2 projection matrix
     (; Pq, Vq) = dg.basis
-    (; local_values_threaded, invJ) = cache
+    (; invJ) = cache
+    (; local_values_threaded) = cache.solution_container
 
     @threaded for e in eachelement(mesh, dg, cache)
         du_at_quad_points = local_values_threaded[Threads.threadid()]
@@ -711,7 +750,7 @@ function calc_sources!(du, u, t, source_terms,
     rd = dg.basis
     md = mesh.md
     @unpack Pq = rd
-    @unpack u_values, local_values_threaded = cache
+    (; u_values, local_values_threaded) = cache.solution_container
     @threaded for e in eachelement(mesh, dg, cache)
         source_values = local_values_threaded[Threads.threadid()]
 
diff --git a/src/solvers/dgmulti/dg_parabolic.jl b/src/solvers/dgmulti/dg_parabolic.jl
index cb500094ca4..507f91c7890 100644
--- a/src/solvers/dgmulti/dg_parabolic.jl
+++ b/src/solvers/dgmulti/dg_parabolic.jl
@@ -30,7 +30,7 @@ function create_cache_parabolic(mesh::DGMultiMesh,
                                                          (dg.basis.Nq,
                                                           mesh.md.num_elements)),
                                             ndims(mesh)))
-    flux_viscous = similar.(gradients)
+    flux_parabolic = similar.(gradients)
 
     u_face_values = allocate_nested_array(uEltype, nvars, size(md.xf), dg)
     scalar_flux_face_values = similar(u_face_values)
@@ -38,20 +38,20 @@ function create_cache_parabolic(mesh::DGMultiMesh,
 
     local_u_values_threaded = [similar(u_transformed, dg.basis.Nq)
                                for _ in 1:Threads.maxthreadid()]
-    local_flux_viscous_threaded = [SVector{ndims(mesh)}(ntuple(_ -> similar(u_transformed,
-                                                                            dg.basis.Nq),
-                                                               ndims(mesh)))
-                                   for _ in 1:Threads.maxthreadid()]
+    local_flux_parabolic_threaded = [SVector{ndims(mesh)}(ntuple(_ -> similar(u_transformed,
+                                                                              dg.basis.Nq),
+                                                                 ndims(mesh)))
+                                     for _ in 1:Threads.maxthreadid()]
     local_flux_face_values_threaded = [similar(scalar_flux_face_values[:, 1])
                                        for _ in 1:Threads.maxthreadid()]
 
-    return (; u_transformed, gradients, flux_viscous,
+    return (; u_transformed, gradients, flux_parabolic,
             weak_differentiation_matrices, strong_differentiation_matrices,
             gradient_lift_matrix, projection_face_interpolation_matrix,
             divergence_lift_matrix,
             dxidxhatj, J, invJ, # geometric terms
             u_face_values, gradients_face_values, scalar_flux_face_values,
-            local_u_values_threaded, local_flux_viscous_threaded,
+            local_u_values_threaded, local_flux_parabolic_threaded,
             local_flux_face_values_threaded)
 end
 
@@ -115,13 +115,13 @@ end
 function calc_volume_integral_gradient!(gradients, u, mesh::DGMultiMesh{NDIMS, <:NonAffine},
                                         equations::AbstractEquationsParabolic,
                                         dg::DGMulti, cache, cache_parabolic) where {NDIMS}
-    (; strong_differentiation_matrices, dxidxhatj, local_flux_viscous_threaded) = cache_parabolic
+    (; strong_differentiation_matrices, dxidxhatj, local_flux_parabolic_threaded) = cache_parabolic
 
     # compute volume contributions to gradients
     @threaded for e in eachelement(mesh, dg)
 
         # compute gradients with respect to reference coordinates
-        local_reference_gradients = local_flux_viscous_threaded[Threads.threadid()]
+        local_reference_gradients = local_flux_parabolic_threaded[Threads.threadid()]
         for i in eachdim(mesh)
             apply_to_each_field(mul_by!(strong_differentiation_matrices[i]),
                                 local_reference_gradients[i], view(u, :, e))
@@ -143,7 +143,7 @@ end
 function calc_interface_flux_gradient!(scalar_flux_face_values,
                                        mesh::DGMultiMesh, equations,
                                        dg::DGMulti,
-                                       parabolic_scheme::ViscousFormulationBassiRebay1,
+                                       parabolic_scheme::ParabolicFormulationBassiRebay1,
                                        cache, cache_parabolic)
     (; u_face_values) = cache_parabolic
     (; mapM, mapP) = mesh.md
@@ -298,11 +298,11 @@ function calc_single_boundary_flux!(flux_face_values, u_face_values, t,
     return nothing
 end
 
-function calc_viscous_fluxes!(flux_viscous, u, gradients, mesh::DGMultiMesh,
-                              equations::AbstractEquationsParabolic,
-                              dg::DGMulti, cache, cache_parabolic)
+function calc_parabolic_fluxes!(flux_parabolic, u, gradients, mesh::DGMultiMesh,
+                                equations::AbstractEquationsParabolic,
+                                dg::DGMulti, cache, cache_parabolic)
     for dim in eachdim(mesh)
-        set_zero!(flux_viscous[dim], dg)
+        set_zero!(flux_parabolic[dim], dg)
     end
 
     (; local_u_values_threaded) = cache_parabolic
@@ -315,13 +315,13 @@ function calc_viscous_fluxes!(flux_viscous, u, gradients, mesh::DGMultiMesh,
         fill!(local_u_values, zero(eltype(local_u_values)))
         apply_to_each_field(mul_by!(dg.basis.Vq), local_u_values, view(u, :, e))
 
-        # compute viscous flux at quad points
+        # compute parabolic flux at quad points
         for i in eachindex(local_u_values)
             u_i = local_u_values[i]
             gradients_i = getindex.(gradients, i, e)
             for dim in eachdim(mesh)
-                flux_viscous_i = flux(u_i, gradients_i, dim, equations)
-                setindex!(flux_viscous[dim], flux_viscous_i, i, e)
+                flux_parabolic_i = flux(u_i, gradients_i, dim, equations)
+                setindex!(flux_parabolic[dim], flux_parabolic_i, i, e)
             end
         end
     end
@@ -330,18 +330,19 @@ function calc_viscous_fluxes!(flux_viscous, u, gradients, mesh::DGMultiMesh,
 end
 
 # no penalization for a BR1 parabolic solver
-function calc_viscous_penalty!(scalar_flux_face_values, u_face_values, t,
-                               boundary_conditions,
-                               mesh, equations::AbstractEquationsParabolic,
-                               dg::DGMulti, parabolic_scheme::ViscousFormulationBassiRebay1,
-                               cache, cache_parabolic)
+function calc_parabolic_penalty!(scalar_flux_face_values, u_face_values, t,
+                                 boundary_conditions,
+                                 mesh, equations::AbstractEquationsParabolic,
+                                 dg::DGMulti,
+                                 parabolic_scheme::ParabolicFormulationBassiRebay1,
+                                 cache, cache_parabolic)
     return nothing
 end
 
-function calc_viscous_penalty!(scalar_flux_face_values, u_face_values, t,
-                               boundary_conditions, mesh,
-                               equations::AbstractEquationsParabolic,
-                               dg::DGMulti, parabolic_scheme, cache, cache_parabolic)
+function calc_parabolic_penalty!(scalar_flux_face_values, u_face_values, t,
+                                 boundary_conditions, mesh,
+                                 equations::AbstractEquationsParabolic,
+                                 dg::DGMulti, parabolic_scheme, cache, cache_parabolic)
     # compute fluxes at interfaces
     (; scalar_flux_face_values) = cache_parabolic
     (; mapM, mapP) = mesh.md
@@ -354,7 +355,7 @@ function calc_viscous_penalty!(scalar_flux_face_values, u_face_values, t,
     return nothing
 end
 
-function calc_volume_integral_divergence!(du, u, flux_viscous, mesh::DGMultiMesh,
+function calc_volume_integral_divergence!(du, u, flux_parabolic, mesh::DGMultiMesh,
                                           equations::AbstractEquationsParabolic,
                                           dg::DGMulti, cache, cache_parabolic)
     (; weak_differentiation_matrices) = cache_parabolic
@@ -364,36 +365,36 @@ function calc_volume_integral_divergence!(du, u, flux_viscous, mesh::DGMultiMesh
         for i in eachdim(mesh), j in eachdim(mesh)
             dxidxhatj = mesh.md.rstxyzJ[i, j][1, e] # assumes mesh is affine
             apply_to_each_field(mul_by_accum!(weak_differentiation_matrices[j], dxidxhatj),
-                                view(du, :, e), view(flux_viscous[i], :, e))
+                                view(du, :, e), view(flux_parabolic[i], :, e))
         end
     end
 
     return nothing
 end
 
-function calc_volume_integral_divergence!(du, u, flux_viscous,
+function calc_volume_integral_divergence!(du, u, flux_parabolic,
                                           mesh::DGMultiMesh{NDIMS, <:NonAffine},
                                           equations::AbstractEquationsParabolic,
                                           dg::DGMulti, cache, cache_parabolic) where {NDIMS}
-    (; weak_differentiation_matrices, dxidxhatj, local_flux_viscous_threaded) = cache_parabolic
+    (; weak_differentiation_matrices, dxidxhatj, local_flux_parabolic_threaded) = cache_parabolic
 
     # compute volume contributions to divergence
     @threaded for e in eachelement(mesh, dg)
-        local_viscous_flux = local_flux_viscous_threaded[Threads.threadid()][1]
+        local_parabolic_flux = local_flux_parabolic_threaded[Threads.threadid()][1]
         for i in eachdim(mesh)
             # rotate flux to reference coordinates
-            fill!(local_viscous_flux, zero(eltype(local_viscous_flux)))
+            fill!(local_parabolic_flux, zero(eltype(local_parabolic_flux)))
             for j in eachdim(mesh)
-                for node in eachindex(local_viscous_flux)
-                    local_viscous_flux[node] = local_viscous_flux[node] +
-                                               dxidxhatj[j, i][node, e] *
-                                               flux_viscous[j][node, e]
+                for node in eachindex(local_parabolic_flux)
+                    local_parabolic_flux[node] = local_parabolic_flux[node] +
+                                                 dxidxhatj[j, i][node, e] *
+                                                 flux_parabolic[j][node, e]
                 end
             end
 
             # differentiate with respect to reference coordinates
             apply_to_each_field(mul_by_accum!(weak_differentiation_matrices[i]),
-                                view(du, :, e), local_viscous_flux)
+                                view(du, :, e), local_parabolic_flux)
         end
     end
 
@@ -402,9 +403,9 @@ end
 
 function calc_interface_flux_divergence!(scalar_flux_face_values,
                                          mesh, equations, dg,
-                                         parabolic_scheme::ViscousFormulationBassiRebay1,
+                                         parabolic_scheme::ParabolicFormulationBassiRebay1,
                                          cache, cache_parabolic)
-    flux_viscous_face_values = cache_parabolic.gradients_face_values # reuse storage
+    flux_parabolic_face_values = cache_parabolic.gradients_face_values # reuse storage
     (; mapM, mapP, nxyzJ) = mesh.md
 
     @threaded for face_node_index in each_face_node_global(mesh, dg, cache, cache_parabolic)
@@ -413,8 +414,8 @@ function calc_interface_flux_divergence!(scalar_flux_face_values,
         # compute f(u, ∇u) ⋅ n
         flux_face_value = zero(eltype(scalar_flux_face_values))
         for dim in eachdim(mesh)
-            fM = flux_viscous_face_values[dim][idM]
-            fP = flux_viscous_face_values[dim][idP]
+            fM = flux_parabolic_face_values[dim][idM]
+            fP = flux_parabolic_face_values[dim][idP]
             # Here, we use the "weak" formulation to compute the divergence (to ensure stability on curved meshes).
             flux_face_value = flux_face_value +
                               0.5f0 * (fP + fM) * nxyzJ[dim][face_node_index]
@@ -425,21 +426,21 @@ function calc_interface_flux_divergence!(scalar_flux_face_values,
     return nothing
 end
 
-function calc_divergence!(du, u::StructArray, t, flux_viscous, mesh::DGMultiMesh,
+function calc_divergence!(du, u::StructArray, t, flux_parabolic, mesh::DGMultiMesh,
                           equations::AbstractEquationsParabolic,
                           boundary_conditions, dg::DGMulti, parabolic_scheme, cache,
                           cache_parabolic)
     set_zero!(du, dg)
 
-    calc_volume_integral_divergence!(du, u, flux_viscous, mesh, equations, dg, cache,
+    calc_volume_integral_divergence!(du, u, flux_parabolic, mesh, equations, dg, cache,
                                      cache_parabolic)
 
     # interpolates from solution coefficients to face quadrature points
     (; projection_face_interpolation_matrix) = cache_parabolic
-    flux_viscous_face_values = cache_parabolic.gradients_face_values # reuse storage
+    flux_parabolic_face_values = cache_parabolic.gradients_face_values # reuse storage
     for dim in eachdim(mesh)
         apply_to_each_field(mul_by!(projection_face_interpolation_matrix),
-                            flux_viscous_face_values[dim], flux_viscous[dim])
+                            flux_parabolic_face_values[dim], flux_parabolic[dim])
     end
 
     # compute fluxes at interfaces
@@ -452,9 +453,9 @@ function calc_divergence!(du, u::StructArray, t, flux_viscous, mesh::DGMultiMesh
                         Divergence(),
                         boundary_conditions, mesh, equations, dg, cache, cache_parabolic)
 
-    calc_viscous_penalty!(scalar_flux_face_values, cache_parabolic.u_face_values, t,
-                          boundary_conditions, mesh, equations, dg, parabolic_scheme,
-                          cache, cache_parabolic)
+    calc_parabolic_penalty!(scalar_flux_face_values, cache_parabolic.u_face_values, t,
+                            boundary_conditions, mesh, equations, dg, parabolic_scheme,
+                            cache, cache_parabolic)
 
     # surface contributions
     apply_to_each_field(mul_by_accum!(cache_parabolic.divergence_lift_matrix), du,
@@ -476,7 +477,7 @@ function rhs_parabolic!(du, u, t, mesh::DGMultiMesh,
     set_zero!(du, dg)
 
     @trixi_timeit timer() "transform variables" begin
-        (; u_transformed, gradients, flux_viscous) = cache_parabolic
+        (; u_transformed, gradients, flux_parabolic) = cache_parabolic
         transform_variables!(u_transformed, u, mesh, equations_parabolic,
                              dg, cache)
     end
@@ -486,13 +487,13 @@ function rhs_parabolic!(du, u, t, mesh::DGMultiMesh,
                        boundary_conditions, dg, parabolic_scheme, cache, cache_parabolic)
     end
 
-    @trixi_timeit timer() "calc viscous fluxes" begin
-        calc_viscous_fluxes!(flux_viscous, u_transformed, gradients,
-                             mesh, equations_parabolic, dg, cache, cache_parabolic)
+    @trixi_timeit timer() "calc parabolic fluxes" begin
+        calc_parabolic_fluxes!(flux_parabolic, u_transformed, gradients,
+                               mesh, equations_parabolic, dg, cache, cache_parabolic)
     end
 
     @trixi_timeit timer() "calc divergence" begin
-        calc_divergence!(du, u_transformed, t, flux_viscous, mesh, equations_parabolic,
+        calc_divergence!(du, u_transformed, t, flux_parabolic, mesh, equations_parabolic,
                          boundary_conditions, dg, parabolic_scheme, cache, cache_parabolic)
     end
 
@@ -500,7 +501,7 @@ function rhs_parabolic!(du, u, t, mesh::DGMultiMesh,
         # Note: we do not flip the sign of the geometric Jacobian here.
         # This is because the parabolic fluxes are assumed to be of the form
         #   `du/dt + df/dx = dg/dx + source(x,t)`,
-        # where f(u) is the inviscid flux and g(u) is the viscous flux.
+        # where f(u) is the inviscid flux and g(u) is the parabolic flux.
         invert_jacobian!(du, mesh, equations_parabolic, dg, cache; scaling = 1)
     end
 
diff --git a/src/solvers/dgmulti/dgmulti.jl b/src/solvers/dgmulti/dgmulti.jl
new file mode 100644
index 00000000000..fabdf01e8ae
--- /dev/null
+++ b/src/solvers/dgmulti/dgmulti.jl
@@ -0,0 +1,22 @@
+# basic types and functions for DGMulti solvers
+include("types.jl")
+include("dg.jl")
+
+# flux differencing solver routines for DGMulti solvers
+include("flux_differencing_gauss_sbp.jl")
+include("flux_differencing.jl")
+
+# adaptive volume integral solver
+include("volume_integral_adaptive.jl")
+
+# integration of SummationByPartsOperators.jl
+include("sbp.jl")
+
+# specialization of DGMulti to specific equations
+include("flux_differencing_compressible_euler.jl")
+
+# shock capturing
+include("shock_capturing.jl")
+
+# parabolic terms for DGMulti solvers
+include("dg_parabolic.jl")
diff --git a/src/solvers/dgmulti/flux_differencing.jl b/src/solvers/dgmulti/flux_differencing.jl
index 56d4e1f3752..3ba81d36ae7 100644
--- a/src/solvers/dgmulti/flux_differencing.jl
+++ b/src/solvers/dgmulti/flux_differencing.jl
@@ -5,234 +5,6 @@
 @muladd begin
 #! format: noindent
 
-#   hadamard_sum!(du, A,
-#                 flux_is_symmetric, volume_flux,
-#                 orientation_or_normal_direction, u, equations)
-#
-# Computes the flux difference ∑_j A[i, j] * f(u_i, u_j) and accumulates the result into `du`.
-# Called by `local_flux_differencing` to compute local contributions to flux differencing
-# volume integrals.
-#
-# - `du`, `u` are vectors
-# - `A` is the skew-symmetric flux differencing matrix
-# - `flux_is_symmetric` is a `Val{<:Bool}` indicating if f(u_i, u_j) = f(u_j, u_i)
-#
-# The matrix `A` can be either dense or sparse. In the latter case, you should
-# use the `adjoint` of a `SparseMatrixCSC` to mimic a `SparseMatrixCSR`, which
-# is more efficient for matrix vector products.
-
-# Version for dense operators and symmetric fluxes
-@inline function hadamard_sum!(du, A,
-                               flux_is_symmetric::True, volume_flux,
-                               orientation_or_normal_direction, u, equations)
-    row_ids, col_ids = axes(A)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for j in col_ids
-            # This routine computes only the upper-triangular part of the hadamard sum (A .* F).
-            # We avoid computing the lower-triangular part, and instead accumulate those contributions
-            # while computing the upper-triangular part (using the fact that A is skew-symmetric and F
-            # is symmetric).
-            if j > i
-                u_j = u[j]
-                AF_ij = 2 * A[i, j] *
-                        volume_flux(u_i, u_j, orientation_or_normal_direction,
-                                    equations)
-                du_i = du_i + AF_ij
-                du[j] = du[j] - AF_ij
-            end
-        end
-        du[i] = du_i
-    end
-end
-
-# Version for dense operators and non-symmetric fluxes
-@inline function hadamard_sum!(du, A,
-                               flux_is_symmetric::False, volume_flux,
-                               orientation::Integer, u, equations)
-    row_ids, col_ids = axes(A)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for j in col_ids
-            u_j = u[j]
-            f_ij = volume_flux(u_i, u_j, orientation, equations)
-            du_i = du_i + 2 * A[i, j] * f_ij
-        end
-        du[i] = du_i
-    end
-end
-
-@inline function hadamard_sum!(du, A,
-                               flux_is_symmetric::False, volume_flux,
-                               normal_direction::AbstractVector, u, equations)
-    row_ids, col_ids = axes(A)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for j in col_ids
-            u_j = u[j]
-            f_ij = volume_flux(u_i, u_j, normal_direction, equations)
-            du_i = du_i + 2 * A[i, j] * f_ij
-        end
-        du[i] = du_i
-    end
-end
-
-# Version for sparse operators and symmetric fluxes
-@inline function hadamard_sum!(du,
-                               A::LinearAlgebra.Adjoint{<:Any,
-                                                        <:AbstractSparseMatrixCSC},
-                               flux_is_symmetric::True, volume_flux,
-                               orientation_or_normal_direction, u, equations)
-    A_base = parent(A) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-    row_ids = axes(A, 2)
-    rows = rowvals(A_base)
-    vals = nonzeros(A_base)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for id in nzrange(A_base, i)
-            j = rows[id]
-            # This routine computes only the upper-triangular part of the hadamard sum (A .* F).
-            # We avoid computing the lower-triangular part, and instead accumulate those contributions
-            # while computing the upper-triangular part (using the fact that A is skew-symmetric and F
-            # is symmetric).
-            if j > i
-                u_j = u[j]
-                A_ij = vals[id]
-                AF_ij = 2 * A_ij *
-                        volume_flux(u_i, u_j, orientation_or_normal_direction,
-                                    equations)
-                du_i = du_i + AF_ij
-                du[j] = du[j] - AF_ij
-            end
-        end
-        du[i] = du_i
-    end
-end
-
-# Version for sparse operators and symmetric fluxes with curved meshes
-@inline function hadamard_sum!(du,
-                               A::LinearAlgebra.Adjoint{<:Any,
-                                                        <:AbstractSparseMatrixCSC},
-                               flux_is_symmetric::True, volume_flux,
-                               normal_directions::AbstractVector{<:AbstractVector},
-                               u, equations)
-    A_base = parent(A) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-    row_ids = axes(A, 2)
-    rows = rowvals(A_base)
-    vals = nonzeros(A_base)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for id in nzrange(A_base, i)
-            j = rows[id]
-            # This routine computes only the upper-triangular part of the hadamard sum (A .* F).
-            # We avoid computing the lower-triangular part, and instead accumulate those contributions
-            # while computing the upper-triangular part (using the fact that A is skew-symmetric and F
-            # is symmetric).
-            if j > i
-                u_j = u[j]
-                A_ij = vals[id]
-
-                # provably entropy stable de-aliasing of geometric terms
-                normal_direction = 0.5 * (getindex.(normal_directions, i) +
-                                    getindex.(normal_directions, j))
-
-                AF_ij = 2 * A_ij * volume_flux(u_i, u_j, normal_direction, equations)
-                du_i = du_i + AF_ij
-                du[j] = du[j] - AF_ij
-            end
-        end
-        du[i] = du_i
-    end
-end
-
-# TODO: DGMulti. Fix for curved meshes.
-# Version for sparse operators and non-symmetric fluxes
-@inline function hadamard_sum!(du,
-                               A::LinearAlgebra.Adjoint{<:Any,
-                                                        <:AbstractSparseMatrixCSC},
-                               flux_is_symmetric::False, volume_flux,
-                               normal_direction::AbstractVector, u, equations)
-    A_base = parent(A) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-    row_ids = axes(A, 2)
-    rows = rowvals(A_base)
-    vals = nonzeros(A_base)
-
-    for i in row_ids
-        u_i = u[i]
-        du_i = du[i]
-        for id in nzrange(A_base, i)
-            A_ij = vals[id]
-            j = rows[id]
-            u_j = u[j]
-            f_ij = volume_flux(u_i, u_j, normal_direction, equations)
-            du_i = du_i + 2 * A_ij * f_ij
-        end
-        du[i] = du_i
-    end
-end
-
-# For DGMulti implementations, we construct "physical" differentiation operators by taking linear
-# combinations of reference differentiation operators scaled by geometric change of variables terms.
-# We use a lazy evaluation of physical differentiation operators, so that we can compute linear
-# combinations of differentiation operators on-the-fly in an allocation-free manner.
-@inline function build_lazy_physical_derivative(element, orientation,
-                                                mesh::DGMultiMesh{1}, dg, cache,
-                                                operator_scaling = 1.0)
-    @unpack Qrst_skew = cache
-    @unpack rxJ = mesh.md
-    # ignore orientation
-    return LazyMatrixLinearCombo(Qrst_skew, operator_scaling .* (rxJ[1, element],))
-end
-
-@inline function build_lazy_physical_derivative(element, orientation,
-                                                mesh::DGMultiMesh{2}, dg, cache,
-                                                operator_scaling = 1.0)
-    @unpack Qrst_skew = cache
-    @unpack rxJ, sxJ, ryJ, syJ = mesh.md
-    if orientation == 1
-        return LazyMatrixLinearCombo(Qrst_skew,
-                                     operator_scaling .*
-                                     (rxJ[1, element], sxJ[1, element]))
-    else # if orientation == 2
-        return LazyMatrixLinearCombo(Qrst_skew,
-                                     operator_scaling .*
-                                     (ryJ[1, element], syJ[1, element]))
-    end
-end
-
-@inline function build_lazy_physical_derivative(element, orientation,
-                                                mesh::DGMultiMesh{3}, dg, cache,
-                                                operator_scaling = 1.0)
-    @unpack Qrst_skew = cache
-    @unpack rxJ, sxJ, txJ, ryJ, syJ, tyJ, rzJ, szJ, tzJ = mesh.md
-    if orientation == 1
-        return LazyMatrixLinearCombo(Qrst_skew,
-                                     operator_scaling .*
-                                     (rxJ[1, element], sxJ[1, element],
-                                      txJ[1, element]))
-    elseif orientation == 2
-        return LazyMatrixLinearCombo(Qrst_skew,
-                                     operator_scaling .*
-                                     (ryJ[1, element], syJ[1, element],
-                                      tyJ[1, element]))
-    else # if orientation == 3
-        return LazyMatrixLinearCombo(Qrst_skew,
-                                     operator_scaling .*
-                                     (rzJ[1, element], szJ[1, element],
-                                      tzJ[1, element]))
-    end
-end
-
 # Return the contravariant basis vector corresponding to the Cartesian
 # coordinate direction `orientation` in a given `element` of the `mesh`.
 # The contravariant basis vectors have entries `dx_i / dxhat_j` where
@@ -260,6 +32,17 @@ end
     return SVector{NDIMS}(view.(dxidxhatj[:, orientation], :, element))
 end
 
+# For Affine meshes, `get_contravariant_vector` returns an SVector of scalars (constant over the
+# element). The normal direction is the same for all node pairs.
+@inline get_normal_direction(normal_directions::AbstractVector, i, j) = normal_directions
+
+# For NonAffine meshes, `get_contravariant_vector` returns an SVector of per-node arrays.
+# We average the normals at nodes i and j for provably entropy-stable de-aliasing of geometric terms.
+@inline function get_normal_direction(normal_directions::AbstractVector{<:AbstractVector},
+                                      i, j)
+    return 0.5f0 * (getindex.(normal_directions, i) + getindex.(normal_directions, j))
+end
+
 # use hybridized SBP operators for general flux differencing schemes.
 function compute_flux_differencing_SBP_matrices(dg::DGMulti)
     return compute_flux_differencing_SBP_matrices(dg, has_sparse_operators(dg))
@@ -288,6 +71,24 @@ function compute_flux_differencing_SBP_matrices(dg::DGMultiFluxDiffSBP,
     return Qrst_skew
 end
 
+# Build element-to-element connectivity from face-to-face connectivity.
+# Used for smoothing of shock capturing blending parameters (see `apply_smoothing!`).
+function build_element_to_element_connectivity(mesh::DGMultiMesh, dg::DGMulti)
+    face_to_face_connectivity = mesh.md.FToF
+    element_to_element_connectivity = similar(face_to_face_connectivity)
+    for e in axes(face_to_face_connectivity, 2)
+        for f in axes(face_to_face_connectivity, 1)
+            neighbor_face_index = face_to_face_connectivity[f, e]
+            # Reverse-engineer element index from face index. Assumes all elements
+            # have the same number of faces.
+            neighbor_element_index = ((neighbor_face_index - 1) ÷ dg.basis.num_faces) +
+                                     1
+            element_to_element_connectivity[f, e] = neighbor_element_index
+        end
+    end
+    return element_to_element_connectivity
+end
+
 # For flux differencing SBP-type approximations, store solutions in Matrix{SVector{nvars}}.
 # This results in a slight speedup for `calc_volume_integral!`.
 function allocate_nested_array(uEltype, nvars, array_dimensions, dg::DGMultiFluxDiffSBP)
@@ -302,25 +103,19 @@ function create_cache(mesh::DGMultiMesh, equations, dg::DGMultiFluxDiffSBP,
     # for use with flux differencing schemes
     Qrst_skew = compute_flux_differencing_SBP_matrices(dg)
 
-    # Todo: DGMulti. Factor common storage into a struct (MeshDataCache?) for reuse across solvers?
-    # storage for volume quadrature values, face quadrature values, flux values
-    nvars = nvariables(equations)
-    u_values = allocate_nested_array(uEltype, nvars, size(md.xq), dg)
-    u_face_values = allocate_nested_array(uEltype, nvars, size(md.xf), dg)
-    flux_face_values = allocate_nested_array(uEltype, nvars, size(md.xf), dg)
     lift_scalings = rd.wf ./ rd.wq[rd.Fmask] # lift scalings for diag-norm SBP operators
 
-    local_values_threaded = [allocate_nested_array(uEltype, nvars, (rd.Nq,), dg)
-                             for _ in 1:Threads.maxthreadid()]
-
+    nvars = nvariables(equations)
     # Use an array of SVectors (chunks of `nvars` are contiguous in memory) to speed up flux differencing
-    fluxdiff_local_threaded = [zeros(SVector{nvars, uEltype}, rd.Nq)
-                               for _ in 1:Threads.maxthreadid()]
+    du_local_threaded = [zeros(SVector{nvars, uEltype}, rd.Nq)
+                         for _ in 1:Threads.maxthreadid()]
+
+    solution_container = initialize_dgmulti_solution_container(mesh, equations, dg,
+                                                               uEltype)
 
     return (; md, Qrst_skew, dxidxhatj = md.rstxyzJ,
             invJ = inv.(md.J), lift_scalings, inv_wq = inv.(rd.wq),
-            u_values, u_face_values, flux_face_values,
-            local_values_threaded, fluxdiff_local_threaded)
+            solution_container, du_local_threaded)
 end
 
 # most general create_cache: works for `DGMultiFluxDiff{<:Polynomial}`
@@ -359,10 +154,10 @@ function create_cache(mesh::DGMultiMesh, equations, dg::DGMultiFluxDiff, RealT,
                              for _ in 1:Threads.maxthreadid()]
 
     # Use an array of SVectors (chunks of `nvars` are contiguous in memory) to speed up flux differencing
-    # The result is then transferred to rhs_local_threaded::StructArray{<:SVector} before
-    # projecting it and storing it into `du`.
-    fluxdiff_local_threaded = [zeros(SVector{nvars, uEltype}, num_quad_points_total)
-                               for _ in 1:Threads.maxthreadid()]
+    # The result is then transferred to `rhs_local`, a thread-local element of
+    # `rhs_local_threaded::StructArray{<:SVector}` before projecting it and storing it into `du`.
+    du_local_threaded = [zeros(SVector{nvars, uEltype}, num_quad_points_total)
+                         for _ in 1:Threads.maxthreadid()]
     rhs_local_threaded = [allocate_nested_array(uEltype, nvars,
                                                 (num_quad_points_total,), dg)
                           for _ in 1:Threads.maxthreadid()]
@@ -370,22 +165,26 @@ function create_cache(mesh::DGMultiMesh, equations, dg::DGMultiFluxDiff, RealT,
     # interpolate geometric terms to both quadrature and face values for curved meshes
     (; Vq, Vf) = dg.basis
     interpolated_geometric_terms = map(x -> [Vq; Vf] * x, mesh.md.rstxyzJ)
-    J = rd.Vq * md.J
+    J = Vq * md.J
+
+    solution_container = DGMultiSolutionContainer(u_values, u_face_values,
+                                                  flux_face_values,
+                                                  local_values_threaded)
 
     return (; md, Qrst_skew, VhP, Ph,
             invJ = inv.(J), dxidxhatj = interpolated_geometric_terms,
             entropy_var_values, projected_entropy_var_values,
             entropy_projected_u_values,
-            u_values, u_face_values, flux_face_values,
-            local_values_threaded, fluxdiff_local_threaded, rhs_local_threaded)
+            solution_container, du_local_threaded, rhs_local_threaded)
 end
 
 # TODO: DGMulti. Address hard-coding of `entropy2cons!` and `cons2entropy!` for this function.
 function entropy_projection!(cache, u, mesh::DGMultiMesh, equations, dg::DGMulti)
     rd = dg.basis
     @unpack Vq = rd
-    @unpack VhP, entropy_var_values, u_values = cache
+    @unpack VhP, entropy_var_values = cache
     @unpack projected_entropy_var_values, entropy_projected_u_values = cache
+    (; u_values) = cache.solution_container
 
     apply_to_each_field(mul_by!(Vq), u_values, u)
 
@@ -452,111 +251,170 @@ end
 
 function calc_volume_integral!(du, u, mesh::DGMultiMesh,
                                have_nonconservative_terms, equations,
-                               volume_integral, dg::DGMultiFluxDiff, cache)
+                               volume_integral, dg::DGMultiFluxDiff, cache,
+                               alpha = true)
     # No interpolation performed for general volume integral.
     # Instead, an element-wise entropy projection (`entropy_projection!`) is performed before, see
     # `rhs!` for `DGMultiFluxDiff`, which populates `entropy_projected_u_values`
     @threaded for element in eachelement(mesh, dg, cache)
         volume_integral_kernel!(du, u, element, mesh,
                                 have_nonconservative_terms, equations,
-                                volume_integral, dg, cache)
+                                volume_integral, dg, cache, alpha)
     end
 
     return nothing
 end
 
-# Computes flux differencing contribution from each Cartesian direction over a single element.
-# For dense operators, we do not use sum factorization.
-@inline function local_flux_differencing!(fluxdiff_local, u_local, element_index,
+# Computes flux differencing contribution over a single element by looping over node pairs (i, j).
+# The physical normal direction for each pair is n_ij = geometric_matrix * ref_entries,
+# where ref_entries[d] = Qrst_skew[d][i,j].
+# This fuses the NDIMS per-dimension flux
+# evaluations of the old dimension-by-dimension loop into a single evaluation per pair.
+# Essentially, instead of calculating 
+#   volume_flux(u_i, u_j, 1, equations) * Qx[i, j] + volume_flux(u_i, u_j, 2, equations) * Qy[i, j] + ...
+# where Qx[i, j] = dr/dx * Qr[i, j] + ds/dx * Qs[i, j], we can expand out and evaluate
+#   volume_flux(u_i, u_j, [dr/dx, dr/dy] * Qr[i, j], equations) + 
+#   volume_flux(u_i, u_j, [ds/dx, ds/dy] * Qs[i, j], equations)
+# which is slightly faster. 
+# 
+# For dense operators (SBP on Line/Tri/Tet), we do not use this sum factorization trick.
+@inline function local_flux_differencing!(du_local, u_local, element_index,
                                           have_nonconservative_terms::False,
                                           volume_flux,
                                           has_sparse_operators::False, mesh,
                                           equations, dg, cache)
-    for dim in eachdim(mesh)
-        Qi_skew = build_lazy_physical_derivative(element_index, dim, mesh, dg, cache)
-        # True() indicates the volume flux is symmetric
-        hadamard_sum!(fluxdiff_local, Qi_skew,
-                      True(), volume_flux,
-                      dim, u_local, equations)
+    @unpack Qrst_skew = cache
+    NDIMS = ndims(mesh)
+    row_ids = axes(first(Qrst_skew), 1)
+    geometric_matrix = get_contravariant_matrix(element_index, mesh, cache)
+    for i in row_ids
+        u_i = u_local[i]
+        for j in row_ids
+            # We use the symmetry of the volume flux and the anti-symmetry
+            # of the derivative operator to save half of the volume flux
+            # computations.
+            if j > i
+                u_j = u_local[j]
+                ref_entries = SVector(ntuple(d -> Qrst_skew[d][i, j], Val(NDIMS)))
+                normal_direction = geometric_matrix * ref_entries
+                AF_ij = 2 * volume_flux(u_i, u_j, normal_direction, equations)
+                du_local[i] = du_local[i] + AF_ij
+                du_local[j] = du_local[j] - AF_ij # Due to skew-symmetry
+            end
+        end
     end
 end
 
-@inline function local_flux_differencing!(fluxdiff_local, u_local, element_index,
+@inline function local_flux_differencing!(du_local, u_local, element_index,
                                           have_nonconservative_terms::True, volume_flux,
                                           has_sparse_operators::False, mesh,
                                           equations, dg, cache)
+    @unpack Qrst_skew = cache
+    NDIMS = ndims(mesh)
     flux_conservative, flux_nonconservative = volume_flux
-    for dim in eachdim(mesh)
-        Qi_skew = build_lazy_physical_derivative(element_index, dim, mesh, dg, cache)
-        # True() indicates the flux is symmetric.
-        hadamard_sum!(fluxdiff_local, Qi_skew,
-                      True(), flux_conservative,
-                      dim, u_local, equations)
-
-        # The final argument .5 scales the operator by 1/2 for the nonconservative terms.
-        half_Qi_skew = build_lazy_physical_derivative(element_index, dim, mesh, dg,
-                                                      cache, 0.5)
-        # False() indicates the flux is non-symmetric.
-        hadamard_sum!(fluxdiff_local, half_Qi_skew,
-                      False(), flux_nonconservative,
-                      dim, u_local, equations)
+    row_ids = axes(first(Qrst_skew), 1)
+    geometric_matrix = get_contravariant_matrix(element_index, mesh, cache)
+    for i in row_ids
+        u_i = u_local[i]
+        for j in row_ids
+            ref_entries = SVector(ntuple(d -> Qrst_skew[d][i, j], Val(NDIMS)))
+            normal_direction = geometric_matrix * ref_entries
+            # We use the symmetry of the volume flux and the anti-symmetry
+            # of the derivative operator to save half of the volume flux
+            # computations.
+            if j > i
+                u_j = u_local[j]
+                AF_ij = 2 * flux_conservative(u_i, u_j, normal_direction, equations)
+                du_local[i] = du_local[i] + AF_ij
+                du_local[j] = du_local[j] - AF_ij # Due to skew-symmetry
+            end
+            # Non-conservative terms use the full (non-symmetric) loop.
+            # The 0.5f0 factor on the normal direction is necessary for the nonconservative 
+            # fluxes based on the interpretation of global SBP operators.  
+            # See also `calc_interface_flux!` with `have_nonconservative_terms::True` 
+            # in src/solvers/dgsem_tree/dg_1d.jl
+            f_nc = flux_nonconservative(u_i, u_local[j], 0.5f0 * normal_direction,
+                                        equations)
+            du_local[i] = du_local[i] + 2 * f_nc
+        end
     end
 end
 
 # When the operators are sparse, we use the sum-factorization approach to
-# computing flux differencing.
-@inline function local_flux_differencing!(fluxdiff_local, u_local, element_index,
+# computing flux differencing. Each dimension has its own sparse operator with
+# its own sparsity pattern (e.g., tensor-product structure on Quad/Hex elements),
+# so we loop per-dimension. For each nonzero entry A[i,j] we evaluate the flux once
+# and exploit skew-symmetry to accumulate both the (i,j) and (j,i) contributions.
+@inline function local_flux_differencing!(du_local, u_local, element_index,
                                           have_nonconservative_terms::False,
                                           volume_flux,
                                           has_sparse_operators::True, mesh,
                                           equations, dg, cache)
     @unpack Qrst_skew = cache
     for dim in eachdim(mesh)
-        # There are two ways to write this flux differencing discretization on affine meshes.
-        #
-        # 1. Use numerical fluxes in Cartesian directions and sum up the discrete derivative
-        #    operators per coordinate direction accordingly.
-        # 2. Use discrete derivative operators per coordinate direction and corresponding
-        #    numerical fluxes in arbitrary (non-Cartesian) space directions.
-        #
-        # The first option makes it necessary to sum up the individual sparsity
-        # patterns of each reference coordinate direction. On tensor-product
-        # elements such as `Quad()` or `Hex()` elements, this increases the number of
-        # potentially expensive numerical flux evaluations by a factor of `ndims(mesh)`.
-        # Thus, we use the second option below (which basically corresponds to the
-        # well-known sum factorization on tensor product elements).
-        # Note that there is basically no difference for dense derivative operators.
-        normal_direction = get_contravariant_vector(element_index, dim, mesh, cache)
+        normal_directions = get_contravariant_vector(element_index, dim, mesh, cache)
         Q_skew = Qrst_skew[dim]
-
-        # True() indicates the flux is symmetric
-        hadamard_sum!(fluxdiff_local, Q_skew,
-                      True(), volume_flux,
-                      normal_direction, u_local, equations)
+        A_base, row_ids, rows, vals = sparse_operator_data(Q_skew)
+        for i in row_ids
+            u_i = u_local[i]
+            du_i = du_local[i]
+            for id in nzrange(A_base, i)
+                j = rows[id]
+                # This routine computes only the upper-triangular part of the hadamard sum (A .* F).
+                # We avoid computing the lower-triangular part, and instead accumulate those contributions
+                # while computing the upper-triangular part (using the fact that A is skew-symmetric and F
+                # is symmetric).
+                if j > i
+                    u_j = u_local[j]
+                    A_ij = vals[id]
+                    normal_direction_ij = get_normal_direction(normal_directions, i, j)
+                    AF_ij = 2 * A_ij *
+                            volume_flux(u_i, u_j, normal_direction_ij, equations)
+                    du_i = du_i + AF_ij
+                    du_local[j] = du_local[j] - AF_ij # Due to skew-symmetry
+                end
+            end
+            du_local[i] = du_i
+        end
     end
 end
 
-@inline function local_flux_differencing!(fluxdiff_local, u_local, element_index,
+@inline function local_flux_differencing!(du_local, u_local, element_index,
                                           have_nonconservative_terms::True, volume_flux,
                                           has_sparse_operators::True, mesh,
                                           equations, dg, cache)
     @unpack Qrst_skew = cache
     flux_conservative, flux_nonconservative = volume_flux
     for dim in eachdim(mesh)
-        normal_direction = get_contravariant_vector(element_index, dim, mesh, cache)
+        normal_directions = get_contravariant_vector(element_index, dim, mesh, cache)
         Q_skew = Qrst_skew[dim]
-
-        # True() indicates the flux is symmetric
-        hadamard_sum!(fluxdiff_local, Q_skew,
-                      True(), flux_conservative,
-                      normal_direction, u_local, equations)
-
-        # We scale the operator by 1/2 for the nonconservative terms.
-        half_Q_skew = LazyMatrixLinearCombo((Q_skew,), (0.5,))
-        # False() indicates the flux is non-symmetric
-        hadamard_sum!(fluxdiff_local, half_Q_skew,
-                      False(), flux_nonconservative,
-                      normal_direction, u_local, equations)
+        A_base, row_ids, rows, vals = sparse_operator_data(Q_skew)
+        for i in row_ids
+            u_i = u_local[i]
+            du_i = du_local[i]
+            for id in nzrange(A_base, i)
+                j = rows[id]
+                A_ij = vals[id]
+                u_j = u_local[j]
+                normal_direction_ij = get_normal_direction(normal_directions, i, j)
+                # Conservative part: exploit skew-symmetry (calculate upper triangular part only).
+                if j > i
+                    AF_ij = 2 * A_ij *
+                            flux_conservative(u_i, u_j, normal_direction_ij, equations)
+                    du_i = du_i + AF_ij
+                    du_local[j] = du_local[j] - AF_ij # Due to skew-symmetry
+                end
+                # Non-conservative terms use the full (non-symmetric) loop.
+                # The 0.5f0 factor on the normal direction is necessary for the nonconservative 
+                # fluxes based on the interpretation of global SBP operators.  
+                # See also `calc_interface_flux!` with `have_nonconservative_terms::True` 
+                # in src/solvers/dgsem_tree/dg_1d.jl
+                f_nc = flux_nonconservative(u_i, u_j, 0.5f0 * normal_direction_ij,
+                                            equations)
+                du_i = du_i + 2 * A_ij * f_nc
+            end
+            du_local[i] = du_i
+        end
     end
 end
 
@@ -566,27 +424,27 @@ end
 @inline function volume_integral_kernel!(du, u, element, mesh::DGMultiMesh,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralFluxDifferencing,
-                                         dg::DGMultiFluxDiff, cache)
+                                         dg::DGMultiFluxDiff, cache, alpha = true)
     @unpack entropy_projected_u_values, Ph = cache
-    @unpack fluxdiff_local_threaded, rhs_local_threaded = cache
+    @unpack du_local_threaded, rhs_local_threaded = cache
 
-    fluxdiff_local = fluxdiff_local_threaded[Threads.threadid()]
-    fill!(fluxdiff_local, zero(eltype(fluxdiff_local)))
+    du_local = du_local_threaded[Threads.threadid()]
+    fill!(du_local, zero(eltype(du_local)))
     u_local = view(entropy_projected_u_values, :, element)
 
-    local_flux_differencing!(fluxdiff_local, u_local, element,
+    local_flux_differencing!(du_local, u_local, element,
                              have_nonconservative_terms,
                              volume_integral.volume_flux,
                              has_sparse_operators(dg),
                              mesh, equations, dg, cache)
 
-    # convert fluxdiff_local::Vector{<:SVector} to StructArray{<:SVector} for faster
+    # convert du_local::Vector{<:SVector} to StructArray{<:SVector} for faster
     # apply_to_each_field performance.
     rhs_local = rhs_local_threaded[Threads.threadid()]
-    for i in Base.OneTo(length(fluxdiff_local))
-        rhs_local[i] = fluxdiff_local[i]
+    for i in Base.OneTo(length(du_local))
+        rhs_local[i] = du_local[i]
     end
-    apply_to_each_field(mul_by_accum!(Ph), view(du, :, element), rhs_local)
+    apply_to_each_field(mul_by_accum!(Ph, alpha), view(du, :, element), rhs_local)
 
     return nothing
 end
@@ -594,21 +452,22 @@ end
 @inline function volume_integral_kernel!(du, u, element, mesh::DGMultiMesh,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralFluxDifferencing,
-                                         dg::DGMultiFluxDiffSBP, cache)
-    @unpack fluxdiff_local_threaded, inv_wq = cache
+                                         dg::DGMultiFluxDiffSBP, cache,
+                                         alpha = true)
+    @unpack du_local_threaded, inv_wq = cache
 
-    fluxdiff_local = fluxdiff_local_threaded[Threads.threadid()]
-    fill!(fluxdiff_local, zero(eltype(fluxdiff_local)))
+    du_local = du_local_threaded[Threads.threadid()]
+    fill!(du_local, zero(eltype(du_local)))
     u_local = view(u, :, element)
 
-    local_flux_differencing!(fluxdiff_local, u_local, element,
+    local_flux_differencing!(du_local, u_local, element,
                              have_nonconservative_terms,
                              volume_integral.volume_flux,
                              has_sparse_operators(dg),
                              mesh, equations, dg, cache)
 
     for i in each_quad_node(mesh, dg, cache)
-        du[i, element] = du[i, element] + fluxdiff_local[i] * inv_wq[i]
+        du[i, element] = du[i, element] + alpha * du_local[i] * inv_wq[i]
     end
 
     return nothing
diff --git a/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl b/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
index 5185a7dbec5..1b8a257d0a2 100644
--- a/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
+++ b/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
@@ -454,9 +454,10 @@ function entropy_projection!(cache, u, mesh::DGMultiMesh, equations,
                              dg::DGMultiFluxDiff{<:GaussSBP})
     rd = dg.basis
     @unpack Vq = rd
-    @unpack VhP, entropy_var_values, u_values = cache
+    @unpack VhP, entropy_var_values = cache
     @unpack projected_entropy_var_values, entropy_projected_u_values = cache
     @unpack interp_matrix_lobatto_to_gauss, interp_matrix_gauss_to_face = cache
+    (; u_values) = cache.solution_container
 
     @threaded for e in eachelement(mesh, dg, cache)
         apply_to_each_field(mul_by!(interp_matrix_lobatto_to_gauss),
@@ -489,7 +490,7 @@ function entropy_projection!(cache, u, mesh::DGMultiMesh, equations,
     return nothing
 end
 
-# Assumes cache.flux_face_values is already computed.
+# Assumes cache.solution_container.flux_face_values is already computed.
 # Enables tensor product evaluation of `LIFT isa TensorProductGaussFaceOperator`.
 function calc_surface_integral!(du, u, mesh::DGMultiMesh, equations,
                                 surface_integral::SurfaceIntegralWeakForm,
@@ -501,7 +502,7 @@ function calc_surface_integral!(du, u, mesh::DGMultiMesh, equations,
         # applies LIFT matrix, output is stored at Gauss nodes
         gauss_volume_local = gauss_volume_local_threaded[Threads.threadid()]
         apply_to_each_field(mul_by!(gauss_LIFT), gauss_volume_local,
-                            view(cache.flux_face_values, :, e))
+                            view(cache.solution_container.flux_face_values, :, e))
 
         for i in eachindex(gauss_volume_local)
             du[i, e] = du[i, e] + gauss_volume_local[i]
@@ -511,29 +512,6 @@ function calc_surface_integral!(du, u, mesh::DGMultiMesh, equations,
     return nothing
 end
 
-@inline function flux_differencing_kernel!(du, u, element, mesh::DGMultiMesh,
-                                           have_nonconservative_terms, equations,
-                                           volume_flux, dg::DGMultiFluxDiff{<:GaussSBP},
-                                           cache, alpha = true)
-    fluxdiff_local = cache.fluxdiff_local_threaded[Threads.threadid()]
-    fill!(fluxdiff_local, zero(eltype(fluxdiff_local)))
-    u_local = view(cache.entropy_projected_u_values, :, element)
-
-    local_flux_differencing!(fluxdiff_local, u_local, element,
-                             have_nonconservative_terms,
-                             volume_flux, has_sparse_operators(dg),
-                             mesh, equations, dg, cache)
-
-    # convert `fluxdiff_local::Vector{<:SVector}` to `rhs_local::StructArray{<:SVector}`
-    # for faster performance when using `apply_to_each_field`.
-    rhs_local = cache.rhs_local_threaded[Threads.threadid()]
-    for i in Base.OneTo(length(fluxdiff_local))
-        rhs_local[i] = fluxdiff_local[i]
-    end
-
-    return project_rhs_to_gauss_nodes!(du, rhs_local, element, mesh, dg, cache, alpha)
-end
-
 function project_rhs_to_gauss_nodes!(du, rhs_local, element, mesh::DGMultiMesh,
                                      dg::DGMulti, cache, alpha = true)
 
@@ -563,14 +541,26 @@ end
 function volume_integral_kernel!(du, u, element, mesh::DGMultiMesh,
                                  have_nonconservative_terms, equations,
                                  volume_integral::VolumeIntegralFluxDifferencing,
-                                 dg::DGMultiFluxDiff{<:GaussSBP}, cache)
+                                 dg::DGMultiFluxDiff{<:GaussSBP}, cache, alpha = true)
     (; volume_flux) = volume_integral
 
-    flux_differencing_kernel!(du, u, element, mesh,
-                              have_nonconservative_terms, equations,
-                              volume_flux, dg, cache)
+    du_local = cache.du_local_threaded[Threads.threadid()]
+    fill!(du_local, zero(eltype(du_local)))
+    u_local = view(cache.entropy_projected_u_values, :, element)
 
-    return nothing
+    local_flux_differencing!(du_local, u_local, element,
+                             have_nonconservative_terms,
+                             volume_flux, has_sparse_operators(dg),
+                             mesh, equations, dg, cache)
+
+    # convert `du_local::Vector{<:SVector}` to `rhs_local::StructArray{<:SVector}`
+    # for faster performance when using `apply_to_each_field`.
+    rhs_local = cache.rhs_local_threaded[Threads.threadid()]
+    for i in Base.OneTo(length(du_local))
+        rhs_local[i] = du_local[i]
+    end
+
+    return project_rhs_to_gauss_nodes!(du, rhs_local, element, mesh, dg, cache, alpha)
 end
 
 # interpolate back to Lobatto nodes after applying the inverse Jacobian at Gauss points
diff --git a/src/solvers/dgmulti/sbp.jl b/src/solvers/dgmulti/sbp.jl
index a6f7fbf844a..2d75b18049f 100644
--- a/src/solvers/dgmulti/sbp.jl
+++ b/src/solvers/dgmulti/sbp.jl
@@ -179,10 +179,9 @@ function create_cache(mesh::DGMultiMesh, equations,
                       dg::DGMultiFluxDiffPeriodicFDSBP, RealT, uEltype)
     md = mesh.md
 
-    # storage for volume quadrature values, face quadrature values, flux values
-    nvars = nvariables(equations)
-    u_values = allocate_nested_array(uEltype, nvars, size(md.xq), dg)
-    return (; u_values, invJ = inv.(md.J))
+    solution_container = initialize_dgmulti_solution_container(mesh, equations, dg,
+                                                               uEltype)
+    return (; solution_container, invJ = inv.(md.J))
 end
 
 # Specialize calc_volume_integral for periodic SBP operators (assumes the operator is sparse).
@@ -208,11 +207,7 @@ function calc_volume_integral!(du, u, mesh::DGMultiMesh,
             # This would have to be changed if `have_nonconservative_terms = False()`
             # because then `volume_flux` is non-symmetric.
             A = dg.basis.Drst[dim]
-
-            A_base = parent(A) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-            row_ids = axes(A, 2)
-            rows = rowvals(A_base)
-            vals = nonzeros(A_base)
+            A_base, row_ids, rows, vals = sparse_operator_data(A)
 
             @threaded for i in row_ids
                 u_i = u[i]
@@ -234,19 +229,35 @@ function calc_volume_integral!(du, u, mesh::DGMultiMesh,
 
     else # if using two threads or fewer
 
-        # Calls `hadamard_sum!``, which uses symmetry to reduce flux evaluations. Symmetry
-        # is expected to yield about a 2x speedup, so we default to the symmetry-exploiting
-        # volume integral unless we have >2 threads (which should yield >2 speedup).
+        # Exploit skew-symmetry to halve the number of flux evaluations (≈2x speedup).
+        # A = Drst[dim] is skew-symmetric for periodic FD-SBP on uniform grids, so
+        # A[i,j] = -A[j,i]. The stored CSC value vals[id] = A[j,i] = -A[i,j], hence
+        # we use -vals[id] to recover A[i,j], matching the multithreaded branch above.
         for dim in eachdim(mesh)
             normal_direction = get_contravariant_vector(1, dim, mesh, cache)
 
             A = dg.basis.Drst[dim]
+            A_base, row_ids, rows, vals = sparse_operator_data(A)
 
-            # since have_nonconservative_terms::False,
-            # the volume flux is symmetric.
-            flux_is_symmetric = True()
-            hadamard_sum!(du, A, flux_is_symmetric, volume_flux,
-                          normal_direction, u, equations)
+            for i in row_ids
+                u_i = u[i]
+                du_i = du[i]
+                for id in nzrange(A_base, i)
+                    j = rows[id]
+                    # We use the symmetry of the volume flux and the anti-symmetry
+                    # of the derivative operator to save half of the volume flux
+                    # computations.
+                    if j > i
+                        A_ij = -vals[id]  # A[j,i] stored; skew-symmetry: -A[j,i] = A[i,j]
+                        u_j = u[j]
+                        AF_ij = 2 * A_ij *
+                                volume_flux(u_i, u_j, normal_direction, equations)
+                        du_i = du_i + AF_ij
+                        du[j] = du[j] - AF_ij # Due to skew-symmetry
+                    end
+                end
+                du[i] = du_i
+            end
         end
     end
 
diff --git a/src/solvers/dgmulti/shock_capturing.jl b/src/solvers/dgmulti/shock_capturing.jl
index d1c9cc3e9d2..b018c7f5894 100644
--- a/src/solvers/dgmulti/shock_capturing.jl
+++ b/src/solvers/dgmulti/shock_capturing.jl
@@ -7,20 +7,7 @@ function create_cache(mesh::DGMultiMesh{NDIMS}, equations,
     @assert volume_integral_blend_high_order isa VolumeIntegralFluxDifferencing "DGMulti is currently only compatible with `VolumeIntegralFluxDifferencing` as `volume_integral_blend_high_order`"
     # `volume_integral_blend_low_order` limited to finite-volume on Gauss-node subcells
 
-    # build element to element (element_to_element_connectivity) connectivity for smoothing of
-    # shock capturing parameters.
-    face_to_face_connectivity = mesh.md.FToF # num_faces x num_elements matrix
-    element_to_element_connectivity = similar(face_to_face_connectivity)
-    for e in axes(face_to_face_connectivity, 2)
-        for f in axes(face_to_face_connectivity, 1)
-            neighbor_face_index = face_to_face_connectivity[f, e]
-
-            # reverse-engineer element index from face. Assumes all elements
-            # have the same number of faces.
-            neighbor_element_index = ((neighbor_face_index - 1) ÷ dg.basis.num_faces) + 1
-            element_to_element_connectivity[f, e] = neighbor_element_index
-        end
-    end
+    element_to_element_connectivity = build_element_to_element_connectivity(mesh, dg)
 
     # create sparse hybridized operators for low order scheme
     Qrst, E = StartUpDG.sparse_low_order_SBP_operators(dg.basis)
@@ -196,10 +183,10 @@ function calc_volume_integral!(du, u, mesh::DGMultiMesh,
                                     dg, cache, 1 - alpha_element)
 
             # Calculate "FV" low order volume integral contribution
-            low_order_flux_differencing_kernel(du, u, element, mesh,
-                                               have_nonconservative_terms, equations,
-                                               volume_integral_blend_low_order,
-                                               dg, cache, alpha_element)
+            volume_integral_kernel!(du, u, element, mesh,
+                                    have_nonconservative_terms, equations,
+                                    volume_integral_blend_low_order,
+                                    dg, cache, alpha_element)
         end
     end
 
@@ -261,57 +248,30 @@ function get_avg_contravariant_matrix(i, j, element, mesh::DGMultiMesh, cache)
             get_contravariant_matrix(j, element, mesh, cache))
 end
 
-# computes an algebraic low order method with internal dissipation.
-# This method is for affine/Cartesian meshes
-function low_order_flux_differencing_kernel(du, u, element,
-                                            mesh::DGMultiMesh,
-                                            have_nonconservative_terms::False, equations,
-                                            volume_integral,
-                                            dg::DGMultiFluxDiff{<:GaussSBP},
-                                            cache, alpha = true)
-    (; volume_flux_fv) = volume_integral
-
-    # accumulates output from flux differencing
-    rhs_local = cache.rhs_local_threaded[Threads.threadid()]
-    fill!(rhs_local, zero(eltype(rhs_local)))
-
-    u_local = view(cache.entropy_projected_u_values, :, element)
-
-    # constant over each element
-    geometric_matrix = get_contravariant_matrix(element, mesh, cache)
-
-    (; sparsity_pattern) = cache
-    A_base = parent(sparsity_pattern) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-    row_ids, rows = axes(sparsity_pattern, 2), rowvals(A_base)
-    for i in row_ids
-        u_i = u_local[i]
-        du_i = zero(u_i)
-        for id in nzrange(A_base, i)
-            j = rows[id]
-            u_j = u_local[j]
-
-            # compute (Q_1[i,j], Q_2[i,j], ...) where Q_i = ∑_j dxidxhatj * Q̂_j
-            reference_operator_entries = get_sparse_operator_entries(i, j, mesh, cache)
-            normal_direction_ij = geometric_matrix * reference_operator_entries
-
-            # note that we do not need to normalize `normal_direction_ij` since
-            # it is typically normalized within the flux computation.
-            f_ij = volume_flux_fv(u_i, u_j, normal_direction_ij, equations)
-            du_i = du_i + 2 * f_ij
-        end
-        rhs_local[i] = du_i
-    end
+# On affine meshes, the geometric matrix is constant over the element, so we compute it
+# once and reuse it for all node pairs (i, j). The compiler is expected to hoist this
+# out of the inner loop after inlining.
+@inline function get_low_order_geometric_matrix(i, j, element,
+                                                mesh::DGMultiMesh{NDIMS, <:Affine},
+                                                cache) where {NDIMS}
+    return get_contravariant_matrix(element, mesh, cache)
+end
 
-    # TODO: factor this out to avoid calling it twice during calc_volume_integral!
-    return project_rhs_to_gauss_nodes!(du, rhs_local, element, mesh, dg, cache, alpha)
+# On non-affine meshes, we use the average of the geometric matrices at nodes i and j
+# for provably entropy-stable de-aliasing of the geometric terms.
+@inline function get_low_order_geometric_matrix(i, j, element,
+                                                mesh::DGMultiMesh,
+                                                cache)
+    return get_avg_contravariant_matrix(i, j, element, mesh, cache)
 end
 
-function low_order_flux_differencing_kernel(du, u, element,
-                                            mesh::DGMultiMesh{NDIMS, <:NonAffine},
-                                            have_nonconservative_terms::False, equations,
-                                            volume_integral,
-                                            dg::DGMultiFluxDiff{<:GaussSBP},
-                                            cache, alpha = true) where {NDIMS}
+# Calculates the volume integral corresponding to an algebraic low order method.
+# This is used, for example, in shock capturing.
+function volume_integral_kernel!(du, u, element, mesh::DGMultiMesh,
+                                 have_nonconservative_terms::False, equations,
+                                 volume_integral::VolumeIntegralPureLGLFiniteVolume,
+                                 dg::DGMultiFluxDiff{<:GaussSBP}, cache,
+                                 alpha = true)
     (; volume_flux_fv) = volume_integral
 
     # accumulates output from flux differencing
@@ -321,8 +281,7 @@ function low_order_flux_differencing_kernel(du, u, element,
     u_local = view(cache.entropy_projected_u_values, :, element)
 
     (; sparsity_pattern) = cache
-    A_base = parent(sparsity_pattern) # the adjoint of a SparseMatrixCSC is basically a SparseMatrixCSR
-    row_ids, rows = axes(sparsity_pattern, 2), rowvals(A_base)
+    A_base, row_ids, rows, _ = sparse_operator_data(sparsity_pattern)
     for i in row_ids
         u_i = u_local[i]
         du_i = zero(u_i)
@@ -331,7 +290,7 @@ function low_order_flux_differencing_kernel(du, u, element,
             u_j = u_local[j]
 
             # compute (Q_1[i,j], Q_2[i,j], ...) where Q_i = ∑_j dxidxhatj * Q̂_j
-            geometric_matrix = get_avg_contravariant_matrix(i, j, element, mesh, cache)
+            geometric_matrix = get_low_order_geometric_matrix(i, j, element, mesh, cache)
             reference_operator_entries = get_sparse_operator_entries(i, j, mesh, cache)
             normal_direction_ij = geometric_matrix * reference_operator_entries
 
diff --git a/src/solvers/dgmulti/types.jl b/src/solvers/dgmulti/types.jl
index 110432d254c..1b408ad57f6 100644
--- a/src/solvers/dgmulti/types.jl
+++ b/src/solvers/dgmulti/types.jl
@@ -22,9 +22,10 @@ const DGMultiWeakForm{ApproxType, ElemType} = DGMulti{NDIMS, ElemType, ApproxTyp
 const DGMultiFluxDiff{ApproxType, ElemType} = DGMulti{NDIMS, ElemType, ApproxType,
                                                       <:SurfaceIntegralWeakForm,
                                                       <:Union{VolumeIntegralFluxDifferencing,
-                                                              VolumeIntegralShockCapturingHGType}} where {
-                                                                                                          NDIMS
-                                                                                                          }
+                                                              VolumeIntegralShockCapturingHGType,
+                                                              VolumeIntegralAdaptiveEC_WF_DG}} where {
+                                                                                                      NDIMS
+                                                                                                      }
 
 const DGMultiFluxDiffSBP{ApproxType, ElemType} = DGMulti{NDIMS, ElemType, ApproxType,
                                                          <:SurfaceIntegralWeakForm,
@@ -340,30 +341,6 @@ function DGMultiMesh(dg::DGMulti{NDIMS}, filename::String;
     return DGMultiMesh(dg, GeometricTermsType(Curved(), dg), md, boundary_faces)
 end
 
-# Matrix type for lazy construction of physical differentiation matrices
-# Constructs a lazy linear combination of B = ∑_i coeffs[i] * A[i]
-struct LazyMatrixLinearCombo{Tcoeffs, N, Tv, TA <: AbstractMatrix{Tv}} <:
-       AbstractMatrix{Tv}
-    matrices::NTuple{N, TA}
-    coeffs::NTuple{N, Tcoeffs}
-    function LazyMatrixLinearCombo(matrices, coeffs)
-        @assert all(matrix -> size(matrix) == size(first(matrices)), matrices)
-        return new{typeof(first(coeffs)), length(matrices), eltype(first(matrices)),
-                   typeof(first(matrices))}(matrices, coeffs)
-    end
-end
-Base.eltype(A::LazyMatrixLinearCombo) = eltype(first(A.matrices))
-Base.IndexStyle(A::LazyMatrixLinearCombo) = IndexCartesian()
-Base.size(A::LazyMatrixLinearCombo) = size(first(A.matrices))
-
-@inline function Base.getindex(A::LazyMatrixLinearCombo{<:Real, N}, i, j) where {N}
-    val = zero(eltype(A))
-    for k in Base.OneTo(N)
-        val = val + A.coeffs[k] * getindex(A.matrices[k], i, j)
-    end
-    return val
-end
-
 # `SimpleKronecker` lazily stores a Kronecker product `kron(ntuple(A, NDIMS)...)`.
 # This object also allocates some temporary storage to enable the fast computation
 # of matrix-vector products.
diff --git a/src/solvers/dgmulti/volume_integral_adaptive.jl b/src/solvers/dgmulti/volume_integral_adaptive.jl
new file mode 100644
index 00000000000..2fc2eaa079f
--- /dev/null
+++ b/src/solvers/dgmulti/volume_integral_adaptive.jl
@@ -0,0 +1,102 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+function create_cache(mesh::DGMultiMesh, equations,
+                      dg::DGMulti{NDIMS, ElemType, <:Polynomial,
+                                  <:SurfaceIntegralWeakForm,
+                                  <:VolumeIntegralAdaptive{<:IndicatorEntropyChange,
+                                                           <:VolumeIntegralWeakForm,
+                                                           <:VolumeIntegralFluxDifferencing}},
+                      RealT, uEltype) where {NDIMS, ElemType}
+    # Construct temporary solvers for each sub-integral to reuse the `create_cache` functions
+
+    # `VolumeIntegralAdaptive` for `DGMulti` currently limited to Weak Form & Flux Differencing combi
+    dg_WF = DG(dg.basis, dg.mortar, dg.surface_integral,
+               dg.volume_integral.volume_integral_default)
+    dg_FD = DG(dg.basis, dg.mortar, dg.surface_integral,
+               dg.volume_integral.volume_integral_stabilized)
+
+    cache_WF = create_cache(mesh, equations, dg_WF, RealT, uEltype)
+    cache_FD = create_cache(mesh, equations, dg_FD, RealT, uEltype)
+
+    # Set up structures required for `IndicatorEntropyChange`
+    rd = dg.basis
+    nvars = nvariables(equations)
+
+    # Required for entropy change computation (`entropy_change_reference_element`)
+    du_values = similar(cache_FD.solution_container.u_values)
+
+    # Thread-local buffer for face interpolation, which is required
+    # for computation of entropy potential at interpolated face nodes
+    # (`surface_integral_reference_element`)
+    u_face_local_threaded = [allocate_nested_array(uEltype, nvars, (rd.Nfq,), dg)
+                             for _ in 1:Threads.maxthreadid()]
+
+    return (; cache_FD...,
+            # Weak-form-specific fields for the default volume integral
+            weak_differentiation_matrices = cache_WF.weak_differentiation_matrices,
+            flux_threaded = cache_WF.flux_threaded,
+            rotated_flux_threaded = cache_WF.rotated_flux_threaded, # For non-affine meshes
+            # Required for `IndicatorEntropyChange`
+            du_values, u_face_local_threaded)
+end
+
+# version for affine meshes (currently only supported one for `VolumeIntegralAdaptive`)
+function calc_volume_integral!(du, u, mesh::DGMultiMesh,
+                               have_nonconservative_terms::False, equations,
+                               volume_integral::VolumeIntegralAdaptive{<:IndicatorEntropyChange},
+                               dg::DGMultiFluxDiff, cache)
+    @unpack volume_integral_default, volume_integral_stabilized = volume_integral
+    @unpack maximum_entropy_increase = volume_integral.indicator
+
+    # For weak form integral
+    @unpack u_values = cache.solution_container
+
+    # For entropy production computation
+    rd = dg.basis
+    @unpack du_values = cache
+
+    # interpolate to quadrature points
+    apply_to_each_field(mul_by!(rd.Vq), u_values, u) # required for weak form trial
+
+    @threaded for e in eachelement(dg, cache)
+        # Try default volume integral first
+        volume_integral_kernel!(du, u, e, mesh,
+                                have_nonconservative_terms, equations,
+                                volume_integral_default, dg, cache)
+
+        # Interpolate `du` to quadrature points after WF integral for entropy production calculation
+        du_local = view(du, :, e)
+        du_values_local = view(du_values, :, e)
+        apply_to_each_field(mul_by!(rd.Vq), du_values_local, du_local) # required for entropy production calculation
+
+        # Compute entropy production of this volume integral
+        u_values_local = view(u_values, :, e)
+        dS_WF = -entropy_change_reference_element(du_values_local, u_values_local,
+                                                  mesh, equations,
+                                                  dg, cache)
+
+        dS_true = surface_integral_reference_element(entropy_potential, u, e,
+                                                     mesh, equations, dg, cache)
+
+        entropy_change = dS_WF - dS_true
+        if entropy_change > maximum_entropy_increase # Recompute using EC FD volume integral
+            # Reset default volume integral contribution.
+            # Note that this assumes that the volume terms are computed first,
+            # before any surface terms are added.
+            fill!(du_local, zero(eltype(du_local)))
+
+            # Recompute using stabilized volume integral. Note that the calculation of this volume integral requires the calculation of the entropy projection, which is done in `rhs!` specialized on the `DGMultiFluxDiff` solver type. 
+            volume_integral_kernel!(du, u, e, mesh,
+                                    have_nonconservative_terms, equations,
+                                    volume_integral_stabilized, dg, cache)
+        end
+    end
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem/calc_volume_integral.jl b/src/solvers/dgsem/calc_volume_integral.jl
index 502e31c9f36..d7017a3ceb4 100644
--- a/src/solvers/dgsem/calc_volume_integral.jl
+++ b/src/solvers/dgsem/calc_volume_integral.jl
@@ -8,44 +8,44 @@
 # The following `volume_integral_kernel!` and `calc_volume_integral!` functions are
 # dimension and meshtype agnostic, i.e., valid for all 1D, 2D, and 3D meshes.
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, MeshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralWeakForm,
                                          dg, cache, alpha = true)
-    weak_form_kernel!(du, u, element, mesh,
+    weak_form_kernel!(du, u, element, MeshT,
                       have_nonconservative_terms, equations,
                       dg, cache, alpha)
 
     return nothing
 end
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, MeshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralFluxDifferencing,
                                          dg, cache, alpha = true)
     @unpack volume_flux = volume_integral # Volume integral specific data
 
-    flux_differencing_kernel!(du, u, element, mesh,
+    flux_differencing_kernel!(du, u, element, MeshT,
                               have_nonconservative_terms, equations,
                               volume_flux, dg, cache, alpha)
 
     return nothing
 end
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, MeshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralPureLGLFiniteVolume,
                                          dg::DGSEM, cache, alpha = true)
     @unpack volume_flux_fv = volume_integral # Volume integral specific data
 
-    fv_kernel!(du, u, mesh,
+    fv_kernel!(du, u, MeshT,
                have_nonconservative_terms, equations,
                volume_flux_fv, dg, cache, element, alpha)
 
     return nothing
 end
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, MeshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralPureLGLFiniteVolumeO2,
                                          dg::DGSEM, cache, alpha = true)
@@ -53,7 +53,7 @@ end
     @unpack (sc_interface_coords, volume_flux_fv, reconstruction_mode, slope_limiter,
     cons2recon, recon2cons) = volume_integral
 
-    fvO2_kernel!(du, u, mesh,
+    fvO2_kernel!(du, u, MeshT,
                  have_nonconservative_terms, equations,
                  volume_flux_fv, dg, cache, element,
                  sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -63,14 +63,14 @@ end
     return nothing
 end
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, MeshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralAdaptive{<:IndicatorEntropyChange},
                                          dg::DGSEM, cache)
     @unpack volume_integral_default, volume_integral_stabilized, indicator = volume_integral
     @unpack maximum_entropy_increase = indicator
 
-    volume_integral_kernel!(du, u, element, mesh,
+    volume_integral_kernel!(du, u, element, MeshT,
                             have_nonconservative_terms, equations,
                             volume_integral_default, dg, cache)
 
@@ -79,11 +79,11 @@ end
     # No scaling by inverse Jacobian here, as there is no Jacobian multiplication
     # in `integrate_reference_element`.
     dS_default = -entropy_change_reference_element(du, u, element,
-                                                   mesh, equations, dg, cache)
+                                                   MeshT, equations, dg, cache)
 
     # Compute true entropy change given by surface integral of the entropy potential
     dS_true = surface_integral_reference_element(entropy_potential, u, element,
-                                                 mesh, equations, dg, cache)
+                                                 MeshT, equations, dg, cache)
 
     entropy_change = dS_default - dS_true
     if entropy_change > maximum_entropy_increase # Recompute using EC FD volume integral
@@ -92,7 +92,7 @@ end
         # before any surface terms are added.
         du[.., element] .= zero(eltype(du))
 
-        volume_integral_kernel!(du, u, element, mesh,
+        volume_integral_kernel!(du, u, element, MeshT,
                                 have_nonconservative_terms, equations,
                                 volume_integral_stabilized, dg, cache)
     end
@@ -100,7 +100,7 @@ end
     return nothing
 end
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, MeshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralEntropyCorrection,
                                          dg::DGSEM, cache)
@@ -110,7 +110,7 @@ end
     du_element_threaded = indicator.cache.volume_integral_values_threaded
 
     # run default volume integral 
-    volume_integral_kernel!(du, u, element, mesh,
+    volume_integral_kernel!(du, u, element, MeshT,
                             have_nonconservative_terms, equations,
                             volume_integral_default, dg, cache)
 
@@ -125,12 +125,12 @@ end
     # No scaling by inverse Jacobian here, as there is no Jacobian multiplication
     # in `integrate_reference_element`.
     dS_volume_integral = -entropy_change_reference_element(du, u, element,
-                                                           mesh, equations,
+                                                           MeshT, equations,
                                                            dg, cache)
 
     # Compute true entropy change given by surface integral of the entropy potential
     dS_true = surface_integral_reference_element(entropy_potential, u, element,
-                                                 mesh, equations, dg, cache)
+                                                 MeshT, equations, dg, cache)
 
     # This quantity should be ≤ 0 for an entropy stable volume integral, and 
     # exactly zero for an entropy conservative volume integral. 
@@ -147,13 +147,13 @@ end
         du[.., element] .= zero(eltype(du))
 
         # Calculate entropy stable volume integral contribution
-        volume_integral_kernel!(du, u, element, mesh,
+        volume_integral_kernel!(du, u, element, MeshT,
                                 have_nonconservative_terms, equations,
                                 volume_integral_stabilized, dg, cache)
 
         dS_volume_integral_stabilized = -entropy_change_reference_element(du, u,
                                                                           element,
-                                                                          mesh,
+                                                                          MeshT,
                                                                           equations, dg,
                                                                           cache)
 
@@ -177,11 +177,11 @@ end
     return nothing
 end
 
-function calc_volume_integral!(du, u, mesh,
+function calc_volume_integral!(backend::Nothing, du, u, mesh,
                                have_nonconservative_terms, equations,
                                volume_integral, dg::DGSEM, cache)
     @threaded for element in eachelement(dg, cache)
-        volume_integral_kernel!(du, u, element, mesh,
+        volume_integral_kernel!(du, u, element, typeof(mesh),
                                 have_nonconservative_terms, equations,
                                 volume_integral, dg, cache)
     end
@@ -189,7 +189,27 @@ function calc_volume_integral!(du, u, mesh,
     return nothing
 end
 
-function calc_volume_integral!(du, u, mesh,
+function calc_volume_integral!(backend::Backend, du, u, mesh,
+                               have_nonconservative_terms, equations,
+                               volume_integral, dg::DGSEM, cache)
+    nelements(dg, cache) == 0 && return nothing
+    kernel! = volume_integral_KAkernel!(backend)
+    kernel_cache = kernel_filter_cache(cache)
+    kernel!(du, u, typeof(mesh), have_nonconservative_terms, equations,
+            volume_integral, dg, kernel_cache,
+            ndrange = nelements(dg, cache))
+    return nothing
+end
+
+@kernel function volume_integral_KAkernel!(du, u, MeshT,
+                                           have_nonconservative_terms, equations,
+                                           volume_integral, dg::DGSEM, cache)
+    element = @index(Global)
+    volume_integral_kernel!(du, u, element, MeshT, have_nonconservative_terms,
+                            equations, volume_integral, dg, cache)
+end
+
+function calc_volume_integral!(backend::Nothing, du, u, mesh,
                                have_nonconservative_terms, equations,
                                volume_integral::VolumeIntegralShockCapturingHGType,
                                dg::DGSEM, cache)
@@ -210,18 +230,18 @@ function calc_volume_integral!(du, u, mesh,
         dg_only = isapprox(alpha_element, 0, atol = atol)
 
         if dg_only
-            volume_integral_kernel!(du, u, element, mesh,
+            volume_integral_kernel!(du, u, element, typeof(mesh),
                                     have_nonconservative_terms, equations,
                                     volume_integral_default, dg, cache)
         else
             # Calculate DG volume integral contribution
-            volume_integral_kernel!(du, u, element, mesh,
+            volume_integral_kernel!(du, u, element, typeof(mesh),
                                     have_nonconservative_terms, equations,
                                     volume_integral_blend_high_order, dg, cache,
                                     1 - alpha_element)
 
             # Calculate FV volume integral contribution
-            volume_integral_kernel!(du, u, element, mesh,
+            volume_integral_kernel!(du, u, element, typeof(mesh),
                                     have_nonconservative_terms, equations,
                                     volume_integral_blend_low_order, dg, cache,
                                     alpha_element)
@@ -231,7 +251,7 @@ function calc_volume_integral!(du, u, mesh,
     return nothing
 end
 
-function calc_volume_integral!(du, u, mesh,
+function calc_volume_integral!(backend::Nothing, du, u, mesh,
                                have_nonconservative_terms, equations,
                                volume_integral::VolumeIntegralEntropyCorrectionShockCapturingCombined,
                                dg::DGSEM, cache)
@@ -250,7 +270,7 @@ function calc_volume_integral!(du, u, mesh,
 
     @threaded for element in eachelement(dg, cache)
         # run default volume integral 
-        volume_integral_kernel!(du, u, element, mesh,
+        volume_integral_kernel!(du, u, element, typeof(mesh),
                                 have_nonconservative_terms, equations,
                                 volume_integral_default, dg, cache)
 
@@ -265,12 +285,12 @@ function calc_volume_integral!(du, u, mesh,
         # No scaling by inverse Jacobian here, as there is no Jacobian multiplication
         # in `integrate_reference_element`.
         dS_volume_integral = -entropy_change_reference_element(du, u, element,
-                                                               mesh, equations,
+                                                               typeof(mesh), equations,
                                                                dg, cache)
 
         # Compute true entropy change given by surface integral of the entropy potential
         dS_true = surface_integral_reference_element(entropy_potential, u, element,
-                                                     mesh, equations, dg, cache)
+                                                     typeof(mesh), equations, dg, cache)
 
         # This quantity should be ≤ 0 for an entropy stable volume integral, and 
         # exactly zero for an entropy conservative volume integral. 
@@ -287,13 +307,13 @@ function calc_volume_integral!(du, u, mesh,
             du[.., element] .= zero(eltype(du))
 
             # Calculate entropy stable volume integral contribution
-            volume_integral_kernel!(du, u, element, mesh,
+            volume_integral_kernel!(du, u, element, typeof(mesh),
                                     have_nonconservative_terms, equations,
                                     volume_integral_stabilized, dg, cache)
 
             dS_volume_integral_stabilized = -entropy_change_reference_element(du, u,
                                                                               element,
-                                                                              mesh,
+                                                                              typeof(mesh),
                                                                               equations,
                                                                               dg,
                                                                               cache)
diff --git a/src/solvers/dgsem/special_volume_integrals.jl b/src/solvers/dgsem/special_volume_integrals.jl
index 19ab8be7952..16cb0250599 100644
--- a/src/solvers/dgsem/special_volume_integrals.jl
+++ b/src/solvers/dgsem/special_volume_integrals.jl
@@ -7,6 +7,10 @@
 
 # This file contains some specialized volume integrals that require some indicators already to be defined.
 
+const VolumeIntegralAdaptiveEC_WF_DG = VolumeIntegralAdaptive{<:IndicatorEntropyChange,
+                                                              <:VolumeIntegralWeakForm,
+                                                              <:VolumeIntegralFluxDifferencing}
+
 """
     VolumeIntegralEntropyCorrection(indicator, 
                                     volume_integral_default, 
diff --git a/src/solvers/dgsem_p4est/containers.jl b/src/solvers/dgsem_p4est/containers.jl
index 7ac6febf470..2dd360b50cc 100644
--- a/src/solvers/dgsem_p4est/containers.jl
+++ b/src/solvers/dgsem_p4est/containers.jl
@@ -920,7 +920,7 @@ function count_required_surfaces(mesh::P4estMesh)
 end
 
 # Return direction of the face, which is indexed by node_indices
-@inline function indices2direction(indices)
+@inline function indices2direction(indices::NTuple{3, Symbol})
     if indices[1] === :begin
         return 1
     elseif indices[1] === :end
@@ -936,6 +936,24 @@ end
     end
 end
 
+@inline function indices2direction(indices::NTuple{2, Symbol})
+    if indices[1] === :begin
+        return 1
+    elseif indices[1] === :end
+        return 2
+    elseif indices[2] === :begin
+        return 3
+    else # if indices[2] === :end
+        return 4
+    end
+end
+
+# Build a reduced cache which can be passed to GPU kernels
+@inline function kernel_filter_cache(cache)
+    return (;
+            elements = (; contravariant_vectors = cache.elements.contravariant_vectors))
+end
+
 include("containers_2d.jl")
 include("containers_3d.jl")
 include("containers_parallel.jl")
diff --git a/src/solvers/dgsem_p4est/dg.jl b/src/solvers/dgsem_p4est/dg.jl
index df31a948213..59d840e4dfb 100644
--- a/src/solvers/dgsem_p4est/dg.jl
+++ b/src/solvers/dgsem_p4est/dg.jl
@@ -48,17 +48,16 @@ function create_cache(mesh::P4estMeshView, equations::AbstractEquations, dg::DG,
     mortars_parent = init_mortars(mesh.parent, equations, dg.basis, elements_parent)
 
     # Extract data for views.
-    elements, interfaces, boundaries, mortars = extract_p4est_mesh_view(elements_parent,
-                                                                        interfaces_parent,
-                                                                        boundaries_parent,
-                                                                        mortars_parent,
-                                                                        mesh,
-                                                                        equations,
-                                                                        dg,
-                                                                        uEltype)
+    elements, interfaces, boundaries, mortars, neighbor_ids_parent = extract_p4est_mesh_view(elements_parent,
+                                                                                             interfaces_parent,
+                                                                                             boundaries_parent,
+                                                                                             mortars_parent,
+                                                                                             mesh,
+                                                                                             equations,
+                                                                                             dg,
+                                                                                             uEltype)
 
-    # Container cache
-    cache = (; elements, interfaces, boundaries, mortars)
+    cache = (; elements, interfaces, boundaries, mortars, neighbor_ids_parent)
 
     # Add Volume-Integral cache
     cache = (; cache...,
@@ -94,6 +93,8 @@ include("dg_3d.jl")
 include("dg_3d_parabolic.jl")
 include("dg_parallel.jl")
 
+# Subcell limiters
+include("subcell_limiters.jl")
 include("subcell_limiters_2d.jl")
 include("dg_2d_subcell_limiters.jl")
 include("subcell_limiters_3d.jl")
diff --git a/src/solvers/dgsem_p4est/dg_2d.jl b/src/solvers/dgsem_p4est/dg_2d.jl
index d4cd609c6bd..94c477ca8e6 100644
--- a/src/solvers/dgsem_p4est/dg_2d.jl
+++ b/src/solvers/dgsem_p4est/dg_2d.jl
@@ -62,125 +62,218 @@ end
     end
 end
 
-function prolong2interfaces!(cache, u,
-                             mesh::Union{P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}},
+function prolong2interfaces!(backend::Nothing, cache, u,
+                             mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                         T8codeMesh{2}},
                              equations, dg::DG)
     @unpack interfaces = cache
+    @unpack neighbor_ids, node_indices = cache.interfaces
     index_range = eachnode(dg)
 
     @threaded for interface in eachinterface(dg, cache)
-        # Copy solution data from the primary element using "delayed indexing" with
-        # a start value and a step size to get the correct face and orientation.
-        # Note that in the current implementation, the interface will be
-        # "aligned at the primary element", i.e., the index of the primary side
-        # will always run forwards.
-        primary_element = interfaces.neighbor_ids[1, interface]
-        primary_indices = interfaces.node_indices[1, interface]
+        prolong2interfaces_per_interface!(interfaces.u, u, interface, typeof(mesh),
+                                          equations, neighbor_ids, node_indices,
+                                          index_range)
+    end
+    return nothing
+end
+
+function prolong2interfaces!(backend::Backend, cache, u,
+                             mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                         T8codeMesh{2}},
+                             equations, dg::DG)
+    @unpack interfaces = cache
+    ninterfaces(interfaces) == 0 && return nothing
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    index_range = eachnode(dg)
+
+    kernel! = prolong2interfaces_KAkernel!(backend)
+    kernel!(interfaces.u, u, typeof(mesh), equations, neighbor_ids, node_indices,
+            index_range, ndrange = ninterfaces(interfaces))
+    return nothing
+end
 
-        i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
+@kernel function prolong2interfaces_KAkernel!(interfaces_u, u,
+                                              mT::Type{<:Union{P4estMesh{2},
+                                                               P4estMeshView{2},
+                                                               T8codeMesh{2}}},
+                                              equations, neighbor_ids,
+                                              node_indices, index_range)
+    interface = @index(Global)
+    prolong2interfaces_per_interface!(interfaces_u, u, interface, mT, equations,
+                                      neighbor_ids, node_indices, index_range)
+end
+
+@inline function prolong2interfaces_per_interface!(interfaces_u, u, interface,
+                                                   ::Type{<:Union{P4estMesh{2},
+                                                                  P4estMeshView{2},
+                                                                  T8codeMesh{2}}},
+                                                   equations, neighbor_ids,
+                                                   node_indices,
+                                                   index_range)
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+
+    i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
+                                                             index_range)
+    j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
+                                                             index_range)
+
+    i_primary = i_primary_start
+    j_primary = j_primary_start
+    for i in index_range
+        for v in eachvariable(equations)
+            interfaces_u[1, v, i, interface] = u[v, i_primary, j_primary,
+                                                 primary_element]
+        end
+        i_primary += i_primary_step
+        j_primary += j_primary_step
+    end
+
+    # Copy solution data from the secondary element using "delayed indexing" with
+    # a start value and a step size to get the correct face and orientation.
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
+
+    i_secondary_start, i_secondary_step = index_to_start_step_2d(secondary_indices[1],
                                                                  index_range)
-        j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
+    j_secondary_start, j_secondary_step = index_to_start_step_2d(secondary_indices[2],
                                                                  index_range)
 
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-        for i in eachnode(dg)
-            for v in eachvariable(equations)
-                interfaces.u[1, v, i, interface] = u[v, i_primary, j_primary,
-                                                     primary_element]
-            end
-            i_primary += i_primary_step
-            j_primary += j_primary_step
+    i_secondary = i_secondary_start
+    j_secondary = j_secondary_start
+    for i in index_range
+        for v in eachvariable(equations)
+            interfaces_u[2, v, i, interface] = u[v, i_secondary, j_secondary,
+                                                 secondary_element]
         end
+        i_secondary += i_secondary_step
+        j_secondary += j_secondary_step
+    end
 
-        # Copy solution data from the secondary element using "delayed indexing" with
-        # a start value and a step size to get the correct face and orientation.
-        secondary_element = interfaces.neighbor_ids[2, interface]
-        secondary_indices = interfaces.node_indices[2, interface]
+    return nothing
+end
 
-        i_secondary_start, i_secondary_step = index_to_start_step_2d(secondary_indices[1],
-                                                                     index_range)
-        j_secondary_start, j_secondary_step = index_to_start_step_2d(secondary_indices[2],
-                                                                     index_range)
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
+                              mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                          T8codeMesh{2}},
+                              have_nonconservative_terms,
+                              equations, surface_integral, dg::DG, cache)
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
 
-        i_secondary = i_secondary_start
-        j_secondary = j_secondary_start
-        for i in eachnode(dg)
-            for v in eachvariable(equations)
-                interfaces.u[2, v, i, interface] = u[v, i_secondary, j_secondary,
-                                                     secondary_element]
-            end
-            i_secondary += i_secondary_step
-            j_secondary += j_secondary_step
-        end
+    @threaded for interface in eachinterface(dg, cache)
+        calc_interface_flux_per_interface!(surface_flux_values, typeof(mesh),
+                                           have_nonconservative_terms,
+                                           equations, surface_integral, typeof(dg),
+                                           cache.interfaces.u, interface,
+                                           neighbor_ids, node_indices,
+                                           contravariant_vectors, index_range)
     end
 
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Backend, surface_flux_values,
                               mesh::Union{P4estMesh{2}, P4estMeshView{2},
                                           T8codeMesh{2}},
                               have_nonconservative_terms,
                               equations, surface_integral, dg::DG, cache)
+    ninterfaces(cache.interfaces) == 0 && return nothing
     @unpack neighbor_ids, node_indices = cache.interfaces
     @unpack contravariant_vectors = cache.elements
     index_range = eachnode(dg)
+
+    kernel! = calc_interface_flux_KAkernel!(backend)
+    kernel!(surface_flux_values, typeof(mesh), have_nonconservative_terms,
+            equations, surface_integral, typeof(dg), cache.interfaces.u,
+            neighbor_ids, node_indices, contravariant_vectors, index_range,
+            ndrange = ninterfaces(cache.interfaces))
+
+    return nothing
+end
+
+@kernel function calc_interface_flux_KAkernel!(surface_flux_values,
+                                               mt::Type{<:Union{P4estMesh{2},
+                                                                P4estMeshView{2},
+                                                                T8codeMesh{2}}},
+                                               have_nonconservative_terms,
+                                               equations, surface_integral,
+                                               st::Type{<:DG}, u_interface,
+                                               neighbor_ids, node_indices,
+                                               contravariant_vectors, index_range)
+    interface = @index(Global)
+    calc_interface_flux_per_interface!(surface_flux_values, mt,
+                                       have_nonconservative_terms, equations,
+                                       surface_integral, st, u_interface,
+                                       interface, neighbor_ids, node_indices,
+                                       contravariant_vectors, index_range)
+end
+
+@inline function calc_interface_flux_per_interface!(surface_flux_values,
+                                                    mt::Type{<:Union{P4estMesh{2},
+                                                                     P4estMeshView{2},
+                                                                     T8codeMesh{2}}},
+                                                    have_nonconservative_terms,
+                                                    equations, surface_integral,
+                                                    st::Type{<:DG},
+                                                    u_interface, interface,
+                                                    neighbor_ids,
+                                                    node_indices, contravariant_vectors,
+                                                    index_range)
     index_end = last(index_range)
 
-    @threaded for interface in eachinterface(dg, cache)
-        # Get element and side index information on the primary element
-        primary_element = neighbor_ids[1, interface]
-        primary_indices = node_indices[1, interface]
-        primary_direction = indices2direction(primary_indices)
+    # Get element and side index information on the primary element
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+    primary_direction = indices2direction(primary_indices)
 
-        # Create the local i,j indexing on the primary element used to pull normal direction information
-        i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
-                                                                 index_range)
-        j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
-                                                                 index_range)
+    # Create the local i,j indexing on the primary element used to pull normal direction information
+    i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
+                                                             index_range)
+    j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
+                                                             index_range)
 
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-
-        # Get element and side index information on the secondary element
-        secondary_element = neighbor_ids[2, interface]
-        secondary_indices = node_indices[2, interface]
-        secondary_direction = indices2direction(secondary_indices)
-
-        # Initiate the secondary index to be used in the surface for loop.
-        # This index on the primary side will always run forward but
-        # the secondary index might need to run backwards for flipped sides.
-        if :i_backward in secondary_indices
-            node_secondary = index_end
-            node_secondary_step = -1
-        else
-            node_secondary = 1
-            node_secondary_step = 1
-        end
+    i_primary = i_primary_start
+    j_primary = j_primary_start
 
-        for node in eachnode(dg)
-            # Get the normal direction on the primary element.
-            # Contravariant vectors at interfaces in negative coordinate direction
-            # are pointing inwards. This is handled by `get_normal_direction`.
-            normal_direction = get_normal_direction(primary_direction,
-                                                    contravariant_vectors,
-                                                    i_primary, j_primary,
-                                                    primary_element)
-
-            calc_interface_flux!(surface_flux_values, mesh, have_nonconservative_terms,
-                                 equations,
-                                 surface_integral, dg, cache,
-                                 interface, normal_direction,
-                                 node, primary_direction, primary_element,
-                                 node_secondary, secondary_direction, secondary_element)
-
-            # Increment primary element indices to pull the normal direction
-            i_primary += i_primary_step
-            j_primary += j_primary_step
-            # Increment the surface node index along the secondary element
-            node_secondary += node_secondary_step
-        end
+    # Get element and side index information on the secondary element
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
+    secondary_direction = indices2direction(secondary_indices)
+
+    # Initiate the secondary index to be used in the surface for loop.
+    # This index on the primary side will always run forward but
+    # the secondary index might need to run backwards for flipped sides.
+    if :i_backward in secondary_indices
+        node_secondary = index_end
+        node_secondary_step = -1
+    else
+        node_secondary = 1
+        node_secondary_step = 1
+    end
+
+    for node in index_range
+        # Get the normal direction on the primary element.
+        # Contravariant vectors at interfaces in negative coordinate direction
+        # are pointing inwards. This is handled by `get_normal_direction`.
+        normal_direction = get_normal_direction(primary_direction,
+                                                contravariant_vectors,
+                                                i_primary, j_primary,
+                                                primary_element)
+
+        calc_interface_flux!(surface_flux_values, mt, have_nonconservative_terms,
+                             equations, surface_integral, st, u_interface, interface,
+                             normal_direction, node, primary_direction,
+                             primary_element, node_secondary,
+                             secondary_direction, secondary_element)
+
+        # Increment primary element indices to pull the normal direction
+        i_primary += i_primary_step
+        j_primary += j_primary_step
+        # Increment the surface node index along the secondary element
+        node_secondary += node_secondary_step
     end
 
     return nothing
@@ -188,19 +281,22 @@ end
 
 # Inlined version of the interface flux computation for conservation laws
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{2}, P4estMeshView{2},
-                                                  T8codeMesh{2}},
+                                      ::Type{<:Union{P4estMesh{2},
+                                                     P4estMeshView{2},
+                                                     T8codeMesh{2}}},
                                       have_nonconservative_terms::False, equations,
-                                      surface_integral, dg::DG, cache,
-                                      interface_index, normal_direction,
-                                      primary_node_index, primary_direction_index,
+                                      surface_integral, st::Type{<:DG},
+                                      u_interface, interface_index,
+                                      normal_direction, primary_node_index,
+                                      primary_direction_index,
                                       primary_element_index,
-                                      secondary_node_index, secondary_direction_index,
+                                      secondary_node_index,
+                                      secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     @unpack surface_flux = surface_integral
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, st,
+                                       primary_node_index,
                                        interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -215,20 +311,22 @@ end
 
 # Inlined version of the interface flux computation for equations with conservative and nonconservative terms
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                      MeshT::Type{<:Union{P4estMesh{2}, T8codeMesh{2}}},
                                       have_nonconservative_terms::True, equations,
-                                      surface_integral, dg::DG, cache,
-                                      interface_index, normal_direction,
-                                      primary_node_index, primary_direction_index,
+                                      surface_integral, st::Type{<:DG},
+                                      u_interface, interface_index,
+                                      normal_direction, primary_node_index,
+                                      primary_direction_index,
                                       primary_element_index,
-                                      secondary_node_index, secondary_direction_index,
+                                      secondary_node_index,
+                                      secondary_direction_index,
                                       secondary_element_index)
     @unpack surface_flux = surface_integral
-    calc_interface_flux!(surface_flux_values, mesh, have_nonconservative_terms,
+    calc_interface_flux!(surface_flux_values, MeshT, have_nonconservative_terms,
                          combine_conservative_and_nonconservative_fluxes(surface_flux,
                                                                          equations),
                          equations,
-                         surface_integral, dg, cache,
+                         surface_integral, st, u_interface,
                          interface_index, normal_direction,
                          primary_node_index, primary_direction_index,
                          primary_element_index,
@@ -238,20 +336,19 @@ end
 end
 
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                      ::Type{<:Union{P4estMesh{2}, T8codeMesh{2}}},
                                       have_nonconservative_terms::True,
                                       combine_conservative_and_nonconservative_fluxes::False,
                                       equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, st::Type{<:DG}, u_interface,
                                       interface_index, normal_direction,
                                       primary_node_index, primary_direction_index,
                                       primary_element_index,
                                       secondary_node_index, secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     surface_flux, nonconservative_flux = surface_integral.surface_flux
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, st, primary_node_index,
                                        interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -265,9 +362,9 @@ end
         # Note the factor 0.5 necessary for the nonconservative fluxes based on
         # the interpretation of global SBP operators coupled discontinuously via
         # central fluxes/SATs
-        surface_flux_values[v, primary_node_index, primary_direction_index, primary_element_index] = (flux_[v] +
-                                                                                                      0.5f0 *
-                                                                                                      noncons_primary[v])
+        surface_flux_values[v, primary_node_index, primary_direction_index, primary_element_index] = flux_[v] +
+                                                                                                     0.5f0 *
+                                                                                                     noncons_primary[v]
         surface_flux_values[v, secondary_node_index, secondary_direction_index, secondary_element_index] = -(flux_[v] +
                                                                                                              0.5f0 *
                                                                                                              noncons_secondary[v])
@@ -277,20 +374,19 @@ end
 end
 
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                      ::Type{<:Union{P4estMesh{2}, T8codeMesh{2}}},
                                       have_nonconservative_terms::True,
                                       combine_conservative_and_nonconservative_fluxes::True,
                                       equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, st::Type{<:DG}, u_interface,
                                       interface_index, normal_direction,
                                       primary_node_index, primary_direction_index,
                                       primary_element_index,
                                       secondary_node_index, secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     @unpack surface_flux = surface_integral
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, st, primary_node_index,
                                        interface_index)
 
     flux_left, flux_right = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -332,6 +428,15 @@ function prolong2boundaries!(cache, u,
     return nothing
 end
 
+# We require this function definition, as the function calls for the
+# coupled simulations pass the u_parent variable
+# Note: Since the implementation is identical, we forward to the original function
+function prolong2boundaries!(cache, u, u_parent, semis,
+                             mesh::P4estMeshView{2},
+                             equations, surface_integral, dg::DG)
+    return prolong2boundaries!(cache, u, mesh, equations, dg)
+end
+
 function calc_boundary_flux!(cache, t, boundary_condition::BC, boundary_indexing,
                              mesh::Union{P4estMesh{2}, T8codeMesh{2}},
                              equations, surface_integral, dg::DG) where {BC}
@@ -402,6 +507,36 @@ end
     return nothing
 end
 
+# inlined version of the boundary flux calculation along a physical interface
+@inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
+                                     mesh::P4estMeshView{2},
+                                     nonconservative_terms::False, equations,
+                                     surface_integral, dg::DG, cache,
+                                     i_index, j_index,
+                                     node_index, direction_index, element_index,
+                                     boundary_index, u_parent)
+    @unpack boundaries = cache
+    @unpack contravariant_vectors = cache.elements
+    @unpack surface_flux = surface_integral
+
+    # Extract solution data from boundary container
+    u_inner = get_node_vars(boundaries.u, equations, dg, node_index, boundary_index)
+
+    # Outward-pointing normal direction (not normalized)
+    normal_direction = get_normal_direction(direction_index, contravariant_vectors,
+                                            i_index, j_index, element_index)
+
+    flux_ = boundary_condition(u_inner, mesh, equations, cache, i_index, j_index,
+                               element_index, normal_direction, surface_flux,
+                               normal_direction, u_parent)
+
+    # Copy flux to element storage in the correct orientation
+    for v in eachvariable(equations)
+        surface_flux_values[v, node_index, direction_index, element_index] = flux_[v]
+    end
+    return nothing
+end
+
 # inlined version of the boundary flux with nonconservative terms calculation along a physical interface
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
@@ -498,6 +633,17 @@ end
     return nothing
 end
 
+# Function barrier for type stability
+function calc_boundary_flux!(cache, t, boundary_conditions,
+                             mesh::P4estMeshView,
+                             equations, surface_integral, dg::DG, u_parent)
+    @unpack boundary_condition_types, boundary_indices = boundary_conditions
+
+    calc_boundary_flux_by_type!(cache, t, boundary_condition_types, boundary_indices,
+                                mesh, equations, surface_integral, dg, u_parent)
+    return nothing
+end
+
 function prolong2mortars!(cache, u,
                           mesh::Union{P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}},
                           equations,
@@ -756,7 +902,7 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u,
+function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{P4estMesh{2}, P4estMeshView{2},
                                             T8codeMesh{2}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
@@ -764,6 +910,51 @@ function calc_surface_integral!(du, u,
     @unpack inverse_weights = dg.basis
     @unpack surface_flux_values = cache.elements
 
+    @threaded for element in eachelement(dg, cache)
+        calc_surface_integral_per_element!(du, typeof(mesh), equations,
+                                           surface_integral, dg, inverse_weights[1],
+                                           surface_flux_values, element)
+    end
+end
+
+function calc_surface_integral!(backend::Backend, du, u,
+                                mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                            T8codeMesh{2}},
+                                equations,
+                                surface_integral::SurfaceIntegralWeakForm,
+                                dg::DGSEM, cache)
+    nelements(dg, cache) == 0 && return nothing
+    @unpack inverse_weights = dg.basis
+    @unpack surface_flux_values = cache.elements
+
+    kernel! = calc_surface_integral_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, surface_integral, dg, inverse_weights[1],
+            surface_flux_values, ndrange = nelements(dg, cache))
+    return nothing
+end
+
+@kernel function calc_surface_integral_KAkernel!(du,
+                                                 mT::Type{<:Union{P4estMesh{2},
+                                                                  P4estMeshView{2},
+                                                                  T8codeMesh{2}}},
+                                                 equations,
+                                                 surface_integral::SurfaceIntegralWeakForm,
+                                                 dg::DGSEM, factor,
+                                                 surface_flux_values)
+    element = @index(Global)
+    calc_surface_integral_per_element!(du, mT, equations, surface_integral,
+                                       dg, factor, surface_flux_values, element)
+end
+
+@inline function calc_surface_integral_per_element!(du,
+                                                    ::Type{<:Union{P4estMesh{2},
+                                                                   P4estMeshView{2},
+                                                                   T8codeMesh{2}}},
+                                                    equations,
+                                                    surface_integral::SurfaceIntegralWeakForm,
+                                                    dg::DGSEM, factor,
+                                                    surface_flux_values,
+                                                    element)
     # Note that all fluxes have been computed with outward-pointing normal vectors.
     # This computes the **negative** surface integral contribution,
     # i.e., M^{-1} * boundary_interpolation^T (which is for Gauss-Lobatto DGSEM just M^{-1} * B)
@@ -771,32 +962,105 @@ function calc_surface_integral!(du, u,
     #
     # We also use explicit assignments instead of `+=` to let `@muladd` turn these
     # into FMAs (see comment at the top of the file).
-    factor = inverse_weights[1] # For LGL basis: Identical to weighted boundary interpolation at x = ±1
-    @threaded for element in eachelement(dg, cache)
-        for l in eachnode(dg)
-            for v in eachvariable(equations)
-                # surface at -x
-                du[v, 1, l, element] = (du[v, 1, l, element] +
-                                        surface_flux_values[v, l, 1, element] *
-                                        factor)
-
-                # surface at +x
-                du[v, nnodes(dg), l, element] = (du[v, nnodes(dg), l, element] +
-                                                 surface_flux_values[v, l, 2, element] *
-                                                 factor)
-
-                # surface at -y
-                du[v, l, 1, element] = (du[v, l, 1, element] +
-                                        surface_flux_values[v, l, 3, element] *
-                                        factor)
-
-                # surface at +y
-                du[v, l, nnodes(dg), element] = (du[v, l, nnodes(dg), element] +
-                                                 surface_flux_values[v, l, 4, element] *
-                                                 factor)
-            end
+    #
+    # factor = inverse_weights[1]
+    # For LGL basis: Identical to weighted boundary interpolation at x = ±1
+    for l in eachnode(dg)
+        for v in eachvariable(equations)
+            # surface at -x
+            du[v, 1, l, element] = (du[v, 1, l, element] +
+                                    surface_flux_values[v, l, 1, element] *
+                                    factor)
+
+            # surface at +x
+            du[v, nnodes(dg), l, element] = (du[v, nnodes(dg), l, element] +
+                                             surface_flux_values[v, l, 2, element] *
+                                             factor)
+
+            # surface at -y
+            du[v, l, 1, element] = (du[v, l, 1, element] +
+                                    surface_flux_values[v, l, 3, element] *
+                                    factor)
+
+            # surface at +y
+            du[v, l, nnodes(dg), element] = (du[v, l, nnodes(dg), element] +
+                                             surface_flux_values[v, l, 4, element] *
+                                             factor)
         end
     end
+    return nothing
+end
+
+# Call this for coupled P4estMeshView simulations.
+# The coupling calculations (especially boundary conditions) require data from the parent mesh, which is why
+# the additional variable u_parent is needed, compared to non-coupled systems.
+function rhs!(du, u, t, u_parent, semis,
+              mesh::P4estMeshView{2},
+              equations,
+              boundary_conditions, source_terms::Source,
+              dg::DG, cache) where {Source}
+    backend = nothing
+    # Reset du
+    @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
+
+    # Calculate volume integral
+    @trixi_timeit timer() "volume integral" begin
+        calc_volume_integral!(backend, du, u, mesh,
+                              have_nonconservative_terms(equations), equations,
+                              dg.volume_integral, dg, cache)
+    end
+
+    # Prolong solution to interfaces
+    @trixi_timeit timer() "prolong2interfaces" begin
+        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
+    end
+
+    # Calculate interface fluxes
+    @trixi_timeit timer() "interface flux" begin
+        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
+                             have_nonconservative_terms(equations), equations,
+                             dg.surface_integral, dg, cache)
+    end
+
+    # Prolong solution to boundaries
+    @trixi_timeit timer() "prolong2boundaries" begin
+        prolong2boundaries!(cache, u, u_parent, semis, mesh, equations,
+                            dg.surface_integral, dg)
+    end
+
+    # Calculate boundary fluxes
+    @trixi_timeit timer() "boundary flux" begin
+        calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
+                            dg.surface_integral, dg, u_parent)
+    end
+
+    # Prolong solution to mortars
+    @trixi_timeit timer() "prolong2mortars" begin
+        prolong2mortars!(cache, u, mesh, equations,
+                         dg.mortar, dg)
+    end
+
+    # Calculate mortar fluxes
+    @trixi_timeit timer() "mortar flux" begin
+        calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
+                          have_nonconservative_terms(equations), equations,
+                          dg.mortar, dg.surface_integral, dg, cache)
+    end
+
+    # Calculate surface integrals
+    @trixi_timeit timer() "surface integral" begin
+        calc_surface_integral!(backend, du, u, mesh, equations,
+                               dg.surface_integral, dg, cache)
+    end
+
+    # Apply Jacobian from mapping to reference element
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
+
+    # Calculate source terms
+    @trixi_timeit timer() "source terms" begin
+        calc_sources!(du, u, t, source_terms, equations, dg, cache)
+    end
 
     return nothing
 end
diff --git a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
index 35372eecea7..21153d2deca 100644
--- a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
@@ -10,7 +10,7 @@ Reusing `rhs_parabolic!` for `P4estMesh`es is not easily possible as
 for `P4estMesh`es we call
 
     ```
-    prolong2mortars_divergence!(cache, flux_viscous, mesh, equations_parabolic,
+    prolong2mortars_divergence!(cache, flux_parabolic, mesh, equations_parabolic,
                                 dg.mortar, dg)
 
     calc_mortar_flux_divergence!(cache_parabolic.elements.surface_flux_values,
@@ -18,7 +18,7 @@ for `P4estMesh`es we call
     ```
 instead of
     ```
-    prolong2mortars!(cache, flux_viscous, mesh, equations_parabolic,
+    prolong2mortars!(cache, flux_parabolic, mesh, equations_parabolic,
                      dg.mortar, dg)
 
     calc_mortar_flux!(cache_parabolic.elements.surface_flux_values,
@@ -30,10 +30,10 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
                         equations_parabolic::AbstractEquationsParabolic,
                         boundary_conditions_parabolic, source_terms_parabolic,
                         dg::DG, parabolic_scheme, cache, cache_parabolic)
-    @unpack viscous_container = cache_parabolic
-    @unpack u_transformed, gradients, flux_viscous = viscous_container
+    @unpack parabolic_container = cache_parabolic
+    @unpack u_transformed, gradients, flux_parabolic = parabolic_container
 
-    # Convert conservative variables to a form more suitable for viscous flux calculations
+    # Convert conservative variables to a form more suitable for parabolic flux calculations
     @trixi_timeit timer() "transform variables" begin
         transform_variables!(u_transformed, u, mesh, equations_parabolic,
                              dg, cache)
@@ -46,20 +46,20 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
                        dg, parabolic_scheme, cache)
     end
 
-    # Compute and store the viscous fluxes
-    @trixi_timeit timer() "calculate viscous fluxes" begin
-        calc_viscous_fluxes!(flux_viscous, gradients, u_transformed, mesh,
-                             equations_parabolic, dg, cache)
+    # Compute and store the parabolic fluxes
+    @trixi_timeit timer() "calculate parabolic fluxes" begin
+        calc_parabolic_fluxes!(flux_parabolic, gradients, u_transformed, mesh,
+                               equations_parabolic, dg, cache)
     end
 
     # The remainder of this function is essentially a regular rhs! for parabolic
-    # equations (i.e., it computes the divergence of the viscous fluxes)
+    # equations (i.e., it computes the divergence of the parabolic fluxes)
     #
-    # OBS! In `calc_viscous_fluxes!`, the viscous flux values at the volume nodes of each element have
-    # been computed and stored in `fluxes_viscous`. In the following, we *reuse* (abuse) the
+    # OBS! In `calc_parabolic_fluxes!`, the parabolic flux values at the volume nodes of each element have
+    # been computed and stored in `flux_parabolic`. In the following, we *reuse* (abuse) the
     # `interfaces` and `boundaries` containers in `cache` to interpolate and store the
     # *fluxes* at the element surfaces, as opposed to interpolating and storing the *solution* (as it
-    # is done in the hyperbolic operator). That is, `interfaces.u`/`boundaries.u` store *viscous flux values*
+    # is done in the hyperbolic operator). That is, `interfaces.u`/`boundaries.u` store *parabolic flux values*
     # and *not the solution*.  The advantage is that a) we do not need to allocate more storage, b) we
     # do not need to recreate the existing data structure only with a different name, and c) we do not
     # need to interpolate solutions *and* gradients to the surfaces.
@@ -68,32 +68,32 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
 
     # Calculate volume integral
-    # This calls the specialized version for the viscous fluxes from
+    # This calls the specialized version for the parabolic fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg, cache)
+        calc_volume_integral!(du, flux_parabolic, mesh, equations_parabolic, dg, cache)
     end
 
     # Prolong solution to interfaces.
-    # This calls the specialized version for the viscous fluxes from
+    # This calls the specialized version for the parabolic fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2interfaces!(cache, flux_parabolic, mesh, equations_parabolic, dg)
     end
 
     # Calculate interface fluxes
-    # This calls the specialized version for the viscous fluxes from
+    # This calls the specialized version for the parabolic fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "interface flux" begin
         calc_interface_flux!(cache.elements.surface_flux_values, mesh,
                              equations_parabolic, dg, parabolic_scheme, cache)
     end
 
-    # Prolong viscous fluxes to boundaries.
-    # This calls the specialized version for the viscous fluxes from
+    # Prolong parabolic fluxes to boundaries.
+    # This calls the specialized version for the parabolic fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2boundaries" begin
-        prolong2boundaries!(cache, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2boundaries!(cache, flux_parabolic, mesh, equations_parabolic, dg)
     end
 
     # Calculate boundary fluxes.
@@ -108,7 +108,7 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # Prolong solution to mortars.
     # This calls the specialized version for parabolic equations.
     @trixi_timeit timer() "prolong2mortars" begin
-        prolong2mortars_divergence!(cache, flux_viscous, mesh, equations_parabolic,
+        prolong2mortars_divergence!(cache, flux_parabolic, mesh, equations_parabolic,
                                     dg.mortar, dg)
     end
 
@@ -123,7 +123,7 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # Calculate surface integrals.
     # This reuses `calc_surface_integral!` for the purely hyperbolic case.
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations_parabolic,
+        calc_surface_integral!(nothing, du, u, mesh, equations_parabolic,
                                dg.surface_integral, dg, cache)
     end
 
@@ -158,7 +158,7 @@ function calc_gradient!(gradients, u_transformed, t,
     # Prolong solution to interfaces.
     # This reuses `prolong2interfaces` for the purely hyperbolic case.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u_transformed, mesh,
+        prolong2interfaces!(nothing, cache, u_transformed, mesh,
                             equations_parabolic, dg)
     end
 
@@ -337,7 +337,7 @@ function calc_interface_flux_gradient!(surface_flux_values,
     return nothing
 end
 
-# This is the version used when calculating the gradient of the viscous fluxes (called from above)
+# This is the version used when calculating the gradient of the parabolic fluxes (called from above)
 @inline function calc_interface_flux_gradient!(surface_flux_values, mesh::P4estMesh{2},
                                                equations_parabolic,
                                                dg::DG, parabolic_scheme, cache,
@@ -366,22 +366,22 @@ end
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes.
+# This is the version used when calculating the divergence of the parabolic fluxes.
 # Identical to weak-form volume integral/kernel for the purely hyperbolic case,
-# except that the fluxes are here already precomputed in `calc_viscous_fluxes!`
-function calc_volume_integral!(du, flux_viscous, mesh::P4estMesh{2},
+# except that the fluxes are here already precomputed in `calc_parabolic_fluxes!`
+function calc_volume_integral!(du, flux_parabolic, mesh::P4estMesh{2},
                                equations_parabolic::AbstractEquationsParabolic,
                                dg::DGSEM, cache)
     (; derivative_hat) = dg.basis
     (; contravariant_vectors) = cache.elements
-    flux_viscous_x, flux_viscous_y = flux_viscous
+    flux_parabolic_x, flux_parabolic_y = flux_parabolic
 
     @threaded for element in eachelement(dg, cache)
         # Calculate volume terms in one element
         for j in eachnode(dg), i in eachnode(dg)
-            flux1 = get_node_vars(flux_viscous_x, equations_parabolic, dg,
+            flux1 = get_node_vars(flux_parabolic_x, equations_parabolic, dg,
                                   i, j, element)
-            flux2 = get_node_vars(flux_viscous_y, equations_parabolic, dg,
+            flux2 = get_node_vars(flux_parabolic_y, equations_parabolic, dg,
                                   i, j, element)
 
             # Compute the contravariant flux by taking the scalar product of the
@@ -411,18 +411,18 @@ function calc_volume_integral!(du, flux_viscous, mesh::P4estMesh{2},
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes.
-# Specialization `flux_viscous::Tuple` needed to
+# This is the version used when calculating the divergence of the parabolic fluxes.
+# Specialization `flux_parabolic::Tuple` needed to
 # avoid amibiguity with the hyperbolic version of `prolong2interfaces!` in dg_2d.jl
 # which is for the variables itself, i.e., `u::Array{uEltype, 4}`.
-function prolong2interfaces!(cache, flux_viscous::Tuple,
+function prolong2interfaces!(cache, flux_parabolic::Tuple,
                              mesh::Union{P4estMesh{2}, P4estMeshView{2}},
                              equations_parabolic::AbstractEquationsParabolic,
                              dg::DG)
     (; interfaces) = cache
     (; contravariant_vectors) = cache.elements
     index_range = eachnode(dg)
-    flux_viscous_x, flux_viscous_y = flux_viscous
+    flux_parabolic_x, flux_parabolic_y = flux_parabolic
 
     @threaded for interface in eachinterface(dg, cache)
         # Copy solution data from the primary element using "delayed indexing" with
@@ -450,12 +450,12 @@ function prolong2interfaces!(cache, flux_viscous::Tuple,
 
             for v in eachvariable(equations_parabolic)
                 # OBS! `interfaces.u` stores the interpolated *fluxes* and *not the solution*!
-                flux_viscous = SVector(flux_viscous_x[v, i_primary, j_primary,
-                                                      primary_element],
-                                       flux_viscous_y[v, i_primary, j_primary,
-                                                      primary_element])
+                flux_parabolic = SVector(flux_parabolic_x[v, i_primary, j_primary,
+                                                          primary_element],
+                                         flux_parabolic_y[v, i_primary, j_primary,
+                                                          primary_element])
 
-                interfaces.u[1, v, i, interface] = dot(flux_viscous, normal_direction)
+                interfaces.u[1, v, i, interface] = dot(flux_parabolic, normal_direction)
             end
             i_primary += i_primary_step
             j_primary += j_primary_step
@@ -485,13 +485,14 @@ function prolong2interfaces!(cache, flux_viscous::Tuple,
 
             for v in eachvariable(equations_parabolic)
                 # OBS! `interfaces.u` stores the interpolated *fluxes* and *not the solution*!
-                flux_viscous = SVector(flux_viscous_x[v, i_secondary, j_secondary,
-                                                      secondary_element],
-                                       flux_viscous_y[v, i_secondary, j_secondary,
-                                                      secondary_element])
+                flux_parabolic = SVector(flux_parabolic_x[v, i_secondary, j_secondary,
+                                                          secondary_element],
+                                         flux_parabolic_y[v, i_secondary, j_secondary,
+                                                          secondary_element])
                 # store the normal flux with respect to the primary normal direction, 
                 # which is the negative of the secondary normal direction
-                interfaces.u[2, v, i, interface] = -dot(flux_viscous, normal_direction)
+                interfaces.u[2, v, i, interface] = -dot(flux_parabolic,
+                                                        normal_direction)
             end
             i_secondary += i_secondary_step
             j_secondary += j_secondary_step
@@ -547,15 +548,15 @@ function calc_interface_flux!(surface_flux_values, mesh::P4estMesh{2},
                                                     i_primary, j_primary,
                                                     primary_element)
 
-            # We prolong the viscous flux dotted with respect the outward normal on the
+            # We prolong the parabolic flux dotted with respect the outward normal on the
             # primary element.
-            viscous_flux_normal_ll, viscous_flux_normal_rr = get_surface_node_vars(cache.interfaces.u,
-                                                                                   equations_parabolic,
-                                                                                   dg,
-                                                                                   i,
-                                                                                   interface)
+            parabolic_flux_normal_ll, parabolic_flux_normal_rr = get_surface_node_vars(cache.interfaces.u,
+                                                                                       equations_parabolic,
+                                                                                       dg,
+                                                                                       i,
+                                                                                       interface)
 
-            flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
+            flux_ = flux_parabolic(parabolic_flux_normal_ll, parabolic_flux_normal_rr,
                                    normal_direction, Divergence(),
                                    equations_parabolic, parabolic_scheme)
 
@@ -577,7 +578,7 @@ function calc_interface_flux!(surface_flux_values, mesh::P4estMesh{2},
     return nothing
 end
 
-function prolong2mortars_divergence!(cache, flux_viscous,
+function prolong2mortars_divergence!(cache, flux_parabolic,
                                      mesh::P4estMesh{2}, equations_parabolic,
                                      mortar_l2::LobattoLegendreMortarL2,
                                      dg::DGSEM)
@@ -585,7 +586,7 @@ function prolong2mortars_divergence!(cache, flux_viscous,
     @unpack contravariant_vectors = cache.elements
     index_range = eachnode(dg)
 
-    flux_viscous_x, flux_viscous_y = flux_viscous
+    flux_parabolic_x, flux_parabolic_y = flux_parabolic
 
     @threaded for mortar in eachmortar(dg, cache)
         # Copy solution data from the small elements using "delayed indexing" with
@@ -608,10 +609,12 @@ function prolong2mortars_divergence!(cache, flux_viscous,
                                                         i_small, j_small, element)
 
                 for v in eachvariable(equations_parabolic)
-                    flux_viscous = SVector(flux_viscous_x[v, i_small, j_small, element],
-                                           flux_viscous_y[v, i_small, j_small, element])
+                    flux_parabolic = SVector(flux_parabolic_x[v, i_small, j_small,
+                                                              element],
+                                             flux_parabolic_y[v, i_small, j_small,
+                                                              element])
 
-                    cache.mortars.u[1, v, position, i, mortar] = dot(flux_viscous,
+                    cache.mortars.u[1, v, position, i, mortar] = dot(flux_parabolic,
                                                                      normal_direction)
                 end
                 i_small += i_small_step
@@ -642,14 +645,14 @@ function prolong2mortars_divergence!(cache, flux_viscous,
                                                     i_large, j_large, element)
 
             for v in eachvariable(equations_parabolic)
-                flux_viscous = SVector(flux_viscous_x[v, i_large, j_large, element],
-                                       flux_viscous_y[v, i_large, j_large, element])
+                flux_parabolic = SVector(flux_parabolic_x[v, i_large, j_large, element],
+                                         flux_parabolic_y[v, i_large, j_large, element])
 
-                # We prolong the viscous flux dotted with respect the outward normal
+                # We prolong the parabolic flux dotted with respect the outward normal
                 # on the small element. We scale by -1/2 here because the normal
                 # direction on the large element is negative 2x that of the small
                 # element (these normal directions are "scaled" by the surface Jacobian)
-                u_buffer[v, i] = -0.5f0 * dot(flux_viscous, normal_direction)
+                u_buffer[v, i] = -0.5f0 * dot(flux_parabolic, normal_direction)
             end
             i_large += i_large_step
             j_large += j_large_step
@@ -699,11 +702,13 @@ function calc_mortar_flux_divergence!(surface_flux_values, mesh::P4estMesh{2},
                                                         i_small, j_small, element)
 
                 for v in eachvariable(equations_parabolic)
-                    viscous_flux_normal_ll = cache.mortars.u[1, v, position, i, mortar]
-                    viscous_flux_normal_rr = cache.mortars.u[2, v, position, i, mortar]
+                    parabolic_flux_normal_ll = cache.mortars.u[1, v, position, i,
+                                                               mortar]
+                    parabolic_flux_normal_rr = cache.mortars.u[2, v, position, i,
+                                                               mortar]
 
-                    flux_ = flux_parabolic(viscous_flux_normal_ll,
-                                           viscous_flux_normal_rr,
+                    flux_ = flux_parabolic(parabolic_flux_normal_ll,
+                                           parabolic_flux_normal_rr,
                                            normal_direction, Divergence(),
                                            equations_parabolic, parabolic_scheme)
 
@@ -792,7 +797,7 @@ end
 # We structure `calc_mortar_flux_gradient!` similarly to "calc_mortar_flux!" for
 # hyperbolic equations with no nonconservative terms.
 # The reasoning is that parabolic fluxes are treated like conservative
-# terms (e.g., we compute a viscous conservative "flux") and thus no
+# terms (e.g., we compute a parabolic conservative "flux") and thus no
 # non-conservative terms are present.
 @inline function calc_mortar_flux_gradient!(fstar_primary, fstar_secondary,
                                             mesh::P4estMesh{2}, equations_parabolic,
@@ -818,10 +823,10 @@ end
     return nothing
 end
 
-# Specialization `flux_viscous::Tuple` needed to
+# Specialization `flux_parabolic::Tuple` needed to
 # avoid amibiguity with the hyperbolic version of `prolong2boundaries!` in dg_2d.jl
 # which is for the variables itself, i.e., `u::Array{uEltype, 4}`.
-function prolong2boundaries!(cache, flux_viscous::Tuple,
+function prolong2boundaries!(cache, flux_parabolic::Tuple,
                              mesh::P4estMesh{2},
                              equations_parabolic::AbstractEquationsParabolic,
                              dg::DG)
@@ -829,7 +834,7 @@ function prolong2boundaries!(cache, flux_viscous::Tuple,
     (; contravariant_vectors) = cache.elements
     index_range = eachnode(dg)
 
-    flux_viscous_x, flux_viscous_y = flux_viscous
+    flux_parabolic_x, flux_parabolic_y = flux_parabolic
 
     @threaded for boundary in eachboundary(dg, cache)
         # Copy solution data from the element using "delayed indexing" with
@@ -849,10 +854,10 @@ function prolong2boundaries!(cache, flux_viscous::Tuple,
                                                     i_node, j_node, element)
 
             for v in eachvariable(equations_parabolic)
-                flux_viscous = SVector(flux_viscous_x[v, i_node, j_node, element],
-                                       flux_viscous_y[v, i_node, j_node, element])
+                flux_parabolic = SVector(flux_parabolic_x[v, i_node, j_node, element],
+                                         flux_parabolic_y[v, i_node, j_node, element])
 
-                boundaries.u[v, i, boundary] = dot(flux_viscous, normal_direction)
+                boundaries.u[v, i, boundary] = dot(flux_parabolic, normal_direction)
             end
             i_node += i_node_step
             j_node += j_node_step
@@ -1190,7 +1195,7 @@ end
 # Needed to *not* flip the sign of the inverse Jacobian.
 # This is because the parabolic fluxes are assumed to be of the form
 #   `du/dt + df/dx = dg/dx + source(x,t)`,
-# where f(u) is the inviscid flux and g(u) is the viscous flux.
+# where f(u) is the inviscid flux and g(u) is the parabolic flux.
 function apply_jacobian_parabolic!(du::AbstractArray, mesh::P4estMesh{2},
                                    equations_parabolic::AbstractEquationsParabolic,
                                    dg::DG, cache)
diff --git a/src/solvers/dgsem_p4est/dg_2d_parallel.jl b/src/solvers/dgsem_p4est/dg_2d_parallel.jl
index f77d625f57e..2cefabd9539 100644
--- a/src/solvers/dgsem_p4est/dg_2d_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_2d_parallel.jl
@@ -140,7 +140,7 @@ end
 @inline function calc_mpi_interface_flux!(surface_flux_values,
                                           mesh::Union{P4estMeshParallel{2},
                                                       T8codeMeshParallel{2}},
-                                          nonconservative_terms::True, equations,
+                                          have_nonconservative_terms::True, equations,
                                           surface_integral, dg::DG, cache,
                                           interface_index, normal_direction,
                                           interface_node_index, local_side,
@@ -321,7 +321,7 @@ end
 @inline function calc_mpi_mortar_flux!(fstar_primary, fstar_secondary,
                                        mesh::Union{P4estMeshParallel{2},
                                                    T8codeMeshParallel{2}},
-                                       nonconservative_terms::True, equations,
+                                       have_nonconservative_terms::True, equations,
                                        surface_integral, dg::DG, cache,
                                        mortar_index, position_index, normal_direction,
                                        node_index)
diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index b70661c56ba..4713ded188f 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -91,85 +91,118 @@ end
     return (i1, i2)
 end
 
-function prolong2interfaces!(cache, u,
+function prolong2interfaces!(backend::Nothing, cache, u,
                              mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                              equations, dg::DG)
     @unpack interfaces = cache
+    @unpack neighbor_ids, node_indices = cache.interfaces
     index_range = eachnode(dg)
 
     @threaded for interface in eachinterface(dg, cache)
-        # Copy solution data from the primary element using "delayed indexing" with
-        # a start value and two step sizes to get the correct face and orientation.
-        # Note that in the current implementation, the interface will be
-        # "aligned at the primary element", i.e., the indices of the primary side
-        # will always run forwards.
-        primary_element = interfaces.neighbor_ids[1, interface]
-        primary_indices = interfaces.node_indices[1, interface]
-
-        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
-                                                                                     index_range)
-        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
-                                                                                     index_range)
-        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
-                                                                                     index_range)
-
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-        k_primary = k_primary_start
-        for j in eachnode(dg)
-            for i in eachnode(dg)
-                for v in eachvariable(equations)
-                    interfaces.u[1, v, i, j, interface] = u[v,
-                                                            i_primary, j_primary,
-                                                            k_primary,
-                                                            primary_element]
-                end
-                i_primary += i_primary_step_i
-                j_primary += j_primary_step_i
-                k_primary += k_primary_step_i
+        prolong2interfaces_per_interface!(interfaces.u, u, typeof(mesh), equations,
+                                          neighbor_ids, node_indices, index_range,
+                                          interface)
+    end
+    return nothing
+end
+
+function prolong2interfaces!(backend::Backend, cache, u,
+                             mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                             equations, dg::DG)
+    @unpack interfaces = cache
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    index_range = eachnode(dg)
+
+    kernel! = prolong2interfaces_KAkernel!(backend)
+    kernel!(interfaces.u, u, typeof(mesh), equations, neighbor_ids, node_indices,
+            index_range,
+            ndrange = ninterfaces(interfaces))
+    return nothing
+end
+
+@kernel function prolong2interfaces_KAkernel!(interface_u, u, MeshT, equations,
+                                              neighbor_ids, node_indices, index_range)
+    interface = @index(Global)
+    prolong2interfaces_per_interface!(interface_u, u, MeshT, equations, neighbor_ids,
+                                      node_indices, index_range, interface)
+end
+
+@inline function prolong2interfaces_per_interface!(u_interface, u,
+                                                   ::Type{<:Union{P4estMesh{3},
+                                                                  T8codeMesh{3}}},
+                                                   equations, neighbor_ids,
+                                                   node_indices,
+                                                   index_range, interface)
+    # Copy solution data from the primary element using "delayed indexing" with
+    # a start value and two step sizes to get the correct face and orientation.
+    # Note that in the current implementation, the interface will be
+    # "aligned at the primary element", i.e., the indices of the primary side
+    # will always run forwards.
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+
+    i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                 index_range)
+    j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                 index_range)
+    k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                 index_range)
+
+    i_primary = i_primary_start
+    j_primary = j_primary_start
+    k_primary = k_primary_start
+    for j in index_range
+        for i in index_range
+            for v in eachvariable(equations)
+                u_interface[1, v, i, j, interface] = u[v,
+                                                       i_primary, j_primary,
+                                                       k_primary,
+                                                       primary_element]
             end
-            i_primary += i_primary_step_j
-            j_primary += j_primary_step_j
-            k_primary += k_primary_step_j
+            i_primary += i_primary_step_i
+            j_primary += j_primary_step_i
+            k_primary += k_primary_step_i
         end
+        i_primary += i_primary_step_j
+        j_primary += j_primary_step_j
+        k_primary += k_primary_step_j
+    end
 
-        # Copy solution data from the secondary element using "delayed indexing" with
-        # a start value and two step sizes to get the correct face and orientation.
-        secondary_element = interfaces.neighbor_ids[2, interface]
-        secondary_indices = interfaces.node_indices[2, interface]
-
-        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_indices[1],
-                                                                                           index_range)
-        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_indices[2],
-                                                                                           index_range)
-        k_secondary_start, k_secondary_step_i, k_secondary_step_j = index_to_start_step_3d(secondary_indices[3],
-                                                                                           index_range)
-
-        i_secondary = i_secondary_start
-        j_secondary = j_secondary_start
-        k_secondary = k_secondary_start
-        for j in eachnode(dg)
-            for i in eachnode(dg)
-                for v in eachvariable(equations)
-                    interfaces.u[2, v, i, j, interface] = u[v,
-                                                            i_secondary, j_secondary,
-                                                            k_secondary,
-                                                            secondary_element]
-                end
-                i_secondary += i_secondary_step_i
-                j_secondary += j_secondary_step_i
-                k_secondary += k_secondary_step_i
+    # Copy solution data from the secondary element using "delayed indexing" with
+    # a start value and two step sizes to get the correct face and orientation.
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
+
+    i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_indices[1],
+                                                                                       index_range)
+    j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_indices[2],
+                                                                                       index_range)
+    k_secondary_start, k_secondary_step_i, k_secondary_step_j = index_to_start_step_3d(secondary_indices[3],
+                                                                                       index_range)
+
+    i_secondary = i_secondary_start
+    j_secondary = j_secondary_start
+    k_secondary = k_secondary_start
+    for j in index_range
+        for i in index_range
+            for v in eachvariable(equations)
+                u_interface[2, v, i, j, interface] = u[v,
+                                                       i_secondary, j_secondary,
+                                                       k_secondary,
+                                                       secondary_element]
             end
-            i_secondary += i_secondary_step_j
-            j_secondary += j_secondary_step_j
-            k_secondary += k_secondary_step_j
+            i_secondary += i_secondary_step_i
+            j_secondary += j_secondary_step_i
+            k_secondary += k_secondary_step_i
         end
+        i_secondary += i_secondary_step_j
+        j_secondary += j_secondary_step_j
+        k_secondary += k_secondary_step_j
     end
-
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                               have_nonconservative_terms,
                               equations, surface_integral, dg::DG, cache)
@@ -178,93 +211,144 @@ function calc_interface_flux!(surface_flux_values,
     index_range = eachnode(dg)
 
     @threaded for interface in eachinterface(dg, cache)
-        # Get element and side information on the primary element
-        primary_element = neighbor_ids[1, interface]
-        primary_indices = node_indices[1, interface]
-        primary_direction = indices2direction(primary_indices)
-
-        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
-                                                                                     index_range)
-        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
-                                                                                     index_range)
-        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
-                                                                                     index_range)
-
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-        k_primary = k_primary_start
-
-        # Get element and side information on the secondary element
-        secondary_element = neighbor_ids[2, interface]
-        secondary_indices = node_indices[2, interface]
-        secondary_direction = indices2direction(secondary_indices)
-        secondary_surface_indices = surface_indices(secondary_indices)
-
-        # Get the surface indexing on the secondary element.
-        # Note that the indices of the primary side will always run forward but
-        # the secondary indices might need to run backwards for flipped sides.
-        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[1],
-                                                                                           index_range)
-        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[2],
-                                                                                           index_range)
-        i_secondary = i_secondary_start
-        j_secondary = j_secondary_start
+        calc_interface_flux_per_interface!(surface_flux_values,
+                                           typeof(mesh),
+                                           have_nonconservative_terms,
+                                           equations, surface_integral, typeof(dg),
+                                           cache.interfaces.u, neighbor_ids,
+                                           node_indices,
+                                           contravariant_vectors, index_range,
+                                           interface)
+    end
+    return nothing
+end
+
+function calc_interface_flux!(backend::Backend, surface_flux_values,
+                              mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                              have_nonconservative_terms,
+                              equations, surface_integral, dg::DG, cache)
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    kernel! = calc_interface_flux_KAkernel!(backend)
+    kernel!(surface_flux_values, typeof(mesh), have_nonconservative_terms, equations,
+            surface_integral, typeof(dg), cache.interfaces.u,
+            neighbor_ids, node_indices, contravariant_vectors, index_range,
+            ndrange = ninterfaces(cache.interfaces))
+    return nothing
+end
+
+@kernel function calc_interface_flux_KAkernel!(surface_flux_values, MeshT,
+                                               have_nonconservative_terms, equations,
+                                               surface_integral, solverT, u_interface,
+                                               neighbor_ids, node_indices,
+                                               contravariant_vectors, index_range)
+    interface = @index(Global)
+    calc_interface_flux_per_interface!(surface_flux_values,
+                                       MeshT,
+                                       have_nonconservative_terms,
+                                       equations, surface_integral, solverT,
+                                       u_interface,
+                                       neighbor_ids, node_indices,
+                                       contravariant_vectors,
+                                       index_range, interface)
+end
+
+@inline function calc_interface_flux_per_interface!(surface_flux_values,
+                                                    MeshT::Type{<:Union{P4estMesh{3},
+                                                                        T8codeMesh{3}}},
+                                                    have_nonconservative_terms,
+                                                    equations, surface_integral,
+                                                    solverT::Type{<:DG}, u_interface,
+                                                    neighbor_ids,
+                                                    node_indices, contravariant_vectors,
+                                                    index_range, interface)
+    # Get element and side information on the primary element
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+    primary_direction = indices2direction(primary_indices)
+
+    i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                 index_range)
+    j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                 index_range)
+    k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                 index_range)
+
+    i_primary = i_primary_start
+    j_primary = j_primary_start
+    k_primary = k_primary_start
+
+    # Get element and side information on the secondary element
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
+    secondary_direction = indices2direction(secondary_indices)
+    secondary_surface_indices = surface_indices(secondary_indices)
+
+    # Get the surface indexing on the secondary element.
+    # Note that the indices of the primary side will always run forward but
+    # the secondary indices might need to run backwards for flipped sides.
+    i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[1],
+                                                                                       index_range)
+    j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[2],
+                                                                                       index_range)
+    i_secondary = i_secondary_start
+    j_secondary = j_secondary_start
+
+    for j in index_range
+        for i in index_range
+            # Get the normal direction from the primary element.
+            # Note, contravariant vectors at interfaces in negative coordinate direction
+            # are pointing inwards. This is handled by `get_normal_direction`.
+            normal_direction = get_normal_direction(primary_direction,
+                                                    contravariant_vectors,
+                                                    i_primary, j_primary, k_primary,
+                                                    primary_element)
+
+            calc_interface_flux!(surface_flux_values, MeshT, have_nonconservative_terms,
+                                 equations,
+                                 surface_integral, solverT, u_interface,
+                                 interface, normal_direction,
+                                 i, j, primary_direction, primary_element,
+                                 i_secondary, j_secondary, secondary_direction,
+                                 secondary_element)
 
-        for j in eachnode(dg)
-            for i in eachnode(dg)
-                # Get the normal direction from the primary element.
-                # Note, contravariant vectors at interfaces in negative coordinate direction
-                # are pointing inwards. This is handled by `get_normal_direction`.
-                normal_direction = get_normal_direction(primary_direction,
-                                                        contravariant_vectors,
-                                                        i_primary, j_primary, k_primary,
-                                                        primary_element)
-
-                calc_interface_flux!(surface_flux_values, mesh,
-                                     have_nonconservative_terms,
-                                     equations,
-                                     surface_integral, dg, cache,
-                                     interface, normal_direction,
-                                     i, j, primary_direction, primary_element,
-                                     i_secondary, j_secondary, secondary_direction,
-                                     secondary_element)
-
-                # Increment the primary element indices
-                i_primary += i_primary_step_i
-                j_primary += j_primary_step_i
-                k_primary += k_primary_step_i
-                # Increment the secondary element surface indices
-                i_secondary += i_secondary_step_i
-                j_secondary += j_secondary_step_i
-            end
             # Increment the primary element indices
-            i_primary += i_primary_step_j
-            j_primary += j_primary_step_j
-            k_primary += k_primary_step_j
+            i_primary += i_primary_step_i
+            j_primary += j_primary_step_i
+            k_primary += k_primary_step_i
             # Increment the secondary element surface indices
-            i_secondary += i_secondary_step_j
-            j_secondary += j_secondary_step_j
+            i_secondary += i_secondary_step_i
+            j_secondary += j_secondary_step_i
         end
+        # Increment the primary element indices
+        i_primary += i_primary_step_j
+        j_primary += j_primary_step_j
+        k_primary += k_primary_step_j
+        # Increment the secondary element surface indices
+        i_secondary += i_secondary_step_j
+        j_secondary += j_secondary_step_j
     end
-
     return nothing
 end
 
 # Inlined function for interface flux computation for conservative flux terms
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                      ::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
                                       have_nonconservative_terms::False, equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
                                       secondary_i_node_index, secondary_j_node_index,
                                       secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     @unpack surface_flux = surface_integral
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_i_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, solverT,
+                                       primary_i_node_index,
                                        primary_j_node_index, interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -281,19 +365,21 @@ end
 
 # Inlined function for interface flux computation for flux + nonconservative terms
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                      MeshT::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
                                       have_nonconservative_terms::True, equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
                                       secondary_i_node_index, secondary_j_node_index,
                                       secondary_direction_index,
                                       secondary_element_index)
-    calc_interface_flux!(surface_flux_values, mesh, have_nonconservative_terms,
+    calc_interface_flux!(surface_flux_values, MeshT, have_nonconservative_terms,
                          combine_conservative_and_nonconservative_fluxes(surface_integral.surface_flux,
                                                                          equations),
-                         equations, surface_integral, dg, cache, interface_index,
+                         equations, surface_integral, solverT, u_interface,
+                         interface_index,
                          normal_direction, primary_i_node_index, primary_j_node_index,
                          primary_direction_index, primary_element_index,
                          secondary_i_node_index, secondary_j_node_index,
@@ -302,21 +388,21 @@ end
 end
 
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                      ::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
                                       have_nonconservative_terms::True,
                                       combine_conservative_and_nonconservative_fluxes::False,
                                       equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
                                       secondary_i_node_index, secondary_j_node_index,
                                       secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     surface_flux, nonconservative_flux = surface_integral.surface_flux
-
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_i_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, solverT,
+                                       primary_i_node_index,
                                        primary_j_node_index, interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -343,21 +429,22 @@ end
 end
 
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                      ::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
                                       have_nonconservative_terms::True,
                                       combine_conservative_and_nonconservative_fluxes::True,
                                       equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
                                       secondary_i_node_index, secondary_j_node_index,
                                       secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     @unpack surface_flux = surface_integral
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_i_node_index,
-                                       primary_j_node_index, interface_index)
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, solverT,
+                                       primary_i_node_index, primary_j_node_index,
+                                       interface_index)
 
     flux_left, flux_right = surface_flux(u_ll, u_rr, normal_direction, equations)
 
@@ -463,7 +550,7 @@ end
 # inlined version of the boundary flux calculation along a physical interface
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
-                                     nonconservative_terms::False, equations,
+                                     have_nonconservative_terms::False, equations,
                                      surface_integral, dg::DG, cache, i_index, j_index,
                                      k_index, i_node_index, j_node_index,
                                      direction_index,
@@ -499,13 +586,13 @@ end
 # inlined version of the boundary flux calculation along a physical interface
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
-                                     nonconservative_terms::True, equations,
+                                     have_nonconservative_terms::True, equations,
                                      surface_integral, dg::DG, cache, i_index, j_index,
                                      k_index, i_node_index, j_node_index,
                                      direction_index,
                                      element_index, boundary_index)
     calc_boundary_flux!(surface_flux_values, t, boundary_condition, mesh,
-                        nonconservative_terms,
+                        have_nonconservative_terms,
                         combine_conservative_and_nonconservative_fluxes(surface_integral.surface_flux,
                                                                         equations),
                         equations,
@@ -517,7 +604,7 @@ end
 
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
-                                     nonconservative_terms::True,
+                                     have_nonconservative_terms::True,
                                      combine_conservative_and_nonconservative_fluxes::False,
                                      equations,
                                      surface_integral, dg::DG, cache, i_index, j_index,
@@ -560,7 +647,7 @@ end
 
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
-                                     nonconservative_terms::True,
+                                     have_nonconservative_terms::True,
                                      combine_conservative_and_nonconservative_fluxes::True,
                                      equations,
                                      surface_integral, dg::DG, cache, i_index, j_index,
@@ -922,13 +1009,54 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u,
+function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
                                 dg::DGSEM, cache)
     @unpack inverse_weights = dg.basis
     @unpack surface_flux_values = cache.elements
 
+    @threaded for element in eachelement(dg, cache)
+        calc_surface_integral_per_element!(du, typeof(mesh),
+                                           equations, surface_integral,
+                                           dg, inverse_weights[1],
+                                           surface_flux_values,
+                                           element)
+    end
+    return nothing
+end
+
+function calc_surface_integral!(backend::Backend, du, u,
+                                mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                equations,
+                                surface_integral::SurfaceIntegralWeakForm,
+                                dg::DGSEM, cache)
+    @unpack inverse_weights = dg.basis
+    @unpack surface_flux_values = cache.elements
+
+    kernel! = calc_surface_integral_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, surface_integral, dg, inverse_weights[1],
+            surface_flux_values, ndrange = nelements(cache.elements))
+    return nothing
+end
+
+@kernel function calc_surface_integral_KAkernel!(du, MeshT, equations,
+                                                 surface_integral, dg, factor,
+                                                 surface_flux_values)
+    element = @index(Global)
+    calc_surface_integral_per_element!(du, MeshT,
+                                       equations, surface_integral, dg, factor,
+                                       surface_flux_values, element)
+end
+
+@inline function calc_surface_integral_per_element!(du,
+                                                    ::Type{<:Union{P4estMesh{3},
+                                                                   T8codeMesh{3}}},
+                                                    equations,
+                                                    surface_integral::SurfaceIntegralWeakForm,
+                                                    dg::DGSEM, factor,
+                                                    surface_flux_values,
+                                                    element)
     # Note that all fluxes have been computed with outward-pointing normal vectors.
     # This computes the **negative** surface integral contribution,
     # i.e., M^{-1} * boundary_interpolation^T (which is for Gauss-Lobatto DGSEM just M^{-1} * B)
@@ -936,49 +1064,48 @@ function calc_surface_integral!(du, u,
     #
     # We also use explicit assignments instead of `+=` to let `@muladd` turn these
     # into FMAs (see comment at the top of the file).
-    factor = inverse_weights[1] # For LGL basis: Identical to weighted boundary interpolation at x = ±1
-    @threaded for element in eachelement(dg, cache)
-        for m in eachnode(dg), l in eachnode(dg)
-            for v in eachvariable(equations)
-                # surface at -x
-                du[v, 1, l, m, element] = (du[v, 1, l, m, element] +
-                                           surface_flux_values[v, l, m, 1,
-                                                               element] *
-                                           factor)
-
-                # surface at +x
-                du[v, nnodes(dg), l, m, element] = (du[v, nnodes(dg), l, m, element] +
-                                                    surface_flux_values[v, l, m, 2,
-                                                                        element] *
-                                                    factor)
-
-                # surface at -y
-                du[v, l, 1, m, element] = (du[v, l, 1, m, element] +
-                                           surface_flux_values[v, l, m, 3,
-                                                               element] *
-                                           factor)
-
-                # surface at +y
-                du[v, l, nnodes(dg), m, element] = (du[v, l, nnodes(dg), m, element] +
-                                                    surface_flux_values[v, l, m, 4,
-                                                                        element] *
-                                                    factor)
-
-                # surface at -z
-                du[v, l, m, 1, element] = (du[v, l, m, 1, element] +
-                                           surface_flux_values[v, l, m, 5,
-                                                               element] *
-                                           factor)
-
-                # surface at +z
-                du[v, l, m, nnodes(dg), element] = (du[v, l, m, nnodes(dg), element] +
-                                                    surface_flux_values[v, l, m, 6,
-                                                                        element] *
-                                                    factor)
-            end
+    #
+    # factor = inverse_weights[1]
+    # For LGL basis: Identical to weighted boundary interpolation at x = ±1
+    for m in eachnode(dg), l in eachnode(dg)
+        for v in eachvariable(equations)
+            # surface at -x
+            du[v, 1, l, m, element] = (du[v, 1, l, m, element] +
+                                       surface_flux_values[v, l, m, 1,
+                                                           element] *
+                                       factor)
+
+            # surface at +x
+            du[v, nnodes(dg), l, m, element] = (du[v, nnodes(dg), l, m, element] +
+                                                surface_flux_values[v, l, m, 2,
+                                                                    element] *
+                                                factor)
+
+            # surface at -y
+            du[v, l, 1, m, element] = (du[v, l, 1, m, element] +
+                                       surface_flux_values[v, l, m, 3,
+                                                           element] *
+                                       factor)
+
+            # surface at +y
+            du[v, l, nnodes(dg), m, element] = (du[v, l, nnodes(dg), m, element] +
+                                                surface_flux_values[v, l, m, 4,
+                                                                    element] *
+                                                factor)
+
+            # surface at -z
+            du[v, l, m, 1, element] = (du[v, l, m, 1, element] +
+                                       surface_flux_values[v, l, m, 5,
+                                                           element] *
+                                       factor)
+
+            # surface at +z
+            du[v, l, m, nnodes(dg), element] = (du[v, l, m, nnodes(dg), element] +
+                                                surface_flux_values[v, l, m, 6,
+                                                                    element] *
+                                                factor)
         end
     end
-
     return nothing
 end
 end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
index 635fb9b72bc..4dd650e5c4f 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parabolic.jl
@@ -195,24 +195,24 @@ end
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes.
+# This is the version used when calculating the divergence of the parabolic fluxes.
 # Identical to weak-form volume integral/kernel for the purely hyperbolic case,
-# except that the fluxes are here already precomputed in `calc_viscous_fluxes!`
-function calc_volume_integral!(du, flux_viscous, mesh::P4estMesh{3},
+# except that the fluxes are here already precomputed in `calc_parabolic_fluxes!`
+function calc_volume_integral!(du, flux_parabolic, mesh::P4estMesh{3},
                                equations_parabolic::AbstractEquationsParabolic,
                                dg::DGSEM, cache)
     (; derivative_hat) = dg.basis
     (; contravariant_vectors) = cache.elements
-    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+    flux_parabolic_x, flux_parabolic_y, flux_parabolic_z = flux_parabolic
 
     @threaded for element in eachelement(dg, cache)
         # Calculate volume terms in one element
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            flux1 = get_node_vars(flux_viscous_x, equations_parabolic, dg,
+            flux1 = get_node_vars(flux_parabolic_x, equations_parabolic, dg,
                                   i, j, k, element)
-            flux2 = get_node_vars(flux_viscous_y, equations_parabolic, dg,
+            flux2 = get_node_vars(flux_parabolic_y, equations_parabolic, dg,
                                   i, j, k, element)
-            flux3 = get_node_vars(flux_viscous_z, equations_parabolic, dg,
+            flux3 = get_node_vars(flux_parabolic_z, equations_parabolic, dg,
                                   i, j, k, element)
 
             # Compute the contravariant flux by taking the scalar product of the
@@ -253,18 +253,18 @@ function calc_volume_integral!(du, flux_viscous, mesh::P4estMesh{3},
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes.
-# Specialization `flux_viscous::Tuple` needed to
+# This is the version used when calculating the divergence of the parabolic fluxes.
+# Specialization `flux_parabolic::Tuple` needed to
 # avoid amibiguity with the hyperbolic version of `prolong2interfaces!` in dg_3d.jl
 # which is for the variables itself, i.e., `u::Array{uEltype, 5}`.
-function prolong2interfaces!(cache, flux_viscous::Tuple,
+function prolong2interfaces!(cache, flux_parabolic::Tuple,
                              mesh::P4estMesh{3},
                              equations_parabolic::AbstractEquationsParabolic,
                              dg::DG)
     (; interfaces) = cache
     (; contravariant_vectors) = cache.elements
     index_range = eachnode(dg)
-    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+    flux_parabolic_x, flux_parabolic_y, flux_parabolic_z = flux_parabolic
 
     @threaded for interface in eachinterface(dg, cache)
         # Copy solution data from the primary element using "delayed indexing" with
@@ -298,14 +298,23 @@ function prolong2interfaces!(cache, flux_viscous::Tuple,
 
                 for v in eachvariable(equations_parabolic)
                     # OBS! `interfaces.u` stores the interpolated *fluxes* and *not the solution*!
-                    flux_viscous = SVector(flux_viscous_x[v, i_primary, j_primary,
-                                                          k_primary, primary_element],
-                                           flux_viscous_y[v, i_primary, j_primary,
-                                                          k_primary, primary_element],
-                                           flux_viscous_z[v, i_primary, j_primary,
-                                                          k_primary, primary_element])
-
-                    interfaces.u[1, v, i, j, interface] = dot(flux_viscous,
+                    flux_parabolic = SVector(flux_parabolic_x[v,
+                                                              i_primary,
+                                                              j_primary,
+                                                              k_primary,
+                                                              primary_element],
+                                             flux_parabolic_y[v,
+                                                              i_primary,
+                                                              j_primary,
+                                                              k_primary,
+                                                              primary_element],
+                                             flux_parabolic_z[v,
+                                                              i_primary,
+                                                              j_primary,
+                                                              k_primary,
+                                                              primary_element])
+
+                    interfaces.u[1, v, i, j, interface] = dot(flux_parabolic,
                                                               normal_direction)
                 end
                 i_primary += i_primary_step_i
@@ -346,18 +355,24 @@ function prolong2interfaces!(cache, flux_viscous::Tuple,
 
                 for v in eachvariable(equations_parabolic)
                     # OBS! `interfaces.u` stores the interpolated *fluxes* and *not the solution*!
-                    flux_viscous = SVector(flux_viscous_x[v, i_secondary, j_secondary,
-                                                          k_secondary,
-                                                          secondary_element],
-                                           flux_viscous_y[v, i_secondary, j_secondary,
-                                                          k_secondary,
-                                                          secondary_element],
-                                           flux_viscous_z[v, i_secondary, j_secondary,
-                                                          k_secondary,
-                                                          secondary_element])
+                    flux_parabolic = SVector(flux_parabolic_x[v,
+                                                              i_secondary,
+                                                              j_secondary,
+                                                              k_secondary,
+                                                              secondary_element],
+                                             flux_parabolic_y[v,
+                                                              i_secondary,
+                                                              j_secondary,
+                                                              k_secondary,
+                                                              secondary_element],
+                                             flux_parabolic_z[v,
+                                                              i_secondary,
+                                                              j_secondary,
+                                                              k_secondary,
+                                                              secondary_element])
                     # store the normal flux with respect to the primary normal direction,
                     # which is the negative of the secondary normal direction
-                    interfaces.u[2, v, i, j, interface] = -dot(flux_viscous,
+                    interfaces.u[2, v, i, j, interface] = -dot(flux_parabolic,
                                                                normal_direction)
                 end
                 i_secondary += i_secondary_step_i
@@ -423,16 +438,17 @@ function calc_interface_flux!(surface_flux_values, mesh::P4estMesh{3},
                                                         contravariant_vectors,
                                                         i_primary, j_primary, k_primary,
                                                         primary_element)
-                # We prolong the viscous flux dotted with respect the outward normal on the
+                # We prolong the parabolic flux dotted with respect the outward normal on the
                 # primary element.
-                viscous_flux_normal_ll, viscous_flux_normal_rr = get_surface_node_vars(cache.interfaces.u,
-                                                                                       equations_parabolic,
-                                                                                       dg,
-                                                                                       i,
-                                                                                       j,
-                                                                                       interface)
-
-                flux_ = flux_parabolic(viscous_flux_normal_ll, viscous_flux_normal_rr,
+                parabolic_flux_normal_ll, parabolic_flux_normal_rr = get_surface_node_vars(cache.interfaces.u,
+                                                                                           equations_parabolic,
+                                                                                           dg,
+                                                                                           i,
+                                                                                           j,
+                                                                                           interface)
+
+                flux_ = flux_parabolic(parabolic_flux_normal_ll,
+                                       parabolic_flux_normal_rr,
                                        normal_direction, Divergence(),
                                        equations_parabolic, parabolic_scheme)
 
@@ -464,7 +480,7 @@ function calc_interface_flux!(surface_flux_values, mesh::P4estMesh{3},
     return nothing
 end
 
-function prolong2mortars_divergence!(cache, flux_viscous,
+function prolong2mortars_divergence!(cache, flux_parabolic,
                                      mesh::P4estMesh{3}, equations_parabolic,
                                      mortar_l2::LobattoLegendreMortarL2,
                                      dg::DGSEM)
@@ -473,7 +489,7 @@ function prolong2mortars_divergence!(cache, flux_viscous,
     @unpack contravariant_vectors = cache.elements
     index_range = eachnode(dg)
 
-    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+    flux_parabolic_x, flux_parabolic_y, flux_parabolic_z = flux_parabolic
 
     @threaded for mortar in eachmortar(dg, cache)
         # Copy solution data from the small elements using "delayed indexing" with
@@ -501,14 +517,14 @@ function prolong2mortars_divergence!(cache, flux_viscous,
                                                             element)
 
                     for v in eachvariable(equations_parabolic)
-                        flux_viscous = SVector(flux_viscous_x[v, i_small, j_small,
-                                                              k_small, element],
-                                               flux_viscous_y[v, i_small, j_small,
-                                                              k_small, element],
-                                               flux_viscous_z[v, i_small, j_small,
-                                                              k_small, element])
-
-                        cache.mortars.u[1, v, position, i, j, mortar] = dot(flux_viscous,
+                        flux_parabolic = SVector(flux_parabolic_x[v, i_small, j_small,
+                                                                  k_small, element],
+                                                 flux_parabolic_y[v, i_small, j_small,
+                                                                  k_small, element],
+                                                 flux_parabolic_z[v, i_small, j_small,
+                                                                  k_small, element])
+
+                        cache.mortars.u[1, v, position, i, j, mortar] = dot(flux_parabolic,
                                                                             normal_direction)
                     end
                     i_small += i_small_step_i
@@ -551,18 +567,27 @@ function prolong2mortars_divergence!(cache, flux_viscous,
                                                         element)
 
                 for v in eachvariable(equations_parabolic)
-                    flux_viscous = SVector(flux_viscous_x[v, i_large, j_large, k_large,
-                                                          element],
-                                           flux_viscous_y[v, i_large, j_large, k_large,
-                                                          element],
-                                           flux_viscous_z[v, i_large, j_large, k_large,
-                                                          element])
-
-                    # We prolong the viscous flux dotted with respect the outward normal
+                    flux_parabolic = SVector(flux_parabolic_x[v,
+                                                              i_large,
+                                                              j_large,
+                                                              k_large,
+                                                              element],
+                                             flux_parabolic_y[v,
+                                                              i_large,
+                                                              j_large,
+                                                              k_large,
+                                                              element],
+                                             flux_parabolic_z[v,
+                                                              i_large,
+                                                              j_large,
+                                                              k_large,
+                                                              element])
+
+                    # We prolong the parabolic flux dotted with respect the outward normal
                     # on the small element. We scale by -1/2 here because the normal
                     # direction on the large element is negative 2x that of the small
                     # element (these normal directions are "scaled" by the surface Jacobian)
-                    u_buffer[v, i, j] = -0.5f0 * dot(flux_viscous, normal_direction)
+                    u_buffer[v, i, j] = -0.5f0 * dot(flux_parabolic, normal_direction)
                 end
                 i_large += i_large_step_i
                 j_large += j_large_step_i
@@ -634,13 +659,13 @@ function calc_mortar_flux_divergence!(surface_flux_values,
                                                             element)
 
                     for v in eachvariable(equations_parabolic)
-                        viscous_flux_normal_ll = cache.mortars.u[1, v, position, i, j,
-                                                                 mortar]
-                        viscous_flux_normal_rr = cache.mortars.u[2, v, position, i, j,
-                                                                 mortar]
+                        parabolic_flux_normal_ll = cache.mortars.u[1, v, position, i, j,
+                                                                   mortar]
+                        parabolic_flux_normal_rr = cache.mortars.u[2, v, position, i, j,
+                                                                   mortar]
 
-                        flux_ = flux_parabolic(viscous_flux_normal_ll,
-                                               viscous_flux_normal_rr,
+                        flux_ = flux_parabolic(parabolic_flux_normal_ll,
+                                               parabolic_flux_normal_rr,
                                                normal_direction, Divergence(),
                                                equations_parabolic, parabolic_scheme)
 
@@ -748,7 +773,7 @@ end
 # We structure `calc_mortar_flux_gradient!` similarly to "calc_mortar_flux!" for
 # hyperbolic equations with no nonconservative terms.
 # The reasoning is that parabolic fluxes are treated like conservative
-# terms (e.g., we compute a viscous conservative "flux") and thus no
+# terms (e.g., we compute a parabolic conservative "flux") and thus no
 # non-conservative terms are present.
 @inline function calc_mortar_flux_gradient!(fstar_primary, fstar_secondary,
                                             mesh::P4estMesh{3},
@@ -852,10 +877,10 @@ function calc_volume_integral_gradient!(gradients, u_transformed,
     return nothing
 end
 
-# Specialization `flux_viscous::Tuple` needed to
+# Specialization `flux_parabolic::Tuple` needed to
 # avoid amibiguity with the hyperbolic version of `prolong2boundaries!` in dg_3d.jl
 # which is for the variables itself, i.e., `u::Array{uEltype, 5}`.
-function prolong2boundaries!(cache, flux_viscous::Tuple,
+function prolong2boundaries!(cache, flux_parabolic::Tuple,
                              mesh::P4estMesh{3},
                              equations_parabolic::AbstractEquationsParabolic,
                              dg::DG)
@@ -863,7 +888,7 @@ function prolong2boundaries!(cache, flux_viscous::Tuple,
     (; contravariant_vectors) = cache.elements
     index_range = eachnode(dg)
 
-    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+    flux_parabolic_x, flux_parabolic_y, flux_parabolic_z = flux_parabolic
 
     @threaded for boundary in eachboundary(dg, cache)
         # Copy solution data from the element using "delayed indexing" with
@@ -891,14 +916,14 @@ function prolong2boundaries!(cache, flux_viscous::Tuple,
                                                         i_node, j_node, k_node, element)
 
                 for v in eachvariable(equations_parabolic)
-                    flux_viscous = SVector(flux_viscous_x[v, i_node, j_node, k_node,
-                                                          element],
-                                           flux_viscous_y[v, i_node, j_node, k_node,
-                                                          element],
-                                           flux_viscous_z[v, i_node, j_node, k_node,
-                                                          element])
-
-                    boundaries.u[v, i, j, boundary] = dot(flux_viscous,
+                    flux_parabolic = SVector(flux_parabolic_x[v, i_node, j_node, k_node,
+                                                              element],
+                                             flux_parabolic_y[v, i_node, j_node, k_node,
+                                                              element],
+                                             flux_parabolic_z[v, i_node, j_node, k_node,
+                                                              element])
+
+                    boundaries.u[v, i, j, boundary] = dot(flux_parabolic,
                                                           normal_direction)
                 end
                 i_node += i_node_step_i
@@ -1092,7 +1117,7 @@ end
 # Needed to *not* flip the sign of the inverse Jacobian.
 # This is because the parabolic fluxes are assumed to be of the form
 #   `du/dt + df/dx = dg/dx + source(x,t)`,
-# where f(u) is the inviscid flux and g(u) is the viscous flux.
+# where f(u) is the inviscid flux and g(u) is the parabolic flux.
 function apply_jacobian_parabolic!(du::AbstractArray, mesh::P4estMesh{3},
                                    equations_parabolic::AbstractEquationsParabolic,
                                    dg::DG, cache)
diff --git a/src/solvers/dgsem_p4est/dg_3d_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
index bb5ee8e9196..91a700a847c 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
@@ -9,6 +9,8 @@ function rhs!(du, u, t,
               mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
+    backend = trixi_backend(u)
+
     # Start to receive MPI data
     @trixi_timeit timer() "start MPI receive" start_mpi_receive!(cache.mpi_cache)
 
@@ -33,19 +35,19 @@ function rhs!(du, u, t,
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache)
     end
 
     # Prolong solution to interfaces
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u, mesh, equations, dg)
+        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
     end
 
     # Calculate interface fluxes
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
                              have_nonconservative_terms(equations), equations,
                              dg.surface_integral, dg, cache)
     end
@@ -95,11 +97,13 @@ function rhs!(du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations, dg.surface_integral, dg, cache)
+        calc_surface_integral!(backend, du, u, mesh, equations, dg.surface_integral, dg,
+                               cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
diff --git a/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl
index 40f3d4c5159..f998c9989eb 100644
--- a/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl
@@ -5,122 +5,12 @@
 @muladd begin
 #! format: noindent
 
-function create_cache(mesh::P4estMesh{3},
-                      equations, volume_integral::VolumeIntegralSubcellLimiting,
-                      dg::DG, cache_containers, uEltype)
-    cache = create_cache(mesh, equations,
-                         VolumeIntegralPureLGLFiniteVolume(volume_integral.volume_flux_fv),
-                         dg, cache_containers, uEltype)
-
-    fhat1_L_threaded, fhat1_R_threaded,
-    fhat2_L_threaded, fhat2_R_threaded,
-    fhat3_L_threaded, fhat3_R_threaded = create_f_threaded(mesh, equations, dg, uEltype)
-
-    A4d = Array{uEltype, 4}
-    flux_temp_threaded = A4d[A4d(undef, nvariables(equations),
-                                 nnodes(dg), nnodes(dg), nnodes(dg))
-                             for _ in 1:Threads.maxthreadid()]
-    fhat_temp_threaded = A4d[A4d(undef, nvariables(equations),
-                                 nnodes(dg), nnodes(dg), nnodes(dg))
-                             for _ in 1:Threads.maxthreadid()]
-
-    antidiffusive_fluxes = ContainerAntidiffusiveFlux3D{uEltype}(0,
-                                                                 nvariables(equations),
-                                                                 nnodes(dg))
-
-    if have_nonconservative_terms(equations) == true
-        A5d = Array{uEltype, 5}
-        # Extract the nonconservative flux as a dispatch argument for `n_nonconservative_terms`
-        _, volume_flux_noncons = volume_integral.volume_flux_dg
-
-        flux_nonconservative_temp_threaded = A5d[A5d(undef, nvariables(equations),
-                                                     n_nonconservative_terms(volume_flux_noncons),
-                                                     nnodes(dg), nnodes(dg),
-                                                     nnodes(dg))
-                                                 for _ in 1:Threads.maxthreadid()]
-        fhat_nonconservative_temp_threaded = A5d[A5d(undef, nvariables(equations),
-                                                     n_nonconservative_terms(volume_flux_noncons),
-                                                     nnodes(dg), nnodes(dg),
-                                                     nnodes(dg))
-                                                 for _ in 1:Threads.maxthreadid()]
-        phi_threaded = A5d[A5d(undef, nvariables(equations),
-                               n_nonconservative_terms(volume_flux_noncons),
-                               nnodes(dg), nnodes(dg), nnodes(dg))
-                           for _ in 1:Threads.maxthreadid()]
-        cache = (; cache..., flux_nonconservative_temp_threaded,
-                 fhat_nonconservative_temp_threaded, phi_threaded)
-    end
-
-    return (; cache..., antidiffusive_fluxes,
-            fhat1_L_threaded, fhat1_R_threaded,
-            fhat2_L_threaded, fhat2_R_threaded,
-            fhat3_L_threaded, fhat3_R_threaded,
-            flux_temp_threaded, fhat_temp_threaded)
-end
-
-# Subcell limiting currently only implemented for certain mesh types
-@inline function volume_integral_kernel!(du, u, element,
-                                         mesh::P4estMesh{3},
-                                         nonconservative_terms, equations,
-                                         volume_integral::VolumeIntegralSubcellLimiting,
-                                         limiter::SubcellLimiterIDP,
-                                         dg::DGSEM, cache)
-    @unpack inverse_weights = dg.basis # Plays role of DG subcell sizes
-    @unpack volume_flux_dg, volume_flux_fv = volume_integral
-
-    # high-order DG fluxes
-    @unpack fhat1_L_threaded, fhat1_R_threaded, fhat2_L_threaded, fhat2_R_threaded, fhat3_L_threaded, fhat3_R_threaded = cache
-
-    fhat1_L = fhat1_L_threaded[Threads.threadid()]
-    fhat1_R = fhat1_R_threaded[Threads.threadid()]
-    fhat2_L = fhat2_L_threaded[Threads.threadid()]
-    fhat2_R = fhat2_R_threaded[Threads.threadid()]
-    fhat3_L = fhat3_L_threaded[Threads.threadid()]
-    fhat3_R = fhat3_R_threaded[Threads.threadid()]
-    calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
-                   u, mesh, nonconservative_terms, equations, volume_flux_dg,
-                   dg, element, cache)
-
-    # low-order FV fluxes
-    @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded, fstar3_L_threaded, fstar3_R_threaded = cache
-
-    fstar1_L = fstar1_L_threaded[Threads.threadid()]
-    fstar1_R = fstar1_R_threaded[Threads.threadid()]
-    fstar2_L = fstar2_L_threaded[Threads.threadid()]
-    fstar2_R = fstar2_R_threaded[Threads.threadid()]
-    fstar3_L = fstar3_L_threaded[Threads.threadid()]
-    fstar3_R = fstar3_R_threaded[Threads.threadid()]
-    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
-                 u, mesh, nonconservative_terms, equations, volume_flux_fv,
-                 dg, element, cache)
-
-    # antidiffusive flux
-    calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
-                            fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
-                            u, mesh, nonconservative_terms, equations, limiter,
-                            dg, element, cache)
-
-    # Calculate volume integral contribution of low-order FV flux
-    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-        for v in eachvariable(equations)
-            du[v, i, j, k, element] += inverse_weights[i] *
-                                       (fstar1_L[v, i + 1, j, k] - fstar1_R[v, i, j, k]) +
-                                       inverse_weights[j] *
-                                       (fstar2_L[v, i, j + 1, k] - fstar2_R[v, i, j, k]) +
-                                       inverse_weights[k] *
-                                       (fstar3_L[v, i, j, k + 1] - fstar3_R[v, i, j, k])
-        end
-    end
-
-    return nothing
-end
-
 # Calculate the DG staggered volume fluxes `fhat` in subcell FV-form inside the element
 # (**without non-conservative terms**).
 #
 # See also `flux_differencing_kernel!`.
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
-                                u, mesh::P4estMesh{3},
+                                u, ::Type{<:P4estMesh{3}},
                                 nonconservative_terms::False, equations,
                                 volume_flux, dg::DGSEM, element, cache)
     (; contravariant_vectors) = cache.elements
@@ -171,12 +61,12 @@ end
     end
 
     # FV-form flux `fhat` in x direction
-    for k in eachnode(dg), j in eachnode(dg), i in 1:(nnodes(dg) - 1),
-        v in eachvariable(equations)
-
-        fhat1_L[v, i + 1, j, k] = fhat1_L[v, i, j, k] +
-                                  weights[i] * flux_temp[v, i, j, k]
-        fhat1_R[v, i + 1, j, k] = fhat1_L[v, i + 1, j, k]
+    for k in eachnode(dg), j in eachnode(dg), i in 1:(nnodes(dg) - 1)
+        for v in eachvariable(equations)
+            fhat1_L[v, i + 1, j, k] = fhat1_L[v, i, j, k] +
+                                      weights[i] * flux_temp[v, i, j, k]
+            fhat1_R[v, i + 1, j, k] = fhat1_L[v, i + 1, j, k]
+        end
     end
 
     # Split form volume flux in orientation 2: y direction
@@ -206,12 +96,12 @@ end
     end
 
     # FV-form flux `fhat` in y direction
-    for k in eachnode(dg), j in 1:(nnodes(dg) - 1), i in eachnode(dg),
-        v in eachvariable(equations)
-
-        fhat2_L[v, i, j + 1, k] = fhat2_L[v, i, j, k] +
-                                  weights[j] * flux_temp[v, i, j, k]
-        fhat2_R[v, i, j + 1, k] = fhat2_L[v, i, j + 1, k]
+    for k in eachnode(dg), j in 1:(nnodes(dg) - 1), i in eachnode(dg)
+        for v in eachvariable(equations)
+            fhat2_L[v, i, j + 1, k] = fhat2_L[v, i, j, k] +
+                                      weights[j] * flux_temp[v, i, j, k]
+            fhat2_R[v, i, j + 1, k] = fhat2_L[v, i, j + 1, k]
+        end
     end
 
     # Split form volume flux in orientation 3: z direction
@@ -241,12 +131,12 @@ end
     end
 
     # FV-form flux `fhat` in z direction
-    for k in 1:(nnodes(dg) - 1), j in eachnode(dg), i in eachnode(dg),
-        v in eachvariable(equations)
-
-        fhat3_L[v, i, j, k + 1] = fhat3_L[v, i, j, k] +
-                                  weights[k] * flux_temp[v, i, j, k]
-        fhat3_R[v, i, j, k + 1] = fhat3_L[v, i, j, k + 1]
+    for k in 1:(nnodes(dg) - 1), j in eachnode(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            fhat3_L[v, i, j, k + 1] = fhat3_L[v, i, j, k] +
+                                      weights[k] * flux_temp[v, i, j, k]
+            fhat3_R[v, i, j, k + 1] = fhat3_L[v, i, j, k + 1]
+        end
     end
 
     return nothing
@@ -264,7 +154,7 @@ end
 #   Discretizations of Non-Conservative Systems. https://arxiv.org/pdf/2211.14009.pdf.
 #
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
-                                u, mesh::P4estMesh{3},
+                                u, ::Type{<:P4estMesh{3}},
                                 nonconservative_terms::True, equations,
                                 volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
                                 element,
@@ -544,106 +434,7 @@ end
     return nothing
 end
 
-# Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
-@inline function calcflux_antidiffusive!(fhat1_L, fhat1_R,
-                                         fhat2_L, fhat2_R,
-                                         fhat3_L, fhat3_R,
-                                         fstar1_L, fstar1_R,
-                                         fstar2_L, fstar2_R,
-                                         fstar3_L, fstar3_R,
-                                         u, mesh::P4estMesh{3},
-                                         nonconservative_terms::False, equations,
-                                         limiter::SubcellLimiterIDP, dg, element, cache)
-    @unpack antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R = cache.antidiffusive_fluxes
-
-    # Due to the use of LGL nodes, the DG staggered fluxes `fhat` and FV fluxes `fstar` are equal
-    # on the element interfaces. So, they are not computed in the volume integral and set to zero
-    # in their respective computation.
-    # The antidiffusive fluxes are therefore zero on the element interfaces and don't need to be
-    # computed either. They are set to zero directly after resizing the container.
-    # This applies to the indices `i=1` and `i=nnodes(dg)+1` for `antidiffusive_flux1_L` and
-    # `antidiffusive_flux1_R` and analogously for the other two directions.
-
-    for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
-        for v in eachvariable(equations)
-            antidiffusive_flux1_L[v, i, j, k, element] = fhat1_L[v, i, j, k] -
-                                                         fstar1_L[v, i, j, k]
-            antidiffusive_flux1_R[v, i, j, k, element] = antidiffusive_flux1_L[v,
-                                                                               i, j, k,
-                                                                               element]
-        end
-    end
-    for k in eachnode(dg), j in 2:nnodes(dg), i in eachnode(dg)
-        for v in eachvariable(equations)
-            antidiffusive_flux2_L[v, i, j, k, element] = fhat2_L[v, i, j, k] -
-                                                         fstar2_L[v, i, j, k]
-            antidiffusive_flux2_R[v, i, j, k, element] = antidiffusive_flux2_L[v,
-                                                                               i, j, k,
-                                                                               element]
-        end
-    end
-    for k in 2:nnodes(dg), j in eachnode(dg), i in eachnode(dg)
-        for v in eachvariable(equations)
-            antidiffusive_flux3_L[v, i, j, k, element] = fhat3_L[v, i, j, k] -
-                                                         fstar3_L[v, i, j, k]
-            antidiffusive_flux3_R[v, i, j, k, element] = antidiffusive_flux3_L[v,
-                                                                               i, j, k,
-                                                                               element]
-        end
-    end
-
-    return nothing
-end
-
-# Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
-@inline function calcflux_antidiffusive!(fhat1_L, fhat1_R,
-                                         fhat2_L, fhat2_R,
-                                         fhat3_L, fhat3_R,
-                                         fstar1_L, fstar1_R,
-                                         fstar2_L, fstar2_R,
-                                         fstar3_L, fstar3_R,
-                                         u, mesh::P4estMesh{3},
-                                         nonconservative_terms::True, equations,
-                                         limiter::SubcellLimiterIDP, dg, element, cache)
-    @unpack antidiffusive_flux1_L, antidiffusive_flux2_L, antidiffusive_flux1_R, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R = cache.antidiffusive_fluxes
-
-    # Due to the use of LGL nodes, the DG staggered fluxes `fhat` and FV fluxes `fstar` are equal
-    # on the element interfaces. So, they are not computed in the volume integral and set to zero
-    # in their respective computation.
-    # The antidiffusive fluxes are therefore zero on the element interfaces and don't need to be
-    # computed either. They are set to zero directly after resizing the container.
-    # This applies to the indices `i=1` and `i=nnodes(dg)+1` for `antidiffusive_flux1_L` and
-    # `antidiffusive_flux1_R` and analogously for the other two directions.
-
-    for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
-        for v in eachvariable(equations)
-            antidiffusive_flux1_L[v, i, j, k, element] = fhat1_L[v, i, j, k] -
-                                                         fstar1_L[v, i, j, k]
-            antidiffusive_flux1_R[v, i, j, k, element] = fhat1_R[v, i, j, k] -
-                                                         fstar1_R[v, i, j, k]
-        end
-    end
-    for k in eachnode(dg), j in 2:nnodes(dg), i in eachnode(dg)
-        for v in eachvariable(equations)
-            antidiffusive_flux2_L[v, i, j, k, element] = fhat2_L[v, i, j, k] -
-                                                         fstar2_L[v, i, j, k]
-            antidiffusive_flux2_R[v, i, j, k, element] = fhat2_R[v, i, j, k] -
-                                                         fstar2_R[v, i, j, k]
-        end
-    end
-    for k in 2:nnodes(dg), j in eachnode(dg), i in eachnode(dg)
-        for v in eachvariable(equations)
-            antidiffusive_flux3_L[v, i, j, k, element] = fhat3_L[v, i, j, k] -
-                                                         fstar3_L[v, i, j, k]
-            antidiffusive_flux3_R[v, i, j, k, element] = fhat3_R[v, i, j, k] -
-                                                         fstar3_R[v, i, j, k]
-        end
-    end
-
-    return nothing
-end
-
-@inline function calc_lambdas_bar_states!(u, t, mesh::P4estMesh{3},
+@inline function calc_lambdas_bar_states!(u, t, mesh::Union{TreeMesh{3}, P4estMesh{3}},
                                           have_nonconservative_terms, equations,
                                           limiter, dg, cache, boundary_conditions;
                                           calc_bar_states = true)
diff --git a/src/solvers/dgsem_p4est/subcell_limiters.jl b/src/solvers/dgsem_p4est/subcell_limiters.jl
new file mode 100644
index 00000000000..82df1571b36
--- /dev/null
+++ b/src/solvers/dgsem_p4est/subcell_limiters.jl
@@ -0,0 +1,53 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+###############################################################################
+# Auxiliary routine `get_boundary_outer_state` for non-periodic domains
+
+"""
+    get_boundary_outer_state(u_inner, t,
+                             boundary_condition::BoundaryConditionDirichlet,
+                             normal_direction
+                             mesh, equations, dg, cache, indices...)
+For subcell limiting, the calculation of local bounds for non-periodic domains requires the boundary
+outer state. This function returns the boundary value  for [`BoundaryConditionDirichlet`](@ref) at
+time `t` and for node with spatial indices `indices` at the boundary with `normal_direction`.
+
+Should be used together with [`P4estMesh`](@ref).
+
+!!! warning "Experimental implementation"
+    This is an experimental feature and may change in future releases.
+"""
+@inline function get_boundary_outer_state(u_inner, t,
+                                          boundary_condition::BoundaryConditionDirichlet,
+                                          normal_direction,
+                                          mesh::P4estMesh,
+                                          equations, dg, cache, indices...)
+    (; node_coordinates) = cache.elements
+
+    x = get_node_coords(node_coordinates, equations, dg, indices...)
+    u_outer = boundary_condition.boundary_value_function(x, t, equations)
+
+    return u_outer
+end
+
+@inline function get_boundary_outer_state(u_inner, t,
+                                          boundary_condition::BoundaryConditionCharacteristic,
+                                          normal_direction,
+                                          mesh::P4estMesh, equations, dg, cache,
+                                          indices...)
+    (; node_coordinates) = cache.elements
+
+    x = get_node_coords(node_coordinates, equations, dg, indices...)
+    u_outer = boundary_condition.boundary_value_function(boundary_condition.outer_boundary_value_function,
+                                                         u_inner,
+                                                         normal_direction,
+                                                         x, t, equations)
+
+    return u_outer
+end
+end # @muladd
diff --git a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
index b4c2dda6f26..4b5d3f1a0ce 100644
--- a/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
+++ b/src/solvers/dgsem_p4est/subcell_limiters_3d.jl
@@ -5,62 +5,175 @@
 @muladd begin
 #! format: noindent
 
-###############################################################################
-# IDP Limiting
-###############################################################################
-
-###############################################################################
-# Calculation of local bounds using low-order FV solution
-
-@inline function calc_bounds_onesided!(var_minmax, min_or_max, variable,
-                                       u::AbstractArray{<:Any, 5}, t, semi)
-    mesh, equations, dg, cache = mesh_equations_solver_cache(semi)
-    # Calc bounds inside elements
-    @threaded for element in eachelement(dg, cache)
-        # Reset bounds
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            if min_or_max === max
-                var_minmax[i, j, k, element] = typemin(eltype(var_minmax))
-            else
-                var_minmax[i, j, k, element] = typemax(eltype(var_minmax))
+function calc_bounds_twosided_interface!(var_min, var_max, variable,
+                                         u, t, semi, mesh::P4estMesh{3}, equations)
+    _, _, dg, cache = mesh_equations_solver_cache(semi)
+    (; boundary_conditions) = semi
+
+    (; neighbor_ids, node_indices) = cache.interfaces
+    index_range = eachnode(dg)
+
+    # Calc bounds at interfaces and periodic boundaries
+    for interface in eachinterface(dg, cache)
+        # Get element and side index information on the primary element
+        primary_element = neighbor_ids[1, interface]
+        primary_indices = node_indices[1, interface]
+
+        # Get element and side index information on the secondary element
+        secondary_element = neighbor_ids[2, interface]
+        secondary_indices = node_indices[2, interface]
+
+        # Create the local i,j,k indexing
+        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                     index_range)
+        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                     index_range)
+        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                     index_range)
+
+        i_primary = i_primary_start
+        j_primary = j_primary_start
+        k_primary = k_primary_start
+
+        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_indices[1],
+                                                                                           index_range)
+        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_indices[2],
+                                                                                           index_range)
+        k_secondary_start, k_secondary_step_i, k_secondary_step_j = index_to_start_step_3d(secondary_indices[3],
+                                                                                           index_range)
+
+        i_secondary = i_secondary_start
+        j_secondary = j_secondary_start
+        k_secondary = k_secondary_start
+
+        for j in eachnode(dg)
+            for i in eachnode(dg)
+                var_primary = u[variable, i_primary, j_primary, k_primary,
+                                primary_element]
+                var_secondary = u[variable, i_secondary, j_secondary, k_secondary,
+                                  secondary_element]
+
+                var_min[i_primary, j_primary, k_primary, primary_element] = min(var_min[i_primary,
+                                                                                        j_primary,
+                                                                                        k_primary,
+                                                                                        primary_element],
+                                                                                var_secondary)
+                var_max[i_primary, j_primary, k_primary, primary_element] = max(var_max[i_primary,
+                                                                                        j_primary,
+                                                                                        k_primary,
+                                                                                        primary_element],
+                                                                                var_secondary)
+
+                var_min[i_secondary, j_secondary, k_secondary, secondary_element] = min(var_min[i_secondary,
+                                                                                                j_secondary,
+                                                                                                k_secondary,
+                                                                                                secondary_element],
+                                                                                        var_primary)
+                var_max[i_secondary, j_secondary, k_secondary, secondary_element] = max(var_max[i_secondary,
+                                                                                                j_secondary,
+                                                                                                k_secondary,
+                                                                                                secondary_element],
+                                                                                        var_primary)
+
+                # Increment the primary element indices
+                i_primary += i_primary_step_i
+                j_primary += j_primary_step_i
+                k_primary += k_primary_step_i
+                # Increment the secondary element surface indices
+                i_secondary += i_secondary_step_i
+                j_secondary += j_secondary_step_i
+                k_secondary += k_secondary_step_i
             end
+            # Increment the primary element indices
+            i_primary += i_primary_step_j
+            j_primary += j_primary_step_j
+            k_primary += k_primary_step_j
+            # Increment the secondary element surface indices
+            i_secondary += i_secondary_step_j
+            j_secondary += j_secondary_step_j
+            k_secondary += k_secondary_step_j
         end
+    end
 
-        # Calculate bounds at Gauss-Lobatto nodes
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            var = variable(get_node_vars(u, equations, dg, i, j, k, element), equations)
-            var_minmax[i, j, k, element] = min_or_max(var_minmax[i, j, k, element], var)
+    # Calc bounds at physical boundaries
+    calc_bounds_twosided_boundary!(var_min, var_max, variable, u, t,
+                                   boundary_conditions,
+                                   mesh, equations, dg, cache)
 
-            if i > 1
-                var_minmax[i - 1, j, k, element] = min_or_max(var_minmax[i - 1, j, k,
-                                                                         element], var)
-            end
-            if i < nnodes(dg)
-                var_minmax[i + 1, j, k, element] = min_or_max(var_minmax[i + 1, j, k,
-                                                                         element], var)
-            end
-            if j > 1
-                var_minmax[i, j - 1, k, element] = min_or_max(var_minmax[i, j - 1, k,
-                                                                         element], var)
-            end
-            if j < nnodes(dg)
-                var_minmax[i, j + 1, k, element] = min_or_max(var_minmax[i, j + 1, k,
-                                                                         element], var)
-            end
-            if k > 1
-                var_minmax[i, j, k - 1, element] = min_or_max(var_minmax[i, j, k - 1,
-                                                                         element], var)
-            end
-            if k < nnodes(dg)
-                var_minmax[i, j, k + 1, element] = min_or_max(var_minmax[i, j, k + 1,
-                                                                         element], var)
+    return nothing
+end
+
+@inline function calc_bounds_twosided_boundary!(var_min, var_max, variable, u, t,
+                                                boundary_conditions::BoundaryConditionPeriodic,
+                                                mesh::P4estMesh{3},
+                                                equations, dg, cache)
+    return nothing
+end
+
+@inline function calc_bounds_twosided_boundary!(var_min, var_max, variable, u, t,
+                                                boundary_conditions,
+                                                mesh::P4estMesh{3},
+                                                equations, dg, cache)
+    (; boundary_condition_types, boundary_indices) = boundary_conditions
+    (; contravariant_vectors) = cache.elements
+
+    (; boundaries) = cache
+    index_range = eachnode(dg)
+
+    foreach_enumerate(boundary_condition_types) do (i, boundary_condition)
+        for boundary in boundary_indices[i]
+            element = boundaries.neighbor_ids[boundary]
+            node_indices = boundaries.node_indices[boundary]
+            direction = indices2direction(node_indices)
+
+            i_node_start, i_node_step_i, i_node_step_j = index_to_start_step_3d(node_indices[1],
+                                                                                index_range)
+            j_node_start, j_node_step_i, j_node_step_j = index_to_start_step_3d(node_indices[2],
+                                                                                index_range)
+            k_node_start, k_node_step_i, k_node_step_j = index_to_start_step_3d(node_indices[3],
+                                                                                index_range)
+
+            i_node = i_node_start
+            j_node = j_node_start
+            k_node = k_node_start
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(direction,
+                                                            contravariant_vectors,
+                                                            i_node, j_node, k_node,
+                                                            element)
+
+                    u_inner = get_node_vars(u, equations, dg, i_node, j_node, k_node,
+                                            element)
+
+                    u_outer = get_boundary_outer_state(u_inner, t, boundary_condition,
+                                                       normal_direction,
+                                                       mesh, equations, dg, cache,
+                                                       i_node, j_node, k_node, element)
+                    var_outer = u_outer[variable]
+
+                    var_min[i_node, j_node, k_node, element] = min(var_min[i_node,
+                                                                           j_node,
+                                                                           k_node,
+                                                                           element],
+                                                                   var_outer)
+                    var_max[i_node, j_node, k_node, element] = max(var_max[i_node,
+                                                                           j_node,
+                                                                           k_node,
+                                                                           element],
+                                                                   var_outer)
+
+                    i_node += i_node_step_i
+                    j_node += j_node_step_i
+                    k_node += k_node_step_i
+                end
+                i_node += i_node_step_j
+                j_node += j_node_step_j
+                k_node += k_node_step_j
             end
         end
     end
 
-    # Values at element boundary
-    calc_bounds_onesided_interface!(var_minmax, min_or_max, variable, u, t, semi, mesh)
-
     return nothing
 end
 
@@ -161,201 +274,65 @@ end
     return nothing
 end
 
-##############################################################################
-# Local one-sided limiting of nonlinear variables
-
-@inline function idp_local_onesided!(alpha, limiter, u::AbstractArray{<:Real, 5},
-                                     t, dt, semi, elements,
-                                     variable, min_or_max)
-    mesh, equations, dg, cache = mesh_equations_solver_cache(semi)
-    (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
-    var_minmax = variable_bounds[Symbol(string(variable), "_", string(min_or_max))]
-    calc_bounds_onesided!(var_minmax, min_or_max, variable, u, t, semi)
-
-    # Perform Newton's bisection method to find new alpha
-    @threaded for element in elements
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            inverse_jacobian = get_inverse_jacobian(cache.elements.inverse_jacobian,
-                                                    mesh, i, j, k, element)
-            u_local = get_node_vars(u, equations, dg, i, j, k, element)
-            newton_loops_alpha!(alpha, var_minmax[i, j, k, element],
-                                u_local, i, j, k, element,
-                                variable, min_or_max,
-                                initial_check_local_onesided_newton_idp,
-                                final_check_local_onesided_newton_idp,
-                                inverse_jacobian, dt, equations, dg, cache, limiter)
-        end
-    end
-
-    return nothing
-end
-
-###############################################################################
-# Global positivity limiting of conservative variables
-
-@inline function idp_positivity_conservative!(alpha, limiter,
-                                              u::AbstractArray{<:Real, 5}, dt, semi,
-                                              elements, variable)
-    mesh, _, dg, cache = mesh_equations_solver_cache(semi)
-    (; antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R) = cache.antidiffusive_fluxes
-    (; inverse_weights) = dg.basis # Plays role of DG subcell sizes
-    (; positivity_correction_factor) = limiter
-
-    (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
-    var_min = variable_bounds[Symbol(string(variable), "_min")]
-
-    @threaded for element in elements
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            inverse_jacobian = get_inverse_jacobian(cache.elements.inverse_jacobian,
-                                                    mesh, i, j, k, element)
-            var = u[variable, i, j, k, element]
-            if var < 0
-                error("Safe low-order method produces negative value for conservative variable $variable. Try a smaller time step.")
-            end
-
-            # Compute bound
-            var_min[i, j, k, element] = positivity_correction_factor * var
-
-            # Real one-sided Zalesak-type limiter
-            # * Zalesak (1979). "Fully multidimensional flux-corrected transport algorithms for fluids"
-            # * Kuzmin et al. (2010). "Failsafe flux limiting and constrained data projections for equations of gas dynamics"
-            # Note: The Zalesak limiter has to be computed, even if the state is valid, because the correction is
-            #       for each interface, not each node
-            Qm = min(0, (var_min[i, j, k, element] - var) / dt)
-
-            # Calculate Pm
-            # Note: Boundaries of antidiffusive_flux1/2/3 are constant 0, so they make no difference here.
-            val_flux1_local = inverse_weights[i] *
-                              antidiffusive_flux1_R[variable, i, j, k, element]
-            val_flux1_local_ip1 = -inverse_weights[i] *
-                                  antidiffusive_flux1_L[variable, i + 1, j, k, element]
-            val_flux2_local = inverse_weights[j] *
-                              antidiffusive_flux2_R[variable, i, j, k, element]
-            val_flux2_local_jp1 = -inverse_weights[j] *
-                                  antidiffusive_flux2_L[variable, i, j + 1, k, element]
-            val_flux3_local = inverse_weights[k] *
-                              antidiffusive_flux3_R[variable, i, j, k, element]
-            val_flux3_local_jp1 = -inverse_weights[k] *
-                                  antidiffusive_flux3_L[variable, i, j, k + 1, element]
-
-            Pm = min(0, val_flux1_local) + min(0, val_flux1_local_ip1) +
-                 min(0, val_flux2_local) + min(0, val_flux2_local_jp1) +
-                 min(0, val_flux3_local) + min(0, val_flux3_local_jp1)
-            Pm = inverse_jacobian * Pm
-
-            # Compute blending coefficient avoiding division by zero
-            # (as in paper of [Guermond, Nazarov, Popov, Thomas] (4.8))
-            Qm = abs(Qm) / (abs(Pm) + eps(typeof(Qm)) * 100)
-
-            # Calculate alpha
-            alpha[i, j, k, element] = max(alpha[i, j, k, element], 1 - Qm)
-        end
-    end
-
-    return nothing
-end
-
-###############################################################################
-# Global positivity limiting of nonlinear variables
-
-@inline function idp_positivity_nonlinear!(alpha, limiter,
-                                           u::AbstractArray{<:Real, 5},
-                                           dt, semi, elements, variable)
-    mesh, equations, dg, cache = mesh_equations_solver_cache(semi)
-    (; positivity_correction_factor) = limiter
-
-    (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
-    var_min = variable_bounds[Symbol(string(variable), "_min")]
+@inline function calc_bounds_onesided_boundary!(var_minmax, minmax, variable, u, t,
+                                                boundary_conditions,
+                                                mesh::P4estMesh{3},
+                                                equations, dg, cache)
+    (; boundary_condition_types, boundary_indices) = boundary_conditions
+    (; contravariant_vectors) = cache.elements
 
-    @threaded for element in elements
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            inverse_jacobian = get_inverse_jacobian(cache.elements.inverse_jacobian,
-                                                    mesh, i, j, k, element)
+    (; boundaries) = cache
+    index_range = eachnode(dg)
 
-            # Compute bound
-            u_local = get_node_vars(u, equations, dg, i, j, k, element)
-            var = variable(u_local, equations)
-            if var < 0
-                error("Safe low-order method produces negative value for variable $variable. Try a smaller time step.")
+    foreach_enumerate(boundary_condition_types) do (i, boundary_condition)
+        for boundary in boundary_indices[i]
+            element = boundaries.neighbor_ids[boundary]
+            node_indices = boundaries.node_indices[boundary]
+            direction = indices2direction(node_indices)
+
+            i_node_start, i_node_step_i, i_node_step_j = index_to_start_step_3d(node_indices[1],
+                                                                                index_range)
+            j_node_start, j_node_step_i, j_node_step_j = index_to_start_step_3d(node_indices[2],
+                                                                                index_range)
+            k_node_start, k_node_step_i, k_node_step_j = index_to_start_step_3d(node_indices[3],
+                                                                                index_range)
+
+            i_node = i_node_start
+            j_node = j_node_start
+            k_node = k_node_start
+            for j in eachnode(dg)
+                for i in eachnode(dg)
+                    normal_direction = get_normal_direction(direction,
+                                                            contravariant_vectors,
+                                                            i_node, j_node, k_node,
+                                                            element)
+
+                    u_inner = get_node_vars(u, equations, dg, i_node, j_node, k_node,
+                                            element)
+
+                    u_outer = get_boundary_outer_state(u_inner, t, boundary_condition,
+                                                       normal_direction,
+                                                       mesh, equations, dg, cache,
+                                                       i_node, j_node, k_node, element)
+                    var_outer = variable(u_outer, equations)
+
+                    var_minmax[i_node, j_node, k_node, element] = minmax(var_minmax[i_node,
+                                                                                    j_node,
+                                                                                    k_node,
+                                                                                    element],
+                                                                         var_outer)
+
+                    i_node += i_node_step_i
+                    j_node += j_node_step_i
+                    k_node += k_node_step_i
+                end
+                i_node += i_node_step_j
+                j_node += j_node_step_j
+                k_node += k_node_step_j
             end
-            var_min[i, j, k, element] = positivity_correction_factor * var
-
-            # Perform Newton's bisection method to find new alpha
-            newton_loops_alpha!(alpha, var_min[i, j, k, element],
-                                u_local, i, j, k, element,
-                                variable, min,
-                                initial_check_nonnegative_newton_idp,
-                                final_check_nonnegative_newton_idp,
-                                inverse_jacobian, dt, equations, dg, cache, limiter)
         end
     end
 
     return nothing
 end
-
-###############################################################################
-# Newton-bisection method
-
-@inline function newton_loops_alpha!(alpha, bound, u, i, j, k, element,
-                                     variable, min_or_max,
-                                     initial_check, final_check,
-                                     inverse_jacobian, dt,
-                                     equations, dg, cache, limiter)
-    (; inverse_weights) = dg.basis # Plays role of inverse DG-subcell sizes
-    (; antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R) = cache.antidiffusive_fluxes
-
-    (; gamma_constant_newton) = limiter
-
-    indices = (i, j, k, element)
-
-    # negative xi direction
-    antidiffusive_flux = gamma_constant_newton * inverse_jacobian *
-                         inverse_weights[i] *
-                         get_node_vars(antidiffusive_flux1_R, equations, dg,
-                                       i, j, k, element)
-    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
-                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
-
-    # positive xi direction
-    antidiffusive_flux = -gamma_constant_newton * inverse_jacobian *
-                         inverse_weights[i] *
-                         get_node_vars(antidiffusive_flux1_L, equations, dg,
-                                       i + 1, j, k, element)
-    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
-                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
-
-    # negative eta direction
-    antidiffusive_flux = gamma_constant_newton * inverse_jacobian *
-                         inverse_weights[j] *
-                         get_node_vars(antidiffusive_flux2_R, equations, dg,
-                                       i, j, k, element)
-    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
-                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
-
-    # positive eta direction
-    antidiffusive_flux = -gamma_constant_newton * inverse_jacobian *
-                         inverse_weights[j] *
-                         get_node_vars(antidiffusive_flux2_L, equations, dg,
-                                       i, j + 1, k, element)
-    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
-                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
-
-    # negative zeta direction
-    antidiffusive_flux = gamma_constant_newton * inverse_jacobian *
-                         inverse_weights[k] *
-                         get_node_vars(antidiffusive_flux3_R, equations, dg,
-                                       i, j, k, element)
-    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
-                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
-
-    # positive zeta direction
-    antidiffusive_flux = -gamma_constant_newton * inverse_jacobian *
-                         inverse_weights[k] *
-                         get_node_vars(antidiffusive_flux3_L, equations, dg,
-                                       i, j, k + 1, element)
-    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
-                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
-
-    return nothing
-end
 end # @muladd
diff --git a/src/solvers/dgsem_structured/containers.jl b/src/solvers/dgsem_structured/containers.jl
index 30e51aad1aa..dd6adfbee5b 100644
--- a/src/solvers/dgsem_structured/containers.jl
+++ b/src/solvers/dgsem_structured/containers.jl
@@ -9,6 +9,10 @@ struct StructuredElementContainer{NDIMS, RealT <: Real, uEltype <: Real,
                                   NDIMSP1, NDIMSP2, NDIMSP3} <: AbstractElementContainer
     # Physical coordinates at each node
     node_coordinates::Array{RealT, NDIMSP2} # [orientation, node_i, node_j, node_k, element]
+
+    # Physical coordinates at boundary nodes
+    boundary_node_coordinates::Array{RealT, NDIMSP1} # [orientation, node_i, node_j, direction/face]
+
     # ID of neighbor element in negative direction in orientation
     left_neighbors::Array{Int, 2} # [orientation, elements]
 
@@ -22,6 +26,9 @@ struct StructuredElementContainer{NDIMS, RealT <: Real, uEltype <: Real,
     # 1/J where J is the Jacobian determinant (determinant of Jacobian matrix)
     inverse_jacobian::Array{RealT, NDIMSP1} # [node_i, node_j, node_k, element]
 
+    # Buffer for solution values at interfaces (filled by `prolong2interfaces!`)
+    interfaces_u::Array{uEltype, NDIMSP2} # [variable, i, j, direction, element]
+
     # Buffer for calculated surface flux
     surface_flux_values::Array{uEltype, NDIMSP2} # [variable, i, j, direction, element]
 end
@@ -36,6 +43,10 @@ function init_elements(mesh::Union{StructuredMesh{NDIMS, RealT},
     node_coordinates = Array{RealT, NDIMS + 2}(undef, NDIMS,
                                                ntuple(_ -> nnodes(basis), NDIMS)...,
                                                nelements)
+    boundary_node_coordinates = Array{RealT, NDIMS + 1}(undef, NDIMS,
+                                                        ntuple(_ -> nnodes(basis),
+                                                               NDIMS - 1)...,
+                                                        NDIMS * 2)
     left_neighbors = Array{Int, 2}(undef, NDIMS, nelements)
     jacobian_matrix = Array{RealT, NDIMS + 3}(undef, NDIMS, NDIMS,
                                               ntuple(_ -> nnodes(basis), NDIMS)...,
@@ -44,6 +55,10 @@ function init_elements(mesh::Union{StructuredMesh{NDIMS, RealT},
     inverse_jacobian = Array{RealT, NDIMS + 1}(undef,
                                                ntuple(_ -> nnodes(basis), NDIMS)...,
                                                nelements)
+    interfaces_u = Array{uEltype, NDIMS + 2}(undef, nvariables(equations),
+                                             ntuple(_ -> nnodes(basis),
+                                                    NDIMS - 1)..., NDIMS * 2,
+                                             nelements)
     surface_flux_values = Array{uEltype, NDIMS + 2}(undef, nvariables(equations),
                                                     ntuple(_ -> nnodes(basis),
                                                            NDIMS - 1)..., NDIMS * 2,
@@ -51,10 +66,12 @@ function init_elements(mesh::Union{StructuredMesh{NDIMS, RealT},
 
     elements = StructuredElementContainer{NDIMS, RealT, uEltype,
                                           NDIMS + 1, NDIMS + 2, NDIMS + 3}(node_coordinates,
+                                                                           boundary_node_coordinates,
                                                                            left_neighbors,
                                                                            jacobian_matrix,
                                                                            contravariant_vectors,
                                                                            inverse_jacobian,
+                                                                           interfaces_u,
                                                                            surface_flux_values)
 
     init_elements!(elements, mesh, basis)
diff --git a/src/solvers/dgsem_structured/containers_1d.jl b/src/solvers/dgsem_structured/containers_1d.jl
index 803ed9fd055..42922059c97 100644
--- a/src/solvers/dgsem_structured/containers_1d.jl
+++ b/src/solvers/dgsem_structured/containers_1d.jl
@@ -6,8 +6,8 @@
 #! format: noindent
 
 # Initialize data structures in element container
-function init_elements!(elements, mesh::StructuredMesh{1}, basis::LobattoLegendreBasis)
-    @unpack node_coordinates, left_neighbors,
+function init_elements!(elements, mesh::StructuredMesh{1}, basis::AbstractBasisSBP)
+    @unpack node_coordinates, boundary_node_coordinates, left_neighbors,
     jacobian_matrix, contravariant_vectors, inverse_jacobian = elements
 
     # Calculate node coordinates, Jacobian matrix, and inverse Jacobian determinant
@@ -23,6 +23,39 @@ function init_elements!(elements, mesh::StructuredMesh{1}, basis::LobattoLegendr
     fill!(contravariant_vectors, NaN)
 
     initialize_left_neighbor_connectivity!(left_neighbors, mesh)
+    calc_boundary_node_coordinates!(boundary_node_coordinates, node_coordinates,
+                                    mesh, basis)
+
+    return nothing
+end
+
+function calc_boundary_node_coordinates!(boundary_node_coordinates,
+                                         node_coordinates,
+                                         mesh::StructuredMesh{1},
+                                         basis::LobattoLegendreBasis)
+    nelements = size(mesh, 1)
+
+    dim = 1 # spatial dimension
+    boundary_node_coordinates[dim, 1] = node_coordinates[dim, 1, 1]
+    boundary_node_coordinates[dim, 2] = node_coordinates[dim, nnodes(basis), nelements]
+
+    return nothing
+end
+
+function calc_boundary_node_coordinates!(boundary_node_coordinates,
+                                         node_coordinates,
+                                         mesh::StructuredMesh{1},
+                                         basis::GaussLegendreBasis)
+    nelements = size(mesh, 1)
+    boundary_matrix = basis.boundary_interpolation
+
+    dim = 1 # spatial dimension
+    # For structured mesh:
+    # Left/right boundaries are really left(-1)/right(+1) [first/second column of boundary matrix]
+    @views boundary_node_coordinates[dim, 1] = dot(boundary_matrix[:, 1],
+                                                   node_coordinates[dim, :, 1])
+    @views boundary_node_coordinates[dim, 2] = dot(boundary_matrix[:, 2],
+                                                   node_coordinates[dim, :, nelements])
 
     return nothing
 end
@@ -31,7 +64,7 @@ end
 # `mesh.mapping` is passed as an additional argument for type stability (function barrier)
 function calc_node_coordinates!(node_coordinates, cell_x, mapping,
                                 mesh::StructuredMesh{1},
-                                basis::LobattoLegendreBasis)
+                                basis::AbstractBasisSBP)
     @unpack nodes = basis
 
     # Get cell length in reference mesh
@@ -51,7 +84,7 @@ end
 # Calculate Jacobian matrix of the mapping from the reference element to the element in the physical domain
 function calc_jacobian_matrix!(jacobian_matrix, element,
                                node_coordinates::AbstractArray{<:Any, 3},
-                               basis::LobattoLegendreBasis)
+                               basis::AbstractBasisSBP)
     @views mul!(jacobian_matrix[1, 1, :, element], basis.derivative_matrix,
                 node_coordinates[1, :, element]) # x_ξ
 
diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl
index 73075381d4f..b52044af393 100644
--- a/src/solvers/dgsem_structured/dg.jl
+++ b/src/solvers/dgsem_structured/dg.jl
@@ -30,7 +30,7 @@ end
 end
 
 # Dimension agnostic, i.e., valid for all 1D, 2D, and 3D `StructuredMesh`es.
-function calc_boundary_flux!(cache, u, t, boundary_condition::BoundaryConditionPeriodic,
+function calc_boundary_flux!(cache, t, boundary_condition::BoundaryConditionPeriodic,
                              mesh::StructuredMesh, equations, surface_integral,
                              dg::DG)
     @assert isperiodic(mesh)
@@ -42,38 +42,50 @@ function rhs!(du, u, t,
               mesh::Union{StructuredMesh, StructuredMeshView{2}}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
+    backend = trixi_backend(u)
+
     # Reset du
     @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache,
                               t, boundary_conditions)
     end
 
-    # Calculate interface and boundary fluxes
+    # Prolong solution to interfaces
+    @trixi_timeit timer() "prolong2interfaces" begin
+        prolong2interfaces!(cache, u, mesh, equations, dg)
+    end
+
+    # Calculate interface fluxes
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache, u, mesh,
+        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
                              have_nonconservative_terms(equations), equations,
-                             dg.surface_integral, dg)
+                             dg.surface_integral, dg, cache)
     end
 
+    # `prolong2boundaries!` is not required for `StructuredMesh` since boundary values 
+    # are stored in the interface datastructure (`interfaces_u`),
+    # so we can directly calculate the boundary fluxes without prolongation.
+
     # Calculate boundary fluxes
     @trixi_timeit timer() "boundary flux" begin
-        calc_boundary_flux!(cache, u, t, boundary_conditions, mesh, equations,
+        calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
                             dg.surface_integral, dg)
     end
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
@@ -83,7 +95,7 @@ function rhs!(du, u, t,
     return nothing
 end
 
-@inline function calc_boundary_flux_by_direction!(surface_flux_values, u, t,
+@inline function calc_boundary_flux_by_direction!(surface_flux_values, t,
                                                   orientation,
                                                   boundary_condition::BoundaryConditionPeriodic,
                                                   mesh::Union{StructuredMesh,
@@ -97,7 +109,7 @@ end
     return nothing
 end
 
-@inline function calc_boundary_flux_by_direction!(surface_flux_values, u, t,
+@inline function calc_boundary_flux_by_direction!(surface_flux_values, t,
                                                   orientation,
                                                   boundary_condition::BoundaryConditionPeriodic,
                                                   mesh::Union{StructuredMesh,
@@ -111,7 +123,7 @@ end
     return nothing
 end
 
-@inline function calc_boundary_flux_by_direction!(surface_flux_values, u, t,
+@inline function calc_boundary_flux_by_direction!(surface_flux_values, t,
                                                   orientation,
                                                   boundary_condition,
                                                   mesh::Union{StructuredMesh,
@@ -121,10 +133,13 @@ end
                                                   surface_integral, dg::DG, cache,
                                                   direction, node_indices,
                                                   surface_node_indices, element)
-    @unpack node_coordinates, contravariant_vectors, inverse_jacobian = cache.elements
+    @unpack node_coordinates, contravariant_vectors, inverse_jacobian, interfaces_u = cache.elements
+    # Boundary values are for `StructuredMesh` stored in the interface datastructure
+    boundaries_u = interfaces_u
     @unpack surface_flux = surface_integral
 
-    u_inner = get_node_vars(u, equations, dg, node_indices..., element)
+    u_inner = get_node_vars(boundaries_u, equations, dg, surface_node_indices...,
+                            direction, element)
     x = get_node_coords(node_coordinates, equations, dg, node_indices..., element)
 
     # If the mapping is orientation-reversing, the contravariant vectors' orientation
@@ -143,6 +158,7 @@ end
     flux = sign_jacobian *
            boundary_condition(u_inner, normal, direction, x, t, surface_flux, equations)
 
+    # Only flux contribution for boundary element, boundary face is the boundary flux
     for v in eachvariable(equations)
         surface_flux_values[v, surface_node_indices..., direction, element] = flux[v]
     end
@@ -150,7 +166,7 @@ end
     return nothing
 end
 
-@inline function calc_boundary_flux_by_direction!(surface_flux_values, u, t,
+@inline function calc_boundary_flux_by_direction!(surface_flux_values, t,
                                                   orientation,
                                                   boundary_condition,
                                                   mesh::Union{StructuredMesh,
@@ -160,10 +176,13 @@ end
                                                   surface_integral, dg::DG, cache,
                                                   direction, node_indices,
                                                   surface_node_indices, element)
-    @unpack node_coordinates, contravariant_vectors, inverse_jacobian = cache.elements
+    @unpack node_coordinates, contravariant_vectors, inverse_jacobian, interfaces_u = cache.elements
+    # Boundary values are for `StructuredMesh` stored in the interface datastructure
+    boundaries_u = interfaces_u
     @unpack surface_flux = surface_integral
 
-    u_inner = get_node_vars(u, equations, dg, node_indices..., element)
+    u_inner = get_node_vars(boundaries_u, equations, dg, surface_node_indices...,
+                            direction, element)
     x = get_node_coords(node_coordinates, equations, dg, node_indices..., element)
 
     # If the mapping is orientation-reversing, the contravariant vectors' orientation
@@ -182,6 +201,7 @@ end
     flux, noncons_flux = boundary_condition(u_inner, normal, direction, x, t,
                                             surface_flux, equations)
 
+    # Only flux contribution for boundary element, boundary face is the boundary flux
     for v in eachvariable(equations)
         surface_flux_values[v, surface_node_indices..., direction, element] = sign_jacobian *
                                                                               (flux[v] +
diff --git a/src/solvers/dgsem_structured/dg_1d.jl b/src/solvers/dgsem_structured/dg_1d.jl
index 672dbe65ebf..8732a98f949 100644
--- a/src/solvers/dgsem_structured/dg_1d.jl
+++ b/src/solvers/dgsem_structured/dg_1d.jl
@@ -5,24 +5,68 @@
 @muladd begin
 #! format: noindent
 
-function calc_interface_flux!(cache, u, mesh::StructuredMesh{1},
+function prolong2interfaces!(cache, u, mesh::StructuredMesh{1}, equations, dg::DG)
+    @unpack interfaces_u = cache.elements
+
+    @threaded for element in eachelement(dg, cache)
+        # Negative side (direction 1, left/negative x face)
+        for v in eachvariable(equations)
+            interfaces_u[v, 1, element] = u[v, 1, element]
+        end
+        # Positive side (direction 2, right/positive x face)
+        for v in eachvariable(equations)
+            interfaces_u[v, 2, element] = u[v, nnodes(dg), element]
+        end
+    end
+
+    return nothing
+end
+
+function prolong2interfaces!(cache, u, mesh::StructuredMesh{1}, equations,
+                             dg::DGSEM{<:GaussLegendreBasis})
+    @unpack interfaces_u = cache.elements
+    @unpack boundary_interpolation = dg.basis
+
+    @threaded for element in eachelement(dg, cache)
+        for v in eachvariable(equations)
+            interface_u_1 = zero(eltype(interfaces_u))
+            interface_u_2 = zero(eltype(interfaces_u))
+            for i in eachnode(dg)
+                # Left/negative x face
+                interface_u_1 = interface_u_1 +
+                                u[v, i, element] * boundary_interpolation[i, 1]
+
+                # Right/positive x face
+                interface_u_2 = interface_u_2 +
+                                u[v, i, element] * boundary_interpolation[i, 2]
+            end
+            interfaces_u[v, 1, element] = interface_u_1
+            interfaces_u[v, 2, element] = interface_u_2
+        end
+    end
+
+    return nothing
+end
+
+function calc_interface_flux!(surface_flux_values, mesh::StructuredMesh{1},
                               nonconservative_terms, # can be True/False
-                              equations, surface_integral, dg::DG)
+                              equations, surface_integral, dg::DG, cache)
     @unpack surface_flux = surface_integral
+    @unpack interfaces_u = cache.elements
 
     @threaded for element in eachelement(dg, cache)
         left_element = cache.elements.left_neighbors[1, element]
         # => `element` is the right element of the interface
 
         if left_element > 0 # left_element = 0 at boundaries
-            u_ll = get_node_vars(u, equations, dg, nnodes(dg), left_element)
-            u_rr = get_node_vars(u, equations, dg, 1, element)
+            u_ll = get_node_vars(interfaces_u, equations, dg, 2, left_element)
+            u_rr = get_node_vars(interfaces_u, equations, dg, 1, element)
 
             f1 = surface_flux(u_ll, u_rr, 1, equations)
 
             for v in eachvariable(equations)
-                cache.elements.surface_flux_values[v, 2, left_element] = f1[v]
-                cache.elements.surface_flux_values[v, 1, element] = f1[v]
+                surface_flux_values[v, 2, left_element] = f1[v]
+                surface_flux_values[v, 1, element] = f1[v]
             end
         end
     end
@@ -30,23 +74,26 @@ function calc_interface_flux!(cache, u, mesh::StructuredMesh{1},
     return nothing
 end
 
-function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
+function calc_boundary_flux!(cache, t, boundary_conditions::NamedTuple,
                              mesh::StructuredMesh{1}, equations, surface_integral,
                              dg::DG)
     @unpack surface_flux = surface_integral
-    @unpack surface_flux_values, node_coordinates = cache.elements
+    @unpack surface_flux_values, boundary_node_coordinates, interfaces_u = cache.elements
+    # Boundary values are for `StructuredMesh` stored in the interface datastructure
+    boundaries_u = interfaces_u
 
     orientation = 1
 
     # Negative x-direction
     direction = 1
 
-    u_rr = get_node_vars(u, equations, dg, 1, 1)
-    x = get_node_coords(node_coordinates, equations, dg, 1, 1)
+    u_rr = get_node_vars(boundaries_u, equations, dg, direction, 1)
+    x = get_node_coords(boundary_node_coordinates, equations, dg, direction)
 
     flux = boundary_conditions[direction](u_rr, orientation, direction, x, t,
                                           surface_flux, equations)
 
+    # Only flux contribution for left element, left face is the boundary flux
     for v in eachvariable(equations)
         surface_flux_values[v, direction, 1] = flux[v]
     end
@@ -54,14 +101,13 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
     # Positive x-direction
     direction = 2
 
-    u_rr = get_node_vars(u, equations, dg, nnodes(dg), nelements(dg, cache))
-    x = get_node_coords(node_coordinates, equations, dg, nnodes(dg),
-                        nelements(dg, cache))
+    u_rr = get_node_vars(boundaries_u, equations, dg, direction, nelements(dg, cache))
+    x = get_node_coords(boundary_node_coordinates, equations, dg, direction)
 
     flux = boundary_conditions[direction](u_rr, orientation, direction, x, t,
                                           surface_flux, equations)
 
-    # Copy flux to left and right element storage
+    # Only flux contribution for right element, right face is the boundary flux
     for v in eachvariable(equations)
         surface_flux_values[v, direction, nelements(dg, cache)] = flux[v]
     end
@@ -69,7 +115,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
     return nothing
 end
 
-function apply_jacobian!(du, mesh::StructuredMesh{1},
+function apply_jacobian!(backend::Nothing, du, mesh::StructuredMesh{1},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 
diff --git a/src/solvers/dgsem_structured/dg_2d.jl b/src/solvers/dgsem_structured/dg_2d.jl
index b92944fb338..685395c9739 100644
--- a/src/solvers/dgsem_structured/dg_2d.jl
+++ b/src/solvers/dgsem_structured/dg_2d.jl
@@ -29,9 +29,10 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
 =#
 @inline function weak_form_kernel!(du, u,
                                    element,
-                                   mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
-                                               UnstructuredMesh2D, P4estMesh{2},
-                                               P4estMeshView{2}, T8codeMesh{2}},
+                                   ::Type{<:Union{StructuredMesh{2},
+                                                  StructuredMeshView{2},
+                                                  UnstructuredMesh2D, P4estMesh{2},
+                                                  P4estMeshView{2}, T8codeMesh{2}}},
                                    have_nonconservative_terms::False, equations,
                                    dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -70,10 +71,11 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       StructuredMeshView{2},
-                                                       UnstructuredMesh2D, P4estMesh{2},
-                                                       T8codeMesh{2}},
+                                           ::Type{<:Union{StructuredMesh{2},
+                                                          StructuredMeshView{2},
+                                                          UnstructuredMesh2D,
+                                                          P4estMesh{2},
+                                                          T8codeMesh{2}}},
                                            have_nonconservative_terms::False, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     @unpack derivative_split = dg.basis
@@ -133,13 +135,14 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       StructuredMeshView{2},
-                                                       UnstructuredMesh2D, P4estMesh{2},
-                                                       T8codeMesh{2}},
+                                           MeshT::Type{<:Union{StructuredMesh{2},
+                                                               StructuredMeshView{2},
+                                                               UnstructuredMesh2D,
+                                                               P4estMesh{2},
+                                                               T8codeMesh{2}}},
                                            have_nonconservative_terms::True, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
-    flux_differencing_kernel!(du, u, element, mesh, have_nonconservative_terms,
+    flux_differencing_kernel!(du, u, element, MeshT, have_nonconservative_terms,
                               combine_conservative_and_nonconservative_fluxes(volume_flux,
                                                                               equations),
                               equations,
@@ -149,10 +152,11 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       StructuredMeshView{2},
-                                                       UnstructuredMesh2D, P4estMesh{2},
-                                                       T8codeMesh{2}},
+                                           MeshT::Type{<:Union{StructuredMesh{2},
+                                                               StructuredMeshView{2},
+                                                               UnstructuredMesh2D,
+                                                               P4estMesh{2},
+                                                               T8codeMesh{2}}},
                                            have_nonconservative_terms::True,
                                            combine_conservative_and_nonconservative_fluxes::False,
                                            equations,
@@ -162,7 +166,7 @@ end
     symmetric_flux, nonconservative_flux = volume_flux
 
     # Apply the symmetric flux as usual
-    flux_differencing_kernel!(du, u, element, mesh, False(), equations, symmetric_flux,
+    flux_differencing_kernel!(du, u, element, MeshT, False(), equations, symmetric_flux,
                               dg, cache, alpha)
 
     # Calculate the remaining volume terms using the nonsymmetric generalized flux
@@ -222,10 +226,11 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       StructuredMeshView{2},
-                                                       UnstructuredMesh2D, P4estMesh{2},
-                                                       T8codeMesh{2}},
+                                           ::Type{<:Union{StructuredMesh{2},
+                                                          StructuredMeshView{2},
+                                                          UnstructuredMesh2D,
+                                                          P4estMesh{2},
+                                                          T8codeMesh{2}}},
                                            have_nonconservative_terms::True,
                                            combine_conservative_and_nonconservative_fluxes::True,
                                            equations,
@@ -293,9 +298,9 @@ end
 end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                              mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
-                                          UnstructuredMesh2D,
-                                          P4estMesh{2}, T8codeMesh{2}},
+                              ::Type{<:Union{StructuredMesh{2}, StructuredMeshView{2},
+                                             UnstructuredMesh2D,
+                                             P4estMesh{2}, T8codeMesh{2}}},
                               have_nonconservative_terms::False, equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     @unpack normal_vectors_1, normal_vectors_2 = cache.normal_vectors
@@ -335,9 +340,9 @@ end
 end
 
 @inline function calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                                mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
-                                            UnstructuredMesh2D,
-                                            P4estMesh{2}, T8codeMesh{2}},
+                                ::Type{<:Union{StructuredMesh{2}, StructuredMeshView{2},
+                                               UnstructuredMesh2D,
+                                               P4estMesh{2}, T8codeMesh{2}}},
                                 have_nonconservative_terms::False, equations,
                                 volume_flux_fv, dg::DGSEM, element, cache,
                                 sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -416,9 +421,9 @@ end
 end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                              mesh::Union{StructuredMesh{2}, StructuredMesh{2},
-                                          UnstructuredMesh2D,
-                                          P4estMesh{2}, T8codeMesh{2}},
+                              ::Type{<:Union{StructuredMesh{2}, StructuredMesh{2},
+                                             UnstructuredMesh2D,
+                                             P4estMesh{2}, T8codeMesh{2}}},
                               have_nonconservative_terms::True, equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     @unpack normal_vectors_1, normal_vectors_2 = cache.normal_vectors
@@ -482,10 +487,39 @@ end
     return nothing
 end
 
-function calc_interface_flux!(cache, u,
+function prolong2interfaces!(cache, u,
+                             mesh::Union{StructuredMesh{2}, StructuredMeshView{2}},
+                             equations, dg::DG)
+    @unpack interfaces_u = cache.elements
+
+    @threaded for element in eachelement(dg, cache)
+        for i in eachnode(dg)
+            # Negative x-direction (direction 1, left/negative x face)
+            for v in eachvariable(equations)
+                interfaces_u[v, i, 1, element] = u[v, 1, i, element]
+            end
+            # Positive x-direction (direction 2, right/positive x face)
+            for v in eachvariable(equations)
+                interfaces_u[v, i, 2, element] = u[v, nnodes(dg), i, element]
+            end
+            # Negative y-direction (direction 3, bottom/negative y face)
+            for v in eachvariable(equations)
+                interfaces_u[v, i, 3, element] = u[v, i, 1, element]
+            end
+            # Positive y-direction (direction 4, top/positive y face)
+            for v in eachvariable(equations)
+                interfaces_u[v, i, 4, element] = u[v, i, nnodes(dg), element]
+            end
+        end
+    end
+
+    return nothing
+end
+
+function calc_interface_flux!(surface_flux_values,
                               mesh::Union{StructuredMesh{2}, StructuredMeshView{2}},
                               have_nonconservative_terms, # can be True/False
-                              equations, surface_integral, dg::DG)
+                              equations, surface_integral, dg::DG, cache)
     @unpack elements = cache
 
     @threaded for element in eachelement(dg, cache)
@@ -495,14 +529,14 @@ function calc_interface_flux!(cache, u,
         # Interfaces in x-direction (`orientation` = 1)
         calc_interface_flux!(elements.surface_flux_values,
                              elements.left_neighbors[1, element],
-                             element, 1, u, mesh,
+                             element, 1, mesh,
                              have_nonconservative_terms, equations,
                              surface_integral, dg, cache)
 
         # Interfaces in y-direction (`orientation` = 2)
         calc_interface_flux!(elements.surface_flux_values,
                              elements.left_neighbors[2, element],
-                             element, 2, u, mesh,
+                             element, 2, mesh,
                              have_nonconservative_terms, equations,
                              surface_integral, dg, cache)
     end
@@ -511,7 +545,7 @@ function calc_interface_flux!(cache, u,
 end
 
 @inline function calc_interface_flux!(surface_flux_values, left_element, right_element,
-                                      orientation, u,
+                                      orientation,
                                       mesh::Union{StructuredMesh{2},
                                                   StructuredMeshView{2}},
                                       have_nonconservative_terms::False, equations,
@@ -522,16 +556,18 @@ end
     end
 
     @unpack surface_flux = surface_integral
-    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    @unpack interfaces_u, contravariant_vectors, inverse_jacobian = cache.elements
 
     right_direction = 2 * orientation
     left_direction = right_direction - 1
 
     for i in eachnode(dg)
-        if orientation == 1
-            u_ll = get_node_vars(u, equations, dg, nnodes(dg), i, left_element)
-            u_rr = get_node_vars(u, equations, dg, 1, i, right_element)
+        u_ll = get_node_vars(interfaces_u, equations, dg, i, right_direction,
+                             left_element)
+        u_rr = get_node_vars(interfaces_u, equations, dg, i, left_direction,
+                             right_element)
 
+        if orientation == 1
             # If the mapping is orientation-reversing, the contravariant vectors' orientation
             # is reversed as well. The normal vector must be oriented in the direction
             # from `left_element` to `right_element`, or the numerical flux will be computed
@@ -543,9 +579,6 @@ end
                                get_contravariant_vector(1, contravariant_vectors,
                                                         1, i, right_element)
         else # orientation == 2
-            u_ll = get_node_vars(u, equations, dg, i, nnodes(dg), left_element)
-            u_rr = get_node_vars(u, equations, dg, i, 1, right_element)
-
             # See above
             sign_jacobian = sign(inverse_jacobian[i, 1, right_element])
 
@@ -569,7 +602,7 @@ end
 end
 
 @inline function calc_interface_flux!(surface_flux_values, left_element, right_element,
-                                      orientation, u,
+                                      orientation,
                                       mesh::Union{StructuredMesh{2},
                                                   StructuredMeshView{2}},
                                       have_nonconservative_terms::True, equations,
@@ -580,16 +613,18 @@ end
     end
 
     surface_flux, nonconservative_flux = surface_integral.surface_flux
-    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    @unpack interfaces_u, contravariant_vectors, inverse_jacobian = cache.elements
 
     right_direction = 2 * orientation
     left_direction = right_direction - 1
 
     for i in eachnode(dg)
-        if orientation == 1
-            u_ll = get_node_vars(u, equations, dg, nnodes(dg), i, left_element)
-            u_rr = get_node_vars(u, equations, dg, 1, i, right_element)
+        u_ll = get_node_vars(interfaces_u, equations, dg, i, right_direction,
+                             left_element)
+        u_rr = get_node_vars(interfaces_u, equations, dg, i, left_direction,
+                             right_element)
 
+        if orientation == 1
             # If the mapping is orientation-reversing, the contravariant vectors' orientation
             # is reversed as well. The normal vector must be oriented in the direction
             # from `left_element` to `right_element`, or the numerical flux will be computed
@@ -601,9 +636,6 @@ end
                                get_contravariant_vector(1, contravariant_vectors,
                                                         1, i, right_element)
         else # orientation == 2
-            u_ll = get_node_vars(u, equations, dg, i, nnodes(dg), left_element)
-            u_rr = get_node_vars(u, equations, dg, i, 1, right_element)
-
             # See above
             sign_jacobian = sign(inverse_jacobian[i, 1, right_element])
 
@@ -641,7 +673,7 @@ end
     return nothing
 end
 
-function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
+function calc_boundary_flux!(cache, t, boundary_conditions::NamedTuple,
                              mesh::Union{StructuredMesh{2}, StructuredMeshView{2}},
                              equations, surface_integral,
                              dg::DG)
@@ -654,7 +686,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
         element = linear_indices[begin, cell_y]
 
         for j in eachnode(dg)
-            calc_boundary_flux_by_direction!(surface_flux_values, u, t, 1,
+            calc_boundary_flux_by_direction!(surface_flux_values, t, 1,
                                              boundary_conditions[direction],
                                              mesh,
                                              have_nonconservative_terms(equations),
@@ -668,7 +700,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
         element = linear_indices[end, cell_y]
 
         for j in eachnode(dg)
-            calc_boundary_flux_by_direction!(surface_flux_values, u, t, 1,
+            calc_boundary_flux_by_direction!(surface_flux_values, t, 1,
                                              boundary_conditions[direction],
                                              mesh,
                                              have_nonconservative_terms(equations),
@@ -684,7 +716,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
         element = linear_indices[cell_x, begin]
 
         for i in eachnode(dg)
-            calc_boundary_flux_by_direction!(surface_flux_values, u, t, 2,
+            calc_boundary_flux_by_direction!(surface_flux_values, t, 2,
                                              boundary_conditions[direction],
                                              mesh,
                                              have_nonconservative_terms(equations),
@@ -698,7 +730,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
         element = linear_indices[cell_x, end]
 
         for i in eachnode(dg)
-            calc_boundary_flux_by_direction!(surface_flux_values, u, t, 2,
+            calc_boundary_flux_by_direction!(surface_flux_values, t, 2,
                                              boundary_conditions[direction],
                                              mesh,
                                              have_nonconservative_terms(equations),
@@ -711,26 +743,61 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
     return nothing
 end
 
-function apply_jacobian!(du,
+function apply_jacobian!(backend::Nothing, du,
                          mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                      UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2},
                                      T8codeMesh{2}},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
-
     @threaded for element in eachelement(dg, cache)
-        for j in eachnode(dg), i in eachnode(dg)
-            # Negative sign included to account for the negated surface and volume terms,
-            # see e.g. the computation of `derivative_hat` in the basis setup and 
-            # the comment in `calc_surface_integral!`.
-            factor = -inverse_jacobian[i, j, element]
+        apply_jacobian_per_element!(du, typeof(mesh), equations, dg, inverse_jacobian,
+                                    element)
+    end
+end
 
-            for v in eachvariable(equations)
-                du[v, i, j, element] *= factor
-            end
+function apply_jacobian!(backend::Backend, du,
+                         mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
+                                     UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2},
+                                     T8codeMesh{2}},
+                         equations, dg::DG, cache)
+    nelements(dg, cache) == 0 && return nothing
+    @unpack inverse_jacobian = cache.elements
+    kernel! = apply_jacobian_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, dg, inverse_jacobian,
+            ndrange = nelements(dg, cache))
+end
+
+@kernel function apply_jacobian_KAkernel!(du,
+                                          mT::Type{<:Union{StructuredMesh{2},
+                                                           StructuredMeshView{2},
+                                                           UnstructuredMesh2D,
+                                                           P4estMesh{2},
+                                                           P4estMeshView{2},
+                                                           T8codeMesh{2}}},
+                                          equations, dg::DG, inverse_jacobian)
+    element = @index(Global)
+    apply_jacobian_per_element!(du, mT, equations, dg, inverse_jacobian, element)
+end
+
+@inline function apply_jacobian_per_element!(du,
+                                             ::Type{<:Union{StructuredMesh{2},
+                                                            StructuredMeshView{2},
+                                                            UnstructuredMesh2D,
+                                                            P4estMesh{2},
+                                                            P4estMeshView{2},
+                                                            T8codeMesh{2}}},
+                                             equations, dg::DG, inverse_jacobian,
+                                             element)
+    for j in eachnode(dg), i in eachnode(dg)
+        # Negative sign included to account for the negated surface and volume terms,
+        # see e.g. the computation of `derivative_hat` in the basis setup and 
+        # the comment in `calc_surface_integral!`.
+        factor = -inverse_jacobian[i, j, element]
+
+        for v in eachvariable(equations)
+            du[v, i, j, element] *= factor
         end
     end
-
     return nothing
 end
 end # @muladd
diff --git a/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl b/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl
index c2956d027b8..77b5bc51a09 100644
--- a/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl
+++ b/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl
@@ -19,8 +19,9 @@
 # works efficiently here.
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
                                            element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       UnstructuredMesh2D, P4estMesh{2}},
+                                           MeshT::Type{<:Union{StructuredMesh{2},
+                                                               UnstructuredMesh2D,
+                                                               P4estMesh{2}}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations2D,
                                            volume_flux::typeof(flux_shima_etal_turbo),
@@ -32,13 +33,13 @@
     # indices `[i, j, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims(MeshT))...,
                                           StaticInt(nvariables(equations))))
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
@@ -82,7 +83,7 @@
     contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims(MeshT))))
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_x[j, i, 1] = contravariant_vectors[1, 1, i, j, element]
@@ -155,7 +156,7 @@
     contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims(MeshT))))
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_y[i, j, 1] = contravariant_vectors[1, 2, i, j, element]
@@ -226,8 +227,9 @@ end
 
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
                                            element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       UnstructuredMesh2D, P4estMesh{2}},
+                                           MeshT::Type{<:Union{StructuredMesh{2},
+                                                               UnstructuredMesh2D,
+                                                               P4estMesh{2}}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations2D,
                                            volume_flux::typeof(flux_ranocha_turbo),
@@ -239,7 +241,7 @@ end
     # indices `[i, j, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`. In addition
@@ -248,7 +250,7 @@ end
     # values.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims(MeshT))...,
                                           StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
@@ -294,7 +296,7 @@ end
     contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims(MeshT))))
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_x[j, i, 1] = contravariant_vectors[1, 1, i, j, element]
@@ -400,7 +402,7 @@ end
     contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims(MeshT))))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_y[i, j, 1] = contravariant_vectors[1, 2, i, j, element]
diff --git a/src/solvers/dgsem_structured/dg_2d_subcell_limiters.jl b/src/solvers/dgsem_structured/dg_2d_subcell_limiters.jl
index 65cdf6796c2..841fc90684d 100644
--- a/src/solvers/dgsem_structured/dg_2d_subcell_limiters.jl
+++ b/src/solvers/dgsem_structured/dg_2d_subcell_limiters.jl
@@ -10,7 +10,7 @@
 #
 # See also `flux_differencing_kernel!`.
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::Union{StructuredMesh{2}, P4estMesh{2}},
+                                ::Type{<:Union{StructuredMesh{2}, P4estMesh{2}}},
                                 have_nonconservative_terms::False, equations,
                                 volume_flux, dg::DGSEM, element, cache)
     (; contravariant_vectors) = cache.elements
@@ -60,9 +60,11 @@
     end
 
     # FV-form flux `fhat` in x direction
-    for j in eachnode(dg), i in 1:(nnodes(dg) - 1), v in eachvariable(equations)
-        fhat1_L[v, i + 1, j] = fhat1_L[v, i, j] + weights[i] * flux_temp[v, i, j]
-        fhat1_R[v, i + 1, j] = fhat1_L[v, i + 1, j]
+    for j in eachnode(dg), i in 1:(nnodes(dg) - 1)
+        for v in eachvariable(equations)
+            fhat1_L[v, i + 1, j] = fhat1_L[v, i, j] + weights[i] * flux_temp[v, i, j]
+            fhat1_R[v, i + 1, j] = fhat1_L[v, i + 1, j]
+        end
     end
 
     # Split form volume flux in orientation 2: y direction
@@ -91,9 +93,11 @@
     end
 
     # FV-form flux `fhat` in y direction
-    for j in 1:(nnodes(dg) - 1), i in eachnode(dg), v in eachvariable(equations)
-        fhat2_L[v, i, j + 1] = fhat2_L[v, i, j] + weights[j] * flux_temp[v, i, j]
-        fhat2_R[v, i, j + 1] = fhat2_L[v, i, j + 1]
+    for j in 1:(nnodes(dg) - 1), i in eachnode(dg)
+        for v in eachvariable(equations)
+            fhat2_L[v, i, j + 1] = fhat2_L[v, i, j] + weights[j] * flux_temp[v, i, j]
+            fhat2_R[v, i, j + 1] = fhat2_L[v, i, j + 1]
+        end
     end
 
     return nothing
@@ -111,8 +115,8 @@ end
 #   Discretizations of Non-Conservative Systems. https://arxiv.org/pdf/2211.14009.pdf.
 #
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::Union{StructuredMesh{2}, P4estMesh{2}},
-                                nonconservative_terms::True, equations,
+                                ::Type{<:Union{StructuredMesh{2}, P4estMesh{2}}},
+                                have_nonconservative_terms::True, equations,
                                 volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
                                 element,
                                 cache) where {
@@ -315,8 +319,8 @@ end
 # The calculation of the non-conservative staggered "fluxes" requires non-conservative
 # terms that can be written as a product of local and jump contributions.
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::Union{StructuredMesh{2}, P4estMesh{2}},
-                                nonconservative_terms::True, equations,
+                                ::Type{<:Union{StructuredMesh{2}, P4estMesh{2}}},
+                                have_nonconservative_terms::True, equations,
                                 volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
                                 element,
                                 cache) where {
diff --git a/src/solvers/dgsem_structured/dg_3d.jl b/src/solvers/dgsem_structured/dg_3d.jl
index 7b0414f66fb..f5bba91f44c 100644
--- a/src/solvers/dgsem_structured/dg_3d.jl
+++ b/src/solvers/dgsem_structured/dg_3d.jl
@@ -4,7 +4,6 @@
 # See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
 @muladd begin
 #! format: noindent
-
 function create_cache(mesh::Union{StructuredMesh{3},
                                   P4estMesh{3}, T8codeMesh{3}},
                       equations,
@@ -32,8 +31,8 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
 =#
 @inline function weak_form_kernel!(du, u,
                                    element,
-                                   mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                               T8codeMesh{3}},
+                                   ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
+                                                  T8codeMesh{3}}},
                                    have_nonconservative_terms::False, equations,
                                    dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -89,8 +88,9 @@ end
 # mapping terms, stored in `contravariant_vectors`, is peeled apart from the evaluation of
 # the physical fluxes in each Cartesian direction
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                                       T8codeMesh{3}},
+                                           ::Type{<:Union{StructuredMesh{3},
+                                                          P4estMesh{3},
+                                                          T8codeMesh{3}}},
                                            have_nonconservative_terms::False, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -171,11 +171,12 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                                       T8codeMesh{3}},
+                                           MeshT::Type{<:Union{StructuredMesh{3},
+                                                               P4estMesh{3},
+                                                               T8codeMesh{3}}},
                                            have_nonconservative_terms::True, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
-    flux_differencing_kernel!(du, u, element, mesh, have_nonconservative_terms,
+    flux_differencing_kernel!(du, u, element, MeshT, have_nonconservative_terms,
                               combine_conservative_and_nonconservative_fluxes(volume_flux,
                                                                               equations),
                               equations, volume_flux, dg, cache, alpha)
@@ -184,8 +185,9 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                                       T8codeMesh{3}},
+                                           MeshT::Type{<:Union{StructuredMesh{3},
+                                                               P4estMesh{3},
+                                                               T8codeMesh{3}}},
                                            have_nonconservative_terms::True,
                                            combine_conservative_and_nonconservative_fluxes::False,
                                            equations,
@@ -195,7 +197,7 @@ end
     symmetric_flux, nonconservative_flux = volume_flux
 
     # Apply the symmetric flux as usual
-    flux_differencing_kernel!(du, u, element, mesh, False(), equations, symmetric_flux,
+    flux_differencing_kernel!(du, u, element, MeshT, False(), equations, symmetric_flux,
                               dg, cache, alpha)
 
     # Calculate the remaining volume terms using the nonsymmetric generalized flux
@@ -274,8 +276,9 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                                       T8codeMesh{3}},
+                                           ::Type{<:Union{StructuredMesh{3},
+                                                          P4estMesh{3},
+                                                          T8codeMesh{3}}},
                                            have_nonconservative_terms::True,
                                            combine_conservative_and_nonconservative_fluxes::True,
                                            equations,
@@ -369,8 +372,8 @@ end
 # [arXiv: 2008.12044v2](https://arxiv.org/pdf/2008.12044)
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                               fstar3_L, fstar3_R, u,
-                              mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                          T8codeMesh{3}},
+                              ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
+                                             T8codeMesh{3}}},
                               have_nonconservative_terms::False,
                               equations, volume_flux_fv, dg::DGSEM, element, cache)
     @unpack contravariant_vectors = cache.elements
@@ -430,8 +433,8 @@ end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                               fstar3_L, fstar3_R, u,
-                              mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                          T8codeMesh{3}},
+                              ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
+                                             T8codeMesh{3}}},
                               have_nonconservative_terms::True,
                               equations, volume_flux_fv, dg::DGSEM, element, cache)
     @unpack contravariant_vectors = cache.elements
@@ -526,8 +529,8 @@ end
 
 @inline function calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                                 fstar3_L, fstar3_R, u,
-                                mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                            T8codeMesh{3}},
+                                ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
+                                               T8codeMesh{3}}},
                                 have_nonconservative_terms::False, equations,
                                 volume_flux_fv, dg::DGSEM, element, cache,
                                 sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -609,9 +612,47 @@ end
     return nothing
 end
 
-function calc_interface_flux!(cache, u, mesh::StructuredMesh{3},
+function prolong2interfaces!(cache, u, mesh::StructuredMesh{3}, equations, dg::DG)
+    @unpack interfaces_u = cache.elements
+
+    @threaded for element in eachelement(dg, cache)
+        for j in eachnode(dg), i in eachnode(dg)
+            # Negative x-direction (direction 1, left/negative x face)
+            # Face nodes (i, j) correspond to (y, z) directions
+            for v in eachvariable(equations)
+                interfaces_u[v, i, j, 1, element] = u[v, 1, i, j, element]
+            end
+            # Positive x-direction (direction 2, right/positive x face)
+            for v in eachvariable(equations)
+                interfaces_u[v, i, j, 2, element] = u[v, nnodes(dg), i, j, element]
+            end
+            # Negative y-direction (direction 3, bottom/negative y face)
+            # Face nodes (i, j) correspond to (x, z) directions
+            for v in eachvariable(equations)
+                interfaces_u[v, i, j, 3, element] = u[v, i, 1, j, element]
+            end
+            # Positive y-direction (direction 4, top/positive y face)
+            for v in eachvariable(equations)
+                interfaces_u[v, i, j, 4, element] = u[v, i, nnodes(dg), j, element]
+            end
+            # Negative z-direction (direction 5, back/negative z face)
+            # Face nodes (i, j) correspond to (x, y) directions
+            for v in eachvariable(equations)
+                interfaces_u[v, i, j, 5, element] = u[v, i, j, 1, element]
+            end
+            # Positive z-direction (direction 6, front/positive z face)
+            for v in eachvariable(equations)
+                interfaces_u[v, i, j, 6, element] = u[v, i, j, nnodes(dg), element]
+            end
+        end
+    end
+
+    return nothing
+end
+
+function calc_interface_flux!(surface_flux_values, mesh::StructuredMesh{3},
                               have_nonconservative_terms, # can be True/False
-                              equations, surface_integral, dg::DG)
+                              equations, surface_integral, dg::DG, cache)
     @unpack elements = cache
 
     @threaded for element in eachelement(dg, cache)
@@ -621,21 +662,21 @@ function calc_interface_flux!(cache, u, mesh::StructuredMesh{3},
         # Interfaces in x-direction (`orientation` = 1)
         calc_interface_flux!(elements.surface_flux_values,
                              elements.left_neighbors[1, element],
-                             element, 1, u, mesh,
+                             element, 1, mesh,
                              have_nonconservative_terms, equations,
                              surface_integral, dg, cache)
 
         # Interfaces in y-direction (`orientation` = 2)
         calc_interface_flux!(elements.surface_flux_values,
                              elements.left_neighbors[2, element],
-                             element, 2, u, mesh,
+                             element, 2, mesh,
                              have_nonconservative_terms, equations,
                              surface_integral, dg, cache)
 
         # Interfaces in z-direction (`orientation` = 3)
         calc_interface_flux!(elements.surface_flux_values,
                              elements.left_neighbors[3, element],
-                             element, 3, u, mesh,
+                             element, 3, mesh,
                              have_nonconservative_terms, equations,
                              surface_integral, dg, cache)
     end
@@ -644,7 +685,7 @@ function calc_interface_flux!(cache, u, mesh::StructuredMesh{3},
 end
 
 @inline function calc_interface_flux!(surface_flux_values, left_element, right_element,
-                                      orientation, u,
+                                      orientation,
                                       mesh::StructuredMesh{3},
                                       have_nonconservative_terms::False, equations,
                                       surface_integral, dg::DG, cache)
@@ -654,16 +695,18 @@ end
     end
 
     @unpack surface_flux = surface_integral
-    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    @unpack interfaces_u, contravariant_vectors, inverse_jacobian = cache.elements
 
     right_direction = 2 * orientation
     left_direction = right_direction - 1
 
     for j in eachnode(dg), i in eachnode(dg)
-        if orientation == 1
-            u_ll = get_node_vars(u, equations, dg, nnodes(dg), i, j, left_element)
-            u_rr = get_node_vars(u, equations, dg, 1, i, j, right_element)
+        u_ll = get_node_vars(interfaces_u, equations, dg, i, j, right_direction,
+                             left_element)
+        u_rr = get_node_vars(interfaces_u, equations, dg, i, j, left_direction,
+                             right_element)
 
+        if orientation == 1
             # If the mapping is orientation-reversing, the contravariant vectors' orientation
             # is reversed as well. The normal vector must be oriented in the direction
             # from `left_element` to `right_element`, or the numerical flux will be computed
@@ -675,9 +718,6 @@ end
                                get_contravariant_vector(1, contravariant_vectors,
                                                         1, i, j, right_element)
         elseif orientation == 2
-            u_ll = get_node_vars(u, equations, dg, i, nnodes(dg), j, left_element)
-            u_rr = get_node_vars(u, equations, dg, i, 1, j, right_element)
-
             # See above
             sign_jacobian = sign(inverse_jacobian[i, 1, j, right_element])
 
@@ -686,9 +726,6 @@ end
                                get_contravariant_vector(2, contravariant_vectors,
                                                         i, 1, j, right_element)
         else # orientation == 3
-            u_ll = get_node_vars(u, equations, dg, i, j, nnodes(dg), left_element)
-            u_rr = get_node_vars(u, equations, dg, i, j, 1, right_element)
-
             # See above
             sign_jacobian = sign(inverse_jacobian[i, j, 1, right_element])
 
@@ -712,7 +749,7 @@ end
 end
 
 @inline function calc_interface_flux!(surface_flux_values, left_element, right_element,
-                                      orientation, u,
+                                      orientation,
                                       mesh::StructuredMesh{3},
                                       have_nonconservative_terms::True, equations,
                                       surface_integral, dg::DG, cache)
@@ -722,16 +759,18 @@ end
     end
 
     surface_flux, nonconservative_flux = surface_integral.surface_flux
-    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    @unpack interfaces_u, contravariant_vectors, inverse_jacobian = cache.elements
 
     right_direction = 2 * orientation
     left_direction = right_direction - 1
 
     for j in eachnode(dg), i in eachnode(dg)
-        if orientation == 1
-            u_ll = get_node_vars(u, equations, dg, nnodes(dg), i, j, left_element)
-            u_rr = get_node_vars(u, equations, dg, 1, i, j, right_element)
+        u_ll = get_node_vars(interfaces_u, equations, dg, i, j, right_direction,
+                             left_element)
+        u_rr = get_node_vars(interfaces_u, equations, dg, i, j, left_direction,
+                             right_element)
 
+        if orientation == 1
             # If the mapping is orientation-reversing, the contravariant vectors' orientation
             # is reversed as well. The normal vector must be oriented in the direction
             # from `left_element` to `right_element`, or the numerical flux will be computed
@@ -743,9 +782,6 @@ end
                                get_contravariant_vector(1, contravariant_vectors,
                                                         1, i, j, right_element)
         elseif orientation == 2
-            u_ll = get_node_vars(u, equations, dg, i, nnodes(dg), j, left_element)
-            u_rr = get_node_vars(u, equations, dg, i, 1, j, right_element)
-
             # See above
             sign_jacobian = sign(inverse_jacobian[i, 1, j, right_element])
 
@@ -754,9 +790,6 @@ end
                                get_contravariant_vector(2, contravariant_vectors,
                                                         i, 1, j, right_element)
         else # orientation == 3
-            u_ll = get_node_vars(u, equations, dg, i, j, nnodes(dg), left_element)
-            u_rr = get_node_vars(u, equations, dg, i, j, 1, right_element)
-
             # See above
             sign_jacobian = sign(inverse_jacobian[i, j, 1, right_element])
 
@@ -794,7 +827,7 @@ end
     return nothing
 end
 
-function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
+function calc_boundary_flux!(cache, t, boundary_conditions::NamedTuple,
                              mesh::StructuredMesh{3}, equations, surface_integral,
                              dg::DG)
     @unpack surface_flux_values = cache.elements
@@ -806,7 +839,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
         element = linear_indices[begin, cell_y, cell_z]
 
         for k in eachnode(dg), j in eachnode(dg)
-            calc_boundary_flux_by_direction!(surface_flux_values, u, t, 1,
+            calc_boundary_flux_by_direction!(surface_flux_values, t, 1,
                                              boundary_conditions[direction],
                                              mesh,
                                              have_nonconservative_terms(equations),
@@ -820,7 +853,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
         element = linear_indices[end, cell_y, cell_z]
 
         for k in eachnode(dg), j in eachnode(dg)
-            calc_boundary_flux_by_direction!(surface_flux_values, u, t, 1,
+            calc_boundary_flux_by_direction!(surface_flux_values, t, 1,
                                              boundary_conditions[direction],
                                              mesh,
                                              have_nonconservative_terms(equations),
@@ -837,7 +870,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
         element = linear_indices[cell_x, begin, cell_z]
 
         for k in eachnode(dg), i in eachnode(dg)
-            calc_boundary_flux_by_direction!(surface_flux_values, u, t, 2,
+            calc_boundary_flux_by_direction!(surface_flux_values, t, 2,
                                              boundary_conditions[direction],
                                              mesh,
                                              have_nonconservative_terms(equations),
@@ -851,7 +884,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
         element = linear_indices[cell_x, end, cell_z]
 
         for k in eachnode(dg), i in eachnode(dg)
-            calc_boundary_flux_by_direction!(surface_flux_values, u, t, 2,
+            calc_boundary_flux_by_direction!(surface_flux_values, t, 2,
                                              boundary_conditions[direction],
                                              mesh,
                                              have_nonconservative_terms(equations),
@@ -868,7 +901,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
         element = linear_indices[cell_x, cell_y, begin]
 
         for j in eachnode(dg), i in eachnode(dg)
-            calc_boundary_flux_by_direction!(surface_flux_values, u, t, 3,
+            calc_boundary_flux_by_direction!(surface_flux_values, t, 3,
                                              boundary_conditions[direction],
                                              mesh,
                                              have_nonconservative_terms(equations),
@@ -882,7 +915,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
         element = linear_indices[cell_x, cell_y, end]
 
         for j in eachnode(dg), i in eachnode(dg)
-            calc_boundary_flux_by_direction!(surface_flux_values, u, t, 3,
+            calc_boundary_flux_by_direction!(surface_flux_values, t, 3,
                                              boundary_conditions[direction],
                                              mesh,
                                              have_nonconservative_terms(equations),
@@ -896,24 +929,49 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
     return nothing
 end
 
-function apply_jacobian!(du,
+function apply_jacobian!(backend::Nothing, du,
                          mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
-
     @threaded for element in eachelement(dg, cache)
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            # Negative sign included to account for the negated surface and volume terms,
-            # see e.g. the computation of `derivative_hat` in the basis setup and 
-            # the comment in `calc_surface_integral!`.
-            factor = -inverse_jacobian[i, j, k, element]
+        apply_jacobian_per_element!(du, typeof(mesh), equations, dg, inverse_jacobian,
+                                    element)
+    end
+    return nothing
+end
 
-            for v in eachvariable(equations)
-                du[v, i, j, k, element] *= factor
-            end
+function apply_jacobian!(backend::Backend, du,
+                         mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
+                         equations, dg::DG, cache)
+    @unpack inverse_jacobian = cache.elements
+
+    kernel! = apply_jacobian_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, dg, inverse_jacobian,
+            ndrange = nelements(cache.elements))
+    return nothing
+end
+
+@kernel function apply_jacobian_KAkernel!(du, MeshT, equations, dg::DG,
+                                          inverse_jacobian)
+    element = @index(Global)
+    apply_jacobian_per_element!(du, MeshT, equations, dg, inverse_jacobian, element)
+end
+
+@inline function apply_jacobian_per_element!(du,
+                                             ::Type{<:Union{StructuredMesh{3},
+                                                            P4estMesh{3},
+                                                            T8codeMesh{3}}},
+                                             equations, dg, inverse_jacobian, element)
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        # Negative sign included to account for the negated surface and volume terms,
+        # see e.g. the computation of `derivative_hat` in the basis setup and 
+        # the comment in `calc_surface_integral!`.
+        factor = -inverse_jacobian[i, j, k, element]
+
+        for v in eachvariable(equations)
+            du[v, i, j, k, element] *= factor
         end
     end
-
     return nothing
 end
 end # @muladd
diff --git a/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl b/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
index 8b710417ff7..2022eb9f3e6 100644
--- a/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
+++ b/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
@@ -19,7 +19,8 @@
 # works efficiently here.
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
                                            element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3}},
+                                           MeshT::Type{<:Union{StructuredMesh{3},
+                                                               P4estMesh{3}}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations3D,
                                            volume_flux::typeof(flux_shima_etal_turbo),
@@ -31,13 +32,13 @@
     # indices `[i, j, k, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims(MeshT))...,
                                           StaticInt(nvariables(equations))))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
@@ -88,7 +89,7 @@
     contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)^2),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims(MeshT))))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         jk = j + nnodes(dg) * (k - 1)
@@ -176,7 +177,7 @@
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims(MeshT))))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, element]
@@ -264,7 +265,7 @@
         contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                              (StaticInt(nnodes(dg)^2),
                                                                               StaticInt(nnodes(dg)),
-                                                                              StaticInt(ndims(mesh))))
+                                                                              StaticInt(ndims(MeshT))))
 
         @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             ij = i + nnodes(dg) * (j - 1)
@@ -351,7 +352,8 @@ end
 
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
                                            element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3}},
+                                           MeshT::Type{<:Union{StructuredMesh{3},
+                                                               P4estMesh{3}}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations3D,
                                            volume_flux::typeof(flux_ranocha_turbo),
@@ -363,7 +365,7 @@ end
     # indices `[i, j, k, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`. In addition
@@ -372,7 +374,7 @@ end
     # values.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims(MeshT))...,
                                           StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
@@ -425,7 +427,7 @@ end
     contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)^2),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims(MeshT))))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         jk = j + nnodes(dg) * (k - 1)
@@ -546,7 +548,7 @@ end
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims(MeshT))))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, element]
@@ -667,7 +669,7 @@ end
         contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                              (StaticInt(nnodes(dg)^2),
                                                                               StaticInt(nnodes(dg)),
-                                                                              StaticInt(ndims(mesh))))
+                                                                              StaticInt(ndims(MeshT))))
 
         @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             ij = i + nnodes(dg) * (j - 1)
diff --git a/src/solvers/dgsem_tree/container_parabolic_1d.jl b/src/solvers/dgsem_tree/container_parabolic_1d.jl
new file mode 100644
index 00000000000..af9a6841636
--- /dev/null
+++ b/src/solvers/dgsem_tree/container_parabolic_1d.jl
@@ -0,0 +1,58 @@
+mutable struct ParabolicContainer1D{uEltype <: Real}
+    u_transformed::Array{uEltype, 3}
+    gradients::Array{uEltype, 3}
+    flux_parabolic::Array{uEltype, 3}
+
+    # internal `resize!`able storage
+    _u_transformed::Vector{uEltype}
+    _gradients::Vector{uEltype}
+    _flux_parabolic::Vector{uEltype}
+
+    function ParabolicContainer1D{uEltype}(n_vars::Integer, n_nodes::Integer,
+                                           n_elements::Integer) where {uEltype <: Real}
+        return new(Array{uEltype, 3}(undef, n_vars, n_nodes, n_elements),
+                   Array{uEltype, 3}(undef, n_vars, n_nodes, n_elements),
+                   Array{uEltype, 3}(undef, n_vars, n_nodes, n_elements),
+                   Vector{uEltype}(undef, n_vars * n_nodes * n_elements),
+                   Vector{uEltype}(undef, n_vars * n_nodes * n_elements),
+                   Vector{uEltype}(undef, n_vars * n_nodes * n_elements))
+    end
+end
+
+function init_parabolic_container_1d(n_vars::Integer, n_nodes::Integer,
+                                     n_elements::Integer,
+                                     ::Type{uEltype}) where {uEltype <: Real}
+    return ParabolicContainer1D{uEltype}(n_vars, n_nodes, n_elements)
+end
+
+# Only one-dimensional `Array`s are `resize!`able in Julia.
+# Hence, we use `Vector`s as internal storage and `resize!`
+# them whenever needed. Then, we reuse the same memory by
+# `unsafe_wrap`ping multi-dimensional `Array`s around the
+# internal storage.
+function Base.resize!(parabolic_container::ParabolicContainer1D, equations, dg, cache)
+    capacity = nvariables(equations) * nnodes(dg) * nelements(dg, cache)
+    resize!(parabolic_container._u_transformed, capacity)
+    resize!(parabolic_container._gradients, capacity)
+    resize!(parabolic_container._flux_parabolic, capacity)
+
+    parabolic_container.u_transformed = unsafe_wrap(Array,
+                                                    pointer(parabolic_container._u_transformed),
+                                                    (nvariables(equations),
+                                                     nnodes(dg),
+                                                     nelements(dg, cache)))
+
+    parabolic_container.gradients = unsafe_wrap(Array,
+                                                pointer(parabolic_container._gradients),
+                                                (nvariables(equations),
+                                                 nnodes(dg),
+                                                 nelements(dg, cache)))
+
+    parabolic_container.flux_parabolic = unsafe_wrap(Array,
+                                                     pointer(parabolic_container._flux_parabolic),
+                                                     (nvariables(equations),
+                                                      nnodes(dg),
+                                                      nelements(dg, cache)))
+
+    return nothing
+end
diff --git a/src/solvers/dgsem_tree/container_parabolic_2d.jl b/src/solvers/dgsem_tree/container_parabolic_2d.jl
new file mode 100644
index 00000000000..7eee9c1302d
--- /dev/null
+++ b/src/solvers/dgsem_tree/container_parabolic_2d.jl
@@ -0,0 +1,84 @@
+mutable struct ParabolicContainer2D{uEltype <: Real}
+    u_transformed::Array{uEltype, 4}
+    gradients::NTuple{2, Array{uEltype, 4}}
+    flux_parabolic::NTuple{2, Array{uEltype, 4}}
+
+    # internal `resize!`able storage
+    _u_transformed::Vector{uEltype}
+    # Use Tuple for outer, fixed-size datastructure
+    _gradients::Tuple{Vector{uEltype}, Vector{uEltype}}
+    _flux_parabolic::Tuple{Vector{uEltype}, Vector{uEltype}}
+
+    function ParabolicContainer2D{uEltype}(n_vars::Integer, n_nodes::Integer,
+                                           n_elements::Integer) where {uEltype <: Real}
+        return new(Array{uEltype, 4}(undef, n_vars, n_nodes, n_nodes, n_elements), # `u_transformed`
+                   # `gradients`
+                   (Array{uEltype, 4}(undef, n_vars, n_nodes, n_nodes, n_elements),
+                    Array{uEltype, 4}(undef, n_vars, n_nodes, n_nodes, n_elements)),
+                   # `flux_parabolic`
+                   (Array{uEltype, 4}(undef, n_vars, n_nodes, n_nodes, n_elements),
+                    Array{uEltype, 4}(undef, n_vars, n_nodes, n_nodes, n_elements)),
+                   # `_u_transformed`
+                   Vector{uEltype}(undef, n_vars * n_nodes^2 * n_elements),
+                   # `_gradients`
+                   (Vector{uEltype}(undef, n_vars * n_nodes^2 * n_elements),
+                    Vector{uEltype}(undef, n_vars * n_nodes^2 * n_elements)),
+                   # `_flux_parabolic`
+                   (Vector{uEltype}(undef, n_vars * n_nodes^2 * n_elements),
+                    Vector{uEltype}(undef, n_vars * n_nodes^2 * n_elements)))
+    end
+end
+
+function init_parabolic_container_2d(n_vars::Integer, n_nodes::Integer,
+                                     n_elements::Integer,
+                                     ::Type{uEltype}) where {uEltype <: Real}
+    return ParabolicContainer2D{uEltype}(n_vars, n_nodes, n_elements)
+end
+
+# Only one-dimensional `Array`s are `resize!`able in Julia.
+# Hence, we use `Vector`s as internal storage and `resize!`
+# them whenever needed. Then, we reuse the same memory by
+# `unsafe_wrap`ping multi-dimensional `Array`s around the
+# internal storage.
+function Base.resize!(parabolic_container::ParabolicContainer2D, equations, dg, cache)
+    capacity = nvariables(equations) * nnodes(dg)^2 * nelements(dg, cache)
+    resize!(parabolic_container._u_transformed, capacity)
+    for dim in 1:2
+        resize!(parabolic_container._gradients[dim], capacity)
+        resize!(parabolic_container._flux_parabolic[dim], capacity)
+    end
+
+    parabolic_container.u_transformed = unsafe_wrap(Array,
+                                                    pointer(parabolic_container._u_transformed),
+                                                    (nvariables(equations),
+                                                     nnodes(dg), nnodes(dg),
+                                                     nelements(dg, cache)))
+
+    gradients_1 = unsafe_wrap(Array,
+                              pointer(parabolic_container._gradients[1]),
+                              (nvariables(equations),
+                               nnodes(dg), nnodes(dg),
+                               nelements(dg, cache)))
+    gradients_2 = unsafe_wrap(Array,
+                              pointer(parabolic_container._gradients[2]),
+                              (nvariables(equations),
+                               nnodes(dg), nnodes(dg),
+                               nelements(dg, cache)))
+
+    parabolic_container.gradients = (gradients_1, gradients_2)
+
+    flux_parabolic_1 = unsafe_wrap(Array,
+                                   pointer(parabolic_container._flux_parabolic[1]),
+                                   (nvariables(equations),
+                                    nnodes(dg), nnodes(dg),
+                                    nelements(dg, cache)))
+    flux_parabolic_2 = unsafe_wrap(Array,
+                                   pointer(parabolic_container._flux_parabolic[2]),
+                                   (nvariables(equations),
+                                    nnodes(dg), nnodes(dg),
+                                    nelements(dg, cache)))
+
+    parabolic_container.flux_parabolic = (flux_parabolic_1, flux_parabolic_2)
+
+    return nothing
+end
diff --git a/src/solvers/dgsem_tree/container_parabolic_3d.jl b/src/solvers/dgsem_tree/container_parabolic_3d.jl
new file mode 100644
index 00000000000..07a1403b438
--- /dev/null
+++ b/src/solvers/dgsem_tree/container_parabolic_3d.jl
@@ -0,0 +1,99 @@
+mutable struct ParabolicContainer3D{uEltype <: Real}
+    u_transformed::Array{uEltype, 5}
+    gradients::NTuple{3, Array{uEltype, 5}}
+    flux_parabolic::NTuple{3, Array{uEltype, 5}}
+
+    # internal `resize!`able storage
+    _u_transformed::Vector{uEltype}
+    # Use Tuple for outer, fixed-size datastructure
+    _gradients::Tuple{Vector{uEltype}, Vector{uEltype}, Vector{uEltype}}
+    _flux_parabolic::Tuple{Vector{uEltype}, Vector{uEltype}, Vector{uEltype}}
+
+    function ParabolicContainer3D{uEltype}(n_vars::Integer, n_nodes::Integer,
+                                           n_elements::Integer) where {uEltype <: Real}
+        return new(Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements), # `u_transformed`
+                   # `gradients`
+                   (Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements),
+                    Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements),
+                    Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements)),
+                   # `flux_parabolic`
+                   (Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements),
+                    Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements),
+                    Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements)),
+                   # `u_transformed`
+                   Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements),
+                   # `_gradients`
+                   (Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements),
+                    Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements),
+                    Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements)),
+                   # `_flux_parabolic`
+                   (Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements),
+                    Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements),
+                    Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements)))
+    end
+end
+
+function init_parabolic_container_3d(n_vars::Integer, n_nodes::Integer,
+                                     n_elements::Integer,
+                                     ::Type{uEltype}) where {uEltype <: Real}
+    return ParabolicContainer3D{uEltype}(n_vars, n_nodes, n_elements)
+end
+
+# Only one-dimensional `Array`s are `resize!`able in Julia.
+# Hence, we use `Vector`s as internal storage and `resize!`
+# them whenever needed. Then, we reuse the same memory by
+# `unsafe_wrap`ping multi-dimensional `Array`s around the
+# internal storage.
+function Base.resize!(parabolic_container::ParabolicContainer3D, equations, dg, cache)
+    capacity = nvariables(equations) * nnodes(dg)^3 * nelements(dg, cache)
+    resize!(parabolic_container._u_transformed, capacity)
+    for dim in 1:3
+        resize!(parabolic_container._gradients[dim], capacity)
+        resize!(parabolic_container._flux_parabolic[dim], capacity)
+    end
+
+    parabolic_container.u_transformed = unsafe_wrap(Array,
+                                                    pointer(parabolic_container._u_transformed),
+                                                    (nvariables(equations),
+                                                     nnodes(dg), nnodes(dg), nnodes(dg),
+                                                     nelements(dg, cache)))
+
+    gradients_1 = unsafe_wrap(Array,
+                              pointer(parabolic_container._gradients[1]),
+                              (nvariables(equations),
+                               nnodes(dg), nnodes(dg), nnodes(dg),
+                               nelements(dg, cache)))
+    gradients_2 = unsafe_wrap(Array,
+                              pointer(parabolic_container._gradients[2]),
+                              (nvariables(equations),
+                               nnodes(dg), nnodes(dg), nnodes(dg),
+                               nelements(dg, cache)))
+    gradients_3 = unsafe_wrap(Array,
+                              pointer(parabolic_container._gradients[3]),
+                              (nvariables(equations),
+                               nnodes(dg), nnodes(dg), nnodes(dg),
+                               nelements(dg, cache)))
+
+    parabolic_container.gradients = (gradients_1, gradients_2, gradients_3)
+
+    flux_parabolic_1 = unsafe_wrap(Array,
+                                   pointer(parabolic_container._flux_parabolic[1]),
+                                   (nvariables(equations),
+                                    nnodes(dg), nnodes(dg), nnodes(dg),
+                                    nelements(dg, cache)))
+    flux_parabolic_2 = unsafe_wrap(Array,
+                                   pointer(parabolic_container._flux_parabolic[2]),
+                                   (nvariables(equations),
+                                    nnodes(dg), nnodes(dg), nnodes(dg),
+                                    nelements(dg, cache)))
+    flux_parabolic_3 = unsafe_wrap(Array,
+                                   pointer(parabolic_container._flux_parabolic[3]),
+                                   (nvariables(equations),
+                                    nnodes(dg), nnodes(dg), nnodes(dg),
+                                    nelements(dg, cache)))
+
+    parabolic_container.flux_parabolic = (flux_parabolic_1, flux_parabolic_2,
+                                          flux_parabolic_3)
+
+    return nothing
+end
diff --git a/src/solvers/dgsem_tree/container_viscous_1d.jl b/src/solvers/dgsem_tree/container_viscous_1d.jl
deleted file mode 100644
index 661fbfb237f..00000000000
--- a/src/solvers/dgsem_tree/container_viscous_1d.jl
+++ /dev/null
@@ -1,58 +0,0 @@
-mutable struct ViscousContainer1D{uEltype <: Real}
-    u_transformed::Array{uEltype, 3}
-    gradients::Array{uEltype, 3}
-    flux_viscous::Array{uEltype, 3}
-
-    # internal `resize!`able storage
-    _u_transformed::Vector{uEltype}
-    _gradients::Vector{uEltype}
-    _flux_viscous::Vector{uEltype}
-
-    function ViscousContainer1D{uEltype}(n_vars::Integer, n_nodes::Integer,
-                                         n_elements::Integer) where {uEltype <: Real}
-        return new(Array{uEltype, 3}(undef, n_vars, n_nodes, n_elements),
-                   Array{uEltype, 3}(undef, n_vars, n_nodes, n_elements),
-                   Array{uEltype, 3}(undef, n_vars, n_nodes, n_elements),
-                   Vector{uEltype}(undef, n_vars * n_nodes * n_elements),
-                   Vector{uEltype}(undef, n_vars * n_nodes * n_elements),
-                   Vector{uEltype}(undef, n_vars * n_nodes * n_elements))
-    end
-end
-
-function init_viscous_container_1d(n_vars::Integer, n_nodes::Integer,
-                                   n_elements::Integer,
-                                   ::Type{uEltype}) where {uEltype <: Real}
-    return ViscousContainer1D{uEltype}(n_vars, n_nodes, n_elements)
-end
-
-# Only one-dimensional `Array`s are `resize!`able in Julia.
-# Hence, we use `Vector`s as internal storage and `resize!`
-# them whenever needed. Then, we reuse the same memory by
-# `unsafe_wrap`ping multi-dimensional `Array`s around the
-# internal storage.
-function Base.resize!(viscous_container::ViscousContainer1D, equations, dg, cache)
-    capacity = nvariables(equations) * nnodes(dg) * nelements(dg, cache)
-    resize!(viscous_container._u_transformed, capacity)
-    resize!(viscous_container._gradients, capacity)
-    resize!(viscous_container._flux_viscous, capacity)
-
-    viscous_container.u_transformed = unsafe_wrap(Array,
-                                                  pointer(viscous_container._u_transformed),
-                                                  (nvariables(equations),
-                                                   nnodes(dg),
-                                                   nelements(dg, cache)))
-
-    viscous_container.gradients = unsafe_wrap(Array,
-                                              pointer(viscous_container._gradients),
-                                              (nvariables(equations),
-                                               nnodes(dg),
-                                               nelements(dg, cache)))
-
-    viscous_container.flux_viscous = unsafe_wrap(Array,
-                                                 pointer(viscous_container._flux_viscous),
-                                                 (nvariables(equations),
-                                                  nnodes(dg),
-                                                  nelements(dg, cache)))
-
-    return nothing
-end
diff --git a/src/solvers/dgsem_tree/container_viscous_2d.jl b/src/solvers/dgsem_tree/container_viscous_2d.jl
deleted file mode 100644
index a4e69643d3f..00000000000
--- a/src/solvers/dgsem_tree/container_viscous_2d.jl
+++ /dev/null
@@ -1,84 +0,0 @@
-mutable struct ViscousContainer2D{uEltype <: Real}
-    u_transformed::Array{uEltype, 4}
-    gradients::NTuple{2, Array{uEltype, 4}}
-    flux_viscous::NTuple{2, Array{uEltype, 4}}
-
-    # internal `resize!`able storage
-    _u_transformed::Vector{uEltype}
-    # Use Tuple for outer, fixed-size datastructure
-    _gradients::Tuple{Vector{uEltype}, Vector{uEltype}}
-    _flux_viscous::Tuple{Vector{uEltype}, Vector{uEltype}}
-
-    function ViscousContainer2D{uEltype}(n_vars::Integer, n_nodes::Integer,
-                                         n_elements::Integer) where {uEltype <: Real}
-        return new(Array{uEltype, 4}(undef, n_vars, n_nodes, n_nodes, n_elements), # `u_transformed`
-                   # `gradients`
-                   (Array{uEltype, 4}(undef, n_vars, n_nodes, n_nodes, n_elements),
-                    Array{uEltype, 4}(undef, n_vars, n_nodes, n_nodes, n_elements)),
-                   # `flux_viscous`
-                   (Array{uEltype, 4}(undef, n_vars, n_nodes, n_nodes, n_elements),
-                    Array{uEltype, 4}(undef, n_vars, n_nodes, n_nodes, n_elements)),
-                   # `_u_transformed`
-                   Vector{uEltype}(undef, n_vars * n_nodes^2 * n_elements),
-                   # `_gradients`
-                   (Vector{uEltype}(undef, n_vars * n_nodes^2 * n_elements),
-                    Vector{uEltype}(undef, n_vars * n_nodes^2 * n_elements)),
-                   # `_flux_viscous`
-                   (Vector{uEltype}(undef, n_vars * n_nodes^2 * n_elements),
-                    Vector{uEltype}(undef, n_vars * n_nodes^2 * n_elements)))
-    end
-end
-
-function init_viscous_container_2d(n_vars::Integer, n_nodes::Integer,
-                                   n_elements::Integer,
-                                   ::Type{uEltype}) where {uEltype <: Real}
-    return ViscousContainer2D{uEltype}(n_vars, n_nodes, n_elements)
-end
-
-# Only one-dimensional `Array`s are `resize!`able in Julia.
-# Hence, we use `Vector`s as internal storage and `resize!`
-# them whenever needed. Then, we reuse the same memory by
-# `unsafe_wrap`ping multi-dimensional `Array`s around the
-# internal storage.
-function Base.resize!(viscous_container::ViscousContainer2D, equations, dg, cache)
-    capacity = nvariables(equations) * nnodes(dg)^2 * nelements(dg, cache)
-    resize!(viscous_container._u_transformed, capacity)
-    for dim in 1:2
-        resize!(viscous_container._gradients[dim], capacity)
-        resize!(viscous_container._flux_viscous[dim], capacity)
-    end
-
-    viscous_container.u_transformed = unsafe_wrap(Array,
-                                                  pointer(viscous_container._u_transformed),
-                                                  (nvariables(equations),
-                                                   nnodes(dg), nnodes(dg),
-                                                   nelements(dg, cache)))
-
-    gradients_1 = unsafe_wrap(Array,
-                              pointer(viscous_container._gradients[1]),
-                              (nvariables(equations),
-                               nnodes(dg), nnodes(dg),
-                               nelements(dg, cache)))
-    gradients_2 = unsafe_wrap(Array,
-                              pointer(viscous_container._gradients[2]),
-                              (nvariables(equations),
-                               nnodes(dg), nnodes(dg),
-                               nelements(dg, cache)))
-
-    viscous_container.gradients = (gradients_1, gradients_2)
-
-    flux_viscous_1 = unsafe_wrap(Array,
-                                 pointer(viscous_container._flux_viscous[1]),
-                                 (nvariables(equations),
-                                  nnodes(dg), nnodes(dg),
-                                  nelements(dg, cache)))
-    flux_viscous_2 = unsafe_wrap(Array,
-                                 pointer(viscous_container._flux_viscous[2]),
-                                 (nvariables(equations),
-                                  nnodes(dg), nnodes(dg),
-                                  nelements(dg, cache)))
-
-    viscous_container.flux_viscous = (flux_viscous_1, flux_viscous_2)
-
-    return nothing
-end
diff --git a/src/solvers/dgsem_tree/container_viscous_3d.jl b/src/solvers/dgsem_tree/container_viscous_3d.jl
deleted file mode 100644
index a55fc5147d9..00000000000
--- a/src/solvers/dgsem_tree/container_viscous_3d.jl
+++ /dev/null
@@ -1,98 +0,0 @@
-mutable struct ViscousContainer3D{uEltype <: Real}
-    u_transformed::Array{uEltype, 5}
-    gradients::NTuple{3, Array{uEltype, 5}}
-    flux_viscous::NTuple{3, Array{uEltype, 5}}
-
-    # internal `resize!`able storage
-    _u_transformed::Vector{uEltype}
-    # Use Tuple for outer, fixed-size datastructure
-    _gradients::Tuple{Vector{uEltype}, Vector{uEltype}, Vector{uEltype}}
-    _flux_viscous::Tuple{Vector{uEltype}, Vector{uEltype}, Vector{uEltype}}
-
-    function ViscousContainer3D{uEltype}(n_vars::Integer, n_nodes::Integer,
-                                         n_elements::Integer) where {uEltype <: Real}
-        return new(Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements), # `u_transformed`
-                   # `gradients`
-                   (Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements),
-                    Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements),
-                    Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements)),
-                   # `flux_viscous`
-                   (Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements),
-                    Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements),
-                    Array{uEltype, 5}(undef, n_vars, n_nodes, n_nodes, n_nodes, n_elements)),
-                   # `u_transformed`
-                   Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements),
-                   # `_gradients`
-                   (Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements),
-                    Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements),
-                    Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements)),
-                   # `_flux_viscous`
-                   (Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements),
-                    Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements),
-                    Vector{uEltype}(undef, n_vars * n_nodes^3 * n_elements)))
-    end
-end
-
-function init_viscous_container_3d(n_vars::Integer, n_nodes::Integer,
-                                   n_elements::Integer,
-                                   ::Type{uEltype}) where {uEltype <: Real}
-    return ViscousContainer3D{uEltype}(n_vars, n_nodes, n_elements)
-end
-
-# Only one-dimensional `Array`s are `resize!`able in Julia.
-# Hence, we use `Vector`s as internal storage and `resize!`
-# them whenever needed. Then, we reuse the same memory by
-# `unsafe_wrap`ping multi-dimensional `Array`s around the
-# internal storage.
-function Base.resize!(viscous_container::ViscousContainer3D, equations, dg, cache)
-    capacity = nvariables(equations) * nnodes(dg)^3 * nelements(dg, cache)
-    resize!(viscous_container._u_transformed, capacity)
-    for dim in 1:3
-        resize!(viscous_container._gradients[dim], capacity)
-        resize!(viscous_container._flux_viscous[dim], capacity)
-    end
-
-    viscous_container.u_transformed = unsafe_wrap(Array,
-                                                  pointer(viscous_container._u_transformed),
-                                                  (nvariables(equations),
-                                                   nnodes(dg), nnodes(dg), nnodes(dg),
-                                                   nelements(dg, cache)))
-
-    gradients_1 = unsafe_wrap(Array,
-                              pointer(viscous_container._gradients[1]),
-                              (nvariables(equations),
-                               nnodes(dg), nnodes(dg), nnodes(dg),
-                               nelements(dg, cache)))
-    gradients_2 = unsafe_wrap(Array,
-                              pointer(viscous_container._gradients[2]),
-                              (nvariables(equations),
-                               nnodes(dg), nnodes(dg), nnodes(dg),
-                               nelements(dg, cache)))
-    gradients_3 = unsafe_wrap(Array,
-                              pointer(viscous_container._gradients[3]),
-                              (nvariables(equations),
-                               nnodes(dg), nnodes(dg), nnodes(dg),
-                               nelements(dg, cache)))
-
-    viscous_container.gradients = (gradients_1, gradients_2, gradients_3)
-
-    flux_viscous_1 = unsafe_wrap(Array,
-                                 pointer(viscous_container._flux_viscous[1]),
-                                 (nvariables(equations),
-                                  nnodes(dg), nnodes(dg), nnodes(dg),
-                                  nelements(dg, cache)))
-    flux_viscous_2 = unsafe_wrap(Array,
-                                 pointer(viscous_container._flux_viscous[2]),
-                                 (nvariables(equations),
-                                  nnodes(dg), nnodes(dg), nnodes(dg),
-                                  nelements(dg, cache)))
-    flux_viscous_3 = unsafe_wrap(Array,
-                                 pointer(viscous_container._flux_viscous[3]),
-                                 (nvariables(equations),
-                                  nnodes(dg), nnodes(dg), nnodes(dg),
-                                  nelements(dg, cache)))
-
-    viscous_container.flux_viscous = (flux_viscous_1, flux_viscous_2, flux_viscous_3)
-
-    return nothing
-end
diff --git a/src/solvers/dgsem_tree/containers_1d.jl b/src/solvers/dgsem_tree/containers_1d.jl
index 0f562670756..15ddff369be 100644
--- a/src/solvers/dgsem_tree/containers_1d.jl
+++ b/src/solvers/dgsem_tree/containers_1d.jl
@@ -337,14 +337,14 @@ end
 
 # Create boundaries container and initialize boundary data in `elements`.
 function init_boundaries(cell_ids, mesh::TreeMesh1D,
-                         elements::TreeElementContainer1D)
+                         elements::TreeElementContainer1D, basis)
     # Initialize container
     n_boundaries = count_required_boundaries(mesh, cell_ids)
     boundaries = TreeBoundaryContainer1D{real(elements), eltype(elements)}(n_boundaries,
                                                                            nvariables(elements))
 
     # Connect elements with boundaries
-    init_boundaries!(boundaries, elements, mesh)
+    init_boundaries!(boundaries, elements, mesh, basis)
     return boundaries
 end
 
@@ -373,8 +373,55 @@ function count_required_boundaries(mesh::TreeMesh1D, cell_ids)
     return count
 end
 
+# For Lobtto points, we can simply use the outer nodes of the elements as boundary nodes.
+function calc_boundary_node_coordinates!(boundaries, element, count, direction,
+                                         elements, mesh::TreeMesh1D,
+                                         basis::LobattoLegendreBasis)
+    el_node_coords = elements.node_coordinates
+    bnd_node_coords = boundaries.node_coordinates
+
+    orientation = 1 # always 1 in 1D
+    if direction == 1
+        bnd_node_coords[orientation, count] = el_node_coords[orientation, 1,
+                                                             element]
+    elseif direction == 2
+        bnd_node_coords[orientation, count] = el_node_coords[orientation, end,
+                                                             element]
+    else
+        error("should not happen")
+    end
+
+    return nothing
+end
+
+# For Gauss points, we need to interpolate the boundary node coordinates.
+function calc_boundary_node_coordinates!(boundaries, element, count, direction,
+                                         elements, mesh::TreeMesh1D,
+                                         basis::GaussLegendreBasis)
+    boundary_matrix = basis.boundary_interpolation
+    el_node_coords = elements.node_coordinates
+    bnd_node_coords = boundaries.node_coordinates
+
+    orientation = 1 # always 1 in 1D
+    if direction == 1
+        @views x_interpolated_left = dot(boundary_matrix[:, 1],
+                                         el_node_coords[orientation, :,
+                                                        element])
+        bnd_node_coords[orientation, count] = x_interpolated_left
+    elseif direction == 2
+        @views x_interpolated_right = dot(boundary_matrix[:, 2],
+                                          el_node_coords[orientation, :,
+                                                         element])
+        bnd_node_coords[orientation, count] = x_interpolated_right
+    else
+        error("should not happen")
+    end
+
+    return nothing
+end
+
 # Initialize connectivity between elements and boundaries
-function init_boundaries!(boundaries, elements, mesh::TreeMesh1D)
+function init_boundaries!(boundaries, elements, mesh::TreeMesh1D, basis)
     # Reset boundaries count
     count = 0
 
@@ -418,15 +465,9 @@ function init_boundaries!(boundaries, elements, mesh::TreeMesh1D)
             # Set orientation (x -> 1)
             boundaries.orientations[count] = 1
 
-            # Store node coordinates
-            enc = elements.node_coordinates
-            if direction == 1 # -x direction
-                boundaries.node_coordinates[:, count] .= enc[:, 1, element]
-            elseif direction == 2 # +x direction
-                boundaries.node_coordinates[:, count] .= enc[:, end, element]
-            else
-                error("should not happen")
-            end
+            # Calculate node coordinates
+            calc_boundary_node_coordinates!(boundaries, element, count, direction,
+                                            elements, mesh, basis)
         end
     end
 
@@ -462,7 +503,7 @@ function reinitialize_containers!(mesh::TreeMesh{1}, equations, dg::DGSEM, cache
     # re-initialize boundaries container
     @unpack boundaries = cache
     resize!(boundaries, count_required_boundaries(mesh, leaf_cell_ids))
-    init_boundaries!(boundaries, elements, mesh)
+    init_boundaries!(boundaries, elements, mesh, dg.basis)
 
     return nothing
 end
diff --git a/src/solvers/dgsem_tree/containers_2d.jl b/src/solvers/dgsem_tree/containers_2d.jl
index d55e4a52c85..bf1d2434702 100644
--- a/src/solvers/dgsem_tree/containers_2d.jl
+++ b/src/solvers/dgsem_tree/containers_2d.jl
@@ -353,7 +353,7 @@ end
 
 # Create boundaries container and initialize boundary data in `elements`.
 function init_boundaries(cell_ids, mesh::TreeMesh2D,
-                         elements::TreeElementContainer2D)
+                         elements::TreeElementContainer2D, basis)
     # Initialize container
     n_boundaries = count_required_boundaries(mesh, cell_ids)
     boundaries = TreeBoundaryContainer2D{real(elements), eltype(elements)}(n_boundaries,
@@ -361,7 +361,7 @@ function init_boundaries(cell_ids, mesh::TreeMesh2D,
                                                                            nnodes(elements))
 
     # Connect elements with boundaries
-    init_boundaries!(boundaries, elements, mesh)
+    init_boundaries!(boundaries, elements, mesh, basis)
     return boundaries
 end
 
@@ -390,8 +390,89 @@ function count_required_boundaries(mesh::TreeMesh2D, cell_ids)
     return count
 end
 
+# For Lobatto points, we can simply use the outer nodes of the elements as boundary nodes.
+function calc_boundary_node_coordinates!(boundaries, element, count, direction,
+                                         elements, mesh::TreeMesh2D,
+                                         basis::LobattoLegendreBasis)
+    el_node_coords = elements.node_coordinates
+    bnd_node_coords = boundaries.node_coordinates
+
+    if direction == 1 # -x direction
+        @views bnd_node_coords[:, :, count] .= el_node_coords[:, 1, :, element]
+    elseif direction == 2 # +x direction
+        @views bnd_node_coords[:, :, count] .= el_node_coords[:, end, :, element]
+    elseif direction == 3 # -y direction
+        @views bnd_node_coords[:, :, count] .= el_node_coords[:, :, 1, element]
+    elseif direction == 4 # +y direction
+        @views bnd_node_coords[:, :, count] .= el_node_coords[:, :, end, element]
+    else
+        error("should not happen")
+    end
+
+    return nothing
+end
+
+# For Gauss points, we need to interpolate the boundary node coordinates.
+function calc_boundary_node_coordinates!(boundaries, element, count, direction,
+                                         elements, mesh::TreeMesh2D,
+                                         basis::GaussLegendreBasis)
+    boundary_matrix = basis.boundary_interpolation
+    el_node_coords = elements.node_coordinates
+    bnd_node_coords = boundaries.node_coordinates
+
+    if direction == 1 # -x direction: interpolate in x for each y node j
+        for j in eachnode(basis)
+            for orientation in 1:2 # Need to set both x and y coordinate of boundary node
+                @views bnd_node_coords[orientation, j, count] = dot(boundary_matrix[:,
+                                                                                    1],
+                                                                    el_node_coords[orientation,
+                                                                                   :,
+                                                                                   j,
+                                                                                   element])
+            end
+        end
+    elseif direction == 2 # +x direction: interpolate in x for each y node j
+        for j in eachnode(basis)
+            for orientation in 1:2 # Need to set both x and y coordinate of boundary node
+                @views bnd_node_coords[orientation, j, count] = dot(boundary_matrix[:,
+                                                                                    2],
+                                                                    el_node_coords[orientation,
+                                                                                   :,
+                                                                                   j,
+                                                                                   element])
+            end
+        end
+    elseif direction == 3 # -y direction: interpolate in y for each x node i
+        for i in eachnode(basis)
+            for orientation in 1:2 # Need to set both x and y coordinate of boundary node
+                @views bnd_node_coords[orientation, i, count] = dot(boundary_matrix[:,
+                                                                                    1],
+                                                                    el_node_coords[orientation,
+                                                                                   i,
+                                                                                   :,
+                                                                                   element])
+            end
+        end
+    elseif direction == 4 # +y direction: interpolate in y for each x node i
+        for i in eachnode(basis)
+            for orientation in 1:2 # Need to set both x and y coordinate of boundary node
+                @views bnd_node_coords[orientation, i, count] = dot(boundary_matrix[:,
+                                                                                    2],
+                                                                    el_node_coords[orientation,
+                                                                                   i,
+                                                                                   :,
+                                                                                   element])
+            end
+        end
+    else
+        error("should not happen")
+    end
+
+    return nothing
+end
+
 # Initialize connectivity between elements and boundaries
-function init_boundaries!(boundaries, elements, mesh::TreeMesh2D)
+function init_boundaries!(boundaries, elements, mesh::TreeMesh2D, basis)
     # Exit early if there are no boundaries to initialize
     if nboundaries(boundaries) == 0
         # In this case n_boundaries_per_direction still needs to be reset!
@@ -441,24 +522,14 @@ function init_boundaries!(boundaries, elements, mesh::TreeMesh2D)
 
             # Set orientation (x -> 1, y -> 2)
             if direction in (1, 2)
-                boundaries.orientations[count] = 1
+                boundaries.orientations[count] = 1 # x direction
             else
-                boundaries.orientations[count] = 2
+                boundaries.orientations[count] = 2 # y direction
             end
 
-            # Store node coordinates
-            enc = elements.node_coordinates
-            if direction == 1 # -x direction
-                boundaries.node_coordinates[:, :, count] .= enc[:, 1, :, element]
-            elseif direction == 2 # +x direction
-                boundaries.node_coordinates[:, :, count] .= enc[:, end, :, element]
-            elseif direction == 3 # -y direction
-                boundaries.node_coordinates[:, :, count] .= enc[:, :, 1, element]
-            elseif direction == 4 # +y direction
-                boundaries.node_coordinates[:, :, count] .= enc[:, :, end, element]
-            else
-                error("should not happen")
-            end
+            # Calculate node coordinates
+            calc_boundary_node_coordinates!(boundaries, element, count, direction,
+                                            elements, mesh, basis)
         end
     end
 
@@ -1271,6 +1342,9 @@ function ContainerAntidiffusiveFlux2D{uEltype}(capacity::Integer, n_variables,
     antidiffusive_flux2_R = unsafe_wrap(Array, pointer(_antidiffusive_flux2_R),
                                         (n_variables, n_nodes, n_nodes + 1, capacity))
 
+    reset_antidiffusive_fluxes!(antidiffusive_flux1_L, antidiffusive_flux1_R,
+                                antidiffusive_flux2_L, antidiffusive_flux2_R)
+
     return ContainerAntidiffusiveFlux2D{uEltype}(antidiffusive_flux1_L,
                                                  antidiffusive_flux1_R,
                                                  antidiffusive_flux2_L,
@@ -1309,17 +1383,22 @@ function Base.resize!(fluxes::ContainerAntidiffusiveFlux2D, capacity)
                                                (n_variables, n_nodes, n_nodes + 1,
                                                 capacity))
 
-    uEltype = eltype(fluxes.antidiffusive_flux1_L)
-    @threaded for element in axes(fluxes.antidiffusive_flux1_L, 4)
-        fluxes.antidiffusive_flux1_L[:, 1, :, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux1_L[:, n_nodes + 1, :, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux1_R[:, 1, :, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux1_R[:, n_nodes + 1, :, element] .= zero(uEltype)
-
-        fluxes.antidiffusive_flux2_L[:, :, 1, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux2_L[:, :, n_nodes + 1, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux2_R[:, :, 1, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux2_R[:, :, n_nodes + 1, element] .= zero(uEltype)
+    return nothing
+end
+
+function reset_antidiffusive_fluxes!(antidiffusive_flux1_L, antidiffusive_flux1_R,
+                                     antidiffusive_flux2_L, antidiffusive_flux2_R)
+    uEltype = eltype(antidiffusive_flux1_L)
+    @threaded for element in axes(antidiffusive_flux1_L, 4)
+        antidiffusive_flux1_L[:, 1, :, element] .= zero(uEltype)
+        antidiffusive_flux1_L[:, end, :, element] .= zero(uEltype)
+        antidiffusive_flux1_R[:, 1, :, element] .= zero(uEltype)
+        antidiffusive_flux1_R[:, end, :, element] .= zero(uEltype)
+
+        antidiffusive_flux2_L[:, :, 1, element] .= zero(uEltype)
+        antidiffusive_flux2_L[:, :, end, element] .= zero(uEltype)
+        antidiffusive_flux2_R[:, :, 1, element] .= zero(uEltype)
+        antidiffusive_flux2_R[:, :, end, element] .= zero(uEltype)
     end
 
     return nothing
@@ -1349,7 +1428,7 @@ function reinitialize_containers!(mesh::Union{TreeMesh{2}, TreeMesh{3}}, equatio
     # re-initialize boundaries container
     @unpack boundaries = cache
     resize!(boundaries, count_required_boundaries(mesh, leaf_cell_ids))
-    init_boundaries!(boundaries, elements, mesh)
+    init_boundaries!(boundaries, elements, mesh, dg.basis)
 
     # re-initialize mortars container
     @unpack mortars = cache
diff --git a/src/solvers/dgsem_tree/containers_3d.jl b/src/solvers/dgsem_tree/containers_3d.jl
index 1b6f5de4118..09aebe049b5 100644
--- a/src/solvers/dgsem_tree/containers_3d.jl
+++ b/src/solvers/dgsem_tree/containers_3d.jl
@@ -350,7 +350,7 @@ end
 
 # Create boundaries container and initialize boundary data in `elements`.
 function init_boundaries(cell_ids, mesh::TreeMesh3D,
-                         elements::TreeElementContainer3D)
+                         elements::TreeElementContainer3D, basis)
     # Initialize container
     n_boundaries = count_required_boundaries(mesh, cell_ids)
     boundaries = TreeBoundaryContainer3D{real(elements), eltype(elements)}(n_boundaries,
@@ -358,7 +358,7 @@ function init_boundaries(cell_ids, mesh::TreeMesh3D,
                                                                            nnodes(elements))
 
     # Connect elements with boundaries
-    init_boundaries!(boundaries, elements, mesh)
+    init_boundaries!(boundaries, elements, mesh, basis)
     return boundaries
 end
 
@@ -388,7 +388,7 @@ function count_required_boundaries(mesh::TreeMesh3D, cell_ids)
 end
 
 # Initialize connectivity between elements and boundaries
-function init_boundaries!(boundaries, elements, mesh::TreeMesh3D)
+function init_boundaries!(boundaries, elements, mesh::TreeMesh3D, basis)
     # Reset boundaries count
     count = 0
 
@@ -845,6 +845,11 @@ function ContainerAntidiffusiveFlux3D{uEltype}(capacity::Integer, n_variables,
     antidiffusive_flux3_R = unsafe_wrap(Array, pointer(_antidiffusive_flux3_R),
                                         (n_variables, n_nodes, n_nodes, n_nodes + 1,
                                          capacity))
+
+    reset_antidiffusive_fluxes!(antidiffusive_flux1_L, antidiffusive_flux1_R,
+                                antidiffusive_flux2_L, antidiffusive_flux2_R,
+                                antidiffusive_flux3_L, antidiffusive_flux3_R)
+
     return ContainerAntidiffusiveFlux3D{uEltype}(antidiffusive_flux1_L,
                                                  antidiffusive_flux1_R,
                                                  antidiffusive_flux2_L,
@@ -908,22 +913,28 @@ function Base.resize!(fluxes::ContainerAntidiffusiveFlux3D, capacity)
                                                 n_nodes, n_nodes, n_nodes + 1,
                                                 capacity))
 
-    uEltype = eltype(fluxes.antidiffusive_flux1_L)
-    @threaded for element in axes(fluxes.antidiffusive_flux1_L, 5)
-        fluxes.antidiffusive_flux1_L[:, 1, :, :, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux1_L[:, n_nodes + 1, :, :, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux1_R[:, 1, :, :, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux1_R[:, n_nodes + 1, :, :, element] .= zero(uEltype)
-
-        fluxes.antidiffusive_flux2_L[:, :, 1, :, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux2_L[:, :, n_nodes + 1, :, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux2_R[:, :, 1, :, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux2_R[:, :, n_nodes + 1, :, element] .= zero(uEltype)
-
-        fluxes.antidiffusive_flux3_L[:, :, :, 1, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux3_L[:, :, :, n_nodes + 1, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux3_R[:, :, :, 1, element] .= zero(uEltype)
-        fluxes.antidiffusive_flux3_R[:, :, :, n_nodes + 1, element] .= zero(uEltype)
+    return nothing
+end
+
+function reset_antidiffusive_fluxes!(antidiffusive_flux1_L, antidiffusive_flux1_R,
+                                     antidiffusive_flux2_L, antidiffusive_flux2_R,
+                                     antidiffusive_flux3_L, antidiffusive_flux3_R)
+    uEltype = eltype(antidiffusive_flux1_L)
+    @threaded for element in axes(antidiffusive_flux1_L, 5)
+        antidiffusive_flux1_L[:, 1, :, :, element] .= zero(uEltype)
+        antidiffusive_flux1_L[:, end, :, :, element] .= zero(uEltype)
+        antidiffusive_flux1_R[:, 1, :, :, element] .= zero(uEltype)
+        antidiffusive_flux1_R[:, end, :, :, element] .= zero(uEltype)
+
+        antidiffusive_flux2_L[:, :, 1, :, element] .= zero(uEltype)
+        antidiffusive_flux2_L[:, :, end, :, element] .= zero(uEltype)
+        antidiffusive_flux2_R[:, :, 1, :, element] .= zero(uEltype)
+        antidiffusive_flux2_R[:, :, end, :, element] .= zero(uEltype)
+
+        antidiffusive_flux3_L[:, :, :, 1, element] .= zero(uEltype)
+        antidiffusive_flux3_L[:, :, :, end, element] .= zero(uEltype)
+        antidiffusive_flux3_R[:, :, :, 1, element] .= zero(uEltype)
+        antidiffusive_flux3_R[:, :, :, end, element] .= zero(uEltype)
     end
 
     return nothing
diff --git a/src/solvers/dgsem_tree/containers_parabolic.jl b/src/solvers/dgsem_tree/containers_parabolic.jl
new file mode 100644
index 00000000000..f90eac9dcb0
--- /dev/null
+++ b/src/solvers/dgsem_tree/containers_parabolic.jl
@@ -0,0 +1,4 @@
+# Dimension-specific implementations
+include("container_parabolic_1d.jl")
+include("container_parabolic_2d.jl")
+include("container_parabolic_3d.jl")
diff --git a/src/solvers/dgsem_tree/containers_viscous.jl b/src/solvers/dgsem_tree/containers_viscous.jl
deleted file mode 100644
index 444f2cb7303..00000000000
--- a/src/solvers/dgsem_tree/containers_viscous.jl
+++ /dev/null
@@ -1,4 +0,0 @@
-# Dimension-specific implementations
-include("container_viscous_1d.jl")
-include("container_viscous_2d.jl")
-include("container_viscous_3d.jl")
diff --git a/src/solvers/dgsem_tree/dg.jl b/src/solvers/dgsem_tree/dg.jl
index 89c7e9a659c..55a71b8e88e 100644
--- a/src/solvers/dgsem_tree/dg.jl
+++ b/src/solvers/dgsem_tree/dg.jl
@@ -60,7 +60,7 @@ include("containers.jl")
 include("dg_parallel.jl")
 
 # Helper structs for parabolic AMR
-include("containers_viscous.jl")
+include("containers_parabolic.jl")
 
 # Some functions for a second-order Finite-Volume (MUSCL) alike
 # scheme on DG-subcells.
@@ -87,5 +87,7 @@ include("dg_3d_compressible_euler.jl")
 # Subcell limiters
 include("subcell_limiters.jl")
 include("subcell_limiters_2d.jl")
+include("subcell_limiters_3d.jl")
 include("dg_2d_subcell_limiters.jl")
+include("dg_3d_subcell_limiters.jl")
 end # @muladd
diff --git a/src/solvers/dgsem_tree/dg_1d.jl b/src/solvers/dgsem_tree/dg_1d.jl
index c472e071ab3..65383168835 100644
--- a/src/solvers/dgsem_tree/dg_1d.jl
+++ b/src/solvers/dgsem_tree/dg_1d.jl
@@ -20,7 +20,7 @@ function create_cache(mesh::TreeMesh{1}, equations,
 
     interfaces = init_interfaces(leaf_cell_ids, mesh, elements)
 
-    boundaries = init_boundaries(leaf_cell_ids, mesh, elements)
+    boundaries = init_boundaries(leaf_cell_ids, mesh, elements, dg.basis)
 
     # Container cache
     cache = (; elements, interfaces, boundaries)
@@ -55,16 +55,23 @@ end
 
 # TODO: Taal discuss/refactor timer, allowing users to pass a custom timer?
 
+# This function is valid for all conforming mesh types (except for `StructuredMesh`), i.e.,
+# all meshes that do not involve mortar operations.
+# Thus, we can use it for 1D `TreeMesh` and `UnstructuredMesh2D`.
 function rhs!(du, u, t,
-              mesh::TreeMesh{1}, equations,
+              mesh::Union{TreeMesh{1},
+                          UnstructuredMesh2D},
+              equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
+    backend = trixi_backend(u)
+
     # Reset du
     @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache)
     end
@@ -94,12 +101,13 @@ function rhs!(du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
@@ -117,7 +125,8 @@ This treatment is required to achieve, e.g., entropy-stability or well-balancedn
 See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-1765644064
 =#
 @inline function weak_form_kernel!(du, u,
-                                   element, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                                   element,
+                                   ::Type{<:Union{TreeMesh{1}, StructuredMesh{1}}},
                                    have_nonconservative_terms::False, equations,
                                    dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -138,7 +147,8 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                                           ::Type{<:Union{TreeMesh{1},
+                                                          StructuredMesh{1}}},
                                            have_nonconservative_terms::False, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -167,7 +177,8 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                                           MeshT::Type{<:Union{TreeMesh{1},
+                                                               StructuredMesh{1}}},
                                            have_nonconservative_terms::True, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -176,7 +187,7 @@ end
     symmetric_flux, nonconservative_flux = volume_flux
 
     # Apply the symmetric flux as usual
-    flux_differencing_kernel!(du, u, element, mesh, False(), equations, symmetric_flux,
+    flux_differencing_kernel!(du, u, element, MeshT, False(), equations, symmetric_flux,
                               dg, cache, alpha)
 
     # Calculate the remaining volume terms using the nonsymmetric generalized flux
@@ -202,7 +213,7 @@ end
 end
 
 @inline function fv_kernel!(du, u,
-                            mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                            MeshT::Type{<:Union{TreeMesh{1}, StructuredMesh{1}}},
                             have_nonconservative_terms, equations,
                             volume_flux_fv, dg::DGSEM, cache, element, alpha = true)
     @unpack fstar1_L_threaded, fstar1_R_threaded = cache
@@ -211,7 +222,7 @@ end
     # Calculate FV two-point fluxes
     fstar1_L = fstar1_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
-    calcflux_fv!(fstar1_L, fstar1_R, u, mesh,
+    calcflux_fv!(fstar1_L, fstar1_R, u, MeshT,
                  have_nonconservative_terms, equations,
                  volume_flux_fv, dg, element, cache)
 
@@ -228,7 +239,7 @@ end
 end
 
 @inline function fvO2_kernel!(du, u,
-                              mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                              MeshT::Type{<:Union{TreeMesh{1}, StructuredMesh{1}}},
                               nonconservative_terms, equations,
                               volume_flux_fv, dg::DGSEM, cache, element,
                               sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -240,7 +251,7 @@ end
     # Calculate FV two-point fluxes
     fstar1_L = fstar1_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
-    calcflux_fvO2!(fstar1_L, fstar1_R, u, mesh, nonconservative_terms, equations,
+    calcflux_fvO2!(fstar1_L, fstar1_R, u, MeshT, nonconservative_terms, equations,
                    volume_flux_fv, dg, element, cache,
                    sc_interface_coords, reconstruction_mode, slope_limiter,
                    cons2recon, recon2cons)
@@ -262,7 +273,7 @@ end
 # "A provably entropy stable subcell shock capturing approach for high order split form DG for the compressible Euler equations"
 # [arXiv: 2008.12044v2](https://arxiv.org/pdf/2008.12044)
 @inline function calcflux_fv!(fstar1_L, fstar1_R, u,
-                              mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                              ::Type{<:Union{TreeMesh{1}, StructuredMesh{1}}},
                               have_nonconservative_terms::False,
                               equations, volume_flux_fv, dg::DGSEM, element, cache)
     for i in 2:nnodes(dg)
@@ -277,7 +288,7 @@ end
 end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, u,
-                              mesh::TreeMesh{1},
+                              ::Type{<:TreeMesh{1}},
                               have_nonconservative_terms::True,
                               equations, volume_flux_fv, dg::DGSEM, element, cache)
     volume_flux, nonconservative_flux = volume_flux_fv
@@ -308,7 +319,7 @@ end
 # "An entropy stable nodal discontinuous Galerkin method for the resistive MHD equations. Part II: Subcell finite volume shock capturing"
 # [JCP: 2021.110580](https://doi.org/10.1016/j.jcp.2021.110580)
 @inline function calcflux_fvO2!(fstar1_L, fstar1_R, u,
-                                mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                                ::Type{<:Union{TreeMesh{1}, StructuredMesh{1}}},
                                 nonconservative_terms::False,
                                 equations, volume_flux_fv, dg::DGSEM, element, cache,
                                 sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -374,8 +385,8 @@ end
 end
 
 # Used for both the purely hyperbolic conserved variables `u`
-# and the viscous flux in x-direction in the 1D parabolic case.
-function prolong2interfaces!(cache, u_or_flux_viscous,
+# and the parabolic flux in x-direction in the 1D parabolic case.
+function prolong2interfaces!(cache, u_or_flux_parabolic,
                              mesh::TreeMesh{1}, equations, dg::DG)
     @unpack interfaces = cache
     @unpack neighbor_ids = interfaces
@@ -387,18 +398,18 @@ function prolong2interfaces!(cache, u_or_flux_viscous,
 
         # interface in x-direction
         for v in eachvariable(equations)
-            interfaces_u[1, v, interface] = u_or_flux_viscous[v, nnodes(dg),
-                                                              left_element]
-            interfaces_u[2, v, interface] = u_or_flux_viscous[v, 1, right_element]
+            interfaces_u[1, v, interface] = u_or_flux_parabolic[v, nnodes(dg),
+                                                                left_element]
+            interfaces_u[2, v, interface] = u_or_flux_parabolic[v, 1, right_element]
         end
     end
 
     return nothing
 end
 
-function prolong2interfaces!(cache, u_or_flux_viscous,
+function prolong2interfaces!(cache, u_or_flux_parabolic,
                              mesh::TreeMesh{1}, equations,
-                             dg::DG{<:GaussLegendreBasis})
+                             dg::DGSEM{<:GaussLegendreBasis})
     @unpack interfaces = cache
     @unpack neighbor_ids = interfaces
     @unpack boundary_interpolation = dg.basis
@@ -410,16 +421,24 @@ function prolong2interfaces!(cache, u_or_flux_viscous,
 
         # interface in x-direction
         for v in eachvariable(equations)
-            interfaces_u[1, v, interface] = zero(eltype(interfaces_u))
-            interfaces_u[2, v, interface] = zero(eltype(interfaces_u))
+            # Interpolate to the interfaces using a local variable for
+            # the accumulation of values (to reduce global memory operations).
+            interface_u_1 = zero(eltype(interfaces_u))
+            interface_u_2 = zero(eltype(interfaces_u))
             for ii in eachnode(dg)
-                interfaces_u[1, v, interface] += (u_or_flux_viscous[v, ii,
-                                                                    left_element] *
-                                                  boundary_interpolation[ii, 2])
-                interfaces_u[2, v, interface] += (u_or_flux_viscous[v, ii,
-                                                                    right_element] *
-                                                  boundary_interpolation[ii, 1])
+                # Not += to allow `@muladd` to turn these into FMAs
+                # (see comment at the top of the file)
+                # Need `boundary_interpolation` at right (+1) node for left element
+                interface_u_1 = (interface_u_1 +
+                                 u_or_flux_parabolic[v, ii, left_element] *
+                                 boundary_interpolation[ii, 2])
+                # Need `boundary_interpolation` at left (-1) node for right element
+                interface_u_2 = (interface_u_2 +
+                                 u_or_flux_parabolic[v, ii, right_element] *
+                                 boundary_interpolation[ii, 1])
             end
+            interfaces_u[1, v, interface] = interface_u_1
+            interfaces_u[2, v, interface] = interface_u_2
         end
     end
 
@@ -500,8 +519,8 @@ function calc_interface_flux!(surface_flux_values,
 end
 
 # Used for both the purely hyperbolic conserved variables `u`
-# and the viscous flux in x-direction in the 1D parabolic case.
-function prolong2boundaries!(cache, u_or_flux_viscous,
+# and the parabolic flux in x-direction in the 1D parabolic case.
+function prolong2boundaries!(cache, u_or_flux_parabolic,
                              mesh::TreeMesh{1}, equations, dg::DG)
     @unpack boundaries = cache
     @unpack neighbor_sides = boundaries
@@ -513,11 +532,54 @@ function prolong2boundaries!(cache, u_or_flux_viscous,
         if neighbor_sides[boundary] == 1
             # element in -x direction of boundary
             for v in eachvariable(equations)
-                boundaries.u[1, v, boundary] = u_or_flux_viscous[v, nnodes(dg), element]
+                boundaries.u[1, v, boundary] = u_or_flux_parabolic[v, nnodes(dg),
+                                                                   element]
             end
         else # Element in +x direction of boundary
             for v in eachvariable(equations)
-                boundaries.u[2, v, boundary] = u_or_flux_viscous[v, 1, element]
+                boundaries.u[2, v, boundary] = u_or_flux_parabolic[v, 1, element]
+            end
+        end
+    end
+
+    return nothing
+end
+
+function prolong2boundaries!(cache, u_or_flux_parabolic,
+                             mesh::TreeMesh{1}, equations,
+                             dg::DGSEM{<:GaussLegendreBasis})
+    @unpack boundaries = cache
+    @unpack neighbor_sides = boundaries
+    @unpack boundary_interpolation = dg.basis
+
+    @threaded for boundary in eachboundary(dg, cache)
+        element = boundaries.neighbor_ids[boundary]
+
+        # boundary in x-direction
+        if neighbor_sides[boundary] == 1
+            # element in -x direction of boundary => need to evaluate at right boundary node (+1)
+            for v in eachvariable(equations)
+                # Interpolate to the boundaries using a local variable for
+                # the accumulation of values (to reduce global memory operations).
+                boundary_u_1 = zero(eltype(boundaries.u))
+                for ii in eachnode(dg)
+                    # Not += to allow `@muladd` to turn these into FMAs
+                    # (see comment at the top of the file)
+                    boundary_u_1 = (boundary_u_1 +
+                                    u_or_flux_parabolic[v, ii, element] *
+                                    boundary_interpolation[ii, 2])
+                end
+                boundaries.u[1, v, boundary] = boundary_u_1
+            end
+        else # Element in +x direction of boundary => need to evaluate at left boundary node (-1)
+            for v in eachvariable(equations)
+                boundary_u_2 = zero(eltype(boundaries.u))
+                for ii in eachnode(dg)
+                    boundary_u_2 = (boundary_u_2 +
+                                    u_or_flux_parabolic[v, ii, element] *
+                                    boundary_interpolation[ii, 1])
+                end
+                boundaries.u[2, v, boundary] = boundary_u_2
             end
         end
     end
@@ -616,7 +678,8 @@ function calc_boundary_flux_by_direction!(surface_flux_values::AbstractArray{<:A
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+function calc_surface_integral!(backend::Nothing, du, u,
+                                mesh::Union{TreeMesh{1}, StructuredMesh{1}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
                                 dg::DGSEM, cache)
     @unpack inverse_weights = dg.basis
@@ -646,9 +709,10 @@ function calc_surface_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+function calc_surface_integral!(backend::Nothing, du, u,
+                                mesh::Union{TreeMesh{1}, StructuredMesh{1}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
-                                dg::DG{<:GaussLegendreBasis}, cache)
+                                dg::DGSEM{<:GaussLegendreBasis}, cache)
     @unpack boundary_interpolation_inverse_weights = dg.basis
     @unpack surface_flux_values = cache.elements
 
@@ -660,15 +724,18 @@ function calc_surface_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1
     # into FMAs (see comment at the top of the file).
     @threaded for element in eachelement(dg, cache)
         for v in eachvariable(equations)
+            # Aliases for repeatedly accessed variables
+            surface_flux_minus = surface_flux_values[v, 1, element]
+            surface_flux_plus = surface_flux_values[v, 2, element]
             for ii in eachnode(dg)
                 # surface at -x
                 du[v, ii, element] = (du[v, ii, element] -
-                                      surface_flux_values[v, 1, element] *
+                                      surface_flux_minus *
                                       boundary_interpolation_inverse_weights[ii, 1])
 
                 # surface at +x
                 du[v, ii, element] = (du[v, ii, element] +
-                                      surface_flux_values[v, 2, element] *
+                                      surface_flux_plus *
                                       boundary_interpolation_inverse_weights[ii, 2])
             end
         end
@@ -677,13 +744,13 @@ function calc_surface_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1
     return nothing
 end
 
-function apply_jacobian!(du, mesh::TreeMesh{1},
+function apply_jacobian!(backend::Nothing, du, mesh::TreeMesh{1},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 
     @threaded for element in eachelement(dg, cache)
         # Negative sign included to account for the negated surface and volume terms,
-        # see e.g. the computation of `derivative_hat` in the basis setup and 
+        # see e.g. the computation of `derivative_hat` in the basis setup and
         # the comment in `calc_surface_integral!`.
         factor = -inverse_jacobian[element]
 
diff --git a/src/solvers/dgsem_tree/dg_1d_parabolic.jl b/src/solvers/dgsem_tree/dg_1d_parabolic.jl
index ac06ec4dfac..f234b23eb70 100644
--- a/src/solvers/dgsem_tree/dg_1d_parabolic.jl
+++ b/src/solvers/dgsem_tree/dg_1d_parabolic.jl
@@ -11,11 +11,11 @@
 function create_cache_parabolic(mesh::TreeMesh{1},
                                 equations_hyperbolic::AbstractEquations,
                                 dg::DG, n_elements, uEltype)
-    viscous_container = init_viscous_container_1d(nvariables(equations_hyperbolic),
-                                                  nnodes(dg), n_elements,
-                                                  uEltype)
+    parabolic_container = init_parabolic_container_1d(nvariables(equations_hyperbolic),
+                                                      nnodes(dg), n_elements,
+                                                      uEltype)
 
-    cache_parabolic = (; viscous_container)
+    cache_parabolic = (; parabolic_container)
 
     return cache_parabolic
 end
@@ -32,10 +32,10 @@ function rhs_parabolic!(du, u, t, mesh::TreeMesh{1},
                         equations_parabolic::AbstractEquationsParabolic,
                         boundary_conditions_parabolic, source_terms_parabolic,
                         dg::DG, parabolic_scheme, cache, cache_parabolic)
-    @unpack viscous_container = cache_parabolic
-    @unpack u_transformed, gradients, flux_viscous = viscous_container
+    @unpack parabolic_container = cache_parabolic
+    @unpack u_transformed, gradients, flux_parabolic = parabolic_container
 
-    # Convert conservative variables to a form more suitable for viscous flux calculations
+    # Convert conservative variables to a form more suitable for parabolic flux calculations
     @trixi_timeit timer() "transform variables" begin
         transform_variables!(u_transformed, u, mesh, equations_parabolic, dg,
                              cache)
@@ -48,20 +48,20 @@ function rhs_parabolic!(du, u, t, mesh::TreeMesh{1},
                        parabolic_scheme, cache)
     end
 
-    # Compute and store the viscous fluxes
-    @trixi_timeit timer() "calculate viscous fluxes" begin
-        calc_viscous_fluxes!(flux_viscous, gradients, u_transformed, mesh,
-                             equations_parabolic, dg, cache)
+    # Compute and store the parabolic fluxes
+    @trixi_timeit timer() "calculate parabolic fluxes" begin
+        calc_parabolic_fluxes!(flux_parabolic, gradients, u_transformed, mesh,
+                               equations_parabolic, dg, cache)
     end
 
     # The remainder of this function is essentially a regular rhs! for
-    # parabolic equations (i.e., it computes the divergence of the viscous fluxes)
+    # parabolic equations (i.e., it computes the divergence of the parabolic fluxes)
     #
-    # OBS! In `calc_viscous_fluxes!`, the viscous flux values at the volume nodes of each element have
-    # been computed and stored in `fluxes_viscous`. In the following, we *reuse* (abuse) the
+    # OBS! In `calc_parabolic_fluxes!`, the parabolic flux values at the volume nodes of each element have
+    # been computed and stored in `flux_parabolic`. In the following, we *reuse* (abuse) the
     # `interfaces` and `boundaries` containers in `cache` to interpolate and store the
     # *fluxes* at the element surfaces, as opposed to interpolating and storing the *solution* (as it
-    # is done in the hyperbolic operator). That is, `interfaces.u`/`boundaries.u` store *viscous flux values*
+    # is done in the hyperbolic operator). That is, `interfaces.u`/`boundaries.u` store *parabolic flux values*
     # and *not the solution*.  The advantage is that a) we do not need to allocate more storage, b) we
     # do not need to recreate the existing data structure only with a different name, and c) we do not
     # need to interpolate solutions *and* gradients to the surfaces.
@@ -70,19 +70,19 @@ function rhs_parabolic!(du, u, t, mesh::TreeMesh{1},
     @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
 
     # Calculate volume integral
-    # This calls the specialized version for the viscous flux.
+    # This calls the specialized version for the parabolic flux.
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg, cache)
+        calc_volume_integral!(du, flux_parabolic, mesh, equations_parabolic, dg, cache)
     end
 
     # Prolong solution to interfaces
     # This reuses `prolong2interfaces!` for the purely hyperbolic case.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2interfaces!(cache, flux_parabolic, mesh, equations_parabolic, dg)
     end
 
     # Calculate interface fluxes.
-    # This calls the specialized version for the viscous flux.
+    # This calls the specialized version for the parabolic flux.
     @trixi_timeit timer() "interface flux" begin
         calc_interface_flux!(cache.elements.surface_flux_values,
                              mesh, equations_parabolic, dg,
@@ -92,7 +92,7 @@ function rhs_parabolic!(du, u, t, mesh::TreeMesh{1},
     # Prolong solution to boundaries.
     # This reuses `prolong2boundaries!` for the purely hyperbolic case.
     @trixi_timeit timer() "prolong2boundaries" begin
-        prolong2boundaries!(cache, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2boundaries!(cache, flux_parabolic, mesh, equations_parabolic, dg)
     end
 
     # Calculate boundary fluxes.
@@ -107,7 +107,7 @@ function rhs_parabolic!(du, u, t, mesh::TreeMesh{1},
     # Calculate surface integrals.
     # This reuses `calc_surface_integral!` for the purely hyperbolic case.
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations_parabolic,
+        calc_surface_integral!(nothing, du, u, mesh, equations_parabolic,
                                dg.surface_integral, dg, cache)
     end
 
@@ -146,10 +146,10 @@ function transform_variables!(u_transformed, u, mesh::TreeMesh{1},
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes.
+# This is the version used when calculating the divergence of the parabolic fluxes.
 # Identical to weak-form volume integral/kernel for the purely hyperbolic case,
-# except that the fluxes are here already precomputed in `calc_viscous_fluxes!`
-function calc_volume_integral!(du, flux_viscous, mesh::TreeMesh{1},
+# except that the fluxes are here already precomputed in `calc_parabolic_fluxes!`
+function calc_volume_integral!(du, flux_parabolic, mesh::TreeMesh{1},
                                equations_parabolic::AbstractEquationsParabolic,
                                dg::DGSEM, cache)
     @unpack derivative_hat = dg.basis
@@ -157,7 +157,7 @@ function calc_volume_integral!(du, flux_viscous, mesh::TreeMesh{1},
     @threaded for element in eachelement(dg, cache)
         # Calculate volume terms in one element
         for i in eachnode(dg)
-            flux_1_node = get_node_vars(flux_viscous, equations_parabolic, dg, i,
+            flux_1_node = get_node_vars(flux_parabolic, equations_parabolic, dg, i,
                                         element)
 
             for ii in eachnode(dg)
@@ -170,7 +170,7 @@ function calc_volume_integral!(du, flux_viscous, mesh::TreeMesh{1},
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes
+# This is the version used when calculating the divergence of the parabolic fluxes
 function calc_interface_flux!(surface_flux_values, mesh::TreeMesh{1},
                               equations_parabolic, dg::DG, parabolic_scheme,
                               cache)
@@ -205,9 +205,10 @@ function calc_interface_flux!(surface_flux_values, mesh::TreeMesh{1},
     return nothing
 end
 
-function calc_viscous_fluxes!(flux_viscous, gradients, u_transformed, mesh::TreeMesh{1},
-                              equations_parabolic::AbstractEquationsParabolic,
-                              dg::DG, cache)
+function calc_parabolic_fluxes!(flux_parabolic, gradients, u_transformed,
+                                mesh::TreeMesh{1},
+                                equations_parabolic::AbstractEquationsParabolic,
+                                dg::DG, cache)
     @threaded for element in eachelement(dg, cache)
         for i in eachnode(dg)
             # Get solution and gradients
@@ -215,10 +216,10 @@ function calc_viscous_fluxes!(flux_viscous, gradients, u_transformed, mesh::Tree
             gradients_1_node = get_node_vars(gradients, equations_parabolic, dg,
                                              i, element)
 
-            # Calculate viscous flux and store each component for later use
-            flux_viscous_node = flux(u_node, (gradients_1_node,), 1,
-                                     equations_parabolic)
-            set_node_vars!(flux_viscous, flux_viscous_node, equations_parabolic, dg,
+            # Calculate parabolic flux and store each component for later use
+            flux_parabolic_node = flux(u_node, (gradients_1_node,), 1,
+                                       equations_parabolic)
+            set_node_vars!(flux_parabolic, flux_parabolic_node, equations_parabolic, dg,
                            i, element)
         end
     end
@@ -348,14 +349,14 @@ function calc_boundary_flux_by_direction_divergence!(surface_flux_values::Abstra
     @unpack surface_flux = surface_integral
 
     # Note: cache.boundaries.u contains the unsigned normal component (using "orientation", not "direction")
-    # of the viscous flux, as computed in `prolong2boundaries!`
+    # of the parabolic flux, as computed in `prolong2boundaries!`
     @unpack u, neighbor_ids, neighbor_sides, node_coordinates, orientations = cache.boundaries
 
     @threaded for boundary in first_boundary:last_boundary
         # Get neighboring element
         neighbor = neighbor_ids[boundary]
 
-        # Get viscous boundary fluxes
+        # Get parabolic boundary fluxes
         flux_ll, flux_rr = get_surface_node_vars(u, equations_parabolic, dg, boundary)
         if neighbor_sides[boundary] == 1 # Element is on the left, boundary on the right
             flux_inner = flux_ll
@@ -470,6 +471,40 @@ function calc_surface_integral_gradient!(gradients,
     return nothing
 end
 
+function calc_surface_integral_gradient!(gradients,
+                                         mesh::TreeMesh{1}, # for dispatch only
+                                         equations_parabolic::AbstractEquationsParabolic,
+                                         dg::DGSEM{<:GaussLegendreBasis}, cache)
+    @unpack boundary_interpolation_inverse_weights = dg.basis
+    @unpack surface_flux_values = cache.elements
+
+    # Note that all fluxes have been computed with outward-pointing normal vectors.
+    # We also use explicit assignments instead of `+=` to let `@muladd` turn these
+    # into FMAs (see comment at the top of the file).
+    @threaded for element in eachelement(dg, cache)
+        for v in eachvariable(equations_parabolic)
+            # Aliases for repeatedly accessed variables
+            surface_flux_minus = surface_flux_values[v, 1, element]
+            surface_flux_plus = surface_flux_values[v, 2, element]
+            for ii in eachnode(dg)
+                # surface at -x
+                gradients[v, ii, element] = (gradients[v, ii, element] -
+                                             surface_flux_minus *
+                                             boundary_interpolation_inverse_weights[ii,
+                                                                                    1])
+
+                # surface at +x
+                gradients[v, ii, element] = (gradients[v, ii, element] +
+                                             surface_flux_plus *
+                                             boundary_interpolation_inverse_weights[ii,
+                                                                                    2])
+            end
+        end
+    end
+
+    return nothing
+end
+
 # Calculate the gradient of the transformed variables
 function calc_gradient!(gradients, u_transformed, t, mesh::TreeMesh{1},
                         equations_parabolic, boundary_conditions_parabolic,
@@ -532,7 +567,7 @@ end
 # Needed to *not* flip the sign of the inverse Jacobian.
 # This is because the parabolic fluxes are assumed to be of the form
 #   `du/dt + df/dx = dg/dx + source(x,t)`,
-# where f(u) is the inviscid flux and g(u) is the viscous flux.
+# where f(u) is the inviscid flux and g(u) is the parabolic flux.
 function apply_jacobian_parabolic!(du::AbstractArray, mesh::TreeMesh{1},
                                    equations_parabolic::AbstractEquationsParabolic,
                                    dg::DG, cache)
diff --git a/src/solvers/dgsem_tree/dg_2d.jl b/src/solvers/dgsem_tree/dg_2d.jl
index 0ae1bd3a37d..2cb37ac4457 100644
--- a/src/solvers/dgsem_tree/dg_2d.jl
+++ b/src/solvers/dgsem_tree/dg_2d.jl
@@ -20,7 +20,7 @@ function create_cache(mesh::Union{TreeMesh{2}, TreeMesh{3}}, equations,
 
     interfaces = init_interfaces(leaf_cell_ids, mesh, elements)
 
-    boundaries = init_boundaries(leaf_cell_ids, mesh, elements)
+    boundaries = init_boundaries(leaf_cell_ids, mesh, elements, dg.basis)
 
     mortars = init_mortars(leaf_cell_ids, mesh, elements, dg.mortar)
 
@@ -100,81 +100,91 @@ end
 
 # TODO: Taal discuss/refactor timer, allowing users to pass a custom timer?
 
+# This function is valid for all non-conforming mesh types, i.e.,
+# all meshes that do involve mortar operations.
+# Thus, we can use it for the serial (i.e., non-distributed memory parallelized)
+# 2D/3D `TreeMesh`es, `P4estMesh`es, and `T8codeMesh`es.
 function rhs!(du, u, t,
               mesh::Union{TreeMesh{2}, P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2},
                           TreeMesh{3}, P4estMesh{3}, T8codeMesh{3}},
               equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
+    backend = trixi_backend(u)
+
     # Reset du
-    @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
+    @trixi_timeit_ext backend timer() "reset ∂u/∂t" begin
+        set_zero!(du, dg, cache)
+    end
 
     # Calculate volume integral
-    @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+    @trixi_timeit_ext backend timer() "volume integral" begin
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache,
                               t, boundary_conditions)
     end
 
     # Prolong solution to interfaces
-    @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u, mesh, equations, dg)
+    @trixi_timeit_ext backend timer() "prolong2interfaces" begin
+        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
     end
 
     # Calculate interface fluxes
-    @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+    @trixi_timeit_ext backend timer() "interface flux" begin
+        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
                              have_nonconservative_terms(equations), equations,
                              dg.surface_integral, dg, cache)
     end
 
     # Prolong solution to boundaries
-    @trixi_timeit timer() "prolong2boundaries" begin
+    @trixi_timeit_ext backend timer() "prolong2boundaries" begin
         prolong2boundaries!(cache, u, mesh, equations, dg)
     end
 
     # Calculate boundary fluxes
-    @trixi_timeit timer() "boundary flux" begin
+    @trixi_timeit_ext backend timer() "boundary flux" begin
         calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
                             dg.surface_integral, dg)
     end
 
     # Prolong solution to mortars
-    @trixi_timeit timer() "prolong2mortars" begin
+    @trixi_timeit_ext backend timer() "prolong2mortars" begin
         prolong2mortars!(cache, u, mesh, equations,
                          dg.mortar, dg)
     end
 
     # Calculate mortar fluxes
-    @trixi_timeit timer() "mortar flux" begin
+    @trixi_timeit_ext backend timer() "mortar flux" begin
         calc_mortar_flux!(cache.elements.surface_flux_values, mesh,
                           have_nonconservative_terms(equations), equations,
                           dg.mortar, dg.surface_integral, dg, cache)
     end
 
     # Calculate surface integrals
-    @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+    @trixi_timeit_ext backend timer() "surface integral" begin
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit_ext backend timer() "Jacobian" begin
+        apply_jacobian!(backend, du, mesh, equations, dg, cache)
+    end
 
     # Calculate source terms
-    @trixi_timeit timer() "source terms" begin
+    @trixi_timeit_ext backend timer() "source terms" begin
         calc_sources!(du, u, t, source_terms, equations, dg, cache)
     end
 
     return nothing
 end
 
-function calc_volume_integral!(du, u, mesh,
+function calc_volume_integral!(backend, du, u, mesh,
                                nonconservative_terms, equations,
                                volume_integral::AbstractVolumeIntegral,
                                dg, cache, t, boundary_conditions)
-    calc_volume_integral!(du, u, mesh,
+    calc_volume_integral!(backend, du, u, mesh,
                           nonconservative_terms, equations,
                           volume_integral, dg, cache)
 
@@ -189,7 +199,7 @@ This treatment is required to achieve, e.g., entropy-stability or well-balancedn
 See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-1765644064
 =#
 @inline function weak_form_kernel!(du, u,
-                                   element, mesh::TreeMesh{2},
+                                   element, ::Type{<:TreeMesh{2}},
                                    have_nonconservative_terms::False, equations,
                                    dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -216,7 +226,7 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
     return nothing
 end
 
-@inline function flux_differencing_kernel!(du, u, element, mesh::TreeMesh{2},
+@inline function flux_differencing_kernel!(du, u, element, ::Type{<:TreeMesh{2}},
                                            have_nonconservative_terms::False, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -254,7 +264,7 @@ end
     end
 end
 
-@inline function flux_differencing_kernel!(du, u, element, mesh::TreeMesh{2},
+@inline function flux_differencing_kernel!(du, u, element, MeshT::Type{<:TreeMesh{2}},
                                            have_nonconservative_terms::True, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -263,7 +273,7 @@ end
     symmetric_flux, nonconservative_flux = volume_flux
 
     # Apply the symmetric flux as usual
-    flux_differencing_kernel!(du, u, element, mesh, False(), equations, symmetric_flux,
+    flux_differencing_kernel!(du, u, element, MeshT, False(), equations, symmetric_flux,
                               dg, cache, alpha)
 
     # Calculate the remaining volume terms using the nonsymmetric generalized flux
@@ -297,9 +307,9 @@ end
 end
 
 @inline function fvO2_kernel!(du, u,
-                              mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                          UnstructuredMesh2D, P4estMesh{2},
-                                          T8codeMesh{2}},
+                              MeshT::Type{<:Union{TreeMesh{2}, StructuredMesh{2},
+                                                  UnstructuredMesh2D, P4estMesh{2},
+                                                  T8codeMesh{2}}},
                               have_nonconservative_terms, equations,
                               volume_flux_fv, dg::DGSEM, cache, element,
                               sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -313,7 +323,7 @@ end
     fstar2_L = fstar2_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
     fstar2_R = fstar2_R_threaded[Threads.threadid()]
-    calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, mesh,
+    calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, MeshT,
                    have_nonconservative_terms, equations,
                    volume_flux_fv, dg, element, cache,
                    sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -334,7 +344,7 @@ end
 end
 
 @inline function calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                                mesh::TreeMesh{2},
+                                ::Type{<:TreeMesh{2}},
                                 have_nonconservative_terms::False,
                                 equations, volume_flux_fv, dg::DGSEM, element, cache,
                                 sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -401,9 +411,9 @@ end
 end
 
 @inline function fv_kernel!(du, u,
-                            mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                        UnstructuredMesh2D, P4estMesh{2},
-                                        T8codeMesh{2}},
+                            MeshT::Type{<:Union{TreeMesh{2}, StructuredMesh{2},
+                                                UnstructuredMesh2D, P4estMesh{2},
+                                                T8codeMesh{2}}},
                             have_nonconservative_terms, equations,
                             volume_flux_fv, dg::DGSEM, cache, element, alpha = true)
     @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded = cache
@@ -414,7 +424,7 @@ end
     fstar2_L = fstar2_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
     fstar2_R = fstar2_R_threaded[Threads.threadid()]
-    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, mesh,
+    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, MeshT,
                  have_nonconservative_terms, equations, volume_flux_fv, dg, element,
                  cache)
 
@@ -437,7 +447,7 @@ end
 # "A provably entropy stable subcell shock capturing approach for high order split form DG for the compressible Euler equations"
 # [arXiv: 2008.12044v2](https://arxiv.org/pdf/2008.12044)
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                              mesh::TreeMesh{2},
+                              ::Type{<:TreeMesh{2}},
                               have_nonconservative_terms::False, equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     for j in eachnode(dg), i in 2:nnodes(dg)
@@ -460,7 +470,7 @@ end
 end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                              mesh::TreeMesh{2},
+                              ::Type{<:TreeMesh{2}},
                               have_nonconservative_terms::True, equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     volume_flux, nonconservative_flux = volume_flux_fv
@@ -508,7 +518,8 @@ end
     return nothing
 end
 
-function prolong2interfaces!(cache, u, mesh::TreeMesh{2}, equations, dg::DG)
+function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{2}, equations,
+                             dg::DG)
     @unpack interfaces = cache
     @unpack orientations, neighbor_ids = interfaces
     interfaces_u = interfaces.u
@@ -535,8 +546,8 @@ function prolong2interfaces!(cache, u, mesh::TreeMesh{2}, equations, dg::DG)
     return nothing
 end
 
-function prolong2interfaces!(cache, u, mesh::TreeMesh{2}, equations,
-                             dg::DG{<:GaussLegendreBasis})
+function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{2}, equations,
+                             dg::DGSEM{<:GaussLegendreBasis})
     @unpack interfaces = cache
     @unpack orientations, neighbor_ids = interfaces
     @unpack boundary_interpolation = dg.basis
@@ -548,26 +559,42 @@ function prolong2interfaces!(cache, u, mesh::TreeMesh{2}, equations,
 
         if orientations[interface] == 1
             # interface in x-direction
-            for j in eachnode(dg), v in eachvariable(equations)
-                interfaces_u[1, v, j, interface] = zero(eltype(interfaces_u))
-                interfaces_u[2, v, j, interface] = zero(eltype(interfaces_u))
-                for ii in eachnode(dg)
-                    interfaces_u[1, v, j, interface] += (u[v, ii, j, left_element] *
-                                                         boundary_interpolation[ii, 2])
-                    interfaces_u[2, v, j, interface] += (u[v, ii, j, right_element] *
-                                                         boundary_interpolation[ii, 1])
+            for j in eachnode(dg)
+                for v in eachvariable(equations)
+                    # Interpolate to the interfaces using a local variable for
+                    # the accumulation of values (to reduce global memory operations).
+                    interface_u_1 = zero(eltype(interfaces_u))
+                    interface_u_2 = zero(eltype(interfaces_u))
+                    for ii in eachnode(dg)
+                        # Not += to allow `@muladd` to turn these into FMAs
+                        # (see comment at the top of the file)
+                        interface_u_1 = (interface_u_1 +
+                                         u[v, ii, j, left_element] *
+                                         boundary_interpolation[ii, 2])
+                        interface_u_2 = (interface_u_2 +
+                                         u[v, ii, j, right_element] *
+                                         boundary_interpolation[ii, 1])
+                    end
+                    interfaces_u[1, v, j, interface] = interface_u_1
+                    interfaces_u[2, v, j, interface] = interface_u_2
                 end
             end
         else # if orientations[interface] == 2
             # interface in y-direction
-            for i in eachnode(dg), v in eachvariable(equations)
-                interfaces_u[1, v, i, interface] = zero(eltype(interfaces_u))
-                interfaces_u[2, v, i, interface] = zero(eltype(interfaces_u))
-                for jj in eachnode(dg)
-                    interfaces_u[1, v, i, interface] += (u[v, i, jj, left_element] *
-                                                         boundary_interpolation[jj, 2])
-                    interfaces_u[2, v, i, interface] += (u[v, i, jj, right_element] *
-                                                         boundary_interpolation[jj, 1])
+            for i in eachnode(dg)
+                for v in eachvariable(equations)
+                    interface_u_1 = zero(eltype(interfaces_u))
+                    interface_u_2 = zero(eltype(interfaces_u))
+                    for jj in eachnode(dg)
+                        interface_u_1 = (interface_u_1 +
+                                         u[v, i, jj, left_element] *
+                                         boundary_interpolation[jj, 2])
+                        interface_u_2 = (interface_u_2 +
+                                         u[v, i, jj, right_element] *
+                                         boundary_interpolation[jj, 1])
+                    end
+                    interfaces_u[1, v, i, interface] = interface_u_1
+                    interfaces_u[2, v, i, interface] = interface_u_2
                 end
             end
         end
@@ -576,7 +603,7 @@ function prolong2interfaces!(cache, u, mesh::TreeMesh{2}, equations,
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{2},
                               have_nonconservative_terms::False, equations,
                               surface_integral, dg::DG, cache)
@@ -610,7 +637,7 @@ function calc_interface_flux!(surface_flux_values,
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{2},
                               have_nonconservative_terms::True, equations,
                               surface_integral, dg::DG, cache)
@@ -695,6 +722,82 @@ function prolong2boundaries!(cache, u,
     return nothing
 end
 
+function prolong2boundaries!(cache, u,
+                             mesh::TreeMesh{2}, equations,
+                             dg::DGSEM{<:GaussLegendreBasis})
+    @unpack boundaries = cache
+    @unpack orientations, neighbor_sides = boundaries
+    @unpack boundary_interpolation = dg.basis
+
+    @threaded for boundary in eachboundary(dg, cache)
+        element = boundaries.neighbor_ids[boundary]
+
+        if orientations[boundary] == 1
+            # boundary in x-direction
+            if neighbor_sides[boundary] == 1
+                # element in -x direction of boundary => interpolate to right boundary node (+1)
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        # Interpolate to the boundaries using a local variable for
+                        # the accumulation of values (to reduce global memory operations).
+                        boundary_u = zero(eltype(boundaries.u))
+                        for ii in eachnode(dg)
+                            # Not += to allow `@muladd` to turn these into FMAs
+                            # (see comment at the top of the file)
+                            boundary_u = (boundary_u +
+                                          u[v, ii, l, element] *
+                                          boundary_interpolation[ii, 2])
+                        end
+                        boundaries.u[1, v, l, boundary] = boundary_u
+                    end
+                end
+            else # element in +x direction of boundary => interpolate to left boundary node (-1)
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        boundary_u = zero(eltype(boundaries.u))
+                        for ii in eachnode(dg)
+                            boundary_u = (boundary_u +
+                                          u[v, ii, l, element] *
+                                          boundary_interpolation[ii, 1])
+                        end
+                        boundaries.u[2, v, l, boundary] = boundary_u
+                    end
+                end
+            end
+        else # if orientations[boundary] == 2
+            # boundary in y-direction
+            if neighbor_sides[boundary] == 1
+                # element in -y direction of boundary => interpolate to right boundary node (+1)
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        boundary_u = zero(eltype(boundaries.u))
+                        for jj in eachnode(dg)
+                            boundary_u = (boundary_u +
+                                          u[v, l, jj, element] *
+                                          boundary_interpolation[jj, 2])
+                        end
+                        boundaries.u[1, v, l, boundary] = boundary_u
+                    end
+                end
+            else # element in +y direction of boundary => interpolate to left boundary node (-1)
+                for l in eachnode(dg)
+                    for v in eachvariable(equations)
+                        boundary_u = zero(eltype(boundaries.u))
+                        for jj in eachnode(dg)
+                            boundary_u = (boundary_u +
+                                          u[v, l, jj, element] *
+                                          boundary_interpolation[jj, 1])
+                        end
+                        boundaries.u[2, v, l, boundary] = boundary_u
+                    end
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
 function calc_boundary_flux!(cache, t, boundary_conditions::NamedTuple,
                              mesh::TreeMesh{2}, equations, surface_integral, dg::DG)
     @unpack surface_flux_values = cache.elements
@@ -1041,6 +1144,16 @@ function calc_mortar_flux!(surface_flux_values,
     return nothing
 end
 
+# For Gauss-Legendre DGSEM mortars are not yet implemented
+function calc_mortar_flux!(surface_flux_values,
+                           mesh::TreeMesh{2},
+                           have_nonconservative_terms, equations,
+                           mortar::Nothing, surface_integral,
+                           dg::DGSEM{<:GaussLegendreBasis}, cache)
+    @assert isempty(eachmortar(dg, cache))
+    return nothing
+end
+
 @inline function calc_fstar!(destination::AbstractArray{<:Any, 2}, equations,
                              surface_flux, dg::DGSEM,
                              u_interfaces, interface, orientation)
@@ -1132,7 +1245,7 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u,
+function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{TreeMesh{2}, StructuredMesh{2},
                                             StructuredMeshView{2}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
@@ -1176,11 +1289,11 @@ function calc_surface_integral!(du, u,
     return nothing
 end
 
-function calc_surface_integral!(du, u,
+function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{TreeMesh{2}, StructuredMesh{2},
                                             StructuredMeshView{2}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
-                                dg::DG{<:GaussLegendreBasis}, cache)
+                                dg::DGSEM{<:GaussLegendreBasis}, cache)
     @unpack boundary_interpolation_inverse_weights = dg.basis
     @unpack surface_flux_values = cache.elements
 
@@ -1193,30 +1306,35 @@ function calc_surface_integral!(du, u,
     @threaded for element in eachelement(dg, cache)
         for l in eachnode(dg)
             for v in eachvariable(equations)
+                # Aliases for repeatedly accessed variables
+                surface_flux_minus = surface_flux_values[v, l, 1, element]
+                surface_flux_plus = surface_flux_values[v, l, 2, element]
                 for ii in eachnode(dg)
                     # surface at -x
                     du[v, ii, l, element] = (du[v, ii, l, element] -
-                                             surface_flux_values[v, l, 1, element] *
+                                             surface_flux_minus *
                                              boundary_interpolation_inverse_weights[ii,
                                                                                     1])
 
                     # surface at +x
                     du[v, ii, l, element] = (du[v, ii, l, element] +
-                                             surface_flux_values[v, l, 2, element] *
+                                             surface_flux_plus *
                                              boundary_interpolation_inverse_weights[ii,
                                                                                     2])
                 end
 
+                surface_flux_minus = surface_flux_values[v, l, 3, element]
+                surface_flux_plus = surface_flux_values[v, l, 4, element]
                 for jj in eachnode(dg)
                     # surface at -y
                     du[v, l, jj, element] = (du[v, l, jj, element] -
-                                             surface_flux_values[v, l, 3, element] *
+                                             surface_flux_minus *
                                              boundary_interpolation_inverse_weights[jj,
                                                                                     1])
 
                     # surface at +y
                     du[v, l, jj, element] = (du[v, l, jj, element] +
-                                             surface_flux_values[v, l, 4, element] *
+                                             surface_flux_plus *
                                              boundary_interpolation_inverse_weights[jj,
                                                                                     2])
                 end
@@ -1227,13 +1345,13 @@ function calc_surface_integral!(du, u,
     return nothing
 end
 
-function apply_jacobian!(du, mesh::TreeMesh{2},
+function apply_jacobian!(backend::Nothing, du, mesh::TreeMesh{2},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 
     @threaded for element in eachelement(dg, cache)
         # Negative sign included to account for the negated surface and volume terms,
-        # see e.g. the computation of `derivative_hat` in the basis setup and 
+        # see e.g. the computation of `derivative_hat` in the basis setup and
         # the comment in `calc_surface_integral!`.
         factor = -inverse_jacobian[element]
 
diff --git a/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl b/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
index 51a5897b065..507b48b20ea 100644
--- a/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
+++ b/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
@@ -65,7 +65,7 @@ end # muladd
 # if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
 # works efficiently here.
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
-                                           element, mesh::TreeMesh{2},
+                                           element, MeshT::Type{<:TreeMesh{2}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations2D,
                                            volume_flux::typeof(flux_shima_etal_turbo),
@@ -76,13 +76,13 @@ end # muladd
     # indices `[i, j, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims(MeshT))...,
                                           StaticInt(nvariables(equations))))
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
@@ -227,7 +227,7 @@ end # muladd
 end
 
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
-                                           element, mesh::TreeMesh{2},
+                                           element, MeshT::Type{<:TreeMesh{2}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations2D,
                                            volume_flux::typeof(flux_ranocha_turbo),
@@ -238,7 +238,7 @@ end
     # indices `[i, j, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`. In addition
@@ -247,7 +247,7 @@ end
     # values.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims(MeshT))...,
                                           StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
diff --git a/src/solvers/dgsem_tree/dg_2d_parabolic.jl b/src/solvers/dgsem_tree/dg_2d_parabolic.jl
index d7e452a97db..bd2d4e5173b 100644
--- a/src/solvers/dgsem_tree/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_tree/dg_2d_parabolic.jl
@@ -11,11 +11,11 @@
 function create_cache_parabolic(mesh::Union{TreeMesh{2}, P4estMesh{2}},
                                 equations_hyperbolic::AbstractEquations,
                                 dg::DG, n_elements, uEltype)
-    viscous_container = init_viscous_container_2d(nvariables(equations_hyperbolic),
-                                                  nnodes(dg), n_elements,
-                                                  uEltype)
+    parabolic_container = init_parabolic_container_2d(nvariables(equations_hyperbolic),
+                                                      nnodes(dg), n_elements,
+                                                      uEltype)
 
-    cache_parabolic = (; viscous_container)
+    cache_parabolic = (; parabolic_container)
 
     return cache_parabolic
 end
@@ -32,10 +32,10 @@ function rhs_parabolic!(du, u, t, mesh::Union{TreeMesh{2}, TreeMesh{3}},
                         equations_parabolic::AbstractEquationsParabolic,
                         boundary_conditions_parabolic, source_terms_parabolic,
                         dg::DG, parabolic_scheme, cache, cache_parabolic)
-    @unpack viscous_container = cache_parabolic
-    @unpack u_transformed, gradients, flux_viscous = viscous_container
+    @unpack parabolic_container = cache_parabolic
+    @unpack u_transformed, gradients, flux_parabolic = parabolic_container
 
-    # Convert conservative variables to a form more suitable for viscous flux calculations
+    # Convert conservative variables to a form more suitable for parabolic flux calculations
     @trixi_timeit timer() "transform variables" begin
         transform_variables!(u_transformed, u, mesh, equations_parabolic,
                              dg, cache)
@@ -48,20 +48,20 @@ function rhs_parabolic!(du, u, t, mesh::Union{TreeMesh{2}, TreeMesh{3}},
                        dg, parabolic_scheme, cache)
     end
 
-    # Compute and store the viscous fluxes
-    @trixi_timeit timer() "calculate viscous fluxes" begin
-        calc_viscous_fluxes!(flux_viscous, gradients, u_transformed, mesh,
-                             equations_parabolic, dg, cache)
+    # Compute and store the parabolic fluxes
+    @trixi_timeit timer() "calculate parabolic fluxes" begin
+        calc_parabolic_fluxes!(flux_parabolic, gradients, u_transformed, mesh,
+                               equations_parabolic, dg, cache)
     end
 
     # The remainder of this function is essentially a regular rhs! for parabolic
-    # equations (i.e., it computes the divergence of the viscous fluxes)
+    # equations (i.e., it computes the divergence of the parabolic fluxes)
     #
-    # OBS! In `calc_viscous_fluxes!`, the viscous flux values at the volume nodes of each element have
-    # been computed and stored in `fluxes_viscous`. In the following, we *reuse* (abuse) the
+    # OBS! In `calc_parabolic_fluxes!`, the parabolic flux values at the volume nodes of each element have
+    # been computed and stored in `flux_parabolic`. In the following, we *reuse* (abuse) the
     # `interfaces` and `boundaries` containers in `cache` to interpolate and store the
     # *fluxes* at the element surfaces, as opposed to interpolating and storing the *solution* (as it
-    # is done in the hyperbolic operator). That is, `interfaces.u`/`boundaries.u` store *viscous flux values*
+    # is done in the hyperbolic operator). That is, `interfaces.u`/`boundaries.u` store *parabolic flux values*
     # and *not the solution*.  The advantage is that a) we do not need to allocate more storage, b) we
     # do not need to recreate the existing data structure only with a different name, and c) we do not
     # need to interpolate solutions *and* gradients to the surfaces.
@@ -70,21 +70,21 @@ function rhs_parabolic!(du, u, t, mesh::Union{TreeMesh{2}, TreeMesh{3}},
     @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
 
     # Calculate volume integral.
-    # This calls the specialized version for the viscous fluxes from
+    # This calls the specialized version for the parabolic fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, flux_viscous, mesh, equations_parabolic, dg, cache)
+        calc_volume_integral!(du, flux_parabolic, mesh, equations_parabolic, dg, cache)
     end
 
     # Prolong solution to interfaces.
-    # This calls the specialized version for the viscous fluxes from
+    # This calls the specialized version for the parabolic fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2interfaces!(cache, flux_parabolic, mesh, equations_parabolic, dg)
     end
 
     # Calculate interface fluxes
-    # This calls the specialized version for the viscous fluxes from
+    # This calls the specialized version for the parabolic fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "interface flux" begin
         calc_interface_flux!(cache.elements.surface_flux_values,
@@ -92,11 +92,11 @@ function rhs_parabolic!(du, u, t, mesh::Union{TreeMesh{2}, TreeMesh{3}},
                              parabolic_scheme, cache)
     end
 
-    # Prolong viscous fluxes to boundaries.
-    # This calls the specialized version for the viscous fluxes from
+    # Prolong parabolic fluxes to boundaries.
+    # This calls the specialized version for the parabolic fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2boundaries" begin
-        prolong2boundaries!(cache, flux_viscous, mesh, equations_parabolic, dg)
+        prolong2boundaries!(cache, flux_parabolic, mesh, equations_parabolic, dg)
     end
 
     # Calculate boundary fluxes.
@@ -108,11 +108,11 @@ function rhs_parabolic!(du, u, t, mesh::Union{TreeMesh{2}, TreeMesh{3}},
                                        dg.surface_integral, dg)
     end
 
-    # Prolong viscous fluxes to mortars.
-    # This calls the specialized version for the viscous fluxes from
+    # Prolong parabolic fluxes to mortars.
+    # This calls the specialized version for the parabolic fluxes from
     # `dg_2d_parabolic.jl` or `dg_3d_parabolic.jl`.
     @trixi_timeit timer() "prolong2mortars" begin
-        prolong2mortars!(cache, flux_viscous, mesh, equations_parabolic,
+        prolong2mortars!(cache, flux_parabolic, mesh, equations_parabolic,
                          dg.mortar, dg)
     end
 
@@ -128,7 +128,7 @@ function rhs_parabolic!(du, u, t, mesh::Union{TreeMesh{2}, TreeMesh{3}},
     # Calculate surface integrals.
     # This reuses `calc_surface_integral!` for the purely hyperbolic case.
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations_parabolic,
+        calc_surface_integral!(nothing, du, u, mesh, equations_parabolic,
                                dg.surface_integral, dg, cache)
     end
 
@@ -166,21 +166,21 @@ function transform_variables!(u_transformed, u, mesh::Union{TreeMesh{2}, P4estMe
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes.
+# This is the version used when calculating the divergence of the parabolic fluxes.
 # Identical to weak-form volume integral/kernel for the purely hyperbolic case,
-# except that the fluxes are here already precomputed in `calc_viscous_fluxes!`
-function calc_volume_integral!(du, flux_viscous, mesh::TreeMesh{2},
+# except that the fluxes are here already precomputed in `calc_parabolic_fluxes!`
+function calc_volume_integral!(du, flux_parabolic, mesh::TreeMesh{2},
                                equations_parabolic::AbstractEquationsParabolic,
                                dg::DGSEM, cache)
     @unpack derivative_hat = dg.basis
-    flux_viscous_x, flux_viscous_y = flux_viscous
+    flux_parabolic_x, flux_parabolic_y = flux_parabolic
 
     @threaded for element in eachelement(dg, cache)
         # Calculate volume terms in one element
         for j in eachnode(dg), i in eachnode(dg)
-            flux_1_node = get_node_vars(flux_viscous_x, equations_parabolic, dg,
+            flux_1_node = get_node_vars(flux_parabolic_x, equations_parabolic, dg,
                                         i, j, element)
-            flux_2_node = get_node_vars(flux_viscous_y, equations_parabolic, dg,
+            flux_2_node = get_node_vars(flux_parabolic_y, equations_parabolic, dg,
                                         i, j, element)
 
             for ii in eachnode(dg)
@@ -198,19 +198,21 @@ function calc_volume_integral!(du, flux_viscous, mesh::TreeMesh{2},
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes.
-# Specialization `flux_viscous::Tuple` needed to
+# This is the version used when calculating the divergence of the parabolic fluxes.
+# Specialization `flux_parabolic::Tuple` needed to
 # avoid amibiguity with the hyperbolic version of `prolong2interfaces!` in dg_2d.jl
 # which is for the variables itself, i.e., `u::Array{uEltype, 4}`.
-function prolong2interfaces!(cache, flux_viscous::Tuple,
+function prolong2interfaces!(cache, flux_parabolic::Tuple,
                              mesh::TreeMesh{2},
                              equations_parabolic::AbstractEquationsParabolic,
                              dg::DG)
     @unpack interfaces = cache
     @unpack orientations, neighbor_ids = interfaces
+
+    # OBS! `interfaces_u` stores the interpolated *fluxes* and *not the solution*!
     interfaces_u = interfaces.u
 
-    flux_viscous_x, flux_viscous_y = flux_viscous
+    flux_parabolic_x, flux_parabolic_y = flux_parabolic
 
     @threaded for interface in eachinterface(dg, cache)
         left_element = neighbor_ids[1, interface]
@@ -219,20 +221,18 @@ function prolong2interfaces!(cache, flux_viscous::Tuple,
         if orientations[interface] == 1
             # interface in x-direction
             for j in eachnode(dg), v in eachvariable(equations_parabolic)
-                # OBS! `interfaces_u` stores the interpolated *fluxes* and *not the solution*!
-                interfaces_u[1, v, j, interface] = flux_viscous_x[v, nnodes(dg), j,
-                                                                  left_element]
-                interfaces_u[2, v, j, interface] = flux_viscous_x[v, 1, j,
-                                                                  right_element]
+                interfaces_u[1, v, j, interface] = flux_parabolic_x[v, nnodes(dg), j,
+                                                                    left_element]
+                interfaces_u[2, v, j, interface] = flux_parabolic_x[v, 1, j,
+                                                                    right_element]
             end
         else # if orientations[interface] == 2
             # interface in y-direction
             for i in eachnode(dg), v in eachvariable(equations_parabolic)
-                # OBS! `interfaces_u` stores the interpolated *fluxes* and *not the solution*!
-                interfaces_u[1, v, i, interface] = flux_viscous_y[v, i, nnodes(dg),
-                                                                  left_element]
-                interfaces_u[2, v, i, interface] = flux_viscous_y[v, i, 1,
-                                                                  right_element]
+                interfaces_u[1, v, i, interface] = flux_parabolic_y[v, i, nnodes(dg),
+                                                                    left_element]
+                interfaces_u[2, v, i, interface] = flux_parabolic_y[v, i, 1,
+                                                                    right_element]
             end
         end
     end
@@ -240,7 +240,78 @@ function prolong2interfaces!(cache, flux_viscous::Tuple,
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes
+# This is the version used when calculating the divergence of the parabolic fluxes.
+# Specialization `flux_parabolic::Tuple` needed to
+# avoid amibiguity with the hyperbolic version of `prolong2interfaces!` in dg_2d.jl
+# which is for the variables itself, i.e., `u::Array{uEltype, 4}`.
+function prolong2interfaces!(cache, flux_parabolic::Tuple,
+                             mesh::TreeMesh{2},
+                             equations_parabolic::AbstractEquationsParabolic,
+                             dg::DGSEM{<:GaussLegendreBasis})
+    @unpack interfaces = cache
+    @unpack orientations, neighbor_ids = interfaces
+    @unpack boundary_interpolation = dg.basis
+
+    # OBS! `interfaces_u` stores the interpolated *fluxes* and *not the solution*!
+    interfaces_u = interfaces.u
+
+    flux_parabolic_x, flux_parabolic_y = flux_parabolic
+
+    @threaded for interface in eachinterface(dg, cache)
+        left_element = neighbor_ids[1, interface]
+        right_element = neighbor_ids[2, interface]
+
+        if orientations[interface] == 1
+            # interface in x-direction
+            for j in eachnode(dg)
+                for v in eachvariable(equations_parabolic)
+                    # Interpolate to the interfaces using a local variable for
+                    # the accumulation of values (to reduce global memory operations).
+                    interface_u_1 = zero(eltype(interfaces_u))
+                    interface_u_2 = zero(eltype(interfaces_u))
+                    for ii in eachnode(dg)
+                        # Not += to allow `@muladd` to turn these into FMAs
+                        # (see comment at the top of the file)
+                        # Need `boundary_interpolation` at right (+1) node for left element
+                        interface_u_1 = (interface_u_1 +
+                                         flux_parabolic_x[v, ii, j, left_element] *
+                                         boundary_interpolation[ii, 2])
+                        # Need `boundary_interpolation` at left (-1) node for right element
+                        interface_u_2 = (interface_u_2 +
+                                         flux_parabolic_x[v, ii, j, right_element] *
+                                         boundary_interpolation[ii, 1])
+                    end
+                    interfaces_u[1, v, j, interface] = interface_u_1
+                    interfaces_u[2, v, j, interface] = interface_u_2
+                end
+            end
+        else # if orientations[interface] == 2
+            # interface in y-direction
+            for i in eachnode(dg)
+                for v in eachvariable(equations_parabolic)
+                    interface_u_1 = zero(eltype(interfaces_u))
+                    interface_u_2 = zero(eltype(interfaces_u))
+                    for jj in eachnode(dg)
+                        # Need `boundary_interpolation` at right (+1) node for left element
+                        interface_u_1 = (interface_u_1 +
+                                         flux_parabolic_y[v, i, jj, left_element] *
+                                         boundary_interpolation[jj, 2])
+                        # Need `boundary_interpolation` at left (-1) node for right element
+                        interface_u_2 = (interface_u_2 +
+                                         flux_parabolic_y[v, i, jj, right_element] *
+                                         boundary_interpolation[jj, 1])
+                    end
+                    interfaces_u[1, v, i, interface] = interface_u_1
+                    interfaces_u[2, v, i, interface] = interface_u_2
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
+# This is the version used when calculating the divergence of the parabolic fluxes
 function calc_interface_flux!(surface_flux_values, mesh::TreeMesh{2},
                               equations_parabolic, dg::DG, parabolic_scheme,
                               cache)
@@ -277,18 +348,20 @@ function calc_interface_flux!(surface_flux_values, mesh::TreeMesh{2},
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes.
-# Specialization `flux_viscous::Tuple` needed to
+# This is the version used when calculating the divergence of the parabolic fluxes.
+# Specialization `flux_parabolic::Tuple` needed to
 # avoid amibiguity with the hyperbolic version of `prolong2boundaries!` in dg_2d.jl
 # which is for the variables itself, i.e., `u::Array{uEltype, 4}`.
-function prolong2boundaries!(cache, flux_viscous::Tuple,
+function prolong2boundaries!(cache, flux_parabolic::Tuple,
                              mesh::TreeMesh{2},
                              equations_parabolic::AbstractEquationsParabolic,
                              dg::DG)
     @unpack boundaries = cache
     @unpack orientations, neighbor_sides, neighbor_ids = boundaries
+
+    # OBS! `boundaries_u` stores the "interpolated" *fluxes* and *not the solution*!
     boundaries_u = boundaries.u
-    flux_viscous_x, flux_viscous_y = flux_viscous
+    flux_parabolic_x, flux_parabolic_y = flux_parabolic
 
     @threaded for boundary in eachboundary(dg, cache)
         element = neighbor_ids[boundary]
@@ -298,15 +371,13 @@ function prolong2boundaries!(cache, flux_viscous::Tuple,
             if neighbor_sides[boundary] == 1
                 # element in -x direction of boundary
                 for l in eachnode(dg), v in eachvariable(equations_parabolic)
-                    # OBS! `boundaries_u` stores the interpolated *fluxes* and *not the solution*!
-                    boundaries_u[1, v, l, boundary] = flux_viscous_x[v, nnodes(dg), l,
-                                                                     element]
+                    boundaries_u[1, v, l, boundary] = flux_parabolic_x[v, nnodes(dg), l,
+                                                                       element]
                 end
             else # Element in +x direction of boundary
                 for l in eachnode(dg), v in eachvariable(equations_parabolic)
-                    # OBS! `boundaries_u` stores the interpolated *fluxes* and *not the solution*!
-                    boundaries_u[2, v, l, boundary] = flux_viscous_x[v, 1, l,
-                                                                     element]
+                    boundaries_u[2, v, l, boundary] = flux_parabolic_x[v, 1, l,
+                                                                       element]
                 end
             end
         else # if orientations[boundary] == 2
@@ -314,16 +385,14 @@ function prolong2boundaries!(cache, flux_viscous::Tuple,
             if neighbor_sides[boundary] == 1
                 # element in -y direction of boundary
                 for l in eachnode(dg), v in eachvariable(equations_parabolic)
-                    # OBS! `boundaries_u` stores the interpolated *fluxes* and *not the solution*!
-                    boundaries_u[1, v, l, boundary] = flux_viscous_y[v, l, nnodes(dg),
-                                                                     element]
+                    boundaries_u[1, v, l, boundary] = flux_parabolic_y[v, l, nnodes(dg),
+                                                                       element]
                 end
             else
                 # element in +y direction of boundary
                 for l in eachnode(dg), v in eachvariable(equations_parabolic)
-                    # OBS! `boundaries_u` stores the interpolated *fluxes* and *not the solution*!
-                    boundaries_u[2, v, l, boundary] = flux_viscous_y[v, l, 1,
-                                                                     element]
+                    boundaries_u[2, v, l, boundary] = flux_parabolic_y[v, l, 1,
+                                                                       element]
                 end
             end
         end
@@ -332,13 +401,98 @@ function prolong2boundaries!(cache, flux_viscous::Tuple,
     return nothing
 end
 
-function calc_viscous_fluxes!(flux_viscous,
-                              gradients, u_transformed,
-                              mesh::Union{TreeMesh{2}, P4estMesh{2}},
-                              equations_parabolic::AbstractEquationsParabolic,
-                              dg::DG, cache)
+# This is the version used when calculating the divergence of the parabolic fluxes.
+# Specialization `flux_parabolic::Tuple` needed to
+# avoid amibiguity with the hyperbolic version of `prolong2boundaries!` in dg_2d.jl
+# which is for the variables itself, i.e., `u::Array{uEltype, 4}`.
+function prolong2boundaries!(cache, flux_parabolic::Tuple,
+                             mesh::TreeMesh{2},
+                             equations_parabolic::AbstractEquationsParabolic,
+                             dg::DGSEM{<:GaussLegendreBasis})
+    @unpack boundaries = cache
+    @unpack orientations, neighbor_sides, neighbor_ids = boundaries
+    @unpack boundary_interpolation = dg.basis
+
+    # OBS! `boundaries_u` stores the interpolated *fluxes* and *not the solution*!
+    boundaries_u = boundaries.u
+    flux_parabolic_x, flux_parabolic_y = flux_parabolic
+
+    @threaded for boundary in eachboundary(dg, cache)
+        element = neighbor_ids[boundary]
+
+        if orientations[boundary] == 1
+            # boundary in x-direction
+            if neighbor_sides[boundary] == 1
+                # element in -x direction of boundary => interpolate to right boundary node (+1)
+                for l in eachnode(dg)
+                    for v in eachvariable(equations_parabolic)
+                        # Interpolate to the boundaries using a local variable for
+                        # the accumulation of values (to reduce global memory operations).
+                        boundary_u = zero(eltype(boundaries_u))
+                        for ii in eachnode(dg)
+                            # Not += to allow `@muladd` to turn these into FMAs
+                            # (see comment at the top of the file)
+                            boundary_u = (boundary_u +
+                                          flux_parabolic_x[v, ii, l, element] *
+                                          boundary_interpolation[ii, 2])
+                        end
+                        boundaries_u[1, v, l, boundary] = boundary_u
+                    end
+                end
+            else # element in +x direction of boundary => interpolate to left boundary node (-1)
+                for l in eachnode(dg)
+                    for v in eachvariable(equations_parabolic)
+                        boundary_u = zero(eltype(boundaries_u))
+                        for ii in eachnode(dg)
+                            boundary_u = (boundary_u +
+                                          flux_parabolic_x[v, ii, l, element] *
+                                          boundary_interpolation[ii, 1])
+                        end
+                        boundaries_u[2, v, l, boundary] = boundary_u
+                    end
+                end
+            end
+        else # if orientations[boundary] == 2
+            # boundary in y-direction
+            if neighbor_sides[boundary] == 1
+                # element in -y direction of boundary => interpolate to right boundary node (+1)
+                for l in eachnode(dg)
+                    for v in eachvariable(equations_parabolic)
+                        boundary_u = zero(eltype(boundaries_u))
+                        for jj in eachnode(dg)
+                            boundary_u = (boundary_u +
+                                          flux_parabolic_y[v, l, jj, element] *
+                                          boundary_interpolation[jj, 2])
+                        end
+                        boundaries_u[1, v, l, boundary] = boundary_u
+                    end
+                end
+            else # element in +y direction of boundary => interpolate to left boundary node (-1)
+                for l in eachnode(dg)
+                    for v in eachvariable(equations_parabolic)
+                        boundary_u = zero(eltype(boundaries_u))
+                        for jj in eachnode(dg)
+                            boundary_u = (boundary_u +
+                                          flux_parabolic_y[v, l, jj, element] *
+                                          boundary_interpolation[jj, 1])
+                        end
+                        boundaries_u[2, v, l, boundary] = boundary_u
+                    end
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
+function calc_parabolic_fluxes!(flux_parabolic,
+                                gradients, u_transformed,
+                                mesh::Union{TreeMesh{2}, P4estMesh{2}},
+                                equations_parabolic::AbstractEquationsParabolic,
+                                dg::DG, cache)
     gradients_x, gradients_y = gradients
-    flux_viscous_x, flux_viscous_y = flux_viscous # output arrays
+    flux_parabolic_x, flux_parabolic_y = flux_parabolic # output arrays
 
     @threaded for element in eachelement(dg, cache)
         for j in eachnode(dg), i in eachnode(dg)
@@ -350,14 +504,16 @@ function calc_viscous_fluxes!(flux_viscous,
             gradients_2_node = get_node_vars(gradients_y, equations_parabolic, dg,
                                              i, j, element)
 
-            # Calculate viscous flux and store each component for later use
-            flux_viscous_node_x = flux(u_node, (gradients_1_node, gradients_2_node), 1,
-                                       equations_parabolic)
-            flux_viscous_node_y = flux(u_node, (gradients_1_node, gradients_2_node), 2,
-                                       equations_parabolic)
-            set_node_vars!(flux_viscous_x, flux_viscous_node_x, equations_parabolic, dg,
+            # Calculate parabolic flux and store each component for later use
+            flux_parabolic_node_x = flux(u_node, (gradients_1_node, gradients_2_node),
+                                         1, equations_parabolic)
+            flux_parabolic_node_y = flux(u_node, (gradients_1_node, gradients_2_node),
+                                         2, equations_parabolic)
+            set_node_vars!(flux_parabolic_x, flux_parabolic_node_x,
+                           equations_parabolic, dg,
                            i, j, element)
-            set_node_vars!(flux_viscous_y, flux_viscous_node_y, equations_parabolic, dg,
+            set_node_vars!(flux_parabolic_y, flux_parabolic_node_y,
+                           equations_parabolic, dg,
                            i, j, element)
         end
     end
@@ -520,7 +676,7 @@ function calc_boundary_flux_by_direction_divergence!(surface_flux_values::Abstra
     @unpack surface_flux = surface_integral
 
     # Note: cache.boundaries.u contains the unsigned normal component (using "orientation", not "direction")
-    # of the viscous flux, as computed in `prolong2boundaries!`
+    # of the parabolic flux, as computed in `prolong2boundaries!`
     @unpack u, neighbor_ids, neighbor_sides, node_coordinates, orientations = cache.boundaries
 
     @threaded for boundary in first_boundary:last_boundary
@@ -528,7 +684,7 @@ function calc_boundary_flux_by_direction_divergence!(surface_flux_values::Abstra
         neighbor = neighbor_ids[boundary]
 
         for i in eachnode(dg)
-            # Get viscous boundary fluxes
+            # Get parabolic boundary fluxes
             flux_ll, flux_rr = get_surface_node_vars(u, equations_parabolic, dg, i,
                                                      boundary)
             if neighbor_sides[boundary] == 1 # Element is on the left, boundary on the right
@@ -559,15 +715,15 @@ function calc_boundary_flux_by_direction_divergence!(surface_flux_values::Abstra
     return nothing
 end
 
-# Specialization `flux_viscous::Tuple` needed to
+# Specialization `flux_parabolic::Tuple` needed to
 # avoid amibiguity with the hyperbolic version of `prolong2mortars!` in dg_2d.jl
 # which is for the variables itself, i.e., `u::Array{uEltype, 4}`.
-function prolong2mortars!(cache, flux_viscous::Tuple,
+function prolong2mortars!(cache, flux_parabolic::Tuple,
                           mesh::TreeMesh{2},
                           equations_parabolic::AbstractEquationsParabolic,
                           mortar_l2::LobattoLegendreMortarL2,
                           dg::DGSEM)
-    flux_viscous_x, flux_viscous_y = flux_viscous
+    flux_parabolic_x, flux_parabolic_y = flux_parabolic
     @threaded for mortar in eachmortar(dg, cache)
         large_element = cache.mortars.neighbor_ids[3, mortar]
         upper_element = cache.mortars.neighbor_ids[2, mortar]
@@ -579,28 +735,28 @@ function prolong2mortars!(cache, flux_viscous::Tuple,
                 # L2 mortars in x-direction
                 for l in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
-                        cache.mortars.u_upper[2, v, l, mortar] = flux_viscous_x[v,
-                                                                                1,
-                                                                                l,
-                                                                                upper_element]
-                        cache.mortars.u_lower[2, v, l, mortar] = flux_viscous_x[v,
-                                                                                1,
-                                                                                l,
-                                                                                lower_element]
+                        cache.mortars.u_upper[2, v, l, mortar] = flux_parabolic_x[v,
+                                                                                  1,
+                                                                                  l,
+                                                                                  upper_element]
+                        cache.mortars.u_lower[2, v, l, mortar] = flux_parabolic_x[v,
+                                                                                  1,
+                                                                                  l,
+                                                                                  lower_element]
                     end
                 end
             else
                 # L2 mortars in y-direction
                 for l in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
-                        cache.mortars.u_upper[2, v, l, mortar] = flux_viscous_y[v,
-                                                                                l,
-                                                                                1,
-                                                                                upper_element]
-                        cache.mortars.u_lower[2, v, l, mortar] = flux_viscous_y[v,
-                                                                                l,
-                                                                                1,
-                                                                                lower_element]
+                        cache.mortars.u_upper[2, v, l, mortar] = flux_parabolic_y[v,
+                                                                                  l,
+                                                                                  1,
+                                                                                  upper_element]
+                        cache.mortars.u_lower[2, v, l, mortar] = flux_parabolic_y[v,
+                                                                                  l,
+                                                                                  1,
+                                                                                  lower_element]
                     end
                 end
             end
@@ -609,28 +765,28 @@ function prolong2mortars!(cache, flux_viscous::Tuple,
                 # L2 mortars in x-direction
                 for l in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
-                        cache.mortars.u_upper[1, v, l, mortar] = flux_viscous_x[v,
-                                                                                nnodes(dg),
-                                                                                l,
-                                                                                upper_element]
-                        cache.mortars.u_lower[1, v, l, mortar] = flux_viscous_x[v,
-                                                                                nnodes(dg),
-                                                                                l,
-                                                                                lower_element]
+                        cache.mortars.u_upper[1, v, l, mortar] = flux_parabolic_x[v,
+                                                                                  nnodes(dg),
+                                                                                  l,
+                                                                                  upper_element]
+                        cache.mortars.u_lower[1, v, l, mortar] = flux_parabolic_x[v,
+                                                                                  nnodes(dg),
+                                                                                  l,
+                                                                                  lower_element]
                     end
                 end
             else
                 # L2 mortars in y-direction
                 for l in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
-                        cache.mortars.u_upper[1, v, l, mortar] = flux_viscous_y[v,
-                                                                                l,
-                                                                                nnodes(dg),
-                                                                                upper_element]
-                        cache.mortars.u_lower[1, v, l, mortar] = flux_viscous_y[v,
-                                                                                l,
-                                                                                nnodes(dg),
-                                                                                lower_element]
+                        cache.mortars.u_upper[1, v, l, mortar] = flux_parabolic_y[v,
+                                                                                  l,
+                                                                                  nnodes(dg),
+                                                                                  upper_element]
+                        cache.mortars.u_lower[1, v, l, mortar] = flux_parabolic_y[v,
+                                                                                  l,
+                                                                                  nnodes(dg),
+                                                                                  lower_element]
                     end
                 end
             end
@@ -641,12 +797,12 @@ function prolong2mortars!(cache, flux_viscous::Tuple,
             leftright = 1
             if cache.mortars.orientations[mortar] == 1
                 # L2 mortars in x-direction
-                u_large = view(flux_viscous_x, :, nnodes(dg), :, large_element)
+                u_large = view(flux_parabolic_x, :, nnodes(dg), :, large_element)
                 element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
                                               mortar, u_large)
             else
                 # L2 mortars in y-direction
-                u_large = view(flux_viscous_y, :, :, nnodes(dg), large_element)
+                u_large = view(flux_parabolic_y, :, :, nnodes(dg), large_element)
                 element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
                                               mortar, u_large)
             end
@@ -654,12 +810,12 @@ function prolong2mortars!(cache, flux_viscous::Tuple,
             leftright = 2
             if cache.mortars.orientations[mortar] == 1
                 # L2 mortars in x-direction
-                u_large = view(flux_viscous_x, :, 1, :, large_element)
+                u_large = view(flux_parabolic_x, :, 1, :, large_element)
                 element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
                                               mortar, u_large)
             else
                 # L2 mortars in y-direction
-                u_large = view(flux_viscous_y, :, :, 1, large_element)
+                u_large = view(flux_parabolic_y, :, :, 1, large_element)
                 element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
                                               mortar, u_large)
             end
@@ -704,6 +860,17 @@ function calc_mortar_flux!(surface_flux_values, mesh::TreeMesh{2},
     return nothing
 end
 
+# For Gauss-Legendre DGSEM mortars are not yet implemented
+function calc_mortar_flux!(surface_flux_values, mesh::TreeMesh{2},
+                           equations_parabolic::AbstractEquationsParabolic,
+                           mortar::Nothing,
+                           surface_integral, dg::DGSEM{<:GaussLegendreBasis},
+                           parabolic_scheme, gradient_or_divergence,
+                           cache)
+    @assert isempty(eachmortar(dg, cache))
+    return nothing
+end
+
 @inline function calc_fstar!(destination::AbstractArray{<:Any, 2},
                              mesh, equations_parabolic::AbstractEquationsParabolic,
                              surface_flux, dg::DGSEM,
@@ -924,6 +1091,60 @@ function calc_surface_integral_gradient!(gradients,
     return nothing
 end
 
+function calc_surface_integral_gradient!(gradients,
+                                         mesh::TreeMesh{2}, # for dispatch only
+                                         equations_parabolic::AbstractEquationsParabolic,
+                                         dg::DGSEM{<:GaussLegendreBasis}, cache)
+    @unpack boundary_interpolation_inverse_weights = dg.basis
+    @unpack surface_flux_values = cache.elements
+
+    gradients_x, gradients_y = gradients
+
+    # Note that all fluxes have been computed with outward-pointing normal vectors.
+    # We also use explicit assignments instead of `+=` to let `@muladd` turn these
+    # into FMAs (see comment at the top of the file).
+    @threaded for element in eachelement(dg, cache)
+        for l in eachnode(dg)
+            for v in eachvariable(equations_parabolic)
+                # Aliases for repeatedly accessed variables
+                surface_flux_minus = surface_flux_values[v, l, 1, element]
+                surface_flux_plus = surface_flux_values[v, l, 2, element]
+                for ii in eachnode(dg)
+                    # surface at -x
+                    gradients_x[v, ii, l, element] = (gradients_x[v, ii, l, element] -
+                                                      surface_flux_minus *
+                                                      boundary_interpolation_inverse_weights[ii,
+                                                                                             1])
+
+                    # surface at +x
+                    gradients_x[v, ii, l, element] = (gradients_x[v, ii, l, element] +
+                                                      surface_flux_plus *
+                                                      boundary_interpolation_inverse_weights[ii,
+                                                                                             2])
+                end
+
+                surface_flux_minus = surface_flux_values[v, l, 3, element]
+                surface_flux_plus = surface_flux_values[v, l, 4, element]
+                for jj in eachnode(dg)
+                    # surface at -y
+                    gradients_y[v, l, jj, element] = (gradients_y[v, l, jj, element] -
+                                                      surface_flux_minus *
+                                                      boundary_interpolation_inverse_weights[jj,
+                                                                                             1])
+
+                    # surface at +y
+                    gradients_y[v, l, jj, element] = (gradients_y[v, l, jj, element] +
+                                                      surface_flux_plus *
+                                                      boundary_interpolation_inverse_weights[jj,
+                                                                                             2])
+                end
+            end
+        end
+    end
+
+    return nothing
+end
+
 function reset_gradients!(gradients::NTuple{2}, dg::DG, cache)
     gradients_x, gradients_y = gradients
 
@@ -952,7 +1173,7 @@ function calc_gradient!(gradients, u_transformed, t,
     # Prolong solution to interfaces
     # This reuses `prolong2interfaces!` for the purely hyperbolic case.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u_transformed, mesh,
+        prolong2interfaces!(nothing, cache, u_transformed, mesh,
                             equations_parabolic, dg)
     end
 
@@ -1022,7 +1243,7 @@ end
 # Needed to *not* flip the sign of the inverse Jacobian.
 # This is because the parabolic fluxes are assumed to be of the form
 #   `du/dt + df/dx = dg/dx + source(x,t)`,
-# where f(u) is the inviscid flux and g(u) is the viscous flux.
+# where f(u) is the inviscid flux and g(u) is the parabolic flux.
 function apply_jacobian_parabolic!(du::AbstractArray, mesh::TreeMesh{2},
                                    equations_parabolic::AbstractEquationsParabolic,
                                    dg::DG, cache)
diff --git a/src/solvers/dgsem_tree/dg_2d_parallel.jl b/src/solvers/dgsem_tree/dg_2d_parallel.jl
index 37ba51e15bc..d4b6192a4df 100644
--- a/src/solvers/dgsem_tree/dg_2d_parallel.jl
+++ b/src/solvers/dgsem_tree/dg_2d_parallel.jl
@@ -253,7 +253,7 @@ function create_cache(mesh::TreeMeshParallel{2}, equations,
 
     mpi_interfaces = init_mpi_interfaces(leaf_cell_ids, mesh, elements)
 
-    boundaries = init_boundaries(leaf_cell_ids, mesh, elements)
+    boundaries = init_boundaries(leaf_cell_ids, mesh, elements, dg.basis)
 
     mortars = init_mortars(leaf_cell_ids, mesh, elements, dg.mortar)
 
@@ -455,6 +455,8 @@ function rhs!(du, u, t,
                           T8codeMeshParallel{2}}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
+    backend = trixi_backend(u)
+
     # Start to receive MPI data
     @trixi_timeit timer() "start MPI receive" start_mpi_receive!(cache.mpi_cache)
 
@@ -479,7 +481,7 @@ function rhs!(du, u, t,
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache)
     end
@@ -487,12 +489,12 @@ function rhs!(du, u, t,
     # Prolong solution to interfaces
     # TODO: Taal decide order of arguments, consistent vs. modified cache first?
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u, mesh, equations, dg)
+        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
     end
 
     # Calculate interface fluxes
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
                              have_nonconservative_terms(equations), equations,
                              dg.surface_integral, dg, cache)
     end
@@ -542,12 +544,13 @@ function rhs!(du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
diff --git a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
index 9c92d75eb8a..e4d609c6758 100644
--- a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
@@ -29,7 +29,8 @@ function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, P4estMesh{2}},
                                  nnodes(dg), nnodes(dg))
                              for _ in 1:Threads.maxthreadid()]
 
-    antidiffusive_fluxes = ContainerAntidiffusiveFlux2D{uEltype}(0,
+    n_elements = nelements(cache_containers.elements)
+    antidiffusive_fluxes = ContainerAntidiffusiveFlux2D{uEltype}(n_elements,
                                                                  nvariables(equations),
                                                                  nnodes(dg))
 
@@ -62,9 +63,9 @@ function create_cache(mesh::Union{TreeMesh{2}, StructuredMesh{2}, P4estMesh{2}},
 end
 
 # Subcell limiting currently only implemented for certain mesh types
-function calc_volume_integral!(du, u,
+function calc_volume_integral!(backend::Nothing, du, u,
                                mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                           P4estMesh{2}, P4estMesh{3}},
+                                           P4estMesh{2}, TreeMesh{3}, P4estMesh{3}},
                                have_nonconservative_terms, equations,
                                volume_integral::VolumeIntegralSubcellLimiting,
                                dg::DGSEM, cache, t, boundary_conditions)
@@ -101,7 +102,7 @@ function calc_volume_integral!(du, u,
         # Loop over pure DG elements
         @trixi_timeit timer() "pure DG" @threaded for idx_element in eachindex(element_ids_dg)
             element = element_ids_dg[idx_element]
-            flux_differencing_kernel!(du, u, element, mesh,
+            flux_differencing_kernel!(du, u, element, typeof(mesh),
                                       have_nonconservative_terms, equations,
                                       volume_integral.volume_flux_dg, dg, cache)
         end
@@ -109,7 +110,7 @@ function calc_volume_integral!(du, u,
         # Loop over blended DG-FV elements
         @trixi_timeit timer() "subcell-wise blended DG-FV" @threaded for idx_element in eachindex(element_ids_dgfv)
             element = element_ids_dgfv[idx_element]
-            volume_integral_kernel!(du, u, element, mesh,
+            volume_integral_kernel!(du, u, element, typeof(mesh),
                                     have_nonconservative_terms, equations,
                                     volume_integral, limiter,
                                     dg, cache)
@@ -118,7 +119,7 @@ function calc_volume_integral!(du, u,
         # Loop over all elements
         @trixi_timeit timer() "subcell-wise blended DG-FV" @threaded for element in eachelement(dg,
                                                                                                 cache)
-            volume_integral_kernel!(du, u, element, mesh,
+            volume_integral_kernel!(du, u, element, typeof(mesh),
                                     have_nonconservative_terms, equations,
                                     volume_integral, limiter,
                                     dg, cache)
@@ -129,8 +130,9 @@ function calc_volume_integral!(du, u,
 end
 
 @inline function volume_integral_kernel!(du, u, element,
-                                         mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                                     P4estMesh{2}},
+                                         MeshT::Type{<:Union{TreeMesh{2},
+                                                             StructuredMesh{2},
+                                                             P4estMesh{2}}},
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralSubcellLimiting,
                                          limiter::SubcellLimiterIDP,
@@ -145,7 +147,7 @@ end
     fhat1_R = fhat1_R_threaded[Threads.threadid()]
     fhat2_L = fhat2_L_threaded[Threads.threadid()]
     fhat2_R = fhat2_R_threaded[Threads.threadid()]
-    calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u, mesh,
+    calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u, MeshT,
                    have_nonconservative_terms, equations, volume_flux_dg, dg, element,
                    cache)
 
@@ -156,14 +158,15 @@ end
     fstar2_L = fstar2_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
     fstar2_R = fstar2_R_threaded[Threads.threadid()]
-    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, mesh,
+    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, MeshT,
                  have_nonconservative_terms, equations, volume_flux_fv, dg, element,
                  cache)
 
     # antidiffusive flux
     calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R,
                             fstar1_L, fstar1_R, fstar2_L, fstar2_R,
-                            u, mesh, have_nonconservative_terms, equations, limiter, dg,
+                            u, MeshT, have_nonconservative_terms, equations, limiter,
+                            dg,
                             element, cache)
 
     # Calculate volume integral contribution of low-order FV flux
@@ -180,8 +183,9 @@ end
 end
 
 @inline function volume_integral_kernel!(du, u, element,
-                                         mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                                     P4estMesh{2}},
+                                         MeshT::Type{<:Union{TreeMesh{2},
+                                                             StructuredMesh{2},
+                                                             P4estMesh{2}}},
                                          nonconservative_terms::False, equations,
                                          volume_integral::VolumeIntegralSubcellLimiting,
                                          limiter::SubcellLimiterMCL,
@@ -195,7 +199,7 @@ end
     fhat1_R = fhat1_R_threaded[Threads.threadid()]
     fhat2_L = fhat2_L_threaded[Threads.threadid()]
     fhat2_R = fhat2_R_threaded[Threads.threadid()]
-    calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u, mesh,
+    calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u, MeshT,
                    nonconservative_terms, equations, volume_flux_dg, dg, element,
                    cache)
 
@@ -205,18 +209,18 @@ end
     fstar2_L = fstar2_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
     fstar2_R = fstar2_R_threaded[Threads.threadid()]
-    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, mesh,
+    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, MeshT,
                  nonconservative_terms, equations, volume_flux_fv, dg, element,
                  cache)
 
     # antidiffusive flux
     calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R,
                             fstar1_L, fstar1_R, fstar2_L, fstar2_R,
-                            u, mesh, nonconservative_terms, equations, limiter, dg,
+                            u, MeshT, nonconservative_terms, equations, limiter, dg,
                             element, cache)
 
     # limit antidiffusive flux
-    calcflux_antidiffusive_limited!(u, mesh, nonconservative_terms, equations,
+    calcflux_antidiffusive_limited!(u, MeshT, nonconservative_terms, equations,
                                     limiter, dg, element, cache,
                                     fstar1_L, fstar2_L)
 
@@ -245,7 +249,8 @@ end
 #
 # See also `flux_differencing_kernel!`.
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::TreeMesh{2}, have_nonconservative_terms::False,
+                                ::Type{<:TreeMesh{2}},
+                                have_nonconservative_terms::False,
                                 equations,
                                 volume_flux, dg::DGSEM, element, cache)
     @unpack weights, derivative_split = dg.basis
@@ -283,9 +288,11 @@ end
     end
 
     # FV-form flux `fhat` in x direction
-    for j in eachnode(dg), i in 1:(nnodes(dg) - 1), v in eachvariable(equations)
-        fhat1_L[v, i + 1, j] = fhat1_L[v, i, j] + weights[i] * flux_temp[v, i, j]
-        fhat1_R[v, i + 1, j] = fhat1_L[v, i + 1, j]
+    for j in eachnode(dg), i in 1:(nnodes(dg) - 1)
+        for v in eachvariable(equations)
+            fhat1_L[v, i + 1, j] = fhat1_L[v, i, j] + weights[i] * flux_temp[v, i, j]
+            fhat1_R[v, i + 1, j] = fhat1_L[v, i + 1, j]
+        end
     end
 
     # Split form volume flux in orientation 2: y direction
@@ -304,9 +311,11 @@ end
     end
 
     # FV-form flux `fhat` in y direction
-    for j in 1:(nnodes(dg) - 1), i in eachnode(dg), v in eachvariable(equations)
-        fhat2_L[v, i, j + 1] = fhat2_L[v, i, j] + weights[j] * flux_temp[v, i, j]
-        fhat2_R[v, i, j + 1] = fhat2_L[v, i, j + 1]
+    for j in 1:(nnodes(dg) - 1), i in eachnode(dg)
+        for v in eachvariable(equations)
+            fhat2_L[v, i, j + 1] = fhat2_L[v, i, j] + weights[j] * flux_temp[v, i, j]
+            fhat2_R[v, i, j + 1] = fhat2_L[v, i, j + 1]
+        end
     end
 
     return nothing
@@ -324,7 +333,7 @@ end
 #   Discretizations of Non-Conservative Systems. https://arxiv.org/pdf/2211.14009.pdf.
 #
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::TreeMesh{2}, have_nonconservative_terms::True,
+                                ::Type{<:TreeMesh{2}}, have_nonconservative_terms::True,
                                 equations,
                                 volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
                                 element,
@@ -503,7 +512,7 @@ end
 # The calculation of the non-conservative staggered "fluxes" requires non-conservative
 # terms that can be written as a product of local and jump contributions.
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::TreeMesh{2}, nonconservative_terms::True,
+                                ::Type{<:TreeMesh{2}}, nonconservative_terms::True,
                                 equations,
                                 volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
                                 element,
@@ -741,8 +750,8 @@ end
 # Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
 @inline function calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R,
                                          fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                                         mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                                     P4estMesh{2}},
+                                         ::Type{<:Union{TreeMesh{2}, StructuredMesh{2},
+                                                        P4estMesh{2}}},
                                          have_nonconservative_terms::False, equations,
                                          limiter::SubcellLimiterIDP, dg, element, cache)
     @unpack antidiffusive_flux1_L, antidiffusive_flux2_L, antidiffusive_flux1_R, antidiffusive_flux2_R = cache.antidiffusive_fluxes
@@ -778,8 +787,8 @@ end
 # Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
 @inline function calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R,
                                          fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                                         mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                                     P4estMesh{2}},
+                                         ::Type{<:Union{TreeMesh{2}, StructuredMesh{2},
+                                                        P4estMesh{2}}},
                                          have_nonconservative_terms::True, equations,
                                          limiter::SubcellLimiterIDP, dg, element, cache)
     @unpack antidiffusive_flux1_L, antidiffusive_flux2_L, antidiffusive_flux1_R, antidiffusive_flux2_R = cache.antidiffusive_fluxes
@@ -2055,65 +2064,4 @@ end
 
     return nothing
 end
-
-"""
-    get_boundary_outer_state(u_inner, t,
-                             boundary_condition::BoundaryConditionDirichlet,
-                             orientation_or_normal, direction,
-                             mesh, equations, dg, cache, indices...)
-For subcell limiting, the calculation of local bounds for non-periodic domains requires the boundary
-outer state. This function returns the boundary value  for [`BoundaryConditionDirichlet`](@ref) at
-time `t` and for node with spatial indices `indices` at the boundary with `orientation_or_normal`
-and `direction`.
-
-Should be used together with [`TreeMesh`](@ref) or [`StructuredMesh`](@ref).
-
-!!! warning "Experimental implementation"
-    This is an experimental feature and may change in future releases.
-"""
-@inline function get_boundary_outer_state(u_inner, t,
-                                          boundary_condition::BoundaryConditionDirichlet,
-                                          orientation_or_normal, direction,
-                                          mesh::Union{TreeMesh, StructuredMesh},
-                                          equations, dg, cache, indices...)
-    (; node_coordinates) = cache.elements
-
-    x = get_node_coords(node_coordinates, equations, dg, indices...)
-    u_outer = boundary_condition.boundary_value_function(x, t, equations)
-
-    return u_outer
-end
-
-@inline function get_boundary_outer_state(u_inner, t,
-                                          boundary_condition::BoundaryConditionCharacteristic,
-                                          orientation_or_normal, direction,
-                                          mesh::Union{TreeMesh, StructuredMesh},
-                                          equations,
-                                          dg, cache, indices...)
-    (; node_coordinates) = cache.elements
-
-    x = get_node_coords(node_coordinates, equations, dg, indices...)
-    u_outer = boundary_condition.boundary_value_function(boundary_condition.outer_boundary_value_function,
-                                                         u_inner, orientation_or_normal,
-                                                         direction, x, t, equations)
-
-    return u_outer
-end
-
-@inline function get_boundary_outer_state(u_inner, t,
-                                          boundary_condition::BoundaryConditionCharacteristic,
-                                          normal_direction::AbstractVector,
-                                          mesh::P4estMesh, equations, dg, cache,
-                                          indices...)
-    (; node_coordinates) = cache.elements
-
-    x = get_node_coords(node_coordinates, equations, dg, indices...)
-
-    u_outer = boundary_condition.boundary_value_function(boundary_condition.outer_boundary_value_function,
-                                                         u_inner,
-                                                         normal_direction,
-                                                         x, t, equations)
-
-    return u_outer
-end
 end # @muladd
diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index 41b46f58f95..5626cb62a6d 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -126,7 +126,7 @@ This treatment is required to achieve, e.g., entropy-stability or well-balancedn
 See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-1765644064
 =#
 @inline function weak_form_kernel!(du, u,
-                                   element, mesh::TreeMesh{3},
+                                   element, ::Type{<:TreeMesh{3}},
                                    have_nonconservative_terms::False, equations,
                                    dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -158,7 +158,7 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
     return nothing
 end
 
-@inline function flux_differencing_kernel!(du, u, element, mesh::TreeMesh{3},
+@inline function flux_differencing_kernel!(du, u, element, ::Type{<:TreeMesh{3}},
                                            have_nonconservative_terms::False, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -208,7 +208,7 @@ end
     return nothing
 end
 
-@inline function flux_differencing_kernel!(du, u, element, mesh::TreeMesh{3},
+@inline function flux_differencing_kernel!(du, u, element, MeshT::Type{<:TreeMesh{3}},
                                            have_nonconservative_terms::True, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -217,7 +217,7 @@ end
     symmetric_flux, nonconservative_flux = volume_flux
 
     # Apply the symmetric flux as usual
-    flux_differencing_kernel!(du, u, element, mesh, False(), equations, symmetric_flux,
+    flux_differencing_kernel!(du, u, element, MeshT, False(), equations, symmetric_flux,
                               dg, cache, alpha)
 
     # Calculate the remaining volume terms using the nonsymmetric generalized flux
@@ -261,8 +261,9 @@ end
 end
 
 @inline function fv_kernel!(du, u,
-                            mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
-                                        T8codeMesh{3}},
+                            MeshT::Type{<:Union{TreeMesh{3}, StructuredMesh{3},
+                                                P4estMesh{3},
+                                                T8codeMesh{3}}},
                             have_nonconservative_terms, equations,
                             volume_flux_fv, dg::DGSEM, cache, element, alpha = true)
     @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded, fstar3_L_threaded, fstar3_R_threaded = cache
@@ -277,7 +278,7 @@ end
     fstar3_R = fstar3_R_threaded[Threads.threadid()]
 
     calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u,
-                 mesh, have_nonconservative_terms, equations,
+                 MeshT, have_nonconservative_terms, equations,
                  volume_flux_fv, dg, element, cache)
 
     # Calculate FV volume integral contribution
@@ -300,8 +301,9 @@ end
 end
 
 @inline function fvO2_kernel!(du, u,
-                              mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
-                                          T8codeMesh{3}},
+                              MeshT::Type{<:Union{TreeMesh{3}, StructuredMesh{3},
+                                                  P4estMesh{3},
+                                                  T8codeMesh{3}}},
                               have_nonconservative_terms, equations,
                               volume_flux_fv, dg::DGSEM, cache, element,
                               sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -321,7 +323,7 @@ end
     fstar3_R = fstar3_R_threaded[Threads.threadid()]
 
     calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u,
-                   mesh, have_nonconservative_terms, equations,
+                   MeshT, have_nonconservative_terms, equations,
                    volume_flux_fv, dg, element, cache,
                    sc_interface_coords, reconstruction_mode, slope_limiter,
                    cons2recon, recon2cons)
@@ -351,7 +353,7 @@ end
 # [arXiv: 2008.12044v2](https://arxiv.org/pdf/2008.12044)
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                               fstar3_L, fstar3_R, u,
-                              mesh::TreeMesh{3}, have_nonconservative_terms::False,
+                              ::Type{<:TreeMesh{3}}, have_nonconservative_terms::False,
                               equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
@@ -383,7 +385,7 @@ end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                               fstar3_L, fstar3_R, u,
-                              mesh::TreeMesh{3},
+                              ::Type{<:TreeMesh{3}},
                               have_nonconservative_terms::True, equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     volume_flux, nonconservative_flux = volume_flux_fv
@@ -447,7 +449,8 @@ end
 
 @inline function calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                                 fstar3_L, fstar3_R, u,
-                                mesh::TreeMesh{3}, have_nonconservative_terms::False,
+                                ::Type{<:TreeMesh{3}},
+                                have_nonconservative_terms::False,
                                 equations,
                                 volume_flux_fv, dg::DGSEM, element, cache,
                                 sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -519,7 +522,8 @@ end
     return nothing
 end
 
-function prolong2interfaces!(cache, u, mesh::TreeMesh{3}, equations, dg::DG)
+function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{3}, equations,
+                             dg::DG)
     @unpack interfaces = cache
     @unpack orientations, neighbor_ids = interfaces
     interfaces_u = interfaces.u
@@ -557,7 +561,7 @@ function prolong2interfaces!(cache, u, mesh::TreeMesh{3}, equations, dg::DG)
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{3},
                               have_nonconservative_terms::False, equations,
                               surface_integral, dg::DG, cache)
@@ -592,7 +596,7 @@ function calc_interface_flux!(surface_flux_values,
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{3},
                               have_nonconservative_terms::True, equations,
                               surface_integral, dg::DG, cache)
@@ -1333,7 +1337,8 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::Union{TreeMesh{3}, StructuredMesh{3}},
+function calc_surface_integral!(backend::Nothing, du, u,
+                                mesh::Union{TreeMesh{3}, StructuredMesh{3}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
                                 dg::DGSEM, cache)
     @unpack inverse_weights = dg.basis
@@ -1391,13 +1396,13 @@ function calc_surface_integral!(du, u, mesh::Union{TreeMesh{3}, StructuredMesh{3
     return nothing
 end
 
-function apply_jacobian!(du, mesh::TreeMesh{3},
+function apply_jacobian!(backend::Nothing, du, mesh::TreeMesh{3},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 
     @threaded for element in eachelement(dg, cache)
         # Negative sign included to account for the negated surface and volume terms,
-        # see e.g. the computation of `derivative_hat` in the basis setup and 
+        # see e.g. the computation of `derivative_hat` in the basis setup and
         # the comment in `calc_surface_integral!`.
         factor = -inverse_jacobian[element]
 
diff --git a/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl b/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl
index b2c48c9f00a..1cdf1ca07e6 100644
--- a/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl
+++ b/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl
@@ -17,7 +17,7 @@
 # if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
 # works efficiently here.
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
-                                           element, mesh::TreeMesh{3},
+                                           element, MeshT::Type{<:TreeMesh{3}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations3D,
                                            volume_flux::typeof(flux_shima_etal_turbo),
@@ -28,13 +28,13 @@
     # indices `[i, j, k, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims(MeshT))...,
                                           StaticInt(nvariables(equations))))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
@@ -263,7 +263,7 @@
 end
 
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
-                                           element, mesh::TreeMesh{3},
+                                           element, MeshT::Type{<:TreeMesh{3}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations3D,
                                            volume_flux::typeof(flux_ranocha_turbo),
@@ -274,7 +274,7 @@ end
     # indices `[i, j, k, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`. In addition
@@ -283,7 +283,7 @@ end
     # values.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims(MeshT))...,
                                           StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
diff --git a/src/solvers/dgsem_tree/dg_3d_parabolic.jl b/src/solvers/dgsem_tree/dg_3d_parabolic.jl
index e3e763f0829..986778085ec 100644
--- a/src/solvers/dgsem_tree/dg_3d_parabolic.jl
+++ b/src/solvers/dgsem_tree/dg_3d_parabolic.jl
@@ -11,11 +11,11 @@
 function create_cache_parabolic(mesh::Union{TreeMesh{3}, P4estMesh{3}},
                                 equations_hyperbolic::AbstractEquations,
                                 dg::DG, n_elements, uEltype)
-    viscous_container = init_viscous_container_3d(nvariables(equations_hyperbolic),
-                                                  nnodes(dg), n_elements,
-                                                  uEltype)
+    parabolic_container = init_parabolic_container_3d(nvariables(equations_hyperbolic),
+                                                      nnodes(dg), n_elements,
+                                                      uEltype)
 
-    cache_parabolic = (; viscous_container)
+    cache_parabolic = (; parabolic_container)
 
     return cache_parabolic
 end
@@ -51,23 +51,23 @@ function reset_gradients!(gradients::NTuple{3}, dg::DG, cache)
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes.
+# This is the version used when calculating the divergence of the parabolic fluxes.
 # Identical to weak-form volume integral/kernel for the purely hyperbolic case,
-# except that the fluxes are here already precomputed in `calc_viscous_fluxes!`
-function calc_volume_integral!(du, flux_viscous, mesh::TreeMesh{3},
+# except that the fluxes are here already precomputed in `calc_parabolic_fluxes!`
+function calc_volume_integral!(du, flux_parabolic, mesh::TreeMesh{3},
                                equations_parabolic::AbstractEquationsParabolic,
                                dg::DGSEM, cache)
     @unpack derivative_hat = dg.basis
-    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+    flux_parabolic_x, flux_parabolic_y, flux_parabolic_z = flux_parabolic
 
     @threaded for element in eachelement(dg, cache)
         # Calculate volume terms in one element
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            flux_1_node = get_node_vars(flux_viscous_x, equations_parabolic, dg,
+            flux_1_node = get_node_vars(flux_parabolic_x, equations_parabolic, dg,
                                         i, j, k, element)
-            flux_2_node = get_node_vars(flux_viscous_y, equations_parabolic, dg,
+            flux_2_node = get_node_vars(flux_parabolic_y, equations_parabolic, dg,
                                         i, j, k, element)
-            flux_3_node = get_node_vars(flux_viscous_z, equations_parabolic, dg,
+            flux_3_node = get_node_vars(flux_parabolic_z, equations_parabolic, dg,
                                         i, j, k, element)
 
             for ii in eachnode(dg)
@@ -90,19 +90,21 @@ function calc_volume_integral!(du, flux_viscous, mesh::TreeMesh{3},
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes.
-# Specialization `flux_viscous::Tuple` needed to
+# This is the version used when calculating the divergence of the parabolic fluxes.
+# Specialization `flux_parabolic::Tuple` needed to
 # avoid amibiguity with the hyperbolic version of `prolong2interfaces!` in dg_3d.jl
 # which is for the variables itself, i.e., `u::Array{uEltype, 5}`.
-function prolong2interfaces!(cache, flux_viscous::Tuple,
+function prolong2interfaces!(cache, flux_parabolic::Tuple,
                              mesh::TreeMesh{3},
                              equations_parabolic::AbstractEquationsParabolic,
                              dg::DG)
     @unpack interfaces = cache
     @unpack orientations, neighbor_ids = interfaces
+
+    # OBS! `interfaces_u` stores the interpolated *fluxes* and *not the solution*!
     interfaces_u = interfaces.u
 
-    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+    flux_parabolic_x, flux_parabolic_y, flux_parabolic_z = flux_parabolic
 
     @threaded for interface in eachinterface(dg, cache)
         left_element = neighbor_ids[1, interface]
@@ -112,37 +114,37 @@ function prolong2interfaces!(cache, flux_viscous::Tuple,
             # interface in x-direction
             for k in eachnode(dg), j in eachnode(dg),
                 v in eachvariable(equations_parabolic)
-                # OBS! `interfaces_u` stores the interpolated *fluxes* and *not the solution*!
-                interfaces_u[1, v, j, k, interface] = flux_viscous_x[v,
-                                                                     nnodes(dg), j, k,
-                                                                     left_element]
-                interfaces_u[2, v, j, k, interface] = flux_viscous_x[v,
-                                                                     1, j, k,
-                                                                     right_element]
+
+                interfaces_u[1, v, j, k, interface] = flux_parabolic_x[v,
+                                                                       nnodes(dg), j, k,
+                                                                       left_element]
+                interfaces_u[2, v, j, k, interface] = flux_parabolic_x[v,
+                                                                       1, j, k,
+                                                                       right_element]
             end
         elseif orientations[interface] == 2
             # interface in y-direction
             for k in eachnode(dg), i in eachnode(dg),
                 v in eachvariable(equations_parabolic)
-                # OBS! `interfaces_u` stores the interpolated *fluxes* and *not the solution*!
-                interfaces_u[1, v, i, k, interface] = flux_viscous_y[v,
-                                                                     i, nnodes(dg), k,
-                                                                     left_element]
-                interfaces_u[2, v, i, k, interface] = flux_viscous_y[v,
-                                                                     i, 1, k,
-                                                                     right_element]
+
+                interfaces_u[1, v, i, k, interface] = flux_parabolic_y[v,
+                                                                       i, nnodes(dg), k,
+                                                                       left_element]
+                interfaces_u[2, v, i, k, interface] = flux_parabolic_y[v,
+                                                                       i, 1, k,
+                                                                       right_element]
             end
         else # if orientations[interface] == 3
             # interface in z-direction
             for j in eachnode(dg), i in eachnode(dg),
                 v in eachvariable(equations_parabolic)
-                # OBS! `interfaces_u` stores the interpolated *fluxes* and *not the solution*!
-                interfaces_u[1, v, i, j, interface] = flux_viscous_z[v,
-                                                                     i, j, nnodes(dg),
-                                                                     left_element]
-                interfaces_u[2, v, i, j, interface] = flux_viscous_z[v,
-                                                                     i, j, 1,
-                                                                     right_element]
+
+                interfaces_u[1, v, i, j, interface] = flux_parabolic_z[v,
+                                                                       i, j, nnodes(dg),
+                                                                       left_element]
+                interfaces_u[2, v, i, j, interface] = flux_parabolic_z[v,
+                                                                       i, j, 1,
+                                                                       right_element]
             end
         end
     end
@@ -150,7 +152,7 @@ function prolong2interfaces!(cache, flux_viscous::Tuple,
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes
+# This is the version used when calculating the divergence of the parabolic fluxes
 function calc_interface_flux!(surface_flux_values, mesh::TreeMesh{3},
                               equations_parabolic, dg::DG, parabolic_scheme,
                               cache)
@@ -188,18 +190,20 @@ function calc_interface_flux!(surface_flux_values, mesh::TreeMesh{3},
     return nothing
 end
 
-# This is the version used when calculating the divergence of the viscous fluxes.
-# Specialization `flux_viscous::Tuple` needed to
+# This is the version used when calculating the divergence of the parabolic fluxes.
+# Specialization `flux_parabolic::Tuple` needed to
 # avoid amibiguity with the hyperbolic version of `prolong2boundaries!` in dg_3d.jl
 # which is for the variables itself, i.e., `u::Array{uEltype, 5}`.
-function prolong2boundaries!(cache, flux_viscous::Tuple,
+function prolong2boundaries!(cache, flux_parabolic::Tuple,
                              mesh::TreeMesh{3},
                              equations_parabolic::AbstractEquationsParabolic,
                              dg::DG)
     @unpack boundaries = cache
     @unpack orientations, neighbor_sides, neighbor_ids = boundaries
+
+    # OBS! `boundaries_u` stores the "interpolated" *fluxes* and *not the solution*!
     boundaries_u = boundaries.u
-    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+    flux_parabolic_x, flux_parabolic_y, flux_parabolic_z = flux_parabolic
 
     @threaded for boundary in eachboundary(dg, cache)
         element = neighbor_ids[boundary]
@@ -210,22 +214,22 @@ function prolong2boundaries!(cache, flux_viscous::Tuple,
                 # element in -x direction of boundary
                 for k in eachnode(dg), j in eachnode(dg),
                     v in eachvariable(equations_parabolic)
-                    # OBS! `boundaries_u` stores the interpolated *fluxes* and *not the solution*!
-                    boundaries_u[1, v, j, k, boundary] = flux_viscous_x[v,
-                                                                        nnodes(dg),
-                                                                        j,
-                                                                        k,
-                                                                        element]
+
+                    boundaries_u[1, v, j, k, boundary] = flux_parabolic_x[v,
+                                                                          nnodes(dg),
+                                                                          j,
+                                                                          k,
+                                                                          element]
                 end
             else # Element in +x direction of boundary
                 for k in eachnode(dg), j in eachnode(dg),
                     v in eachvariable(equations_parabolic)
-                    # OBS! `boundaries_u` stores the interpolated *fluxes* and *not the solution*!
-                    boundaries_u[2, v, j, k, boundary] = flux_viscous_x[v,
-                                                                        1,
-                                                                        j,
-                                                                        k,
-                                                                        element]
+
+                    boundaries_u[2, v, j, k, boundary] = flux_parabolic_x[v,
+                                                                          1,
+                                                                          j,
+                                                                          k,
+                                                                          element]
                 end
             end
         elseif orientations[boundary] == 2
@@ -234,23 +238,23 @@ function prolong2boundaries!(cache, flux_viscous::Tuple,
                 # element in -y direction of boundary
                 for k in eachnode(dg), i in eachnode(dg),
                     v in eachvariable(equations_parabolic)
-                    # OBS! `boundaries_u` stores the interpolated *fluxes* and *not the solution*!
-                    boundaries_u[1, v, i, k, boundary] = flux_viscous_y[v,
-                                                                        i,
-                                                                        nnodes(dg),
-                                                                        k,
-                                                                        element]
+
+                    boundaries_u[1, v, i, k, boundary] = flux_parabolic_y[v,
+                                                                          i,
+                                                                          nnodes(dg),
+                                                                          k,
+                                                                          element]
                 end
             else
                 # element in +y direction of boundary
                 for k in eachnode(dg), i in eachnode(dg),
                     v in eachvariable(equations_parabolic)
-                    # OBS! `boundaries_u` stores the interpolated *fluxes* and *not the solution*!
-                    boundaries_u[2, v, i, k, boundary] = flux_viscous_y[v,
-                                                                        i,
-                                                                        1,
-                                                                        k,
-                                                                        element]
+
+                    boundaries_u[2, v, i, k, boundary] = flux_parabolic_y[v,
+                                                                          i,
+                                                                          1,
+                                                                          k,
+                                                                          element]
                 end
             end
         else # if orientations[boundary] == 3
@@ -259,23 +263,23 @@ function prolong2boundaries!(cache, flux_viscous::Tuple,
                 # element in -z direction of boundary
                 for j in eachnode(dg), i in eachnode(dg),
                     v in eachvariable(equations_parabolic)
-                    # OBS! `boundaries_u` stores the interpolated *fluxes* and *not the solution*!
-                    boundaries_u[1, v, i, j, boundary] = flux_viscous_z[v,
-                                                                        i,
-                                                                        j,
-                                                                        nnodes(dg),
-                                                                        element]
+
+                    boundaries_u[1, v, i, j, boundary] = flux_parabolic_z[v,
+                                                                          i,
+                                                                          j,
+                                                                          nnodes(dg),
+                                                                          element]
                 end
             else
                 # element in +z direction of boundary
                 for j in eachnode(dg), i in eachnode(dg),
                     v in eachvariable(equations_parabolic)
-                    # OBS! `boundaries_u` stores the interpolated *fluxes* and *not the solution*!
-                    boundaries_u[2, v, i, j, boundary] = flux_viscous_z[v,
-                                                                        i,
-                                                                        j,
-                                                                        1,
-                                                                        element]
+
+                    boundaries_u[2, v, i, j, boundary] = flux_parabolic_z[v,
+                                                                          i,
+                                                                          j,
+                                                                          1,
+                                                                          element]
                 end
             end
         end
@@ -284,13 +288,13 @@ function prolong2boundaries!(cache, flux_viscous::Tuple,
     return nothing
 end
 
-function calc_viscous_fluxes!(flux_viscous,
-                              gradients, u_transformed,
-                              mesh::Union{TreeMesh{3}, P4estMesh{3}},
-                              equations_parabolic::AbstractEquationsParabolic,
-                              dg::DG, cache)
+function calc_parabolic_fluxes!(flux_parabolic,
+                                gradients, u_transformed,
+                                mesh::Union{TreeMesh{3}, P4estMesh{3}},
+                                equations_parabolic::AbstractEquationsParabolic,
+                                dg::DG, cache)
     gradients_x, gradients_y, gradients_z = gradients
-    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous # output arrays
+    flux_parabolic_x, flux_parabolic_y, flux_parabolic_z = flux_parabolic # output arrays
 
     @threaded for element in eachelement(dg, cache)
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
@@ -304,21 +308,24 @@ function calc_viscous_fluxes!(flux_viscous,
             gradients_3_node = get_node_vars(gradients_z, equations_parabolic, dg,
                                              i, j, k, element)
 
-            # Calculate viscous flux and store each component for later use
-            flux_viscous_node_x = flux(u_node,
-                                       (gradients_1_node, gradients_2_node,
-                                        gradients_3_node), 1, equations_parabolic)
-            flux_viscous_node_y = flux(u_node,
-                                       (gradients_1_node, gradients_2_node,
-                                        gradients_3_node), 2, equations_parabolic)
-            flux_viscous_node_z = flux(u_node,
-                                       (gradients_1_node, gradients_2_node,
-                                        gradients_3_node), 3, equations_parabolic)
-            set_node_vars!(flux_viscous_x, flux_viscous_node_x, equations_parabolic, dg,
+            # Calculate parabolic flux and store each component for later use
+            flux_parabolic_node_x = flux(u_node,
+                                         (gradients_1_node, gradients_2_node,
+                                          gradients_3_node), 1, equations_parabolic)
+            flux_parabolic_node_y = flux(u_node,
+                                         (gradients_1_node, gradients_2_node,
+                                          gradients_3_node), 2, equations_parabolic)
+            flux_parabolic_node_z = flux(u_node,
+                                         (gradients_1_node, gradients_2_node,
+                                          gradients_3_node), 3, equations_parabolic)
+            set_node_vars!(flux_parabolic_x, flux_parabolic_node_x,
+                           equations_parabolic, dg,
                            i, j, k, element)
-            set_node_vars!(flux_viscous_y, flux_viscous_node_y, equations_parabolic, dg,
+            set_node_vars!(flux_parabolic_y, flux_parabolic_node_y,
+                           equations_parabolic, dg,
                            i, j, k, element)
-            set_node_vars!(flux_viscous_z, flux_viscous_node_z, equations_parabolic, dg,
+            set_node_vars!(flux_parabolic_z, flux_parabolic_node_z,
+                           equations_parabolic, dg,
                            i, j, k, element)
         end
     end
@@ -506,7 +513,7 @@ function calc_boundary_flux_by_direction_divergence!(surface_flux_values::Abstra
     @unpack surface_flux = surface_integral
 
     # Note: cache.boundaries.u contains the unsigned normal component (using "orientation", not "direction")
-    # of the viscous flux, as computed in `prolong2boundaries!`
+    # of the parabolic flux, as computed in `prolong2boundaries!`
     @unpack u, neighbor_ids, neighbor_sides, node_coordinates, orientations = cache.boundaries
 
     @threaded for boundary in first_boundary:last_boundary
@@ -514,7 +521,7 @@ function calc_boundary_flux_by_direction_divergence!(surface_flux_values::Abstra
         neighbor = neighbor_ids[boundary]
 
         for j in eachnode(dg), i in eachnode(dg)
-            # Get viscous boundary fluxes
+            # Get parabolic boundary fluxes
             flux_ll, flux_rr = get_surface_node_vars(u, equations_parabolic, dg,
                                                      i, j, boundary)
             if neighbor_sides[boundary] == 1 # Element is on the left, boundary on the right
@@ -546,17 +553,17 @@ function calc_boundary_flux_by_direction_divergence!(surface_flux_values::Abstra
     return nothing
 end
 
-# Specialization `flux_viscous::Tuple` needed to
+# Specialization `flux_parabolic::Tuple` needed to
 # avoid amibiguity with the hyperbolic version of `prolong2mortars!` in dg_3d.jl
 # which is for the variables itself, i.e., `u::Array{uEltype, 5}`.
-function prolong2mortars!(cache, flux_viscous::Tuple,
+function prolong2mortars!(cache, flux_parabolic::Tuple,
                           mesh::TreeMesh{3},
                           equations_parabolic::AbstractEquationsParabolic,
                           mortar_l2::LobattoLegendreMortarL2, dg::DGSEM)
     # temporary buffer for projections
     @unpack fstar_tmp1_threaded = cache
 
-    flux_viscous_x, flux_viscous_y, flux_viscous_z = flux_viscous
+    flux_parabolic_x, flux_parabolic_y, flux_parabolic_z = flux_parabolic
     @threaded for mortar in eachmortar(dg, cache)
         fstar_tmp1 = fstar_tmp1_threaded[Threads.threadid()]
 
@@ -572,78 +579,78 @@ function prolong2mortars!(cache, flux_viscous::Tuple,
                 # L2 mortars in x-direction
                 for k in eachnode(dg), j in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
-                        cache.mortars.u_upper_left[2, v, j, k, mortar] = flux_viscous_x[v,
-                                                                                        1,
-                                                                                        j,
-                                                                                        k,
-                                                                                        upper_left_element]
-                        cache.mortars.u_upper_right[2, v, j, k, mortar] = flux_viscous_x[v,
-                                                                                         1,
-                                                                                         j,
-                                                                                         k,
-                                                                                         upper_right_element]
-                        cache.mortars.u_lower_left[2, v, j, k, mortar] = flux_viscous_x[v,
-                                                                                        1,
-                                                                                        j,
-                                                                                        k,
-                                                                                        lower_left_element]
-                        cache.mortars.u_lower_right[2, v, j, k, mortar] = flux_viscous_x[v,
-                                                                                         1,
-                                                                                         j,
-                                                                                         k,
-                                                                                         lower_right_element]
+                        cache.mortars.u_upper_left[2, v, j, k, mortar] = flux_parabolic_x[v,
+                                                                                          1,
+                                                                                          j,
+                                                                                          k,
+                                                                                          upper_left_element]
+                        cache.mortars.u_upper_right[2, v, j, k, mortar] = flux_parabolic_x[v,
+                                                                                           1,
+                                                                                           j,
+                                                                                           k,
+                                                                                           upper_right_element]
+                        cache.mortars.u_lower_left[2, v, j, k, mortar] = flux_parabolic_x[v,
+                                                                                          1,
+                                                                                          j,
+                                                                                          k,
+                                                                                          lower_left_element]
+                        cache.mortars.u_lower_right[2, v, j, k, mortar] = flux_parabolic_x[v,
+                                                                                           1,
+                                                                                           j,
+                                                                                           k,
+                                                                                           lower_right_element]
                     end
                 end
             elseif cache.mortars.orientations[mortar] == 2
                 # L2 mortars in y-direction
                 for k in eachnode(dg), i in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
-                        cache.mortars.u_upper_left[2, v, i, k, mortar] = flux_viscous_y[v,
-                                                                                        i,
-                                                                                        1,
-                                                                                        k,
-                                                                                        upper_left_element]
-                        cache.mortars.u_upper_right[2, v, i, k, mortar] = flux_viscous_y[v,
-                                                                                         i,
-                                                                                         1,
-                                                                                         k,
-                                                                                         upper_right_element]
-                        cache.mortars.u_lower_left[2, v, i, k, mortar] = flux_viscous_y[v,
-                                                                                        i,
-                                                                                        1,
-                                                                                        k,
-                                                                                        lower_left_element]
-                        cache.mortars.u_lower_right[2, v, i, k, mortar] = flux_viscous_y[v,
-                                                                                         i,
-                                                                                         1,
-                                                                                         k,
-                                                                                         lower_right_element]
+                        cache.mortars.u_upper_left[2, v, i, k, mortar] = flux_parabolic_y[v,
+                                                                                          i,
+                                                                                          1,
+                                                                                          k,
+                                                                                          upper_left_element]
+                        cache.mortars.u_upper_right[2, v, i, k, mortar] = flux_parabolic_y[v,
+                                                                                           i,
+                                                                                           1,
+                                                                                           k,
+                                                                                           upper_right_element]
+                        cache.mortars.u_lower_left[2, v, i, k, mortar] = flux_parabolic_y[v,
+                                                                                          i,
+                                                                                          1,
+                                                                                          k,
+                                                                                          lower_left_element]
+                        cache.mortars.u_lower_right[2, v, i, k, mortar] = flux_parabolic_y[v,
+                                                                                           i,
+                                                                                           1,
+                                                                                           k,
+                                                                                           lower_right_element]
                     end
                 end
             else # orientations[mortar] == 3
                 # L2 mortars in z-direction
                 for j in eachnode(dg), i in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
-                        cache.mortars.u_upper_left[2, v, i, j, mortar] = flux_viscous_z[v,
-                                                                                        i,
-                                                                                        j,
-                                                                                        1,
-                                                                                        upper_left_element]
-                        cache.mortars.u_upper_right[2, v, i, j, mortar] = flux_viscous_z[v,
-                                                                                         i,
-                                                                                         j,
-                                                                                         1,
-                                                                                         upper_right_element]
-                        cache.mortars.u_lower_left[2, v, i, j, mortar] = flux_viscous_z[v,
-                                                                                        i,
-                                                                                        j,
-                                                                                        1,
-                                                                                        lower_left_element]
-                        cache.mortars.u_lower_right[2, v, i, j, mortar] = flux_viscous_z[v,
-                                                                                         i,
-                                                                                         j,
-                                                                                         1,
-                                                                                         lower_right_element]
+                        cache.mortars.u_upper_left[2, v, i, j, mortar] = flux_parabolic_z[v,
+                                                                                          i,
+                                                                                          j,
+                                                                                          1,
+                                                                                          upper_left_element]
+                        cache.mortars.u_upper_right[2, v, i, j, mortar] = flux_parabolic_z[v,
+                                                                                           i,
+                                                                                           j,
+                                                                                           1,
+                                                                                           upper_right_element]
+                        cache.mortars.u_lower_left[2, v, i, j, mortar] = flux_parabolic_z[v,
+                                                                                          i,
+                                                                                          j,
+                                                                                          1,
+                                                                                          lower_left_element]
+                        cache.mortars.u_lower_right[2, v, i, j, mortar] = flux_parabolic_z[v,
+                                                                                           i,
+                                                                                           j,
+                                                                                           1,
+                                                                                           lower_right_element]
                     end
                 end
             end
@@ -652,78 +659,78 @@ function prolong2mortars!(cache, flux_viscous::Tuple,
                 # L2 mortars in x-direction
                 for k in eachnode(dg), j in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
-                        cache.mortars.u_upper_left[1, v, j, k, mortar] = flux_viscous_x[v,
-                                                                                        nnodes(dg),
-                                                                                        j,
-                                                                                        k,
-                                                                                        upper_left_element]
-                        cache.mortars.u_upper_right[1, v, j, k, mortar] = flux_viscous_x[v,
-                                                                                         nnodes(dg),
-                                                                                         j,
-                                                                                         k,
-                                                                                         upper_right_element]
-                        cache.mortars.u_lower_left[1, v, j, k, mortar] = flux_viscous_x[v,
-                                                                                        nnodes(dg),
-                                                                                        j,
-                                                                                        k,
-                                                                                        lower_left_element]
-                        cache.mortars.u_lower_right[1, v, j, k, mortar] = flux_viscous_x[v,
-                                                                                         nnodes(dg),
-                                                                                         j,
-                                                                                         k,
-                                                                                         lower_right_element]
+                        cache.mortars.u_upper_left[1, v, j, k, mortar] = flux_parabolic_x[v,
+                                                                                          nnodes(dg),
+                                                                                          j,
+                                                                                          k,
+                                                                                          upper_left_element]
+                        cache.mortars.u_upper_right[1, v, j, k, mortar] = flux_parabolic_x[v,
+                                                                                           nnodes(dg),
+                                                                                           j,
+                                                                                           k,
+                                                                                           upper_right_element]
+                        cache.mortars.u_lower_left[1, v, j, k, mortar] = flux_parabolic_x[v,
+                                                                                          nnodes(dg),
+                                                                                          j,
+                                                                                          k,
+                                                                                          lower_left_element]
+                        cache.mortars.u_lower_right[1, v, j, k, mortar] = flux_parabolic_x[v,
+                                                                                           nnodes(dg),
+                                                                                           j,
+                                                                                           k,
+                                                                                           lower_right_element]
                     end
                 end
             elseif cache.mortars.orientations[mortar] == 2
                 # L2 mortars in y-direction
                 for k in eachnode(dg), i in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
-                        cache.mortars.u_upper_left[1, v, i, k, mortar] = flux_viscous_y[v,
-                                                                                        i,
-                                                                                        nnodes(dg),
-                                                                                        k,
-                                                                                        upper_left_element]
-                        cache.mortars.u_upper_right[1, v, i, k, mortar] = flux_viscous_y[v,
-                                                                                         i,
-                                                                                         nnodes(dg),
-                                                                                         k,
-                                                                                         upper_right_element]
-                        cache.mortars.u_lower_left[1, v, i, k, mortar] = flux_viscous_y[v,
-                                                                                        i,
-                                                                                        nnodes(dg),
-                                                                                        k,
-                                                                                        lower_left_element]
-                        cache.mortars.u_lower_right[1, v, i, k, mortar] = flux_viscous_y[v,
-                                                                                         i,
-                                                                                         nnodes(dg),
-                                                                                         k,
-                                                                                         lower_right_element]
+                        cache.mortars.u_upper_left[1, v, i, k, mortar] = flux_parabolic_y[v,
+                                                                                          i,
+                                                                                          nnodes(dg),
+                                                                                          k,
+                                                                                          upper_left_element]
+                        cache.mortars.u_upper_right[1, v, i, k, mortar] = flux_parabolic_y[v,
+                                                                                           i,
+                                                                                           nnodes(dg),
+                                                                                           k,
+                                                                                           upper_right_element]
+                        cache.mortars.u_lower_left[1, v, i, k, mortar] = flux_parabolic_y[v,
+                                                                                          i,
+                                                                                          nnodes(dg),
+                                                                                          k,
+                                                                                          lower_left_element]
+                        cache.mortars.u_lower_right[1, v, i, k, mortar] = flux_parabolic_y[v,
+                                                                                           i,
+                                                                                           nnodes(dg),
+                                                                                           k,
+                                                                                           lower_right_element]
                     end
                 end
             else # if cache.mortars.orientations[mortar] == 3
                 # L2 mortars in z-direction
                 for j in eachnode(dg), i in eachnode(dg)
                     for v in eachvariable(equations_parabolic)
-                        cache.mortars.u_upper_left[1, v, i, j, mortar] = flux_viscous_z[v,
-                                                                                        i,
-                                                                                        j,
-                                                                                        nnodes(dg),
-                                                                                        upper_left_element]
-                        cache.mortars.u_upper_right[1, v, i, j, mortar] = flux_viscous_z[v,
-                                                                                         i,
-                                                                                         j,
-                                                                                         nnodes(dg),
-                                                                                         upper_right_element]
-                        cache.mortars.u_lower_left[1, v, i, j, mortar] = flux_viscous_z[v,
-                                                                                        i,
-                                                                                        j,
-                                                                                        nnodes(dg),
-                                                                                        lower_left_element]
-                        cache.mortars.u_lower_right[1, v, i, j, mortar] = flux_viscous_z[v,
-                                                                                         i,
-                                                                                         j,
-                                                                                         nnodes(dg),
-                                                                                         lower_right_element]
+                        cache.mortars.u_upper_left[1, v, i, j, mortar] = flux_parabolic_z[v,
+                                                                                          i,
+                                                                                          j,
+                                                                                          nnodes(dg),
+                                                                                          upper_left_element]
+                        cache.mortars.u_upper_right[1, v, i, j, mortar] = flux_parabolic_z[v,
+                                                                                           i,
+                                                                                           j,
+                                                                                           nnodes(dg),
+                                                                                           upper_right_element]
+                        cache.mortars.u_lower_left[1, v, i, j, mortar] = flux_parabolic_z[v,
+                                                                                          i,
+                                                                                          j,
+                                                                                          nnodes(dg),
+                                                                                          lower_left_element]
+                        cache.mortars.u_lower_right[1, v, i, j, mortar] = flux_parabolic_z[v,
+                                                                                           i,
+                                                                                           j,
+                                                                                           nnodes(dg),
+                                                                                           lower_right_element]
                     end
                 end
             end
@@ -734,17 +741,17 @@ function prolong2mortars!(cache, flux_viscous::Tuple,
             leftright = 1
             if cache.mortars.orientations[mortar] == 1
                 # L2 mortars in x-direction
-                u_large = view(flux_viscous_x, :, nnodes(dg), :, :, large_element)
+                u_large = view(flux_parabolic_x, :, nnodes(dg), :, :, large_element)
                 element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
                                               mortar, u_large, fstar_tmp1)
             elseif cache.mortars.orientations[mortar] == 2
                 # L2 mortars in y-direction
-                u_large = view(flux_viscous_y, :, :, nnodes(dg), :, large_element)
+                u_large = view(flux_parabolic_y, :, :, nnodes(dg), :, large_element)
                 element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
                                               mortar, u_large, fstar_tmp1)
             else # cache.mortars.orientations[mortar] == 3
                 # L2 mortars in z-direction
-                u_large = view(flux_viscous_z, :, :, :, nnodes(dg), large_element)
+                u_large = view(flux_parabolic_z, :, :, :, nnodes(dg), large_element)
                 element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
                                               mortar, u_large, fstar_tmp1)
             end
@@ -752,17 +759,17 @@ function prolong2mortars!(cache, flux_viscous::Tuple,
             leftright = 2
             if cache.mortars.orientations[mortar] == 1
                 # L2 mortars in x-direction
-                u_large = view(flux_viscous_x, :, 1, :, :, large_element)
+                u_large = view(flux_parabolic_x, :, 1, :, :, large_element)
                 element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
                                               mortar, u_large, fstar_tmp1)
             elseif cache.mortars.orientations[mortar] == 2
                 # L2 mortars in y-direction
-                u_large = view(flux_viscous_y, :, :, 1, :, large_element)
+                u_large = view(flux_parabolic_y, :, :, 1, :, large_element)
                 element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
                                               mortar, u_large, fstar_tmp1)
             else # cache.mortars.orientations[mortar] == 3
                 # L2 mortars in z-direction
-                u_large = view(flux_viscous_z, :, :, :, 1, large_element)
+                u_large = view(flux_parabolic_z, :, :, :, 1, large_element)
                 element_solutions_to_mortars!(cache.mortars, mortar_l2, leftright,
                                               mortar, u_large, fstar_tmp1)
             end
@@ -1124,7 +1131,7 @@ end
 # Needed to *not* flip the sign of the inverse Jacobian.
 # This is because the parabolic fluxes are assumed to be of the form
 #   `du/dt + df/dx = dg/dx + source(x,t)`,
-# where f(u) is the inviscid flux and g(u) is the viscous flux.
+# where f(u) is the inviscid flux and g(u) is the parabolic flux.
 function apply_jacobian_parabolic!(du::AbstractArray, mesh::TreeMesh{3},
                                    equations_parabolic::AbstractEquationsParabolic,
                                    dg::DG, cache)
diff --git a/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
new file mode 100644
index 00000000000..203fa69b89c
--- /dev/null
+++ b/src/solvers/dgsem_tree/dg_3d_subcell_limiters.jl
@@ -0,0 +1,319 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+function create_cache(mesh::Union{TreeMesh{3}, P4estMesh{3}},
+                      equations, volume_integral::VolumeIntegralSubcellLimiting,
+                      dg::DG, cache_containers, uEltype)
+    cache = create_cache(mesh, equations,
+                         VolumeIntegralPureLGLFiniteVolume(volume_integral.volume_flux_fv),
+                         dg, cache_containers, uEltype)
+
+    fhat1_L_threaded, fhat1_R_threaded,
+    fhat2_L_threaded, fhat2_R_threaded,
+    fhat3_L_threaded, fhat3_R_threaded = create_f_threaded(mesh, equations, dg, uEltype)
+
+    A4d = Array{uEltype, 4}
+    flux_temp_threaded = A4d[A4d(undef, nvariables(equations),
+                                 nnodes(dg), nnodes(dg), nnodes(dg))
+                             for _ in 1:Threads.maxthreadid()]
+    fhat_temp_threaded = A4d[A4d(undef, nvariables(equations),
+                                 nnodes(dg), nnodes(dg), nnodes(dg))
+                             for _ in 1:Threads.maxthreadid()]
+
+    n_elements = nelements(cache_containers.elements)
+    antidiffusive_fluxes = ContainerAntidiffusiveFlux3D{uEltype}(n_elements,
+                                                                 nvariables(equations),
+                                                                 nnodes(dg))
+
+    if have_nonconservative_terms(equations) == true
+        A5d = Array{uEltype, 5}
+        # Extract the nonconservative flux as a dispatch argument for `n_nonconservative_terms`
+        _, volume_flux_noncons = volume_integral.volume_flux_dg
+
+        flux_nonconservative_temp_threaded = A5d[A5d(undef, nvariables(equations),
+                                                     n_nonconservative_terms(volume_flux_noncons),
+                                                     nnodes(dg), nnodes(dg),
+                                                     nnodes(dg))
+                                                 for _ in 1:Threads.maxthreadid()]
+        fhat_nonconservative_temp_threaded = A5d[A5d(undef, nvariables(equations),
+                                                     n_nonconservative_terms(volume_flux_noncons),
+                                                     nnodes(dg), nnodes(dg),
+                                                     nnodes(dg))
+                                                 for _ in 1:Threads.maxthreadid()]
+        phi_threaded = A5d[A5d(undef, nvariables(equations),
+                               n_nonconservative_terms(volume_flux_noncons),
+                               nnodes(dg), nnodes(dg), nnodes(dg))
+                           for _ in 1:Threads.maxthreadid()]
+        cache = (; cache..., flux_nonconservative_temp_threaded,
+                 fhat_nonconservative_temp_threaded, phi_threaded)
+    end
+
+    return (; cache..., antidiffusive_fluxes,
+            fhat1_L_threaded, fhat1_R_threaded,
+            fhat2_L_threaded, fhat2_R_threaded,
+            fhat3_L_threaded, fhat3_R_threaded,
+            flux_temp_threaded, fhat_temp_threaded)
+end
+
+# Subcell limiting currently only implemented for certain mesh types
+@inline function volume_integral_kernel!(du, u, element,
+                                         MeshT::Type{<:Union{TreeMesh{3}, P4estMesh{3}}},
+                                         nonconservative_terms, equations,
+                                         volume_integral::VolumeIntegralSubcellLimiting,
+                                         limiter::SubcellLimiterIDP,
+                                         dg::DGSEM, cache)
+    @unpack inverse_weights = dg.basis # Plays role of DG subcell sizes
+    @unpack volume_flux_dg, volume_flux_fv, limiter = volume_integral
+
+    # high-order DG fluxes
+    @unpack fhat1_L_threaded, fhat1_R_threaded, fhat2_L_threaded, fhat2_R_threaded, fhat3_L_threaded, fhat3_R_threaded = cache
+
+    fhat1_L = fhat1_L_threaded[Threads.threadid()]
+    fhat1_R = fhat1_R_threaded[Threads.threadid()]
+    fhat2_L = fhat2_L_threaded[Threads.threadid()]
+    fhat2_R = fhat2_R_threaded[Threads.threadid()]
+    fhat3_L = fhat3_L_threaded[Threads.threadid()]
+    fhat3_R = fhat3_R_threaded[Threads.threadid()]
+    calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
+                   u, MeshT, nonconservative_terms, equations, volume_flux_dg,
+                   dg, element, cache)
+
+    # low-order FV fluxes
+    @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded, fstar3_L_threaded, fstar3_R_threaded = cache
+
+    fstar1_L = fstar1_L_threaded[Threads.threadid()]
+    fstar1_R = fstar1_R_threaded[Threads.threadid()]
+    fstar2_L = fstar2_L_threaded[Threads.threadid()]
+    fstar2_R = fstar2_R_threaded[Threads.threadid()]
+    fstar3_L = fstar3_L_threaded[Threads.threadid()]
+    fstar3_R = fstar3_R_threaded[Threads.threadid()]
+    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
+                 u, MeshT, nonconservative_terms, equations, volume_flux_fv,
+                 dg, element, cache)
+
+    # antidiffusive flux
+    calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
+                            fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
+                            u, MeshT, nonconservative_terms, equations, limiter,
+                            dg, element, cache)
+
+    # Calculate volume integral contribution of low-order FV flux
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            du[v, i, j, k, element] += inverse_weights[i] *
+                                       (fstar1_L[v, i + 1, j, k] - fstar1_R[v, i, j, k]) +
+                                       inverse_weights[j] *
+                                       (fstar2_L[v, i, j + 1, k] - fstar2_R[v, i, j, k]) +
+                                       inverse_weights[k] *
+                                       (fstar3_L[v, i, j, k + 1] - fstar3_R[v, i, j, k])
+        end
+    end
+
+    return nothing
+end
+
+# Calculate the DG staggered volume fluxes `fhat` in subcell FV-form inside the element
+# (**without non-conservative terms**).
+#
+# See also `flux_differencing_kernel!`.
+@inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
+                                u, ::Type{<:TreeMesh{3}},
+                                have_nonconservative_terms::False, equations,
+                                volume_flux, dg::DGSEM, element, cache)
+    @unpack weights, derivative_split = dg.basis
+    @unpack flux_temp_threaded = cache
+
+    flux_temp = flux_temp_threaded[Threads.threadid()]
+
+    # The FV-form fluxes are calculated in a recursive manner, i.e.:
+    # fhat_(0,1)   = w_0 * FVol_0,
+    # fhat_(j,j+1) = fhat_(j-1,j) + w_j * FVol_j,   for j=1,...,N-1,
+    # with the split form volume fluxes FVol_j = -2 * sum_i=0^N D_ji f*_(j,i).
+
+    # To use the symmetry of the `volume_flux`, the split form volume flux is precalculated
+    # like in `calc_volume_integral!` for the `VolumeIntegralFluxDifferencing`
+    # and saved in in `flux_temp`.
+
+    # Split form volume flux in orientation 1: x direction
+    flux_temp .= zero(eltype(flux_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+        # All diagonal entries of `derivative_split` are zero. Thus, we can skip
+        # the computation of the diagonal terms. In addition, we use the symmetry
+        # of the `volume_flux` to save half of the possible two-point flux
+        # computations.
+        for ii in (i + 1):nnodes(dg)
+            u_node_ii = get_node_vars(u, equations, dg, ii, j, k, element)
+            flux1 = volume_flux(u_node, u_node_ii, 1, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[i, ii], flux1,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[ii, i], flux1,
+                                       equations, dg, ii, j, k)
+        end
+    end
+
+    # FV-form flux `fhat` in x direction
+    for k in eachnode(dg), j in eachnode(dg), i in 1:(nnodes(dg) - 1)
+        for v in eachvariable(equations)
+            fhat1_L[v, i + 1, j, k] = fhat1_L[v, i, j, k] +
+                                      weights[i] * flux_temp[v, i, j, k]
+            fhat1_R[v, i + 1, j, k] = fhat1_L[v, i + 1, j, k]
+        end
+    end
+
+    # Split form volume flux in orientation 2: y direction
+    flux_temp .= zero(eltype(flux_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+        for jj in (j + 1):nnodes(dg)
+            u_node_jj = get_node_vars(u, equations, dg, i, jj, k, element)
+            flux2 = volume_flux(u_node, u_node_jj, 2, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[j, jj], flux2,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[jj, j], flux2,
+                                       equations, dg, i, jj, k)
+        end
+    end
+
+    # FV-form flux `fhat` in y direction
+    for k in eachnode(dg), j in 1:(nnodes(dg) - 1), i in eachnode(dg)
+        for v in eachvariable(equations)
+            fhat2_L[v, i, j + 1, k] = fhat2_L[v, i, j, k] +
+                                      weights[j] * flux_temp[v, i, j, k]
+            fhat2_R[v, i, j + 1, k] = fhat2_L[v, i, j + 1, k]
+        end
+    end
+
+    # Split form volume flux in orientation 3: z direction
+    flux_temp .= zero(eltype(flux_temp))
+
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+        for kk in (k + 1):nnodes(dg)
+            u_node_kk = get_node_vars(u, equations, dg, i, j, kk, element)
+            flux3 = volume_flux(u_node, u_node_kk, 3, equations)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[k, kk], flux3,
+                                       equations, dg, i, j, k)
+            multiply_add_to_node_vars!(flux_temp, derivative_split[kk, k], flux3,
+                                       equations, dg, i, j, kk)
+        end
+    end
+
+    # FV-form flux `fhat` in z direction
+    for k in 1:(nnodes(dg) - 1), j in eachnode(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            fhat3_L[v, i, j, k + 1] = fhat3_L[v, i, j, k] +
+                                      weights[k] * flux_temp[v, i, j, k]
+            fhat3_R[v, i, j, k + 1] = fhat3_L[v, i, j, k + 1]
+        end
+    end
+
+    return nothing
+end
+
+# Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
+@inline function calcflux_antidiffusive!(fhat1_L, fhat1_R,
+                                         fhat2_L, fhat2_R,
+                                         fhat3_L, fhat3_R,
+                                         fstar1_L, fstar1_R,
+                                         fstar2_L, fstar2_R,
+                                         fstar3_L, fstar3_R,
+                                         u, ::Type{<:Union{TreeMesh{3}, P4estMesh{3}}},
+                                         nonconservative_terms::False, equations,
+                                         limiter::SubcellLimiterIDP, dg, element, cache)
+    @unpack antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R = cache.antidiffusive_fluxes
+
+    # Due to the use of LGL nodes, the DG staggered fluxes `fhat` and FV fluxes `fstar` are equal
+    # on the element interfaces. So, they are not computed in the volume integral and set to zero
+    # in their respective computation.
+    # The antidiffusive fluxes are therefore zero on the element interfaces and don't need to be
+    # computed either. They are set to zero directly after resizing the container.
+    # This applies to the indices `i=1` and `i=nnodes(dg)+1` for `antidiffusive_flux1_L` and
+    # `antidiffusive_flux1_R` and analogously for the other two directions.
+
+    for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux1_L[v, i, j, k, element] = fhat1_L[v, i, j, k] -
+                                                         fstar1_L[v, i, j, k]
+            antidiffusive_flux1_R[v, i, j, k, element] = antidiffusive_flux1_L[v,
+                                                                               i, j, k,
+                                                                               element]
+        end
+    end
+    for k in eachnode(dg), j in 2:nnodes(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux2_L[v, i, j, k, element] = fhat2_L[v, i, j, k] -
+                                                         fstar2_L[v, i, j, k]
+            antidiffusive_flux2_R[v, i, j, k, element] = antidiffusive_flux2_L[v,
+                                                                               i, j, k,
+                                                                               element]
+        end
+    end
+    for k in 2:nnodes(dg), j in eachnode(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux3_L[v, i, j, k, element] = fhat3_L[v, i, j, k] -
+                                                         fstar3_L[v, i, j, k]
+            antidiffusive_flux3_R[v, i, j, k, element] = antidiffusive_flux3_L[v,
+                                                                               i, j, k,
+                                                                               element]
+        end
+    end
+
+    return nothing
+end
+
+# Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
+@inline function calcflux_antidiffusive!(fhat1_L, fhat1_R,
+                                         fhat2_L, fhat2_R,
+                                         fhat3_L, fhat3_R,
+                                         fstar1_L, fstar1_R,
+                                         fstar2_L, fstar2_R,
+                                         fstar3_L, fstar3_R,
+                                         u, ::Type{<:Union{TreeMesh{3}, P4estMesh{3}}},
+                                         nonconservative_terms::True, equations,
+                                         limiter::SubcellLimiterIDP, dg, element, cache)
+    @unpack antidiffusive_flux1_L, antidiffusive_flux2_L, antidiffusive_flux1_R, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R = cache.antidiffusive_fluxes
+
+    # Due to the use of LGL nodes, the DG staggered fluxes `fhat` and FV fluxes `fstar` are equal
+    # on the element interfaces. So, they are not computed in the volume integral and set to zero
+    # in their respective computation.
+    # The antidiffusive fluxes are therefore zero on the element interfaces and don't need to be
+    # computed either. They are set to zero directly after resizing the container.
+    # This applies to the indices `i=1` and `i=nnodes(dg)+1` for `antidiffusive_flux1_L` and
+    # `antidiffusive_flux1_R` and analogously for the other two directions.
+
+    for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux1_L[v, i, j, k, element] = fhat1_L[v, i, j, k] -
+                                                         fstar1_L[v, i, j, k]
+            antidiffusive_flux1_R[v, i, j, k, element] = fhat1_R[v, i, j, k] -
+                                                         fstar1_R[v, i, j, k]
+        end
+    end
+    for k in eachnode(dg), j in 2:nnodes(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux2_L[v, i, j, k, element] = fhat2_L[v, i, j, k] -
+                                                         fstar2_L[v, i, j, k]
+            antidiffusive_flux2_R[v, i, j, k, element] = fhat2_R[v, i, j, k] -
+                                                         fstar2_R[v, i, j, k]
+        end
+    end
+    for k in 2:nnodes(dg), j in eachnode(dg), i in eachnode(dg)
+        for v in eachvariable(equations)
+            antidiffusive_flux3_L[v, i, j, k, element] = fhat3_L[v, i, j, k] -
+                                                         fstar3_L[v, i, j, k]
+            antidiffusive_flux3_R[v, i, j, k, element] = fhat3_R[v, i, j, k] -
+                                                         fstar3_R[v, i, j, k]
+        end
+    end
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem_tree/subcell_limiters.jl b/src/solvers/dgsem_tree/subcell_limiters.jl
index 1a921bc7d80..9046d69bea0 100644
--- a/src/solvers/dgsem_tree/subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/subcell_limiters.jl
@@ -274,6 +274,8 @@ end
 function create_cache(limiter::Type{SubcellLimiterIDP},
                       equations::AbstractEquations{NDIMS},
                       basis::LobattoLegendreBasis, bound_keys, bar_states) where {NDIMS}
+    # The number of elements is not yet known here. So, we initialize the container with 0 elements
+    # and resize it later while initializing the time integration method in `methods_SSP.jl`.
     subcell_limiter_coefficients = Trixi.ContainerSubcellLimiterIDP{NDIMS, real(basis)}(0,
                                                                                         nnodes(basis),
                                                                                         bound_keys)
@@ -800,4 +802,51 @@ function get_node_variable(::Val{:limiting_coefficient_mean_entropy}, u,
     (; alpha_mean_entropy) = limiter.cache.subcell_limiter_coefficients
     return alpha_mean_entropy
 end
+
+###############################################################################
+# Auxiliary routine `get_boundary_outer_state` for non-periodic domains
+
+"""
+    get_boundary_outer_state(u_inner, t,
+                             boundary_condition::BoundaryConditionDirichlet,
+                             orientation_or_normal, direction,
+                             mesh, equations, dg, cache, indices...)
+For subcell limiting, the calculation of local bounds for non-periodic domains requires the boundary
+outer state. This function returns the boundary value  for [`BoundaryConditionDirichlet`](@ref) at
+time `t` and for node with spatial indices `indices` at the boundary with `orientation_or_normal`
+and `direction`.
+
+Should be used together with [`TreeMesh`](@ref) or [`StructuredMesh`](@ref).
+
+!!! warning "Experimental implementation"
+    This is an experimental feature and may change in future releases.
+"""
+@inline function get_boundary_outer_state(u_inner, t,
+                                          boundary_condition::BoundaryConditionDirichlet,
+                                          orientation_or_normal, direction,
+                                          mesh::Union{TreeMesh, StructuredMesh},
+                                          equations, dg, cache, indices...)
+    (; node_coordinates) = cache.elements
+
+    x = get_node_coords(node_coordinates, equations, dg, indices...)
+    u_outer = boundary_condition.boundary_value_function(x, t, equations)
+
+    return u_outer
+end
+
+@inline function get_boundary_outer_state(u_inner, t,
+                                          boundary_condition::BoundaryConditionCharacteristic,
+                                          orientation_or_normal, direction,
+                                          mesh::Union{TreeMesh, StructuredMesh},
+                                          equations,
+                                          dg, cache, indices...)
+    (; node_coordinates) = cache.elements
+
+    x = get_node_coords(node_coordinates, equations, dg, indices...)
+    u_outer = boundary_condition.boundary_value_function(boundary_condition.outer_boundary_value_function,
+                                                         u_inner, orientation_or_normal,
+                                                         direction, x, t, equations)
+
+    return u_outer
+end
 end # @muladd
diff --git a/src/solvers/dgsem_tree/subcell_limiters_2d.jl b/src/solvers/dgsem_tree/subcell_limiters_2d.jl
index 44e7fc61a25..0c2330953c9 100644
--- a/src/solvers/dgsem_tree/subcell_limiters_2d.jl
+++ b/src/solvers/dgsem_tree/subcell_limiters_2d.jl
@@ -61,28 +61,34 @@ end
     # Calc bounds at interfaces and periodic boundaries
     for interface in eachinterface(dg, cache)
         # Get neighboring element ids
-        left = cache.interfaces.neighbor_ids[1, interface]
-        right = cache.interfaces.neighbor_ids[2, interface]
+        left_element = cache.interfaces.neighbor_ids[1, interface]
+        right_element = cache.interfaces.neighbor_ids[2, interface]
 
         orientation = cache.interfaces.orientations[interface]
 
         for i in eachnode(dg)
-            index_left = (nnodes(dg), i)
-            index_right = (1, i)
-            if orientation == 2
-                index_left = reverse(index_left)
-                index_right = reverse(index_right)
+            # Define node indices for left and right element based on the interface orientation
+            if orientation == 1
+                index_left = (nnodes(dg), i)
+                index_right = (1, i)
+            else # if orientation == 2
+                index_left = (i, nnodes(dg))
+                index_right = (i, 1)
             end
-            var_left = u[variable, index_left..., left]
-            var_right = u[variable, index_right..., right]
-
-            var_min[index_right..., right] = min(var_min[index_right..., right],
-                                                 var_left)
-            var_max[index_right..., right] = max(var_max[index_right..., right],
-                                                 var_left)
-
-            var_min[index_left..., left] = min(var_min[index_left..., left], var_right)
-            var_max[index_left..., left] = max(var_max[index_left..., left], var_right)
+            var_left = u[variable, index_left..., left_element]
+            var_right = u[variable, index_right..., right_element]
+
+            var_min[index_right..., right_element] = min(var_min[index_right...,
+                                                                 right_element],
+                                                         var_left)
+            var_max[index_right..., right_element] = max(var_max[index_right...,
+                                                                 right_element],
+                                                         var_left)
+
+            var_min[index_left..., left_element] = min(var_min[index_left...,
+                                                               left_element], var_right)
+            var_max[index_left..., left_element] = max(var_max[index_left...,
+                                                               left_element], var_right)
         end
     end
 
@@ -94,26 +100,28 @@ end
 
         for i in eachnode(dg)
             if neighbor_side == 2 # Element is on the right, boundary on the left
-                index = (1, i)
+                node_index = (1, i)
                 boundary_index = 1
             else # Element is on the left, boundary on the right
-                index = (nnodes(dg), i)
+                node_index = (nnodes(dg), i)
                 boundary_index = 2
             end
             if orientation == 2
-                index = reverse(index)
+                node_index = reverse(node_index)
                 boundary_index += 2
             end
-            u_inner = get_node_vars(u, equations, dg, index..., element)
+            u_inner = get_node_vars(u, equations, dg, node_index..., element)
             u_outer = get_boundary_outer_state(u_inner, t,
                                                boundary_conditions[boundary_index],
                                                orientation, boundary_index,
                                                mesh, equations, dg, cache,
-                                               index..., element)
+                                               node_index..., element)
             var_outer = u_outer[variable]
 
-            var_min[index..., element] = min(var_min[index..., element], var_outer)
-            var_max[index..., element] = max(var_max[index..., element], var_outer)
+            var_min[node_index..., element] = min(var_min[node_index..., element],
+                                                  var_outer)
+            var_max[node_index..., element] = max(var_max[node_index..., element],
+                                                  var_outer)
         end
     end
 
@@ -175,26 +183,32 @@ end
     # Calc bounds at interfaces and periodic boundaries
     for interface in eachinterface(dg, cache)
         # Get neighboring element ids
-        left = cache.interfaces.neighbor_ids[1, interface]
-        right = cache.interfaces.neighbor_ids[2, interface]
+        left_element = cache.interfaces.neighbor_ids[1, interface]
+        right_element = cache.interfaces.neighbor_ids[2, interface]
 
         orientation = cache.interfaces.orientations[interface]
         for i in eachnode(dg)
-            index_left = (nnodes(dg), i)
-            index_right = (1, i)
-            if orientation == 2
-                index_left = reverse(index_left)
-                index_right = reverse(index_right)
+            # Define node indices for left and right element based on the interface orientation
+            if orientation == 1
+                index_left = (nnodes(dg), i)
+                index_right = (1, i)
+            else # if orientation == 2
+                index_left = (i, nnodes(dg))
+                index_right = (i, 1)
             end
-            var_left = variable(get_node_vars(u, equations, dg, index_left..., left),
+            var_left = variable(get_node_vars(u, equations, dg, index_left...,
+                                              left_element),
                                 equations)
-            var_right = variable(get_node_vars(u, equations, dg, index_right..., right),
+            var_right = variable(get_node_vars(u, equations, dg, index_right...,
+                                               right_element),
                                  equations)
 
-            var_minmax[index_right..., right] = min_or_max(var_minmax[index_right...,
-                                                                      right], var_left)
-            var_minmax[index_left..., left] = min_or_max(var_minmax[index_left...,
-                                                                    left], var_right)
+            var_minmax[index_right..., right_element] = min_or_max(var_minmax[index_right...,
+                                                                              right_element],
+                                                                   var_left)
+            var_minmax[index_left..., left_element] = min_or_max(var_minmax[index_left...,
+                                                                            left_element],
+                                                                 var_right)
         end
     end
 
@@ -206,26 +220,27 @@ end
 
         for i in eachnode(dg)
             if neighbor_side == 2 # Element is on the right, boundary on the left
-                index = (1, i)
+                node_index = (1, i)
                 boundary_index = 1
             else # Element is on the left, boundary on the right
-                index = (nnodes(dg), i)
+                node_index = (nnodes(dg), i)
                 boundary_index = 2
             end
             if orientation == 2
-                index = reverse(index)
+                node_index = reverse(node_index)
                 boundary_index += 2
             end
-            u_inner = get_node_vars(u, equations, dg, index..., element)
+            u_inner = get_node_vars(u, equations, dg, node_index..., element)
             u_outer = get_boundary_outer_state(u_inner, t,
                                                boundary_conditions[boundary_index],
                                                orientation, boundary_index,
                                                mesh, equations, dg, cache,
-                                               index..., element)
+                                               node_index..., element)
             var_outer = variable(u_outer, equations)
 
-            var_minmax[index..., element] = min_or_max(var_minmax[index..., element],
-                                                       var_outer)
+            var_minmax[node_index..., element] = min_or_max(var_minmax[node_index...,
+                                                                       element],
+                                                            var_outer)
         end
     end
 
@@ -432,13 +447,13 @@ end
 ###############################################################################
 # Newton-bisection method
 
-# 2D version
 @inline function newton_loops_alpha!(alpha, bound, u, i, j, element,
                                      variable, min_or_max,
                                      initial_check, final_check,
                                      inverse_jacobian, dt,
-                                     equations, dg, cache, limiter)
-    (; inverse_weights) = dg.basis
+                                     equations::AbstractEquations{2},
+                                     dg, cache, limiter)
+    (; inverse_weights) = dg.basis # Plays role of inverse DG-subcell sizes
     (; antidiffusive_flux1_L, antidiffusive_flux2_L, antidiffusive_flux1_R, antidiffusive_flux2_R) = cache.antidiffusive_fluxes
 
     (; gamma_constant_newton) = limiter
diff --git a/src/solvers/dgsem_tree/subcell_limiters_3d.jl b/src/solvers/dgsem_tree/subcell_limiters_3d.jl
new file mode 100644
index 00000000000..81b7b32c997
--- /dev/null
+++ b/src/solvers/dgsem_tree/subcell_limiters_3d.jl
@@ -0,0 +1,491 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+###############################################################################
+# IDP Limiting
+###############################################################################
+
+###############################################################################
+# Calculation of local bounds using low-order FV solution
+
+@inline function calc_bounds_twosided!(var_min, var_max, variable,
+                                       u::AbstractArray{<:Any, 5}, t, semi, equations)
+    mesh, _, dg, cache = mesh_equations_solver_cache(semi)
+    # Calc bounds inside elements
+    @threaded for element in eachelement(dg, cache)
+        # Calculate bounds at Gauss-Lobatto nodes
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            var = u[variable, i, j, k, element]
+            var_min[i, j, k, element] = var
+            var_max[i, j, k, element] = var
+        end
+
+        # Apply values in x direction
+        for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
+            var = u[variable, i - 1, j, k, element]
+            var_min[i, j, k, element] = min(var_min[i, j, k, element], var)
+            var_max[i, j, k, element] = max(var_max[i, j, k, element], var)
+
+            var = u[variable, i, j, k, element]
+            var_min[i - 1, j, k, element] = min(var_min[i - 1, j, k, element], var)
+            var_max[i - 1, j, k, element] = max(var_max[i - 1, j, k, element], var)
+        end
+
+        # Apply values in y direction
+        for k in eachnode(dg), j in 2:nnodes(dg), i in eachnode(dg)
+            var = u[variable, i, j - 1, k, element]
+            var_min[i, j, k, element] = min(var_min[i, j, k, element], var)
+            var_max[i, j, k, element] = max(var_max[i, j, k, element], var)
+
+            var = u[variable, i, j, k, element]
+            var_min[i, j - 1, k, element] = min(var_min[i, j - 1, k, element], var)
+            var_max[i, j - 1, k, element] = max(var_max[i, j - 1, k, element], var)
+        end
+
+        # Apply values in z direction
+        for k in 2:nnodes(dg), j in eachnode(dg), i in eachnode(dg)
+            var = u[variable, i, j, k - 1, element]
+            var_min[i, j, k, element] = min(var_min[i, j, k, element], var)
+            var_max[i, j, k, element] = max(var_max[i, j, k, element], var)
+
+            var = u[variable, i, j, k, element]
+            var_min[i, j, k - 1, element] = min(var_min[i, j, k - 1, element], var)
+            var_max[i, j, k - 1, element] = max(var_max[i, j, k - 1, element], var)
+        end
+    end
+
+    # Values at element boundary
+    calc_bounds_twosided_interface!(var_min, var_max, variable,
+                                    u, t, semi, mesh, equations)
+    return nothing
+end
+
+@inline function calc_bounds_twosided_interface!(var_min, var_max, variable,
+                                                 u, t, semi, mesh::TreeMesh3D,
+                                                 equations)
+    _, _, dg, cache = mesh_equations_solver_cache(semi)
+
+    # Calc bounds at interfaces and periodic boundaries
+    for interface in eachinterface(dg, cache)
+        # Get neighboring element ids
+        left_element = cache.interfaces.neighbor_ids[1, interface]
+        right_element = cache.interfaces.neighbor_ids[2, interface]
+
+        orientation = cache.interfaces.orientations[interface]
+
+        for j in eachnode(dg), i in eachnode(dg)
+            # Define node indices for left and right element based on the interface orientation
+            if orientation == 1
+                # interface in x-direction
+                index_left = (nnodes(dg), i, j)
+                index_right = (1, i, j)
+            elseif orientation == 2
+                # interface in y-direction
+                index_left = (i, nnodes(dg), j)
+                index_right = (i, 1, j)
+            else # if orientation == 3
+                # interface in z-direction
+                index_left = (i, j, nnodes(dg))
+                index_right = (i, j, 1)
+            end
+            var_left = u[variable, index_left..., left_element]
+            var_right = u[variable, index_right..., right_element]
+
+            var_min[index_right..., right_element] = min(var_min[index_right...,
+                                                                 right_element],
+                                                         var_left)
+            var_max[index_right..., right_element] = max(var_max[index_right...,
+                                                                 right_element],
+                                                         var_left)
+
+            var_min[index_left..., left_element] = min(var_min[index_left...,
+                                                               left_element], var_right)
+            var_max[index_left..., left_element] = max(var_max[index_left...,
+                                                               left_element], var_right)
+        end
+    end
+
+    return nothing
+end
+
+@inline function calc_bounds_onesided!(var_minmax, min_or_max, variable,
+                                       u::AbstractArray{<:Any, 5}, t, semi)
+    mesh, equations, dg, cache = mesh_equations_solver_cache(semi)
+    # Calc bounds inside elements
+
+    # The approach used in `calc_bounds_twosided!` is not used here because it requires more
+    # evaluations of the variable and is therefore slower.
+
+    @threaded for element in eachelement(dg, cache)
+        # Reset bounds
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            if min_or_max === max
+                var_minmax[i, j, k, element] = typemin(eltype(var_minmax))
+            else
+                var_minmax[i, j, k, element] = typemax(eltype(var_minmax))
+            end
+        end
+
+        # Calculate bounds at Gauss-Lobatto nodes
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            var = variable(get_node_vars(u, equations, dg, i, j, k, element), equations)
+            var_minmax[i, j, k, element] = min_or_max(var_minmax[i, j, k, element], var)
+
+            if i > 1
+                var_minmax[i - 1, j, k, element] = min_or_max(var_minmax[i - 1, j, k,
+                                                                         element], var)
+            end
+            if i < nnodes(dg)
+                var_minmax[i + 1, j, k, element] = min_or_max(var_minmax[i + 1, j, k,
+                                                                         element], var)
+            end
+            if j > 1
+                var_minmax[i, j - 1, k, element] = min_or_max(var_minmax[i, j - 1, k,
+                                                                         element], var)
+            end
+            if j < nnodes(dg)
+                var_minmax[i, j + 1, k, element] = min_or_max(var_minmax[i, j + 1, k,
+                                                                         element], var)
+            end
+            if k > 1
+                var_minmax[i, j, k - 1, element] = min_or_max(var_minmax[i, j, k - 1,
+                                                                         element], var)
+            end
+            if k < nnodes(dg)
+                var_minmax[i, j, k + 1, element] = min_or_max(var_minmax[i, j, k + 1,
+                                                                         element], var)
+            end
+        end
+    end
+
+    # Values at element boundary
+    calc_bounds_onesided_interface!(var_minmax, min_or_max, variable, u, t, semi, mesh)
+
+    return nothing
+end
+
+@inline function calc_bounds_onesided_interface!(var_minmax, min_or_max, variable, u, t,
+                                                 semi, mesh::TreeMesh{3})
+    _, equations, dg, cache = mesh_equations_solver_cache(semi)
+
+    # Calc bounds at interfaces and periodic boundaries
+    for interface in eachinterface(dg, cache)
+        # Get neighboring element ids
+        left_element = cache.interfaces.neighbor_ids[1, interface]
+        right_element = cache.interfaces.neighbor_ids[2, interface]
+
+        orientation = cache.interfaces.orientations[interface]
+
+        for j in eachnode(dg), i in eachnode(dg)
+            # Define node indices for left and right element based on the interface orientation
+            if orientation == 1
+                # interface in x-direction
+                index_left = (nnodes(dg), i, j)
+                index_right = (1, i, j)
+            elseif orientation == 2
+                # interface in y-direction
+                index_left = (i, nnodes(dg), j)
+                index_right = (i, 1, j)
+            else # if orientation == 3
+                # interface in z-direction
+                index_left = (i, j, nnodes(dg))
+                index_right = (i, j, 1)
+            end
+            var_left = variable(get_node_vars(u, equations, dg, index_left...,
+                                              left_element),
+                                equations)
+            var_right = variable(get_node_vars(u, equations, dg, index_right...,
+                                               right_element),
+                                 equations)
+
+            var_minmax[index_right..., right_element] = min_or_max(var_minmax[index_right...,
+                                                                              right_element],
+                                                                   var_left)
+            var_minmax[index_left..., left_element] = min_or_max(var_minmax[index_left...,
+                                                                            left_element],
+                                                                 var_right)
+        end
+    end
+
+    return nothing
+end
+
+###############################################################################
+# Local minimum and maximum limiting of conservative variables
+
+@inline function idp_local_twosided!(alpha, limiter, u::AbstractArray{<:Any, 5},
+                                     t, dt, semi, elements, variable)
+    mesh, equations, dg, cache = mesh_equations_solver_cache(semi)
+    (; antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R) = cache.antidiffusive_fluxes
+    (; inverse_weights) = dg.basis
+
+    (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
+    variable_string = string(variable)
+    var_min = variable_bounds[Symbol(variable_string, "_min")]
+    var_max = variable_bounds[Symbol(variable_string, "_max")]
+    calc_bounds_twosided!(var_min, var_max, variable, u, t, semi, equations)
+
+    @threaded for element in elements
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            inverse_jacobian = get_inverse_jacobian(cache.elements.inverse_jacobian,
+                                                    mesh, i, j, k, element)
+            var = u[variable, i, j, k, element]
+            # Real Zalesak type limiter
+            #   * Zalesak (1979). "Fully multidimensional flux-corrected transport algorithms for fluids"
+            #   * Kuzmin et al. (2010). "Failsafe flux limiting and constrained data projections for equations of gas dynamics"
+            #   Note: The Zalesak limiter has to be computed, even if the state is valid, because the correction is
+            #         for each interface, not each node
+
+            Qp = max(0, (var_max[i, j, k, element] - var) / dt)
+            Qm = min(0, (var_min[i, j, k, element] - var) / dt)
+
+            # Calculate Pp and Pm
+            # Note: Boundaries of antidiffusive_flux1/2 are constant 0, so they make no difference here.
+            val_flux1_local = inverse_weights[i] *
+                              antidiffusive_flux1_R[variable, i, j, k, element]
+            val_flux1_local_ip1 = -inverse_weights[i] *
+                                  antidiffusive_flux1_L[variable, i + 1, j, k, element]
+            val_flux2_local = inverse_weights[j] *
+                              antidiffusive_flux2_R[variable, i, j, k, element]
+            val_flux2_local_jp1 = -inverse_weights[j] *
+                                  antidiffusive_flux2_L[variable, i, j + 1, k, element]
+            val_flux3_local = inverse_weights[k] *
+                              antidiffusive_flux3_R[variable, i, j, k, element]
+            val_flux3_local_jp1 = -inverse_weights[k] *
+                                  antidiffusive_flux3_L[variable, i, j, k + 1, element]
+
+            Pp = max(0, val_flux1_local) + max(0, val_flux1_local_ip1) +
+                 max(0, val_flux2_local) + max(0, val_flux2_local_jp1) +
+                 max(0, val_flux3_local) + max(0, val_flux3_local_jp1)
+            Pm = min(0, val_flux1_local) + min(0, val_flux1_local_ip1) +
+                 min(0, val_flux2_local) + min(0, val_flux2_local_jp1) +
+                 min(0, val_flux3_local) + min(0, val_flux3_local_jp1)
+
+            Pp = inverse_jacobian * Pp
+            Pm = inverse_jacobian * Pm
+
+            # Compute blending coefficient avoiding division by zero
+            # (as in paper of [Guermond, Nazarov, Popov, Thomas] (4.8))
+            Qp = abs(Qp) /
+                 (abs(Pp) + eps(typeof(Qp)) * 100 * abs(var_max[i, j, k, element]))
+            Qm = abs(Qm) /
+                 (abs(Pm) + eps(typeof(Qm)) * 100 * abs(var_max[i, j, k, element]))
+
+            # Calculate alpha at nodes
+            alpha[i, j, k, element] = max(alpha[i, j, k, element], 1 - min(1, Qp, Qm))
+        end
+    end
+
+    return nothing
+end
+
+##############################################################################
+# Local one-sided limiting of nonlinear variables
+
+@inline function idp_local_onesided!(alpha, limiter, u::AbstractArray{<:Real, 5},
+                                     t, dt, semi, elements,
+                                     variable, min_or_max)
+    mesh, equations, dg, cache = mesh_equations_solver_cache(semi)
+    (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
+    var_minmax = variable_bounds[Symbol(string(variable), "_", string(min_or_max))]
+    calc_bounds_onesided!(var_minmax, min_or_max, variable, u, t, semi)
+
+    # Perform Newton's bisection method to find new alpha
+    @threaded for element in elements
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            inverse_jacobian = get_inverse_jacobian(cache.elements.inverse_jacobian,
+                                                    mesh, i, j, k, element)
+            u_local = get_node_vars(u, equations, dg, i, j, k, element)
+            newton_loops_alpha!(alpha, var_minmax[i, j, k, element],
+                                u_local, i, j, k, element,
+                                variable, min_or_max,
+                                initial_check_local_onesided_newton_idp,
+                                final_check_local_onesided_newton_idp,
+                                inverse_jacobian, dt, equations, dg, cache, limiter)
+        end
+    end
+
+    return nothing
+end
+
+###############################################################################
+# Global positivity limiting of conservative variables
+
+@inline function idp_positivity_conservative!(alpha, limiter,
+                                              u::AbstractArray{<:Real, 5},
+                                              dt, semi, elements, variable)
+    mesh, _, dg, cache = mesh_equations_solver_cache(semi)
+    (; antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R) = cache.antidiffusive_fluxes
+    (; inverse_weights) = dg.basis # Plays role of DG subcell sizes
+    (; positivity_correction_factor) = limiter
+
+    (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
+    var_min = variable_bounds[Symbol(string(variable), "_min")]
+
+    @threaded for element in elements
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            inverse_jacobian = get_inverse_jacobian(cache.elements.inverse_jacobian,
+                                                    mesh, i, j, k, element)
+            var = u[variable, i, j, k, element]
+            if var < 0
+                error("Safe low-order method produces negative value for conservative variable $variable. Try a smaller time step.")
+            end
+
+            # Compute bound
+            if limiter.local_twosided &&
+               (variable in limiter.local_twosided_variables_cons) &&
+               (var_min[i, j, k, element] >= positivity_correction_factor * var)
+                # Local limiting is more restrictive that positivity limiting
+                # => Skip positivity limiting for this node
+                continue
+            end
+            var_min[i, j, k, element] = positivity_correction_factor * var
+
+            # Real one-sided Zalesak-type limiter
+            # * Zalesak (1979). "Fully multidimensional flux-corrected transport algorithms for fluids"
+            # * Kuzmin et al. (2010). "Failsafe flux limiting and constrained data projections for equations of gas dynamics"
+            # Note: The Zalesak limiter has to be computed, even if the state is valid, because the correction is
+            #       for each interface, not each node
+            Qm = min(0, (var_min[i, j, k, element] - var) / dt)
+
+            # Calculate Pm
+            # Note: Boundaries of antidiffusive_flux1/2/3 are constant 0, so they make no difference here.
+            val_flux1_local = inverse_weights[i] *
+                              antidiffusive_flux1_R[variable, i, j, k, element]
+            val_flux1_local_ip1 = -inverse_weights[i] *
+                                  antidiffusive_flux1_L[variable, i + 1, j, k, element]
+            val_flux2_local = inverse_weights[j] *
+                              antidiffusive_flux2_R[variable, i, j, k, element]
+            val_flux2_local_jp1 = -inverse_weights[j] *
+                                  antidiffusive_flux2_L[variable, i, j + 1, k, element]
+            val_flux3_local = inverse_weights[k] *
+                              antidiffusive_flux3_R[variable, i, j, k, element]
+            val_flux3_local_jp1 = -inverse_weights[k] *
+                                  antidiffusive_flux3_L[variable, i, j, k + 1, element]
+
+            Pm = min(0, val_flux1_local) + min(0, val_flux1_local_ip1) +
+                 min(0, val_flux2_local) + min(0, val_flux2_local_jp1) +
+                 min(0, val_flux3_local) + min(0, val_flux3_local_jp1)
+            Pm = inverse_jacobian * Pm
+
+            # Compute blending coefficient avoiding division by zero
+            # (as in paper of [Guermond, Nazarov, Popov, Thomas] (4.8))
+            Qm = abs(Qm) / (abs(Pm) + eps(typeof(Qm)) * 100)
+
+            # Calculate alpha
+            alpha[i, j, k, element] = max(alpha[i, j, k, element], 1 - Qm)
+        end
+    end
+
+    return nothing
+end
+
+###############################################################################
+# Global positivity limiting of nonlinear variables
+
+@inline function idp_positivity_nonlinear!(alpha, limiter,
+                                           u::AbstractArray{<:Real, 5},
+                                           dt, semi, elements, variable)
+    mesh, equations, dg, cache = mesh_equations_solver_cache(semi)
+    (; positivity_correction_factor) = limiter
+
+    (; variable_bounds) = limiter.cache.subcell_limiter_coefficients
+    var_min = variable_bounds[Symbol(string(variable), "_min")]
+
+    @threaded for element in elements
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            inverse_jacobian = get_inverse_jacobian(cache.elements.inverse_jacobian,
+                                                    mesh, i, j, k, element)
+
+            # Compute bound
+            u_local = get_node_vars(u, equations, dg, i, j, k, element)
+            var = variable(u_local, equations)
+            if var < 0
+                error("Safe low-order method produces negative value for variable $variable. Try a smaller time step.")
+            end
+            var_min[i, j, k, element] = positivity_correction_factor * var
+
+            # Perform Newton's bisection method to find new alpha
+            newton_loops_alpha!(alpha, var_min[i, j, k, element],
+                                u_local, i, j, k, element,
+                                variable, min,
+                                initial_check_nonnegative_newton_idp,
+                                final_check_nonnegative_newton_idp,
+                                inverse_jacobian, dt, equations, dg, cache, limiter)
+        end
+    end
+
+    return nothing
+end
+
+###############################################################################
+# Newton-bisection method
+
+@inline function newton_loops_alpha!(alpha, bound, u, i, j, k, element,
+                                     variable, min_or_max,
+                                     initial_check, final_check,
+                                     inverse_jacobian, dt,
+                                     equations::AbstractEquations{3},
+                                     dg, cache, limiter)
+    (; inverse_weights) = dg.basis # Plays role of inverse DG-subcell sizes
+    (; antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R) = cache.antidiffusive_fluxes
+
+    (; gamma_constant_newton) = limiter
+
+    indices = (i, j, k, element)
+
+    # negative xi direction
+    antidiffusive_flux = gamma_constant_newton * inverse_jacobian *
+                         inverse_weights[i] *
+                         get_node_vars(antidiffusive_flux1_R, equations, dg,
+                                       i, j, k, element)
+    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
+                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
+
+    # positive xi direction
+    antidiffusive_flux = -gamma_constant_newton * inverse_jacobian *
+                         inverse_weights[i] *
+                         get_node_vars(antidiffusive_flux1_L, equations, dg,
+                                       i + 1, j, k, element)
+    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
+                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
+
+    # negative eta direction
+    antidiffusive_flux = gamma_constant_newton * inverse_jacobian *
+                         inverse_weights[j] *
+                         get_node_vars(antidiffusive_flux2_R, equations, dg,
+                                       i, j, k, element)
+    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
+                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
+
+    # positive eta direction
+    antidiffusive_flux = -gamma_constant_newton * inverse_jacobian *
+                         inverse_weights[j] *
+                         get_node_vars(antidiffusive_flux2_L, equations, dg,
+                                       i, j + 1, k, element)
+    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
+                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
+
+    # negative zeta direction
+    antidiffusive_flux = gamma_constant_newton * inverse_jacobian *
+                         inverse_weights[k] *
+                         get_node_vars(antidiffusive_flux3_R, equations, dg,
+                                       i, j, k, element)
+    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
+                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
+
+    # positive zeta direction
+    antidiffusive_flux = -gamma_constant_newton * inverse_jacobian *
+                         inverse_weights[k] *
+                         get_node_vars(antidiffusive_flux3_L, equations, dg,
+                                       i, j, k + 1, element)
+    newton_loop!(alpha, bound, u, indices, variable, min_or_max,
+                 initial_check, final_check, equations, dt, limiter, antidiffusive_flux)
+
+    return nothing
+end
+end # @muladd
diff --git a/src/solvers/dgsem_unstructured/dg_2d.jl b/src/solvers/dgsem_unstructured/dg_2d.jl
index 76a186b9fa6..760e7e40405 100644
--- a/src/solvers/dgsem_unstructured/dg_2d.jl
+++ b/src/solvers/dgsem_unstructured/dg_2d.jl
@@ -35,61 +35,6 @@ function create_cache(mesh::UnstructuredMesh2D, equations,
     return cache
 end
 
-function rhs!(du, u, t,
-              mesh::UnstructuredMesh2D, equations,
-              boundary_conditions, source_terms::Source,
-              dg::DG, cache) where {Source}
-    # Reset du
-    @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
-
-    # Calculate volume integral
-    @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
-                              have_nonconservative_terms(equations), equations,
-                              dg.volume_integral, dg, cache)
-    end
-
-    # Prolong solution to interfaces
-    @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u, mesh, equations, dg)
-    end
-
-    # Calculate interface fluxes
-    @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
-                             have_nonconservative_terms(equations), equations,
-                             dg.surface_integral, dg, cache)
-    end
-
-    # Prolong solution to boundaries
-    @trixi_timeit timer() "prolong2boundaries" begin
-        prolong2boundaries!(cache, u, mesh, equations, dg)
-    end
-
-    # Calculate boundary fluxes
-    @trixi_timeit timer() "boundary flux" begin
-        calc_boundary_flux!(cache, t, boundary_conditions, mesh, equations,
-                            dg.surface_integral, dg)
-    end
-
-    # Calculate surface integrals
-    @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
-                               dg.surface_integral, dg, cache)
-    end
-
-    # Apply Jacobian from mapping to reference element
-    #  Note! this routine is reused from dgsem_structured/dg_2d.jl
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
-
-    # Calculate source terms
-    @trixi_timeit timer() "source terms" begin
-        calc_sources!(du, u, t, source_terms, equations, dg, cache)
-    end
-
-    return nothing
-end
-
 # prolong the solution into the convenience array in the interior interface container
 # Note! this routine is for quadrilateral elements with "right-handed" orientation
 function prolong2interfaces!(cache, u, mesh::UnstructuredMesh2D, equations, dg::DG)
@@ -309,8 +254,7 @@ function prolong2boundaries!(cache, u,
 end
 
 function calc_boundary_flux!(cache, t, boundary_condition::BoundaryConditionPeriodic,
-                             mesh::Union{UnstructuredMesh2D, P4estMesh, P4estMeshView,
-                                         T8codeMesh},
+                             mesh::Union{UnstructuredMesh2D, P4estMesh, T8codeMesh},
                              equations, surface_integral, dg::DG)
     @assert isempty(eachboundary(dg, cache))
 
@@ -319,8 +263,7 @@ end
 
 # Function barrier for type stability
 function calc_boundary_flux!(cache, t, boundary_conditions,
-                             mesh::Union{UnstructuredMesh2D, P4estMesh, P4estMeshView,
-                                         T8codeMesh},
+                             mesh::Union{UnstructuredMesh2D, P4estMesh, T8codeMesh},
                              equations, surface_integral, dg::DG)
     @unpack boundary_condition_types, boundary_indices = boundary_conditions
 
@@ -477,7 +420,7 @@ end
 #          -----------------                  -----------------
 #                  3                                  1
 # Therefore, we require a different surface integral routine here despite their similar structure.
-function calc_surface_integral!(du, u, mesh::UnstructuredMesh2D,
+function calc_surface_integral!(backend, du, u, mesh::UnstructuredMesh2D,
                                 equations, surface_integral, dg::DGSEM, cache)
     @unpack inverse_weights = dg.basis
     @unpack surface_flux_values = cache.elements
diff --git a/src/solvers/fdsbp_tree/fdsbp_1d.jl b/src/solvers/fdsbp_tree/fdsbp_1d.jl
index 004e6b95c98..002093121a7 100644
--- a/src/solvers/fdsbp_tree/fdsbp_1d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_1d.jl
@@ -42,7 +42,8 @@ function create_cache(mesh::TreeMesh{1}, equations,
 end
 
 # 2D volume integral contributions for `VolumeIntegralStrongForm`
-function calc_volume_integral!(du, u, mesh::TreeMesh{1},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{1},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
                                dg::FDSBP, cache)
@@ -88,7 +89,8 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u, mesh::TreeMesh{1},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{1},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,
                                dg::FDSBP, cache)
@@ -139,7 +141,7 @@ function calc_volume_integral!(du, u, mesh::TreeMesh{1},
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::TreeMesh{1},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{1},
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::DG, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -166,7 +168,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{1},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh1D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh1D,
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
@@ -220,7 +222,7 @@ end
 # in the specialized `calc_interface_flux` routine. These SATs are still of
 # a strong form penalty type, except that the interior flux at a particular
 # side of the element are computed in the upwind direction.
-function calc_surface_integral!(du, u, mesh::TreeMesh{1},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{1},
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::FDSBP, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -248,7 +250,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{1},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh1D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh1D,
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
diff --git a/src/solvers/fdsbp_tree/fdsbp_2d.jl b/src/solvers/fdsbp_tree/fdsbp_2d.jl
index d370d29aff8..5d30157a28c 100644
--- a/src/solvers/fdsbp_tree/fdsbp_2d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_2d.jl
@@ -42,7 +42,8 @@ function create_cache(mesh::Union{TreeMesh{2}, UnstructuredMesh2D}, equations,
 end
 
 # 2D volume integral contributions for `VolumeIntegralStrongForm`
-function calc_volume_integral!(du, u, mesh::TreeMesh{2},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{2},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
                                dg::FDSBP, cache)
@@ -97,7 +98,8 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u, mesh::TreeMesh{2},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{2},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,
                                dg::FDSBP, cache)
@@ -159,7 +161,7 @@ function calc_volume_integral!(du, u, mesh::TreeMesh{2},
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::TreeMesh{2},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{2},
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::DG, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -202,7 +204,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{2},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh2D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh2D,
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
@@ -214,7 +216,7 @@ end
 # already separates the solution information into right-traveling and
 # left-traveling information. So we only need to compute the appropriate
 # flux information at each side of an interface.
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{2},
                               have_nonconservative_terms::False, equations,
                               surface_integral::SurfaceIntegralUpwind,
@@ -260,7 +262,7 @@ end
 # in the specialized `calc_interface_flux` routine. These SATs are still of
 # a strong form penalty type, except that the interior flux at a particular
 # side of the element are computed in the upwind direction.
-function calc_surface_integral!(du, u, mesh::TreeMesh{2},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{2},
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::FDSBP, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -304,7 +306,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{2},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh2D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh2D,
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
diff --git a/src/solvers/fdsbp_tree/fdsbp_3d.jl b/src/solvers/fdsbp_tree/fdsbp_3d.jl
index 719ef4f7c96..e737a8f2137 100644
--- a/src/solvers/fdsbp_tree/fdsbp_3d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_3d.jl
@@ -42,7 +42,8 @@ function create_cache(mesh::TreeMesh{3}, equations,
 end
 
 # 3D volume integral contributions for `VolumeIntegralStrongForm`
-function calc_volume_integral!(du, u, mesh::TreeMesh{3},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{3},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
                                dg::FDSBP, cache)
@@ -104,7 +105,8 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u, mesh::TreeMesh{3},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{3},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,
                                dg::FDSBP, cache)
@@ -181,7 +183,7 @@ function calc_volume_integral!(du, u, mesh::TreeMesh{3},
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::TreeMesh{3},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{3},
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::DG, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -238,7 +240,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{3},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh3D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh3D,
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
@@ -250,7 +252,7 @@ end
 # already separates the solution information into right-traveling and
 # left-traveling information. So we only need to compute the appropriate
 # flux information at each side of an interface.
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{3},
                               have_nonconservative_terms::False, equations,
                               surface_integral::SurfaceIntegralUpwind,
@@ -297,7 +299,7 @@ end
 # in the specialized `calc_interface_flux` routine. These SATs are still of
 # a strong form penalty type, except that the interior flux at a particular
 # side of the element are computed in the upwind direction.
-function calc_surface_integral!(du, u, mesh::TreeMesh{3},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{3},
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::FDSBP, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -355,7 +357,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{3},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh3D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh3D,
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
diff --git a/src/solvers/fdsbp_unstructured/fdsbp_2d.jl b/src/solvers/fdsbp_unstructured/fdsbp_2d.jl
index 2a10d0a6cea..2d7058b9957 100644
--- a/src/solvers/fdsbp_unstructured/fdsbp_2d.jl
+++ b/src/solvers/fdsbp_unstructured/fdsbp_2d.jl
@@ -29,7 +29,8 @@ end
 # 2D volume integral contributions for `VolumeIntegralStrongForm`
 # OBS! This is the standard (not de-aliased) form of the volume integral.
 # So it is not provably stable for variable coefficients due to the the metric terms.
-function calc_volume_integral!(du, u, mesh::UnstructuredMesh2D,
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::UnstructuredMesh2D,
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
                                dg::FDSBP, cache)
@@ -91,7 +92,8 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u, mesh::UnstructuredMesh2D,
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::UnstructuredMesh2D,
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,
                                dg::FDSBP, cache)
@@ -184,7 +186,7 @@ end
 # Therefore, we require a different surface integral routine here despite their similar structure.
 # Also, the normal directions are already outward pointing for `UnstructuredMesh2D` so all the
 # surface contributions are added.
-function calc_surface_integral!(du, u, mesh::UnstructuredMesh2D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::UnstructuredMesh2D,
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::DG, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
diff --git a/src/solvers/solvers.jl b/src/solvers/solvers.jl
index 8988a42bdab..0716477be78 100644
--- a/src/solvers/solvers.jl
+++ b/src/solvers/solvers.jl
@@ -5,8 +5,14 @@
 @muladd begin
 #! format: noindent
 
-# Used by both `dg::DGSEM` and `dg::FDSBP`
 function set_zero!(du, dg, cache)
+    set_zero!(trixi_backend(du), du, dg, cache)
+
+    return nothing
+end
+
+# Used by both `dg::DGSEM` and `dg::FDSBP`
+function set_zero!(::Nothing, du, dg, cache)
     # du .= zero(eltype(du)) doesn't scale when using multiple threads.
     # See https://github.com/trixi-framework/Trixi.jl/pull/924 for a performance comparison.
     @threaded for element in eachelement(dg, cache)
@@ -16,9 +22,15 @@ function set_zero!(du, dg, cache)
     return nothing
 end
 
+function set_zero!(::Backend, du, dg, cache)
+    # Broadcasting is parallel on the GPU
+    du .= zero(eltype(du))
+    return nothing
+end
+
 # define types for parabolic solvers
 include("solvers_parabolic.jl")
 
 include("dg.jl")
-include("dgmulti.jl")
+include("dgmulti/dgmulti.jl")
 end # @muladd
diff --git a/src/solvers/solvers_parabolic.jl b/src/solvers/solvers_parabolic.jl
index 77ee2c64c9b..d183c1f04f3 100644
--- a/src/solvers/solvers_parabolic.jl
+++ b/src/solvers/solvers_parabolic.jl
@@ -1,5 +1,5 @@
 """
-  ViscousFormulationBassiRebay1()
+  ParabolicFormulationBassiRebay1()
 
 The classical BR1 flux from
 
@@ -15,59 +15,59 @@ A more detailed study of the BR1 scheme for the DGSEM can be found in
 
 The BR1 scheme works well for convection-dominated problems, but may cause instabilities or 
 reduced convergence for diffusion-dominated problems. 
-In the latter case, the [`ViscousFormulationLocalDG`](@ref) scheme is recommended.
+In the latter case, the [`ParabolicFormulationLocalDG`](@ref) scheme is recommended.
 """
-struct ViscousFormulationBassiRebay1 end
+struct ParabolicFormulationBassiRebay1 end
 
 """
     flux_parabolic(u_ll, u_rr,
                    gradient_or_divergence, equations_parabolic,
-                   parabolic_scheme::ViscousFormulationBassiRebay1)
+                   parabolic_scheme::ParabolicFormulationBassiRebay1)
 
     flux_parabolic(u_ll, u_rr, normal_direction::AbstractVector,
                    gradient_or_divergence, equations_parabolic,
-                   parabolic_scheme::ViscousFormulationBassiRebay1)
+                   parabolic_scheme::ParabolicFormulationBassiRebay1)
 
 This computes the classical BR1 flux. Since the interface flux for both the 
 DG gradient and DG divergence under BR1 are identical, this function does 
 not need to be specialized for `Gradient` and `Divergence`.
 
 `normal_direction` is not used in the BR1 flux,
-but is included as an argument for consistency with the [`ViscousFormulationLocalDG`](@ref) flux,
+but is included as an argument for consistency with the [`ParabolicFormulationLocalDG`](@ref) flux,
 which does use the `normal_direction` to compute the LDG "switch" on the generally non-Cartesian [`P4estMesh`](@ref).
 """
 function flux_parabolic(u_ll, u_rr, # Version for `TreeMesh`
                         gradient_or_divergence, equations_parabolic,
-                        parabolic_scheme::ViscousFormulationBassiRebay1)
+                        parabolic_scheme::ParabolicFormulationBassiRebay1)
     return 0.5f0 * (u_ll + u_rr)
 end
 # Version for `P4estMesh`
 function flux_parabolic(u_ll, u_rr, normal_direction::AbstractVector,
                         gradient_or_divergence, equations_parabolic,
-                        parabolic_scheme::ViscousFormulationBassiRebay1)
+                        parabolic_scheme::ParabolicFormulationBassiRebay1)
     return 0.5f0 * (u_ll + u_rr)
 end
 
 """
-    ViscousFormulationLocalDG(penalty_parameter)
+    ParabolicFormulationLocalDG(penalty_parameter)
 
 The local DG (LDG) flux from "The Local Discontinuous Galerkin Method for Time-Dependent
 Convection-Diffusion Systems" by Cockburn and Shu (1998).
 
 The parabolic "upwinding" vector is currently implemented for `TreeMesh`; for all other mesh types,
-the LDG solver is equivalent to [`ViscousFormulationBassiRebay1`](@ref) with an LDG-type penalization.
+the LDG solver is equivalent to [`ParabolicFormulationBassiRebay1`](@ref) with an LDG-type penalization.
 
 - Cockburn and Shu (1998).
   The Local Discontinuous Galerkin Method for Time-Dependent
   Convection-Diffusion Systems
   [DOI: 10.1137/S0036142997316712](https://doi.org/10.1137/S0036142997316712)
 """
-struct ViscousFormulationLocalDG{P}
+struct ParabolicFormulationLocalDG{P}
     penalty_parameter::P
 end
 
 """
-    ViscousFormulationLocalDG()
+    ParabolicFormulationLocalDG()
 
 The minimum dissipation local DG (LDG) flux from "An Analysis of the Minimal Dissipation Local 
 Discontinuous Galerkin Method for Convection–Diffusion Problems" by Cockburn and Dong (2007). 
@@ -79,16 +79,16 @@ Cockburn and Dong proved that this scheme is still stable despite the zero penal
   Galerkin Method for Convection–Diffusion Problems.
   [DOI: 10.1007/s10915-007-9130-3](https://doi.org/10.1007/s10915-007-9130-3)
 """
-ViscousFormulationLocalDG() = ViscousFormulationLocalDG(nothing)
+ParabolicFormulationLocalDG() = ParabolicFormulationLocalDG(nothing)
 
 @doc raw"""
     flux_parabolic(u_ll, u_rr,
                    ::Gradient, equations_parabolic,
-                   parabolic_scheme::ViscousFormulationLocalDG)
+                   parabolic_scheme::ParabolicFormulationLocalDG)
 
     flux_parabolic(u_ll, u_rr, normal_direction,
                    ::Gradient, equations_parabolic,
-                   parabolic_scheme::ViscousFormulationLocalDG)
+                   parabolic_scheme::ParabolicFormulationLocalDG)
 
 These fluxes computes the gradient and divergence interface fluxes for the 
 local DG method. The local DG method uses an "upwind/downwind" flux for the 
@@ -100,16 +100,17 @@ f_{\text{gradient}} = u_{L}
 ```
 on the Cartesian [`TreeMesh`](@ref).
 
-For the [`P4estMesh`](@ref), the `normal_direction` is used to compute the LDG "switch" ``\sigma`` for the upwinding/downwinding.
-This is realized by taking the sign of the dot product of the normal and positive-coordinate direction vector:
+For the [`P4estMesh`](@ref), the `normal_direction` is used to compute the LDG "switch" ``\sigma`` for the upwinding.
+This is realized by selecting the sign of the maximum (in absolute value sense) normal direction component,
+which corresponds to the "dominant" direction of the interface normal.
 ```math
-\sigma = \text{sign}(\vec{n} \cdot \vec{1})
-f = \frac{1}{2}\Big(f(u_{L}) + f(u_{R}) - \sigma \big[f(u_{R}) - f(u_{L})\big]\Big)
+i = \text{argmax} \{ \begin{pmatrix} \vert n_1 \vert \\ \vert n_2 \vert \\ \dots \end{pmatrix} \}
+\sigma = \text{sign} (n_i)
 ```
 """
 function flux_parabolic(u_ll, u_rr, # Version for `TreeMesh`
                         ::Gradient, equations_parabolic,
-                        parabolic_scheme::ViscousFormulationLocalDG)
+                        parabolic_scheme::ParabolicFormulationLocalDG)
     # The LDG flux is {{f}} + beta * [[f]], where beta is the LDG "switch", 
     # which we set to -1 on the left and +1 on the right in 1D. The sign of the 
     # jump term should be opposite that of the sign used in the divergence flux. 
@@ -120,19 +121,21 @@ end
 # Version for `P4estMesh`
 function flux_parabolic(u_ll, u_rr, normal_direction,
                         ::Gradient, equations_parabolic,
-                        parabolic_scheme::ViscousFormulationLocalDG)
-    ldg_switch = sign(sum(normal_direction)) # equivalent to sign(dot(normal_direction, ones))
+                        parabolic_scheme::ParabolicFormulationLocalDG)
+    # Use "Upwind in dominant direction" for LDG switch
+    abs_max_dir = argmax(abs.(normal_direction))
+    ldg_switch = sign(normal_direction[abs_max_dir])
     return 0.5f0 * (u_ll + u_rr - ldg_switch * (u_rr - u_ll))
 end
 
 @doc raw"""
     flux_parabolic(u_ll, u_rr,
                    ::Divergence, equations_parabolic,
-                   parabolic_scheme::ViscousFormulationLocalDG)
+                   parabolic_scheme::ParabolicFormulationLocalDG)
 
     flux_parabolic(u_ll, u_rr, normal_direction,
                    ::Divergence, equations_parabolic,  
-                   parabolic_scheme::ViscousFormulationLocalDG)
+                   parabolic_scheme::ParabolicFormulationLocalDG)
 
 These fluxes computes the gradient and divergence interface fluxes for the 
 local DG method. The local DG method uses an "upwind/downwind" flux for the 
@@ -145,24 +148,27 @@ f_{\text{divergence}} = u_{R}
 ```
 on the Cartesian [`TreeMesh`](@ref).
 
-For the [`P4estMesh`](@ref), the `normal_direction` is used to compute the LDG "switch" ``\sigma`` for the upwinding/downwinding.
-This is realized by taking the sign of the dot product of the normal and positive-coordinate direction vector:
+For the [`P4estMesh`](@ref), the `normal_direction` is used to compute the LDG "switch" ``\sigma`` for the downwinding.
+This is realized by selecting the sign of the maximum (in absolute value sense) normal direction component,
+which corresponds to the "dominant" direction of the interface normal.
 ```math
-\sigma = \text{sign}(\vec{n} \cdot \vec{1})
-f = \frac{1}{2}\Big(f(u_{L}) + f(u_{R}) + \sigma \big[f(u_{R}) - f(u_{L})\big]\Big)
+i = \text{argmax} \{ \begin{pmatrix} \vert n_1 \vert \\ \vert n_2 \vert \\ \dots \end{pmatrix} \}
+\sigma = -\text{sign} (n_i)
 ```
 """
 function flux_parabolic(u_ll, u_rr, # Version for `TreeMesh`
                         ::Divergence, equations_parabolic,
-                        parabolic_scheme::ViscousFormulationLocalDG)
+                        parabolic_scheme::ParabolicFormulationLocalDG)
     return u_rr # Use the downwind value for the divergence interface flux
 end
-# Version or `P4estMesh`
+# Version for `P4estMesh`
 function flux_parabolic(u_ll, u_rr, normal_direction,
                         ::Divergence, equations_parabolic,
-                        parabolic_scheme::ViscousFormulationLocalDG)
-    ldg_switch = sign(sum(normal_direction)) # equivalent to sign(dot(normal_direction, ones))
-    return 0.5f0 * (u_ll + u_rr + ldg_switch * (u_rr - u_ll))
+                        parabolic_scheme::ParabolicFormulationLocalDG)
+    # Use "Downwind in dominant direction" for LDG switch
+    abs_max_dir = argmax(abs.(normal_direction))
+    ldg_switch = -sign(normal_direction[abs_max_dir])
+    return 0.5f0 * (u_ll + u_rr - ldg_switch * (u_rr - u_ll))
 end
 
-default_parabolic_solver() = ViscousFormulationBassiRebay1()
+default_parabolic_solver() = ParabolicFormulationBassiRebay1()
diff --git a/src/time_integration/methods_SSP.jl b/src/time_integration/methods_SSP.jl
index db7cf9021f8..7a53cfeef1d 100644
--- a/src/time_integration/methods_SSP.jl
+++ b/src/time_integration/methods_SSP.jl
@@ -132,9 +132,20 @@ function init(ode::ODEProblem, alg::SimpleAlgorithmSSP;
                                                                 kwargs...),
                                      false, true, false)
 
-    # resize container
-    resize!(integrator.p, integrator.p.solver.volume_integral,
-            nelements(integrator.p.solver, integrator.p.cache))
+    # Resize container of volume integral for subcell limiting
+    _, _, dg, cache = mesh_equations_solver_cache(integrator.p)
+    if dg.volume_integral isa VolumeIntegralSubcellLimiting
+        # `subcell_limiter_coefficients` was created with 0 elements
+        resize!(dg.volume_integral.limiter.cache.subcell_limiter_coefficients,
+                nelements(dg, cache))
+        if dg.volume_integral.limiter isa SubcellLimiterMCL ||
+           (dg.volume_integral.limiter isa SubcellLimiterIDP &&
+            dg.volume_integral.limiter.bar_states)
+            # `container_bar_states` was created with 0 elements
+            resize!(dg.volume_integral.limiter.cache.container_bar_states,
+                    nelements(dg, cache))
+        end
+    end
 
     # Standard callbacks
     initialize_callbacks!(callback, integrator)
@@ -258,11 +269,6 @@ function Base.resize!(integrator::SimpleIntegratorSSP, new_size)
     resize!(integrator.du, new_size)
     resize!(integrator.u_tmp, new_size)
 
-    # Resize container
-    # new_size = n_variables * n_nodes^n_dims * n_elements
-    n_elements = nelements(integrator.p.solver, integrator.p.cache)
-    resize!(integrator.p, integrator.p.solver.volume_integral, n_elements)
-
     return nothing
 end
 end # @muladd
diff --git a/test/Project.toml b/test/Project.toml
index 48e1efa86ee..d9cfc69117a 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -20,6 +20,7 @@ MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
 OrdinaryDiffEqBDF = "6ad6398a-0878-4a85-9266-38940aa047c8"
 OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
+OrdinaryDiffEqDifferentiation = "4302a76b-040a-498a-8c04-15b101fed76b"
 OrdinaryDiffEqFeagin = "101fe9f7-ebb6-4678-b671-3a81e7194747"
 OrdinaryDiffEqHighOrderRK = "d28bc4f8-55e1-4f49-af69-84c1a99f0f58"
 OrdinaryDiffEqLowOrderRK = "1344f307-1e59-4825-a18e-ace9aa3fa4c6"
@@ -42,12 +43,12 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TrixiTest = "0a316866-cbd0-4425-8bcb-08103b2c1f26"
 
 [compat]
-Accessors = "0.1.36"
-ADTypes = "1.14"
-Adapt = "4.1"
+Accessors = "0.1.42"
+ADTypes = "1.16"
+Adapt = "4.3"
 Aqua = "0.8"
 CUDA = "5.8.2"
-CairoMakie = "0.12, 0.13, 0.14, 0.15"
+CairoMakie = "0.13, 0.14, 0.15"
 Convex = "0.16"
 DelimitedFiles = "1"
 DoubleFloats = "1.4.0"
@@ -55,28 +56,29 @@ Downloads = "1"
 ECOS = "1.1.2"
 ExplicitImports = "1.0.1"
 FiniteDiff = "2.27.0"
-ForwardDiff = "0.10.36, 1"
+ForwardDiff = "0.10.38, 1"
 Krylov = "0.10"
 LinearAlgebra = "1"
-LinearSolve = "3.13"
+LinearSolve = "3.54"
 MPI = "0.20.22"
 NLsolve = "4.5.1"
-OrdinaryDiffEqBDF = "1.1"
-OrdinaryDiffEqCore = "1.26, 2, 3"
-OrdinaryDiffEqFeagin = "1"
-OrdinaryDiffEqHighOrderRK = "1.1"
-OrdinaryDiffEqLowOrderRK = "1.2"
-OrdinaryDiffEqLowStorageRK = "1.2"
-OrdinaryDiffEqSDIRK = "1.1"
-OrdinaryDiffEqSSPRK = "1.2"
-OrdinaryDiffEqStabilizedRK = "1.3"
-OrdinaryDiffEqTsit5 = "1.1"
-Plots = "1.38.9"
+OrdinaryDiffEqBDF = "1.15"
+OrdinaryDiffEqCore = "3.8"
+OrdinaryDiffEqDifferentiation = "2"
+OrdinaryDiffEqFeagin = "1.8"
+OrdinaryDiffEqHighOrderRK = "1.9"
+OrdinaryDiffEqLowOrderRK = "1.10"
+OrdinaryDiffEqLowStorageRK = "1.11"
+OrdinaryDiffEqSDIRK = "1.12"
+OrdinaryDiffEqSSPRK = "1.11"
+OrdinaryDiffEqStabilizedRK = "1.8"
+OrdinaryDiffEqTsit5 = "1.9"
+Plots = "1.38.13"
 Printf = "1"
 Quadmath = "0.5.10"
 Random = "1"
-SciMLBase = "2.92.0"
-SciMLOperators = "1"
+SciMLBase = "2.141"
+SciMLOperators = "1.15"
 SparseArrays = "1"
 SparseConnectivityTracer = "1.0.1"
 SparseMatrixColorings = "0.4.21"
diff --git a/test/runtests.jl b/test/runtests.jl
index 8b524a09df8..faacce41a27 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,6 @@
 using Test
 using MPI: mpiexec
+import Trixi
 
 # We run tests in parallel with CI jobs setting the `TRIXI_TEST` environment
 # variable to determine the subset of tests to execute.
@@ -22,7 +23,7 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3)
 
         # We provide a `--heap-size-hint` to avoid/reduce out-of-memory errors during CI testing
         mpiexec() do cmd
-            run(`$cmd -n $TRIXI_MPI_NPROCS $(Base.julia_cmd()) --threads=1 --check-bounds=yes --heap-size-hint=0.5G $(abspath("test_mpi.jl"))`)
+            run(`$cmd -n $TRIXI_MPI_NPROCS $(Base.julia_cmd()) --threads=1 --check-bounds=yes --heap-size-hint=0.5G $(joinpath(@__DIR__, "test_mpi.jl"))`)
             return nothing
         end
     end
@@ -34,89 +35,110 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3)
         # cf. https://github.com/JuliaParallel/MPI.jl/pull/391
         @test true
 
-        run(`$(Base.julia_cmd()) --threads=$TRIXI_NTHREADS --check-bounds=yes --code-coverage=none $(abspath("test_threaded.jl"))`)
+        run(`$(Base.julia_cmd()) --threads=$TRIXI_NTHREADS --check-bounds=yes --code-coverage=none $(joinpath(@__DIR__, "test_threaded.jl"))`)
+    end
+
+    # Downgrade CI currently has issues with running julia processes via `run`, see
+    # https://github.com/trixi-framework/Trixi.jl/pull/2507#issuecomment-3990318366
+    # So we run test_threaded.jl serially.
+    # For `TRIXI_TEST = "all"`, test_threaded.jl is already covered by the threaded run, so we don't need to run it again.
+    @time if TRIXI_TEST == "downgrade"
+        include(joinpath(@__DIR__, "test_threaded.jl"))
     end
 
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "tree_part1"
-        include("test_tree_1d.jl")
-        include("test_tree_2d_part1.jl")
+        include(joinpath(@__DIR__, "test_tree_1d.jl"))
+        include(joinpath(@__DIR__, "test_tree_2d_part1.jl"))
     end
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "tree_part2"
-        include("test_tree_2d_part2.jl")
+        include(joinpath(@__DIR__, "test_tree_2d_part2.jl"))
     end
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "tree_part3"
-        include("test_tree_2d_part3.jl")
+        include(joinpath(@__DIR__, "test_tree_2d_part3.jl"))
     end
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "tree_part4"
-        include("test_tree_3d_part1.jl")
+        include(joinpath(@__DIR__, "test_tree_3d_part1.jl"))
     end
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "tree_part5"
-        include("test_tree_3d_part2.jl")
+        include(joinpath(@__DIR__, "test_tree_3d_part2.jl"))
     end
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "tree_part6"
-        include("test_tree_3d_part3.jl")
+        include(joinpath(@__DIR__, "test_tree_3d_part3.jl"))
     end
 
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "structured"
-        include("test_structured_1d.jl")
-        include("test_structured_2d.jl")
-        include("test_structured_3d.jl")
+        include(joinpath(@__DIR__, "test_structured_1d.jl"))
+        include(joinpath(@__DIR__, "test_structured_2d.jl"))
+        include(joinpath(@__DIR__, "test_structured_3d.jl"))
     end
 
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "p4est_part1"
-        include("test_p4est_2d.jl")
+        include(joinpath(@__DIR__, "test_p4est_2d.jl"))
     end
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "p4est_part2"
-        include("test_p4est_3d.jl")
+        include(joinpath(@__DIR__, "test_p4est_3d.jl"))
     end
 
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "t8code_part1"
-        include("test_t8code_2d.jl")
+        include(joinpath(@__DIR__, "test_t8code_2d.jl"))
     end
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "t8code_part2"
-        include("test_t8code_3d.jl")
+        include(joinpath(@__DIR__, "test_t8code_3d.jl"))
     end
 
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "unstructured_dgmulti"
-        include("test_unstructured_2d.jl")
-        include("test_dgmulti_1d.jl")
-        include("test_dgmulti_2d.jl")
-        include("test_dgmulti_3d.jl")
+        include(joinpath(@__DIR__, "test_unstructured_2d.jl"))
+        include(joinpath(@__DIR__, "test_dgmulti_1d.jl"))
+        include(joinpath(@__DIR__, "test_dgmulti_2d.jl"))
+        include(joinpath(@__DIR__, "test_dgmulti_3d.jl"))
     end
 
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "parabolic_part1"
-        include("test_parabolic_1d.jl")
-        include("test_parabolic_2d.jl")
+        include(joinpath(@__DIR__, "test_parabolic_1d.jl"))
+        include(joinpath(@__DIR__, "test_parabolic_2d.jl"))
     end
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "parabolic_part2"
-        include("test_parabolic_3d.jl")
+        include(joinpath(@__DIR__, "test_parabolic_3d.jl"))
     end
 
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "misc_part1"
-        include("test_unit.jl")
-        include("test_type.jl")
-        include("test_visualization.jl")
+        include(joinpath(@__DIR__, "test_unit.jl"))
+        include(joinpath(@__DIR__, "test_type.jl"))
+        include(joinpath(@__DIR__, "test_visualization.jl"))
     end
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "misc_part2"
-        include("test_special_elixirs.jl")
-        include("test_aqua.jl")
+        include(joinpath(@__DIR__, "test_special_elixirs.jl"))
+        include(joinpath(@__DIR__, "test_aqua.jl"))
     end
 
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "performance_specializations"
-        include("test_performance_specializations_2d.jl")
-        include("test_performance_specializations_3d.jl")
+        include(joinpath(@__DIR__, "test_performance_specializations_2d.jl"))
+        include(joinpath(@__DIR__, "test_performance_specializations_3d.jl"))
     end
 
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "paper_self_gravitating_gas_dynamics"
-        include("test_paper_self_gravitating_gas_dynamics.jl")
+        include(joinpath(@__DIR__, "test_paper_self_gravitating_gas_dynamics.jl"))
     end
 
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "CUDA"
         import CUDA
         if CUDA.functional()
-            include("test_cuda.jl")
+            include(joinpath(@__DIR__, "test_cuda_2d.jl"))
+            include(joinpath(@__DIR__, "test_cuda_3d.jl"))
         else
             @warn "Unable to run CUDA tests on this machine"
         end
     end
+
+    @time if TRIXI_TEST == "all" || TRIXI_TEST == "kernelabstractions"
+        previous_backend = Trixi._PREFERENCE_THREADING
+        Trixi.set_threading_backend!(:kernelabstractions)
+        # relaunching julia
+        try
+            run(`$(Base.julia_cmd()) --threads=$TRIXI_NTHREADS --check-bounds=yes $(abspath("test_kernelabstractions.jl"))`)
+        finally
+            # Restore previous threading backend for later tests
+            Trixi.set_threading_backend!(Symbol(previous_backend))
+        end
+    end
 end
diff --git a/test/test_cuda.jl b/test/test_cuda_2d.jl
similarity index 80%
rename from test/test_cuda.jl
rename to test/test_cuda_2d.jl
index fd0adaeece1..85fcb75139f 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda_2d.jl
@@ -1,15 +1,18 @@
-module TestCUDA
+module TestCUDA2D
 
 using Test
 using Trixi
 
 include("test_trixi.jl")
 
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+
 # Start with a clean environment: remove Trixi.jl output directory if it exists
 outdir = "out"
 isdir(outdir) && rm(outdir, recursive = true)
 
-EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+@testset "CUDA 2D" begin
+#! format: noindent
 
 @trixi_testset "elixir_advection_basic_gpu.jl native" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
@@ -18,6 +21,7 @@ EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
                         linf=6.627000273229378e-5)
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
     @test real(ode.p.solver) == Float64
     @test real(ode.p.solver.basis) == Float64
@@ -39,15 +43,15 @@ end
     using CUDA
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
                         # Expected errors are exactly the same as with TreeMesh!
-                        l2=nothing,   # TODO: GPU. [Float32(8.311947673061856e-6)],
-                        linf=nothing, # TODO: GPU. [Float32(6.627000273229378e-5)],
+                        l2=[Float32(8.311947673061856e-6)],
+                        linf=[Float32(6.627000273229378e-5)],
                         RealT_for_test_tolerances=Float32,
                         real_type=Float32,
-                        storage_type=CuArray,
-                        sol=nothing,) # TODO: GPU. Remove this once we can run the simulation on the GPU
-    # # Ensure that we do not have excessive memory allocations
-    # # (e.g., from type instabilities)
-    # @test_allocations(Trixi.rhs!, semi, sol, 1000)
+                        storage_type=CuArray)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
+    @test_allocations(Trixi.rhs!, semi, sol, 700_000)
     @test real(ode.p.solver) == Float32
     @test real(ode.p.solver.basis) == Float32
     @test real(ode.p.solver.mortar) == Float32
@@ -65,5 +69,5 @@ end
 
 # Clean up afterwards: delete Trixi.jl output directory
 @test_nowarn isdir(outdir) && rm(outdir, recursive = true)
-
+end
 end # module
diff --git a/test/test_cuda_3d.jl b/test/test_cuda_3d.jl
new file mode 100644
index 00000000000..6c590332555
--- /dev/null
+++ b/test/test_cuda_3d.jl
@@ -0,0 +1,73 @@
+module TestCUDA3D
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_3d_dgsem")
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+@testset "CUDA 3D" begin
+#! format: noindent
+
+@trixi_testset "elixir_advection_basic_gpu.jl native" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[0.00016263963870641478],
+                        linf=[0.0014537194925779984])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test real(ode.p.solver) == Float64
+    @test real(ode.p.solver.basis) == Float64
+    @test real(ode.p.solver.mortar) == Float64
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa Array
+    @test ode.p.solver.basis.derivative_matrix isa Array
+
+    @test Trixi.storage_type(ode.p.cache.elements) === Array
+    @test Trixi.storage_type(ode.p.cache.interfaces) === Array
+    @test Trixi.storage_type(ode.p.cache.boundaries) === Array
+    @test Trixi.storage_type(ode.p.cache.mortars) === Array
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32 / CUDA" begin
+    # Using CUDA inside the testset since otherwise the bindings are hiddend by the anonymous modules
+    using CUDA
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors similar to reference on CPU
+                        l2=[Float32(0.00016263963870641478)],
+                        linf=[Float32(0.0014537194925779984)],
+                        RealT_for_test_tolerances=Float32,
+                        real_type=Float32,
+                        storage_type=CuArray)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
+    @test_allocations(Trixi.rhs!, semi, sol, 1_700_000)
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa CuArray
+    @test ode.p.solver.basis.derivative_matrix isa CuArray
+
+    @test Trixi.storage_type(ode.p.cache.elements) === CuArray
+    @test Trixi.storage_type(ode.p.cache.interfaces) === CuArray
+    @test Trixi.storage_type(ode.p.cache.boundaries) === CuArray
+    @test Trixi.storage_type(ode.p.cache.mortars) === CuArray
+end
+
+# Clean up afterwards: delete Trixi.jl output directory
+@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
+end
+end # module
diff --git a/test/test_dgmulti_2d.jl b/test/test_dgmulti_2d.jl
index 4e00dd4859f..251b003522f 100644
--- a/test/test_dgmulti_2d.jl
+++ b/test/test_dgmulti_2d.jl
@@ -354,6 +354,28 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_euler_kelvin_helmholtz_instability_adaptive_vol_int.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                 "elixir_euler_kelvin_helmholtz_instability_adaptive_vol_int.jl"),
+                        maximum_entropy_increase=0.0,
+                        tspan=(0.0, 0.2),
+                        l2=[
+                            0.05570371489805444,
+                            0.03299286402646503,
+                            0.05224508023471742,
+                            0.08011545946002244
+                        ],
+                        linf=[
+                            0.24323216643032874,
+                            0.1685158282708948,
+                            0.12357902305982191,
+                            0.26981068435988087
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
 @trixi_testset "elixir_euler_rayleigh_taylor_instability.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_euler_rayleigh_taylor_instability.jl"),
diff --git a/test/test_kernelabstractions.jl b/test/test_kernelabstractions.jl
new file mode 100644
index 00000000000..a1a771ee402
--- /dev/null
+++ b/test/test_kernelabstractions.jl
@@ -0,0 +1,83 @@
+module TestExamplesKernelAbstractions
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+EXAMPLES_DIR = examples_dir()
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+Trixi.mpi_isroot() && isdir(outdir) && rm(outdir, recursive = true)
+Trixi.MPI.Barrier(Trixi.mpi_comm())
+
+@testset "basic" begin
+    @test Trixi._PREFERENCE_THREADING == :kernelabstractions
+end
+
+@testset "KernelAbstractions CPU 2D" begin
+#! format: noindent
+
+@trixi_testset "elixir_advection_basic_gpu.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
+                                 "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=8.311947673061856e-6,
+                        linf=6.627000273229378e-5)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
+    @test_allocations(Trixi.rhs!, ode.p, sol, 75_000)
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
+                                 "elixir_advection_basic_gpu.jl"),
+                        # Expected errors similar to reference on CPU
+                        l2=[Float32(8.311947673061856e-6)],
+                        linf=[Float32(6.627000273229378e-5)],
+                        RealT_for_test_tolerances=Float32,
+                        real_type=Float32)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
+    @test_allocations(Trixi.rhs!, ode.p, sol, 60_000)
+end
+end
+
+@testset "KernelAbstractions CPU 3D" begin
+#! format: noindent
+
+@trixi_testset "elixir_advection_basic_gpu.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",
+                                 "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[0.00016263963870641478],
+                        linf=[0.0014537194925779984])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
+    @test_allocations(Trixi.rhs!, semi, sol, 450_000)
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",
+                                 "elixir_advection_basic_gpu.jl"),
+                        # Expected errors similar to reference on CPU
+                        l2=[Float32(0.00016263963870641478)],
+                        linf=[Float32(0.0014537194925779984)],
+                        RealT_for_test_tolerances=Float32,
+                        real_type=Float32)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
+    @test_allocations(Trixi.rhs!, semi, sol, 370_000)
+end
+end
+
+# Clean up afterwards: delete Trixi.jl output directory
+Trixi.mpi_isroot() && isdir(outdir) && @test_nowarn rm(outdir, recursive = true)
+Trixi.MPI.Barrier(Trixi.mpi_comm())
+
+end # module
diff --git a/test/test_mpi_tree.jl b/test/test_mpi_tree.jl
index 7152438e7f2..62c74e2fd3c 100644
--- a/test/test_mpi_tree.jl
+++ b/test/test_mpi_tree.jl
@@ -74,7 +74,7 @@ CI_ON_WINDOWS = (get(ENV, "GITHUB_ACTIONS", false) == "true") && Sys.iswindows()
     @trixi_testset "elixir_advection_restart_amr.jl" begin
         @test_trixi_include(joinpath(EXAMPLES_DIR,
                                      "elixir_advection_restart_amr.jl"),
-                            l2=[8.018498574373939e-5],
+                            l2=[8.018497923389368e-5],
                             linf=[0.0007307237754662355])
     end
 
diff --git a/test/test_p4est_2d.jl b/test/test_p4est_2d.jl
index f15e0f4a261..81e0c9153ca 100644
--- a/test/test_p4est_2d.jl
+++ b/test/test_p4est_2d.jl
@@ -110,22 +110,19 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
-@trixi_testset "elixir_advection_meshview.jl" begin
-    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_meshview.jl"),
-                        l2=[0.00013773915040249946],
-                        linf=[0.0010140184322192658])
+@trixi_testset "elixir_advection_coupled.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_coupled.jl"),
+                        l2=[0.00013318279010717573, 0.00013318279010712838],
+                        linf=[0.0009605782290112996, 0.0009605782290100784])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
-    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test_broken (@allocated Trixi.rhs!(du_ode, u_ode, semi, t)) < 1000
 
     # Ensure we cover the calculation of the node coordinates
     node_coordinates = typeof(parent_mesh.tree_node_coordinates)(undef, 2,
                                                                  ntuple(_ -> length(parent_mesh.nodes),
                                                                         2)...,
-                                                                 length(mesh.cell_ids))
-    result = Trixi.calc_node_coordinates!(node_coordinates, mesh, parent_mesh.nodes)
-    @test parent_mesh.tree_node_coordinates == result
-
+                                                                 length(mesh1.cell_ids))
     # Load the mesh file for code coverage.
     loaded_mesh = Trixi.load_mesh_serial(joinpath("out", "mesh.h5"); n_cells_max = 0,
                                          RealT = typeof(parent_mesh).parameters[3])
@@ -679,17 +676,18 @@ end
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_euler_supersonic_cylinder_scO2.jl"),
                         l2=[
-                            0.029314031292054992,
-                            0.053506886526450186,
-                            0.03543104168310674,
-                            0.21538892425489486
+                            0.02952388632922144,
+                            0.05371261793410487,
+                            0.035384060637794805,
+                            0.21588602773829588
                         ],
                         linf=[
-                            4.159114155336756,
-                            4.200427029096135,
-                            7.397166897133932,
-                            33.18602863132469
+                            4.163159992186843,
+                            4.2267168297270725,
+                            7.332852278485849,
+                            34.243826868270645
                         ],
+                        adaptive=false, dt=1e-5,
                         tspan=(0.0, 0.001))
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
diff --git a/test/test_p4est_3d.jl b/test/test_p4est_3d.jl
index f29256f056b..12b2fa92e91 100644
--- a/test/test_p4est_3d.jl
+++ b/test/test_p4est_3d.jl
@@ -387,7 +387,7 @@ end
                             1.888627709320322,
                             4.971280431903264
                         ],
-                        tspan=(0.0, 0.3),)
+                        tspan=(0.0, 0.3))
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
     # Larger values for allowed allocations due to usage of custom
@@ -399,22 +399,55 @@ end
 
 @trixi_testset "elixir_euler_sedov_sc_subcell.jl (local bounds)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_sedov_sc_subcell.jl"),
+                        local_twosided_variables_cons=["rho"],
+                        local_onesided_variables_nonlinear=[(entropy_guermond_etal,
+                                                             min)],
+                        max_iterations_newton=30,
+                        l2=[
+                            0.16504564013491585,
+                            0.06461384162458203,
+                            0.06461384162461223,
+                            0.06461384162461678,
+                            0.36193245790622036
+                        ],
+                        linf=[
+                            0.9138327077620716,
+                            0.5707102472596818,
+                            0.5707102472739252,
+                            0.5707102472781822,
+                            4.777595503303726
+                        ],
+                        tspan=(0.0, 0.3))
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    # Larger values for allowed allocations due to usage of custom
+    # integrator which are not *recorded* for the methods from
+    # OrdinaryDiffEq.jl
+    # Corresponding issue: https://github.com/trixi-framework/Trixi.jl/issues/1877
+    @test_allocations(Trixi.rhs!, semi, sol, 15_000)
+end
+
+@trixi_testset "elixir_euler_sedov_sc_subcell.jl (local bounds, nonperiodic)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_sedov_sc_subcell.jl"),
+                        local_twosided_variables_cons=["rho"],
                         local_onesided_variables_nonlinear=[(entropy_guermond_etal,
                                                              min)],
-                        max_iterations_newton=40,
+                        max_iterations_newton=50,
+                        periodicity=false,
+                        boundary_conditions=BoundaryConditionDirichlet(initial_condition),
                         l2=[
-                            0.19153085678321066,
-                            0.07411109384422779,
-                            0.07411109384410808,
-                            0.07411109384406232,
-                            0.36714268468314665
+                            0.16504564013491585,
+                            0.06461384162458203,
+                            0.06461384162461223,
+                            0.06461384162461678,
+                            0.36193245790622036
                         ],
                         linf=[
-                            1.4037775549639524,
-                            1.339590863739464,
-                            1.3395908637591605,
-                            1.3395908637371077,
-                            4.824252924073932
+                            0.9138327077620716,
+                            0.5707102472596818,
+                            0.5707102472739252,
+                            0.5707102472781822,
+                            4.777595503303726
                         ],
                         tspan=(0.0, 0.3))
     # Ensure that we do not have excessive memory allocations
diff --git a/test/test_parabolic_1d.jl b/test/test_parabolic_1d.jl
index dac1035f8a0..9fef07fed7a 100644
--- a/test/test_parabolic_1d.jl
+++ b/test/test_parabolic_1d.jl
@@ -37,6 +37,19 @@ end
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
 end
 
+@trixi_testset "TreeMesh1D: elixir_advection_diffusion_ldg.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
+                                 "elixir_advection_diffusion_ldg.jl"),
+                        solver=DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs,
+                                     basis_type = GaussLegendreBasis),
+                        tspan=(0.0, 0.4),
+                        l2=[4.126471023759558e-6], linf=[1.4470099431229677e-5])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+end
+
 @trixi_testset "TreeMesh1D: elixir_advection_diffusion_gradient_source_terms.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
                                  "elixir_advection_diffusion_gradient_source_terms.jl"),
@@ -276,6 +289,29 @@ end
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
 end
 
+@trixi_testset "TreeMesh1D: elixir_navierstokes_convergence_walls.jl (Gauss-Legendre)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
+                                 "elixir_navierstokes_convergence_walls.jl"),
+                        solver=DGSEM(polydeg = 3, surface_flux = flux_hll,
+                                     basis_type = GaussLegendreBasis),
+                        time_int_tol=1e-10,
+                        l2=[
+                            4.201445769104007e-5,
+                            9.758279535510314e-5,
+                            0.0004199990641561288
+                        ],
+                        linf=[
+                            0.00015356293659607445,
+                            0.0004198436005902785,
+                            0.0016946745322332646
+                        ],
+                        atol=1e-10)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+end
+
 @trixi_testset "TreeMesh1D: elixir_navierstokes_convergence_walls_amr.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
                                  "elixir_navierstokes_convergence_walls_amr.jl"),
@@ -342,6 +378,27 @@ end
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
 end
 
+@trixi_testset "TreeMesh1D: elixir_navierstokes_viscous_shock.jl (Gauss-Legendre)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
+                                 "elixir_navierstokes_viscous_shock.jl"),
+                        solver=DGSEM(polydeg = 3, surface_flux = flux_hlle,
+                                     basis_type = GaussLegendreBasis),
+                        l2=[
+                            0.00010415910094963455,
+                            7.569570282227496e-5,
+                            8.643799824895884e-5
+                        ],
+                        linf=[
+                            0.0004795456761867989,
+                            0.0003525509032139551,
+                            0.0004044657250887873
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+end
+
 @trixi_testset "TreeMesh1D: elixir_navierstokes_viscous_shock_imex.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
                                  "elixir_navierstokes_viscous_shock_imex.jl"),
@@ -430,7 +487,7 @@ end
                         callbacks=CallbackSet(summary_callback, alive_callback,
                                               analysis_callback,
                                               StepsizeCallback(cfl = 0.5,
-                                                               cfl_diffusive = 0.1)),
+                                                               cfl_parabolic = 0.1)),
                         adaptive=false,
                         l2=[
                             3.804624387087144e-5,
diff --git a/test/test_parabolic_2d.jl b/test/test_parabolic_2d.jl
index dd2051cf7f4..a0ae0d0864a 100644
--- a/test/test_parabolic_2d.jl
+++ b/test/test_parabolic_2d.jl
@@ -68,9 +68,9 @@ isdir(outdir) && rm(outdir, recursive = true)
     @test getindex.(gradients[2], 1) ≈ xq .^ 2
 
     u_flux = similar.(gradients)
-    Trixi.calc_viscous_fluxes!(u_flux, u0, gradients, mesh,
-                               equations_parabolic,
-                               dg, cache, cache_parabolic)
+    Trixi.calc_parabolic_fluxes!(u_flux, u0, gradients, mesh,
+                                 equations_parabolic,
+                                 dg, cache, cache_parabolic)
     @test u_flux[1] ≈ gradients[1]
     @test u_flux[2] ≈ gradients[2]
 
@@ -207,10 +207,23 @@ end
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
 end
 
+@trixi_testset "TreeMesh2D: elixir_advection_diffusion.jl (Gauss-Legendre)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_2d_dgsem",
+                                 "elixir_advection_diffusion.jl"),
+                        solver=DGSEM(polydeg = 5, surface_flux = flux_lax_friedrichs,
+                                     basis_type = GaussLegendreBasis),
+                        initial_refinement_level=2, tspan=(0.0, 0.4),
+                        l2=[2.8254621369070895e-6], linf=[6.914648264633172e-6])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+end
+
 @trixi_testset "TreeMesh2D: elixir_advection_diffusion.jl (LDG)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_2d_dgsem",
                                  "elixir_advection_diffusion.jl"),
-                        solver_parabolic=ViscousFormulationLocalDG(),
+                        solver_parabolic=ParabolicFormulationLocalDG(),
                         initial_refinement_level=2, tspan=(0.0, 0.4), polydeg=5,
                         l2=[6.193056910594806e-6], linf=[4.918855889635143e-5])
     # Ensure that we do not have excessive memory allocations
@@ -234,7 +247,7 @@ end
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_2d_dgsem",
                                  "elixir_advection_diffusion_gradient_source_terms.jl"),
                         initial_refinement_level=2, tspan=(0.0, 0.4),
-                        solver_parabolic=ViscousFormulationBassiRebay1(), nu=1e-3,
+                        solver_parabolic=ParabolicFormulationBassiRebay1(), nu=1e-3,
                         stepsize_callback=TrivialCallback(), dt=1e-1,
                         l2=[0.0017395186758592685], linf=[0.007481527467476025])
     # Ensure that we do not have excessive memory allocations
@@ -251,7 +264,7 @@ end
                                        coordinates_max = coordinates_max,
                                        periodicity = true),
                         tspan=(0.0, 0.4),
-                        solver_parabolic=ViscousFormulationBassiRebay1(), nu=1e-3,
+                        solver_parabolic=ParabolicFormulationBassiRebay1(), nu=1e-3,
                         stepsize_callback=TrivialCallback(), dt=1e-1,
                         l2=[0.0017395186758592685], linf=[0.007481527467476025])
     # Ensure that we do not have excessive memory allocations
@@ -316,6 +329,20 @@ end
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
 end
 
+@trixi_testset "TreeMesh2D: elixir_advection_diffusion_nonperiodic.jl (Gauss-Legendre)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_2d_dgsem",
+                                 "elixir_advection_diffusion_nonperiodic.jl"),
+                        solver=DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs,
+                                     basis_type = GaussLegendreBasis),
+                        initial_refinement_level=2, tspan=(0.0, 0.1),
+                        l2=[0.005916696880764326],
+                        linf=[0.034212013224034776])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+end
+
 @trixi_testset "TreeMesh2D: elixir_advection_diffusion_nonperiodic_amr.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_2d_dgsem",
                                  "elixir_advection_diffusion_nonperiodic_amr.jl"),
@@ -331,7 +358,7 @@ end
 @trixi_testset "TreeMesh2D: elixir_advection_diffusion_nonperiodic_amr.jl (LDG)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_2d_dgsem",
                                  "elixir_advection_diffusion_nonperiodic_amr.jl"),
-                        solver_parabolic=ViscousFormulationLocalDG(),
+                        solver_parabolic=ParabolicFormulationLocalDG(),
                         tspan=(0.0, 0.01),
                         l2=[0.000684755734524055],
                         linf=[0.01141444199847298])
@@ -358,7 +385,7 @@ end
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_2d_dgsem",
                                  "elixir_advection_diffusion_nonperiodic.jl"),
                         initial_refinement_level=2, tspan=(0.0, 0.1),
-                        solver_parabolic=ViscousFormulationLocalDG(),
+                        solver_parabolic=ParabolicFormulationLocalDG(),
                         l2=[0.007009146246373517], linf=[0.09535203925012649])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
@@ -610,7 +637,7 @@ end
 @trixi_testset "TreeMesh2D: elixir_navierstokes_shearlayer_nonconforming.jl (LDG)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_2d_dgsem",
                                  "elixir_navierstokes_shearlayer_nonconforming.jl"),
-                        solver_parabolic=ViscousFormulationLocalDG(),
+                        solver_parabolic=ParabolicFormulationLocalDG(),
                         l2=[
                             0.005352370793583371,
                             0.5969444914287823,
@@ -672,6 +699,31 @@ end
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
 end
 
+@trixi_testset "TreeMesh2D: elixir_navierstokes_viscous_shock.jl (Gauss-Legendre, LDG)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_2d_dgsem",
+                                 "elixir_navierstokes_viscous_shock.jl"),
+                        solver=DGSEM(polydeg = 3, surface_flux = flux_hlle,
+                                     basis_type = GaussLegendreBasis),
+                        solver_parabolic=ParabolicFormulationLocalDG(),
+                        cfl_parabolic=0.04,
+                        l2=[
+                            6.599006355897759e-6,
+                            4.514805201434994e-6,
+                            6.54834144833621e-17,
+                            4.882545625516753e-6
+                        ],
+                        linf=[
+                            3.7580718253771295e-5,
+                            2.6691756676799905e-5,
+                            3.560074538214949e-16,
+                            2.989434893274634e-5
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+end
+
 @trixi_testset "P4estMesh2D: elixir_advection_diffusion_periodic.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
                                  "elixir_advection_diffusion_periodic.jl"),
@@ -698,6 +750,16 @@ end
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
 end
 
+@trixi_testset "P4estMesh2D: elixir_advection_diffusion_rotated.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
+                                 "elixir_advection_diffusion_rotated.jl"),
+                        l2=[4.8533724384822306e-5], linf=[0.0006284491001110615])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
+end
+
 @trixi_testset "P4estMesh2D: elixir_advection_diffusion_periodic_curved.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
                                  "elixir_advection_diffusion_periodic_curved.jl"),
@@ -734,14 +796,14 @@ end
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
 end
 
-@trixi_testset "P4estMesh2D: elixir_advection_diffusion_nonperiodic_amr.jl (Diffusive CFL)" begin
+@trixi_testset "P4estMesh2D: elixir_advection_diffusion_nonperiodic_amr.jl (Parabolic CFL)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
                                  "elixir_advection_diffusion_nonperiodic_amr.jl"),
                         initial_refinement_level=2,
                         callbacks=CallbackSet(summary_callback, analysis_callback,
                                               alive_callback,
                                               StepsizeCallback(cfl = 1.6,
-                                                               cfl_diffusive = 0.2)),
+                                                               cfl_parabolic = 0.2)),
                         ode_alg=CarpenterKennedy2N54(williamson_condition = false),
                         dt=1.0, # will be overwritten
                         l2=[0.00010850375815619432],
@@ -755,7 +817,7 @@ end
 @trixi_testset "P4estMesh2D: elixir_advection_diffusion_nonperiodic_amr.jl (LDG)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
                                  "elixir_advection_diffusion_nonperiodic_amr.jl"),
-                        solver_parabolic=ViscousFormulationLocalDG(),
+                        solver_parabolic=ParabolicFormulationLocalDG(),
                         tspan=(0.0, 0.01),
                         l2=[0.0006847533999311489],
                         linf=[0.01141430509080712])
@@ -886,7 +948,7 @@ end
 @trixi_testset "P4estMesh2D: elixir_navierstokes_shearlayer_nonconforming.jl (LDG)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
                                  "elixir_navierstokes_shearlayer_nonconforming.jl"),
-                        solver_parabolic=ViscousFormulationLocalDG(),
+                        solver_parabolic=ParabolicFormulationLocalDG(),
                         l2=[
                             0.0053523707935916025,
                             0.5969444914278867,
@@ -1093,7 +1155,7 @@ end
                         callbacks=CallbackSet(summary_callback, analysis_callback,
                                               alive_callback,
                                               StepsizeCallback(cfl = 2.3,
-                                                               cfl_diffusive = 1.0)),
+                                                               cfl_parabolic = 1.0)),
                         adaptive=false, # respect CFL
                         ode_alg=CKLLSRK95_4S(),
                         l2=[
diff --git a/test/test_parabolic_3d.jl b/test/test_parabolic_3d.jl
index f8aa7e85afa..8b192e015ce 100644
--- a/test/test_parabolic_3d.jl
+++ b/test/test_parabolic_3d.jl
@@ -408,7 +408,7 @@ end
     @test_allocations(Trixi.rhs_parabolic!, semi, sol, 1000)
 end
 
-@trixi_testset "P4estMesh3D: elixir_navierstokes_taylor_green_vortex.jl (Diffusive CFL)" begin
+@trixi_testset "P4estMesh3D: elixir_navierstokes_taylor_green_vortex.jl (Parabolic CFL)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",
                                  "elixir_navierstokes_taylor_green_vortex.jl"),
                         tspan=(0.0, 0.1),
@@ -416,7 +416,7 @@ end
                         callbacks=CallbackSet(summary_callback, analysis_callback,
                                               alive_callback,
                                               StepsizeCallback(cfl = 2.3,
-                                                               cfl_diffusive = 0.4)),
+                                                               cfl_parabolic = 0.4)),
                         adaptive=false, # respect CFL
                         ode_alg=CKLLSRK95_4S(),
                         l2=[
@@ -457,7 +457,7 @@ end
 @trixi_testset "TreeMesh3D: elixir_advection_diffusion_amr.jl (LDG)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_3d_dgsem",
                                  "elixir_advection_diffusion_amr.jl"),
-                        solver_parabolic=ViscousFormulationLocalDG(),
+                        solver_parabolic=ParabolicFormulationLocalDG(),
                         initial_refinement_level=2,
                         base_level=2,
                         med_level=3,
@@ -495,7 +495,7 @@ end
 @trixi_testset "TreeMesh3D: elixir_advection_diffusion_nonperiodic.jl (LDG)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "tree_3d_dgsem",
                                  "elixir_advection_diffusion_nonperiodic.jl"),
-                        solver_parabolic=ViscousFormulationLocalDG(),
+                        solver_parabolic=ParabolicFormulationLocalDG(),
                         l2=[0.0009432415534931421], linf=[0.016955330290404563])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
@@ -525,9 +525,9 @@ end
 @trixi_testset "P4estMesh3D: elixir_advection_diffusion_nonperiodic.jl (LDG)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",
                                  "elixir_advection_diffusion_nonperiodic.jl"),
-                        solver_parabolic=ViscousFormulationLocalDG(),
-                        cfl_diffusive=0.07,
-                        l2=[0.0041854757843498725], linf=[0.05166356737492643])
+                        solver_parabolic=ParabolicFormulationLocalDG(),
+                        cfl_parabolic=0.07,
+                        l2=[0.004185076476662267], linf=[0.05166349548111486])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
@@ -547,7 +547,7 @@ end
 @trixi_testset "P4estMesh3D: elixir_advection_diffusion_amr_curved.jl (LDG)" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",
                                  "elixir_advection_diffusion_amr_curved.jl"),
-                        solver_parabolic=ViscousFormulationLocalDG(),
+                        solver_parabolic=ParabolicFormulationLocalDG(),
                         l2=[0.0006853004145232737], linf=[0.02352694543085776])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
diff --git a/test/test_performance_specializations_2d.jl b/test/test_performance_specializations_2d.jl
index b42a1b8f640..7dceea2b6a7 100644
--- a/test/test_performance_specializations_2d.jl
+++ b/test/test_performance_specializations_2d.jl
@@ -33,7 +33,7 @@ isdir(outdir) && rm(outdir, recursive = true)
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -43,10 +43,10 @@ isdir(outdir) && rm(outdir, recursive = true)
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, 1]
@@ -72,7 +72,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -82,10 +82,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, 1]
@@ -112,7 +112,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -122,10 +122,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, 1]
@@ -151,7 +151,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -161,10 +161,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, 1]
diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 3b3bd40b2f5..967b0f9cf3e 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -33,7 +33,7 @@ isdir(outdir) && rm(outdir, recursive = true)
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -43,10 +43,10 @@ isdir(outdir) && rm(outdir, recursive = true)
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, :, 1]
@@ -72,7 +72,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -82,10 +82,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, :, 1]
@@ -112,7 +112,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -122,10 +122,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, :, 1]
@@ -151,7 +151,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -161,10 +161,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, :, 1]
diff --git a/test/test_structured_1d.jl b/test/test_structured_1d.jl
index 2e255e07240..74f12927117 100644
--- a/test/test_structured_1d.jl
+++ b/test/test_structured_1d.jl
@@ -24,6 +24,19 @@ isdir(outdir) && rm(outdir, recursive = true)
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_advection_basic.jl (Gauss-Legendre)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic.jl"),
+                        solver=DGSEM(polydeg = 3, basis_type = GaussLegendreBasis,
+                                     surface_flux = flux_godunov),
+                        cfl=0.8,
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[2.515203865524688e-6],
+                        linf=[8.660338936650191e-6])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
 @trixi_testset "elixir_advection_nonperiodic.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_nonperiodic.jl"),
                         l2=[5.641921365468918e-5],
@@ -188,6 +201,27 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_euler_source_terms_nonperiodic.jl (Gauss-Legendre)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                 "elixir_euler_source_terms_nonperiodic.jl"),
+                        solver=DGSEM(polydeg = 3, basis_type = GaussLegendreBasis,
+                                     surface_flux = flux_lax_friedrichs),
+                        # Identical errors as for the `TreeMesh` version of this example
+                        l2=[
+                            6.179119971404758e-7,
+                            6.831335637140733e-7,
+                            1.8153512648336213e-6
+                        ],
+                        linf=[
+                            2.3035825069683824e-6,
+                            2.7398314812465685e-6,
+                            7.132056524916663e-6
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
 @trixi_testset "elixir_euler_source_terms_nonperiodic_fvO2.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_euler_source_terms_nonperiodic_fvO2.jl"),
diff --git a/test/test_structured_2d.jl b/test/test_structured_2d.jl
index a2dfc250371..6a254435eeb 100644
--- a/test/test_structured_2d.jl
+++ b/test/test_structured_2d.jl
@@ -1309,6 +1309,129 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 10000)
 end
 
+@trixi_testset "elixir_mhdmultiion_ec.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_mhdmultiion_ec.jl"),
+                        l2=[
+                            0.001213161200979075,
+                            0.0012130848294820726,
+                            0.0015755001554277398,
+                            0.0013582923009927254,
+                            0.0020117645260758414,
+                            0.002011172963281366,
+                            4.809766518427324e-5,
+                            0.017144539884022238,
+                            0.002725517676213416,
+                            0.002881115950314307,
+                            0.002879122244461666,
+                            0.00019209433660607767,
+                            0.011925524606599836,
+                            2.6719402415663762e-8
+                        ],
+                        linf=[
+                            0.097859973877228,
+                            0.09690728356274181,
+                            0.13431313472001527,
+                            0.05311488868916897,
+                            0.09355072731834056,
+                            0.09355996145995278,
+                            0.004795107808363838,
+                            0.8722675916712932,
+                            0.10974816636830609,
+                            0.15983705713358845,
+                            0.13675344214792837,
+                            0.019081518305426586,
+                            0.7126785929162383,
+                            5.6197760273085075e-6
+                        ],
+                        tspan=(0.0, 0.002))
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
+@trixi_testset "elixir_mhdmultiion_ec.jl with local Lax-Friedrichs at the surface" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_mhdmultiion_ec.jl"),
+                        l2=[
+                            0.0011033979927766748,
+                            0.0011034179185842633,
+                            0.0014691729962388591,
+                            0.0013355107747779773,
+                            0.0019050117958305927,
+                            0.0019038291863233438,
+                            3.6572032347831196e-5,
+                            0.016664167388106423,
+                            0.0026895584922699152,
+                            0.002787640168848639,
+                            0.0027843010508327975,
+                            0.00014627111755617758,
+                            0.01154990855334394,
+                            6.943248884497632e-7
+                        ],
+                        linf=[
+                            0.06939001371446096,
+                            0.06926864952622269,
+                            0.11513905163412463,
+                            0.045270416449765816,
+                            0.08204570060964968,
+                            0.08205454581110574,
+                            0.002824171274841899,
+                            0.7319481885201382,
+                            0.09235891401707053,
+                            0.16128957341727507,
+                            0.12439795810457398,
+                            0.011237062227958462,
+                            0.5923410602688102,
+                            0.00011972366162122378
+                        ],
+                        tspan=(0.0, 0.002),
+                        surface_flux=(FluxLaxFriedrichs(max_abs_speed_naive),
+                                      flux_nonconservative_central))
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
+@trixi_testset "elixir_mhdmultiion_convergence_twospecies.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                 "elixir_mhdmultiion_convergence_twospecies.jl"),
+                        l2=[
+                            0.0010051333701703825,
+                            0.0010394775468233015,
+                            0.0002811601603064331,
+                            0.0011347597481031293,
+                            0.004308549702423105,
+                            0.004034566673203751,
+                            0.00016314810227339212,
+                            0.009733230503960424,
+                            0.0016373815502533284,
+                            0.00830712611315323,
+                            0.008424476361399211,
+                            0.0002693869498956917,
+                            0.01786371557065078,
+                            0.0010551940921468834
+                        ],
+                        linf=[
+                            0.004331125692400628,
+                            0.006326095686991051,
+                            0.001622796413497718,
+                            0.005898819200413019,
+                            0.023231022862884698,
+                            0.02015682661284135,
+                            0.0007163788637357393,
+                            0.04864671278045618,
+                            0.010511130196469765,
+                            0.0391322779237806,
+                            0.03334142743633839,
+                            0.0014513724607740641,
+                            0.09978672252281795,
+                            0.005053531087457125
+                        ],
+                        tspan=(0.0, 0.1))
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
 @trixi_testset "elixir_mhd_coupled.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_mhd_coupled.jl"),
                         l2=[
diff --git a/test/test_tree_1d_euler.jl b/test/test_tree_1d_euler.jl
index 42889430bf0..c786afb5a52 100644
--- a/test/test_tree_1d_euler.jl
+++ b/test/test_tree_1d_euler.jl
@@ -147,6 +147,26 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_euler_source_terms_nonperiodic.jl (Gauss-Legendre)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                 "elixir_euler_source_terms_nonperiodic.jl"),
+                        solver=DGSEM(polydeg = 3, basis_type = GaussLegendreBasis,
+                                     surface_flux = flux_lax_friedrichs),
+                        l2=[
+                            6.179119971404758e-7,
+                            6.831335637140733e-7,
+                            1.8153512648336213e-6
+                        ],
+                        linf=[
+                            2.3035825069683824e-6,
+                            2.7398314812465685e-6,
+                            7.132056524916663e-6
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
 @trixi_testset "elixir_euler_ec.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_ec.jl"),
                         l2=[
diff --git a/test/test_tree_2d_acoustics.jl b/test/test_tree_2d_acoustics.jl
index e38ed62cc07..49e183fdc7a 100644
--- a/test/test_tree_2d_acoustics.jl
+++ b/test/test_tree_2d_acoustics.jl
@@ -87,19 +87,55 @@ end
 
 @trixi_testset "elixir_acoustics_gauss_wall.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_acoustics_gauss_wall.jl"),
-                        l2=[0.019419398248465843, 0.019510701017551826,
+                        l2=[
+                            0.019419398248465843,
+                            0.019510701017551826,
                             0.04818246051887614,
-                            7.382060834820337e-17, 0.0, 1.4764121669640674e-16,
+                            7.382060834820337e-17,
+                            0.0,
+                            1.4764121669640674e-16,
                             1.4764121669640674e-16],
-                        linf=[0.18193631937316496, 0.1877464607867628,
+                        linf=[
+                            0.18193631937316496,
+                            0.1877464607867628,
                             1.0355388011792845,
-                            2.220446049250313e-16, 0.0, 4.440892098500626e-16,
+                            2.220446049250313e-16,
+                            0.0,
+                            4.440892098500626e-16,
                             4.440892098500626e-16])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_acoustics_gauss_wall.jl (Gauss Legendre)" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_acoustics_gauss_wall.jl"),
+                        solver=DGSEM(polydeg = 5, surface_flux = flux_lax_friedrichs,
+                                     basis_type = GaussLegendreBasis),
+                        cfl=0.6,
+                        l2=[
+                            0.01944153623864891,
+                            0.01952877141847981,
+                            0.04820571764883919,
+                            1.1071998298551595e-16,
+                            0.0,
+                            2.214399659710319e-16,
+                            2.214399659710319e-16
+                        ],
+                        linf=[
+                            0.1828989576562236,
+                            0.18857385551148917,
+                            1.036543390095062,
+                            3.3306690738754696e-16,
+                            0.0,
+                            6.661338147750939e-16,
+                            6.661338147750939e-16
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
 @trixi_testset "elixir_acoustics_monopole.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_acoustics_monopole.jl"),
                         l2=[0.006816790293009947, 0.0065068948357351625,
diff --git a/test/test_tree_2d_advection.jl b/test/test_tree_2d_advection.jl
index b4d11b0683d..672555c4c11 100644
--- a/test/test_tree_2d_advection.jl
+++ b/test/test_tree_2d_advection.jl
@@ -42,7 +42,7 @@ end
 @trixi_testset "elixir_advection_implicit_sparse_jacobian_restart.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_advection_implicit_sparse_jacobian_restart.jl"),
-                        l2=[0.007964280656552015], linf=[0.011267546271397588])
+                        l2=[0.00972948620504335], linf=[0.013761951552254348])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
     @test_allocations(Trixi.rhs!, semi_float_type, sol, 1000)
@@ -52,7 +52,7 @@ end
     @test_trixi_include(joinpath(EXAMPLES_DIR,
                                  "elixir_advection_implicit_sparse_jacobian_restart.jl"),
                         colorvec=nothing,
-                        l2=[0.007964280656552015], linf=[0.011267546271397588])
+                        l2=[0.00972948620504335], linf=[0.013761951552254348])
     # Ensure that we do not have excessive memory allocations
     # (e.g., from type instabilities)
     @test_allocations(Trixi.rhs!, semi_float_type, sol, 1000)
diff --git a/test/test_tree_2d_euler.jl b/test/test_tree_2d_euler.jl
index fa4928e70e5..b4f7bc35841 100644
--- a/test/test_tree_2d_euler.jl
+++ b/test/test_tree_2d_euler.jl
@@ -289,6 +289,29 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_euler_source_terms_nonperiodic.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                 "elixir_euler_source_terms_nonperiodic.jl"),
+                        solver=DGSEM(polydeg = 3, basis_type = GaussLegendreBasis,
+                                     surface_flux = flux_lax_friedrichs),
+                        cfl=0.8,
+                        l2=[
+                            8.565448573947783e-7,
+                            9.279921990156959e-7,
+                            9.279921990210634e-7,
+                            2.6853435359565158e-6
+                        ],
+                        linf=[
+                            3.699190303185773e-6,
+                            4.467127135754367e-6,
+                            4.4671271295371184e-6,
+                            1.5194716922017903e-5
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
 @trixi_testset "elixir_euler_ec.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_euler_ec.jl"),
                         l2=[
diff --git a/test/test_tree_3d_euler.jl b/test/test_tree_3d_euler.jl
index 1bbcc7ea558..17866e74f47 100644
--- a/test/test_tree_3d_euler.jl
+++ b/test/test_tree_3d_euler.jl
@@ -498,6 +498,33 @@ end
     # (e.g., from type instabilities)
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
+
+@trixi_testset "elixir_euler_sedov_blast_wave_sc_subcell.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR,
+                                 "elixir_euler_sedov_blast_wave_sc_subcell.jl"),
+                        l2=[
+                            0.24806841083939926,
+                            0.07001337223874464,
+                            0.07001337223806398,
+                            0.0700133722383429,
+                            0.3620366037665587
+                        ],
+                        linf=[
+                            0.9384071822566761,
+                            0.573009568617271,
+                            0.5730095685845291,
+                            0.5730095686063774,
+                            4.861205850307592
+                        ],
+                        tspan=(0.0, 0.5))
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    # Larger values for allowed allocations due to usage of custom
+    # integrator which are not *recorded* for the methods from
+    # OrdinaryDiffEq.jl
+    # Corresponding issue: https://github.com/trixi-framework/Trixi.jl/issues/1877
+    @test_allocations(Trixi.rhs!, semi, sol, 15_000)
+end
 end
 
 end # module
diff --git a/test/test_tree_3d_mhd.jl b/test/test_tree_3d_mhd.jl
index 2f1dc252409..b2d42eb8b6f 100644
--- a/test/test_tree_3d_mhd.jl
+++ b/test/test_tree_3d_mhd.jl
@@ -221,6 +221,35 @@ end
     @test_allocations(Trixi.rhs!, semi, sol, 1000)
 end
 
+@trixi_testset "elixir_mhd_convergence.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_mhd_convergence.jl"),
+                        l2=[
+                            0.009193403522877426,
+                            0.013723993903671483,
+                            0.013723993903671291,
+                            0.00748865999127907,
+                            0.02205355095416697,
+                            0.009989774083000539,
+                            0.009989774083000584,
+                            0.004353727273126007,
+                            0.0013769046942994955
+                        ],
+                        linf=[
+                            0.027349064091453767,
+                            0.04489621477501449,
+                            0.04489621477501515,
+                            0.02699974461840463,
+                            0.15656498361977533,
+                            0.028682970290997645,
+                            0.028682970290998533,
+                            0.0124215159258882,
+                            0.007286608218242374
+                        ])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
 @trixi_testset "elixir_mhd_ec_shockcapturing.jl" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_mhd_ec_shockcapturing.jl"),
                         l2=[
diff --git a/test/test_type.jl b/test/test_type.jl
index 048c608223a..06d9c4b1192 100644
--- a/test/test_type.jl
+++ b/test/test_type.jl
@@ -1628,6 +1628,7 @@ isdir(outdir) && rm(outdir, recursive = true)
                                              one(RealT))
             dissipation_es = DissipationLaxFriedrichsEntropyVariables()
             orientations = [1, 2]
+            normal_direction = SVector(one(RealT), zero(RealT))
 
             @test eltype(@inferred initial_condition_weak_blast_wave(x, t, equations)) ==
                   RealT
@@ -1660,6 +1661,21 @@ isdir(outdir) && rm(outdir, recursive = true)
                       RealT
             end
 
+            @test eltype(@inferred flux(u, normal_direction, equations)) == RealT
+            @test eltype(@inferred flux_nonconservative_ruedaramirez_etal(u_ll, u_rr,
+                                                                          normal_direction,
+                                                                          equations)) ==
+                  RealT
+            @test eltype(@inferred flux_nonconservative_central(u_ll, u_rr,
+                                                                normal_direction,
+                                                                equations)) == RealT
+            @test eltype(@inferred flux_ruedaramirez_etal(u_ll, u_rr, normal_direction,
+                                                          equations)) == RealT
+            @test typeof(@inferred max_abs_speed_naive(u_ll, u_rr, normal_direction,
+                                                       equations)) == RealT
+            @test typeof(Trixi.calc_fast_wavespeed(cons, normal_direction, equations)) ==
+                  RealT
+
             @test eltype(@inferred Trixi.max_abs_speeds(u, equations)) == RealT
             @test eltype(@inferred cons2prim(u, equations)) == RealT
             @test eltype(@inferred prim2cons(u, equations)) == RealT
@@ -1862,7 +1878,7 @@ isdir(outdir) && rm(outdir, recursive = true)
                       RealT
             end
 
-            parabolic_solver = ViscousFormulationLocalDG(RealT(0.1))
+            parabolic_solver = ParabolicFormulationLocalDG(RealT(0.1))
             @test eltype(@inferred Trixi.penalty(u_outer, u_inner, inv_h,
                                                  equations_parabolic, parabolic_solver)) ==
                   RealT
@@ -1885,7 +1901,7 @@ isdir(outdir) && rm(outdir, recursive = true)
                       RealT
             end
 
-            parabolic_solver = ViscousFormulationLocalDG(RealT(0.1))
+            parabolic_solver = ParabolicFormulationLocalDG(RealT(0.1))
             @test eltype(@inferred Trixi.penalty(u_outer, u_inner, inv_h,
                                                  equations_parabolic, parabolic_solver)) ==
                   RealT
diff --git a/test/test_unit.jl b/test/test_unit.jl
index e50a5f89696..a79aa5b5117 100644
--- a/test/test_unit.jl
+++ b/test/test_unit.jl
@@ -2448,6 +2448,12 @@ end
             @test max_abs_speed_naive(u_ll, u_rr, orientation, equations) ≈
                   max_abs_speed(u_ll, u_rr, orientation, equations)
         end
+
+        normal_directions = [SVector(1.0, 0.0), SVector(0.0, 1.0), SVector(0.5, -0.5)]
+        for normal_direction in normal_directions
+            @test max_abs_speed_naive(u_ll, u_rr, normal_direction, equations) ≈
+                  max_abs_speed(u_ll, u_rr, normal_direction, equations)
+        end
     end
 
     @timed_testset "IdealGlmMhdMultiIonEquations3D" begin
diff --git a/test/test_visualization.jl b/test/test_visualization.jl
index a5ac5b09f0d..367021928fa 100644
--- a/test/test_visualization.jl
+++ b/test/test_visualization.jl
@@ -441,6 +441,169 @@ end
     end
 end
 
+@testset "PlotData2D Regression Tests" begin
+    using Trixi
+    equations = CompressibleEulerEquations2D(1.4)
+    solver = DGSEM(polydeg = 3,
+                   surface_flux = FluxLaxFriedrichs(max_abs_speed_naive))
+
+    coordinates_min = (-1.0, -1.0)
+    coordinates_max = (1.0, 1.0)
+    initial_refinement_level = 3
+
+    # Manually initialize meshes
+    mesh_tree = TreeMesh(coordinates_min, coordinates_max;
+                         n_cells_max = 10^5,
+                         initial_refinement_level,
+                         periodicity = true)
+
+    trees_per_dimension = (1, 1)
+    mesh_p4est = P4estMesh(trees_per_dimension; polydeg = 3,
+                           coordinates_min, coordinates_max,
+                           initial_refinement_level,
+                           periodicity = true)
+
+    cells_per_dimension = (2, 2) .^ initial_refinement_level
+    mesh_structured = StructuredMesh(cells_per_dimension,
+                                     coordinates_min, coordinates_max,
+                                     periodicity = true)
+
+    function initial_condition_taylor_green_vortex(x, t,
+                                                   equations::CompressibleEulerEquations2D)
+        A = 1.0 # magnitude of speed
+        Ms = 0.1 # maximum Mach number
+
+        rho = 1.0
+        v1 = A * sin(x[1]) * cos(x[2])
+        v2 = -A * cos(x[1]) * sin(x[2])
+        p = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
+        p = p +
+            1.0 / 16.0 * A^2 * rho *
+            (cos(2 * x[1]) + 2 * cos(2 * x[2]) +
+             2 * cos(2 * x[1]) + cos(2 * x[2]))
+
+        return prim2cons(SVector(rho, v1, v2, p), equations)
+    end
+
+    @testset "Constant IC (Exact Checks)" begin
+        ic = initial_condition_constant
+
+        @testset "TreeMesh" begin
+            semi_tree = SemidiscretizationHyperbolic(mesh_tree, equations, ic, solver;
+                                                     boundary_conditions = boundary_condition_periodic)
+            u_ode = compute_coefficients(0.0, semi_tree)
+            pd = PlotData2D(u_ode, semi_tree, solution_variables = cons2prim)
+
+            ref_cons = Trixi.initial_condition_constant(SVector(0.0, 0.0), 0.0,
+                                                        semi_tree.equations)
+            ref_prim = cons2prim(ref_cons, semi_tree.equations)
+
+            @test all(x -> isapprox(x, ref_prim[1]), pd.data[1]) # rho
+            @test all(x -> isapprox(x, ref_prim[2]), pd.data[2]) # v1
+            @test all(x -> isapprox(x, ref_prim[3]), pd.data[3]) # v2
+            @test all(x -> isapprox(x, ref_prim[4]), pd.data[4]) # p
+        end
+
+        @testset "StructuredMesh" begin
+            semi_struct = SemidiscretizationHyperbolic(mesh_structured, equations, ic,
+                                                       solver;
+                                                       boundary_conditions = boundary_condition_periodic)
+            u_ode = compute_coefficients(0.0, semi_struct)
+            pd = PlotData2D(u_ode, semi_struct, solution_variables = cons2prim)
+
+            ref_cons = Trixi.initial_condition_constant(SVector(0.0, 0.0), 0.0,
+                                                        semi_struct.equations)
+            ref_prim = cons2prim(ref_cons, semi_struct.equations)
+
+            @test all(val -> isapprox(val[1], ref_prim[1]), pd.data) # rho
+            @test all(val -> isapprox(val[2], ref_prim[2]), pd.data) # v1
+            @test all(val -> isapprox(val[3], ref_prim[3]), pd.data) # v2
+            @test all(val -> isapprox(val[4], ref_prim[4]), pd.data) # p
+        end
+
+        @testset "P4estMesh" begin
+            semi_p4est = SemidiscretizationHyperbolic(mesh_p4est, equations, ic, solver;
+                                                      boundary_conditions = boundary_condition_periodic)
+            u_ode = compute_coefficients(0.0, semi_p4est)
+            pd = PlotData2D(u_ode, semi_p4est, solution_variables = cons2prim)
+
+            ref_cons = Trixi.initial_condition_constant(SVector(0.0, 0.0), 0.0,
+                                                        semi_p4est.equations)
+            ref_prim = cons2prim(ref_cons, semi_p4est.equations)
+
+            @test all(val -> isapprox(val[1], ref_prim[1]), pd.data) # rho
+            @test all(val -> isapprox(val[2], ref_prim[2]), pd.data) # v1
+            @test all(val -> isapprox(val[3], ref_prim[3]), pd.data) # v2
+            @test all(val -> isapprox(val[4], ref_prim[4]), pd.data) # p
+        end
+    end
+
+    @testset "Non-Constant IC (Taylor-Green Vortex)" begin
+        ic = initial_condition_taylor_green_vortex
+
+        @testset "TreeMesh" begin
+            semi_tree = SemidiscretizationHyperbolic(mesh_tree, equations, ic, solver;
+                                                     boundary_conditions = boundary_condition_periodic)
+            u_ode = compute_coefficients(0.0, semi_tree)
+            pd = PlotData2D(u_ode, semi_tree, solution_variables = cons2prim)
+
+            max_error = 0.0
+            for (j, y) in enumerate(pd.y), (i, x) in enumerate(pd.x)
+                u_exact = ic(SVector(x, y), 0.0, semi_tree.equations)
+                prim_exact = cons2prim(u_exact, semi_tree.equations)
+                prim_interp = SVector(pd.data[1][i, j], pd.data[2][i, j],
+                                      pd.data[3][i, j], pd.data[4][i, j])
+
+                current_error = maximum(abs.(prim_interp - prim_exact))
+                max_error = max(max_error, current_error)
+            end
+            # Note that PlotData2D for TreeMesh uses a different algorithm that interpolates
+            # the solution onto a uniform Cartesian grid. This is less accurate than the
+            # exact nodal evaluations above, so we need to use a larger tolerance.
+            @test max_error < 1.05
+        end
+
+        @testset "StructuredMesh" begin
+            semi_struct = SemidiscretizationHyperbolic(mesh_structured, equations, ic,
+                                                       solver;
+                                                       boundary_conditions = boundary_condition_periodic)
+            u_ode = compute_coefficients(0.0, semi_struct)
+            pd = PlotData2D(u_ode, semi_struct, solution_variables = cons2prim)
+
+            max_error = 0.0
+            for i in eachindex(pd.x)
+                x = pd.x[i]
+                y = pd.y[i]
+                u_exact = ic(SVector(x, y), 0.0, semi_struct.equations)
+                prim_exact = cons2prim(u_exact, semi_struct.equations)
+
+                current_error = maximum(abs.(pd.data[i] - prim_exact))
+                max_error = max(max_error, current_error)
+            end
+            @test max_error < 1.0e-5
+        end
+
+        @testset "P4estMesh" begin
+            semi_p4est = SemidiscretizationHyperbolic(mesh_p4est, equations, ic, solver;
+                                                      boundary_conditions = boundary_condition_periodic)
+            u_ode = compute_coefficients(0.0, semi_p4est)
+            pd = PlotData2D(u_ode, semi_p4est, solution_variables = cons2prim)
+
+            max_error = 0.0
+            for i in eachindex(pd.x)
+                x = pd.x[i]
+                y = pd.y[i]
+                u_exact = ic(SVector(x, y), 0.0, semi_p4est.equations)
+                prim_exact = cons2prim(u_exact, semi_p4est.equations)
+
+                current_error = maximum(abs.(pd.data[i] - prim_exact))
+                max_error = max(max_error, current_error)
+            end
+            @test max_error < 1.0e-5
+        end
+    end
+end
+
 @timed_testset "PlotData1D (DGMulti)" begin
     # Test two different approximation types since these use different memory layouts:
     # - structure of arrays for `Polynomial()`