diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 344b8eacc3a..77dbdd5ea06 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -19,3 +19,23 @@ steps:
     timeout_in_minutes: 60
     soft_fail:
       - exit_status: 3
+
+  - label: "Metal Julia {{matrix.version}}"
+    matrix:
+      setup:
+        version:
+          - "1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "{{matrix.version}}"
+      - JuliaCI/julia-test#v1: ~
+    env:
+      TRIXI_TEST: "Metal"
+    agents:
+      queue: "juliaecosystem"
+      os: "macos"
+      arch: "aarch64"
+    if: build.message !~ /\[skip ci\]/
+    timeout_in_minutes: 60
+    soft_fail:
+      - exit_status: 3
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6f8e4a3f2e4..ff47696c58e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -85,6 +85,7 @@ jobs:
           - performance_specializations
           - mpi
           - threaded
+          - kernelabstractions
         include:
           - version: '1.11'
             os: ubuntu-latest
diff --git a/Project.toml b/Project.toml
index ee7b35463cb..947aebf931b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -58,6 +58,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
 ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
 SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
 
@@ -65,6 +66,7 @@ SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
 TrixiCUDAExt = "CUDA"
 TrixiConvexECOSExt = ["Convex", "ECOS"]
 TrixiMakieExt = "Makie"
+TrixiMetalExt = "Metal"
 TrixiNLsolveExt = "NLsolve"
 TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer"
 
@@ -90,6 +92,7 @@ LinearAlgebra = "1"
 LinearMaps = "2.7, 3.0"
 LoopVectorization = "0.12.171"
 MPI = "0.20.22"
+Metal = "1.9.2"
 Makie = "0.21, 0.22, 0.23, 0.24"
 MuladdMacro = "0.2.4"
 NLsolve = "4.5.1"
diff --git a/benchmark/CUDA/Project.toml b/benchmark/CUDA/Project.toml
new file mode 100644
index 00000000000..221c03a5947
--- /dev/null
+++ b/benchmark/CUDA/Project.toml
@@ -0,0 +1,6 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d"
+TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
diff --git a/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
new file mode 100644
index 00000000000..de491a3761b
--- /dev/null
+++ b/benchmark/CUDA/elixir_euler_taylor_green_vortex.jl
@@ -0,0 +1,79 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations3D(1.4)
+
+function initial_condition_taylor_green_vortex(x, t,
+                                               equations::CompressibleEulerEquations3D)
+    A = 1.0 # magnitude of speed
+    Ms = 0.1 # maximum Mach number
+
+    rho = 1.0
+    v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3])
+    v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
+    v3 = 0.0
+    p = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
+    p = p +
+        1.0 / 16.0 * A^2 * rho *
+        (cos(2 * x[1]) * cos(2 * x[3]) +
+         2 * cos(2 * x[2]) + 2 * cos(2 * x[1]) + cos(2 * x[2]) * cos(2 * x[3]))
+
+    return prim2cons(SVector(rho, v1, v2, v3, p), equations)
+end
+
+initial_condition = initial_condition_taylor_green_vortex
+
+# TODO Undefined external symbol "log"
+#volume_flux = flux_ranocha
+volume_flux = flux_lax_friedrichs
+solver = DGSEM(polydeg = 5, surface_flux = volume_flux)
+# TODO flux diff
+#volume_integral=VolumeIntegralFluxDifferencing(volume_flux))
+
+coordinates_min = (-1.0, -1.0, -1.0) .* pi
+coordinates_max = (1.0, 1.0, 1.0) .* pi
+
+initial_refinement_level = 1
+trees_per_dimension = (4, 4, 4)
+
+mesh = P4estMesh(trees_per_dimension, polydeg = 1,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 periodicity = true, initial_refinement_level = initial_refinement_level)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 100.0)
+ode = semidiscretize(semi, tspan; storage_type = nothing, real_type = nothing)
+
+summary_callback = SummaryCallback()
+
+stepsize_callback = StepsizeCallback(cfl = 0.1)
+
+callbacks = CallbackSet(summary_callback,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+maxiters = 200
+run_profiler = false
+
+# disable warnings when maxiters is reached
+integrator = init(ode, CarpenterKennedy2N54(williamson_condition = false),
+                  dt = 1.0,
+                  save_everystep = false, callback = callbacks,
+                  maxiters = maxiters, verbose = false)
+if run_profiler
+    prof_result = CUDA.@profile solve!(integrator)
+else
+    solve!(integrator)
+    prof_result = nothing
+end
+
+finalize(mesh)
diff --git a/benchmark/CUDA/run.jl b/benchmark/CUDA/run.jl
new file mode 100644
index 00000000000..70c840722af
--- /dev/null
+++ b/benchmark/CUDA/run.jl
@@ -0,0 +1,89 @@
+using Trixi
+using CUDA
+using TimerOutputs
+using JSON
+
+function main(elixir_path)
+
+    # setup
+    maxiters = 50
+    initial_refinement_level = 3
+    storage_type = CuArray
+    real_type = Float64
+
+    println("Warming up...")
+
+    # start simulation with tiny final time to trigger compilation
+    duration_compile = @elapsed begin
+        trixi_include(elixir_path,
+                      tspan = (0.0, 1e-14),
+                      storage_type = storage_type,
+                      real_type = real_type)
+        trixi_include(elixir_path,
+                      tspan = (0.0, 1e-14),
+                      storage_type = storage_type,
+                      real_type = Float32)
+    end
+
+    println("Finished warm-up in $duration_compile seconds\n")
+    println("Starting simulation...")
+
+    # start the real simulation
+    duration_elixir = @elapsed trixi_include(elixir_path,
+                                             maxiters = maxiters,
+                                             initial_refinement_level = initial_refinement_level,
+                                             storage_type = storage_type,
+                                             real_type = real_type)
+
+    # store metrics (on every rank!)
+    metrics = Dict{String, Float64}("elapsed time" => duration_elixir)
+
+    # read TimerOutputs timings
+    timer = Trixi.timer()
+    metrics["total time"] = 1.0e-9 * TimerOutputs.tottime(timer)
+    metrics["rhs! time"] = 1.0e-9 * TimerOutputs.time(timer["rhs!"])
+
+    # compute performance index
+    nrhscalls = Trixi.ncalls(semi.performance_counter)
+    walltime = 1.0e-9 * take!(semi.performance_counter)
+    metrics["PID"] = walltime * Trixi.mpi_nranks() / (Trixi.ndofsglobal(semi) * nrhscalls)
+
+    # write json file
+    open("metrics.out", "w") do f
+        indent = 2
+        JSON.print(f, metrics, indent)
+    end
+
+    # run profiler
+    maxiters = 5
+    initial_refinement_level = 1
+
+    println("Running profiler (Float64)...")
+    trixi_include(elixir_path,
+                  maxiters = maxiters,
+                  initial_refinement_level = initial_refinement_level,
+                  storage_type = storage_type,
+                  real_type = Float64,
+                  run_profiler = true)
+
+    open("profile_float64.txt", "w") do io
+        show(io, prof_result)
+    end
+
+    println("Running profiler (Float32)...")
+    trixi_include(elixir_path,
+                  maxiters = maxiters,
+                  initial_refinement_level = initial_refinement_level,
+                  storage_type = storage_type,
+                  real_type = Float32,
+                  run_profiler = true)
+
+    open("profile_float32.txt", "w") do io
+        show(io, prof_result)
+    end
+end
+
+# hardcoded elixir
+elixir_path = joinpath(@__DIR__(), "elixir_euler_taylor_green_vortex.jl")
+
+main(elixir_path)
diff --git a/docs/src/heterogeneous.md b/docs/src/heterogeneous.md
index 9d4dc50c181..70d40dd2f6d 100644
--- a/docs/src/heterogeneous.md
+++ b/docs/src/heterogeneous.md
@@ -120,9 +120,14 @@ function trixi_rhs_fct(mesh, equations, solver, cache, args)
 end
 ```
 
-1. Put the inner code in a new function `rhs_fct_per_element`. Besides the index
-   `element`, pass all required fields as arguments, but make sure to `@unpack` them from
-   their structs in advance.
+1. Move the inner code into a new inlined function `rhs_fct_per_element`.
+   ```julia
+   @inline function rhs_fct_per_element(..., element, ...)
+       ...
+   end
+   ```
+   Besides the index `element`, pass all required fields as arguments, but make sure to
+   `@unpack` them from their structs in advance.
 2. Where `trixi_rhs_fct` is called, get the backend, i.e., the hardware we are currently
    running on via `trixi_backend(x)`.
    This will, e.g., work with `u_ode`. Internally, KernelAbstractions.jl's `get_backend`
diff --git a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
index 4553d4823ed..9e1fc0dba8e 100644
--- a/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
+++ b/examples/p4est_2d_dgsem/elixir_advection_basic_gpu.jl
@@ -50,9 +50,8 @@ save_solution = SaveSolutionCallback(interval = 100,
 stepsize_callback = StepsizeCallback(cfl = 1.6)
 
 # Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
-callbacks = CallbackSet(summary_callback, stepsize_callback)
-# TODO: GPU. The `analysis_callback` needs to be updated for GPU support
-# analysis_callback, save_solution, stepsize_callback)
+callbacks = CallbackSet(summary_callback, analysis_callback,
+                        save_solution, stepsize_callback)
 
 ###############################################################################
 # run the simulation
diff --git a/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl b/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl
new file mode 100644
index 00000000000..c0161c2683a
--- /dev/null
+++ b/examples/p4est_3d_dgsem/elixir_advection_basic_gpu.jl
@@ -0,0 +1,62 @@
+# The same setup as tree_3d_dgsem/elixir_advection_basic.jl
+# to verify the P4estMesh implementation against TreeMesh
+
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the linear advection equation
+
+advection_velocity = (0.2, -0.7, 0.5)
+equations = LinearScalarAdvectionEquation3D(advection_velocity)
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+solver = DGSEM(polydeg = 3, surface_flux = flux_lax_friedrichs)
+
+coordinates_min = (-1.0, -1.0, -1.0) # minimum coordinates (min(x), min(y), min(z))
+coordinates_max = (1.0, 1.0, 1.0) # maximum coordinates (max(x), max(y), max(z))
+
+# Create P4estMesh with 8 x 8 x 8 elements (note `refinement_level=1`)
+trees_per_dimension = (4, 4, 4)
+mesh = P4estMesh(trees_per_dimension, polydeg = 3,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 initial_refinement_level = 1,
+                 periodicity = true)
+
+# A semidiscretization collects data structures and functions for the spatial discretization
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition_convergence_test,
+                                    solver;
+                                    boundary_conditions = boundary_condition_periodic)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span from 0.0 to 1.0
+tspan = (0.0, 1.0)
+ode = semidiscretize(semi, tspan; real_type = nothing, storage_type = nothing)
+
+# At the beginning of the main loop, the SummaryCallback prints a summary of the simulation setup
+# and resets the timers
+summary_callback = SummaryCallback()
+
+# The AnalysisCallback allows to analyse the solution in regular intervals and prints the results
+analysis_callback = AnalysisCallback(semi, interval = 100)
+
+# The SaveSolutionCallback allows to save the solution to a file in regular intervals
+save_solution = SaveSolutionCallback(interval = 100,
+                                     solution_variables = cons2prim)
+
+# The StepsizeCallback handles the re-calculation of the maximum Δt after each time step
+stepsize_callback = StepsizeCallback(cfl = 1.2)
+
+# Create a CallbackSet to collect all callbacks such that they can be passed to the ODE solver
+callbacks = CallbackSet(summary_callback, analysis_callback,
+                        save_solution, stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+# OrdinaryDiffEq's `solve` method evolves the solution in time and executes the passed callbacks
+sol = solve(ode, CarpenterKennedy2N54(williamson_condition = false);
+            dt = 0.05, # solve needs some value here but it will be overwritten by the stepsize_callback
+            ode_default_options()..., callback = callbacks);
diff --git a/ext/TrixiCUDAExt.jl b/ext/TrixiCUDAExt.jl
index 681d2f53a1e..3326c536a76 100644
--- a/ext/TrixiCUDAExt.jl
+++ b/ext/TrixiCUDAExt.jl
@@ -1,11 +1,19 @@
 # Package extension for adding CUDA-based features to Trixi.jl
 module TrixiCUDAExt
 
-import CUDA: CuArray
+import CUDA: CuArray, CuDeviceArray, KernelAdaptor
 import Trixi
 
 function Trixi.storage_type(::Type{<:CuArray})
     return CuArray
 end
 
+function Trixi.unsafe_wrap_or_alloc(::KernelAdaptor, vec, size)
+    return Trixi.unsafe_wrap_or_alloc(CuDeviceArray, vec, size)
+end
+
+function Trixi.unsafe_wrap_or_alloc(::Type{<:CuDeviceArray}, vec::CuDeviceArray, size)
+    return reshape(vec, size)
+end
+
 end
diff --git a/ext/TrixiMetalExt.jl b/ext/TrixiMetalExt.jl
new file mode 100644
index 00000000000..b1d2be8b926
--- /dev/null
+++ b/ext/TrixiMetalExt.jl
@@ -0,0 +1,19 @@
+# Package extension for adding Metal-based features to Trixi.jl
+module TrixiMetalExt
+
+import Metal: MtlArray, MtlDeviceArray, Adaptor
+import Trixi
+
+function Trixi.storage_type(::Type{<:MtlArray})
+    return MtlArray
+end
+
+function Trixi.unsafe_wrap_or_alloc(::Adaptor, vec, size)
+    return Trixi.unsafe_wrap_or_alloc(MtlDeviceArray, vec, size)
+end
+
+function Trixi.unsafe_wrap_or_alloc(::Type{<:MtlDeviceArray}, vec::MtlDeviceArray, size)
+    return reshape(vec, size)
+end
+
+end
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 7707b8434bc..6394bd34da2 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -59,7 +59,8 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
 using FillArrays: Ones, Zeros
 using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
-using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend
+using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend,
+                          allocate
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices
diff --git a/src/callbacks_step/analysis_dg1d.jl b/src/callbacks_step/analysis_dg1d.jl
index e53df1dd4c3..83bd9746848 100644
--- a/src/callbacks_step/analysis_dg1d.jl
+++ b/src/callbacks_step/analysis_dg1d.jl
@@ -124,7 +124,8 @@ end
 # This avoids the need to divide the RHS of the DG scheme by the Jacobian when computing
 # the time derivative of entropy, see `entropy_change_reference_element`.
 function integrate_reference_element(func::Func, u, element,
-                                     mesh::AbstractMesh{1}, equations, dg::DGSEM, cache,
+                                     ::Type{<:AbstractMesh{1}}, equations, dg::DGSEM,
+                                     cache,
                                      args...) where {Func}
     @unpack weights = dg.basis
 
@@ -142,9 +143,9 @@ end
 # Calculate ∫_e (∂S/∂u ⋅ ∂u/∂t) dΩ_e where the result on element 'e' is kept in reference space
 # Note that ∂S/∂u = w(u) with entropy variables w
 function entropy_change_reference_element(du, u, element,
-                                          mesh::AbstractMesh{1},
+                                          meshT::Type{<:AbstractMesh{1}},
                                           equations, dg::DGSEM, cache, args...)
-    return integrate_reference_element(u, element, mesh, equations, dg, cache,
+    return integrate_reference_element(u, element, meshT, equations, dg, cache,
                                        du) do u, i, element, equations, dg, du
         u_node = get_node_vars(u, equations, dg, i, element)
         du_node = get_node_vars(du, equations, dg, i, element)
@@ -155,7 +156,8 @@ end
 
 # calculate surface integral of func(u, equations) * normal on the reference element.
 function surface_integral_reference_element(func::Func, u, element,
-                                            mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                                            ::Type{<:Union{TreeMesh{1},
+                                                           StructuredMesh{1}}},
                                             equations, dg::DGSEM,
                                             cache, args...) where {Func}
     u_left = get_node_vars(u, equations, dg, 1, element)
diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl
index 5035e831eed..96e640f8d62 100644
--- a/src/callbacks_step/analysis_dg2d.jl
+++ b/src/callbacks_step/analysis_dg2d.jl
@@ -138,7 +138,7 @@ function calc_error_norms(func, u, t, analyzer,
     return l2_error, linf_error
 end
 
-function calc_error_norms(func, u, t, analyzer,
+function calc_error_norms(func, _u, t, analyzer,
                           mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                       UnstructuredMesh2D,
                                       P4estMesh{2}, P4estMeshView{2},
@@ -146,9 +146,19 @@ function calc_error_norms(func, u, t, analyzer,
                           equations,
                           initial_condition, dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
-    @unpack node_coordinates, inverse_jacobian = cache.elements
     @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis
 
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack node_coordinates, inverse_jacobian = cache.elements
+        u = _u
+    else
+        node_coordinates = Array(cache.elements.node_coordinates)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
+
     # Set up data structures
     l2_error = zero(func(get_node_vars(u, equations, dg, 1, 1, 1), equations))
     linf_error = copy(l2_error)
@@ -190,7 +200,8 @@ end
 # This avoids the need to divide the RHS of the DG scheme by the Jacobian when computing
 # the time derivative of entropy, see `entropy_change_reference_element`.
 function integrate_reference_element(func::Func, u, element,
-                                     mesh::AbstractMesh{2}, equations, dg::DGSEM, cache,
+                                     ::Type{<:AbstractMesh{2}}, equations, dg::DGSEM,
+                                     cache,
                                      args...) where {Func}
     @unpack weights = dg.basis
 
@@ -208,9 +219,9 @@ end
 # Calculate ∫_e (∂S/∂u ⋅ ∂u/∂t) dΩ_e where the result on element 'e' is kept in reference space
 # Note that ∂S/∂u = w(u) with entropy variables w
 function entropy_change_reference_element(du, u, element,
-                                          mesh::AbstractMesh{2},
+                                          meshT::Type{<:AbstractMesh{2}},
                                           equations, dg::DGSEM, cache, args...)
-    return integrate_reference_element(u, element, mesh, equations, dg, cache,
+    return integrate_reference_element(u, element, meshT, equations, dg, cache,
                                        du) do u, i, j, element, equations, dg, du
         u_node = get_node_vars(u, equations, dg, i, j, element)
         du_node = get_node_vars(du, equations, dg, i, j, element)
@@ -221,7 +232,7 @@ end
 
 # calculate surface integral of func(u, equations) * normal on the reference element.
 function surface_integral_reference_element(func::Func, u, element,
-                                            mesh::TreeMesh{2}, equations, dg::DGSEM,
+                                            ::Type{<:TreeMesh{2}}, equations, dg::DGSEM,
                                             cache, args...) where {Func}
     @unpack weights = dg.basis
 
@@ -250,11 +261,11 @@ end
 # Note: `get_normal_direction` already returns an outward-pointing normal for all directions,
 # thus no +- flips are needed here.
 function surface_integral_reference_element(func::Func, u, element,
-                                            mesh::Union{StructuredMesh{2},
-                                                        StructuredMeshView{2},
-                                                        UnstructuredMesh2D,
-                                                        P4estMesh{2},
-                                                        T8codeMesh{2}},
+                                            ::Type{<:Union{StructuredMesh{2},
+                                                           StructuredMeshView{2},
+                                                           UnstructuredMesh2D,
+                                                           P4estMesh{2},
+                                                           T8codeMesh{2}}},
                                             equations, dg::DGSEM,
                                             cache, args...) where {Func}
     @unpack contravariant_vectors = cache.elements
@@ -327,13 +338,23 @@ function integrate_via_indices(func::Func, u,
     return integral
 end
 
-function integrate_via_indices(func::Func, u,
+function integrate_via_indices(func::Func, _u,
                                mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                            UnstructuredMesh2D, P4estMesh{2},
                                            T8codeMesh{2}},
                                equations, dg::DGSEM, cache,
                                args...; normalize = true) where {Func}
-    @unpack weights = dg.basis
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack weights = dg.basis
+        @unpack inverse_jacobian = cache.elements
+        u = _u
+    else
+        weights = Array(dg.basis.weights)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
 
     # Initialize integral with zeros of the right shape
     integral = zero(func(u, 1, 1, 1, equations, dg, args...))
@@ -343,7 +364,7 @@ function integrate_via_indices(func::Func, u,
     @batch reduction=((+, integral), (+, total_volume)) for element in eachelement(dg,
                                                                                    cache)
         for j in eachnode(dg), i in eachnode(dg)
-            volume_jacobian = abs(inv(cache.elements.inverse_jacobian[i, j, element]))
+            volume_jacobian = abs(inv(inverse_jacobian[i, j, element]))
             integral += volume_jacobian * weights[i] * weights[j] *
                         func(u, i, j, element, equations, dg, args...)
             total_volume += volume_jacobian * weights[i] * weights[j]
@@ -388,10 +409,19 @@ function integrate(func::Func, u,
     end
 end
 
-function analyze(::typeof(entropy_timederivative), du, u, t,
+function analyze(::typeof(entropy_timederivative), _du, u, t,
                  mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2},
                              UnstructuredMesh2D, P4estMesh{2}, T8codeMesh{2}},
                  equations, dg::Union{DGSEM, FDSBP}, cache)
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        du = _du
+    else
+        du = Array(_du)
+    end
+
+    # Calculate
     # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
     integrate_via_indices(u, mesh, equations, dg, cache,
                           du) do u, i, j, element, equations, dg, du
diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl
index 634db48de29..2578fa7174a 100644
--- a/src/callbacks_step/analysis_dg3d.jl
+++ b/src/callbacks_step/analysis_dg3d.jl
@@ -161,14 +161,24 @@ function calc_error_norms(func, u, t, analyzer,
     return l2_error, linf_error
 end
 
-function calc_error_norms(func, u, t, analyzer,
+function calc_error_norms(func, _u, t, analyzer,
                           mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                           equations, initial_condition,
                           dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
-    @unpack node_coordinates, inverse_jacobian = cache.elements
     @unpack u_local, u_tmp1, u_tmp2, x_local, x_tmp1, x_tmp2, jacobian_local, jacobian_tmp1, jacobian_tmp2 = cache_analysis
 
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack node_coordinates, inverse_jacobian = cache.elements
+        u = _u
+    else
+        node_coordinates = Array(cache.elements.node_coordinates)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
+
     # Set up data structures
     l2_error = zero(func(get_node_vars(u, equations, dg, 1, 1, 1, 1), equations))
     linf_error = copy(l2_error)
@@ -214,7 +224,8 @@ end
 # This avoids the need to divide the RHS of the DG scheme by the Jacobian when computing
 # the time derivative of entropy, see `entropy_change_reference_element`.
 function integrate_reference_element(func::Func, u, element,
-                                     mesh::AbstractMesh{3}, equations, dg::DGSEM, cache,
+                                     ::Type{<:AbstractMesh{3}}, equations, dg::DGSEM,
+                                     cache,
                                      args...; normalize = true) where {Func}
     @unpack weights = dg.basis
 
@@ -232,9 +243,9 @@ end
 # Calculate ∫_e (∂S/∂u ⋅ ∂u/∂t) dΩ_e where the result on element 'e' is kept in reference space
 # Note that ∂S/∂u = w(u) with entropy variables w
 function entropy_change_reference_element(du, u, element,
-                                          mesh::AbstractMesh{3},
+                                          meshT::Type{<:AbstractMesh{3}},
                                           equations, dg::DGSEM, cache, args...)
-    return integrate_reference_element(u, element, mesh, equations, dg, cache,
+    return integrate_reference_element(u, element, meshT, equations, dg, cache,
                                        du) do u, i, j, k, element, equations, dg, du
         u_node = get_node_vars(u, equations, dg, i, j, k, element)
         du_node = get_node_vars(du, equations, dg, i, j, k, element)
@@ -245,7 +256,7 @@ end
 
 # calculate surface integral of func(u, equations) * normal on the reference element.
 function surface_integral_reference_element(func::Func, u, element,
-                                            mesh::TreeMesh{3}, equations, dg::DGSEM,
+                                            ::Type{<:TreeMesh{3}}, equations, dg::DGSEM,
                                             cache, args...) where {Func}
     @unpack weights = dg.basis
 
@@ -281,8 +292,9 @@ end
 # Note: `get_normal_direction` already returns an outward-pointing normal for all directions,
 # thus no +- flips are needed here.
 function surface_integral_reference_element(func::Func, u, element,
-                                            mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                                        T8codeMesh{3}},
+                                            ::Type{<:Union{StructuredMesh{3},
+                                                           P4estMesh{3},
+                                                           T8codeMesh{3}}},
                                             equations, dg::DGSEM, cache,
                                             args...) where {Func}
     @unpack contravariant_vectors = cache.elements
@@ -377,12 +389,22 @@ function integrate_via_indices(func::Func, u,
     return integral
 end
 
-function integrate_via_indices(func::Func, u,
+function integrate_via_indices(func::Func, _u,
                                mesh::Union{StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
                                equations, dg::DGSEM, cache,
                                args...; normalize = true) where {Func}
-    @unpack weights = dg.basis
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(_u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        @unpack weights = dg.basis
+        @unpack inverse_jacobian = cache.elements
+        u = _u
+    else
+        weights = Array(dg.basis.weights)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+        u = Array(_u)
+    end
 
     # Initialize integral with zeros of the right shape
     integral = zero(func(u, 1, 1, 1, 1, equations, dg, args...))
@@ -392,7 +414,7 @@ function integrate_via_indices(func::Func, u,
     @batch reduction=((+, integral), (+, total_volume)) for element in eachelement(dg,
                                                                                    cache)
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            volume_jacobian = abs(inv(cache.elements.inverse_jacobian[i, j, k, element]))
+            volume_jacobian = abs(inv(inverse_jacobian[i, j, k, element]))
             integral += volume_jacobian * weights[i] * weights[j] * weights[k] *
                         func(u, i, j, k, element, equations, dg, args...)
             total_volume += volume_jacobian * weights[i] * weights[j] * weights[k]
@@ -438,10 +460,18 @@ function integrate(func::Func, u,
     end
 end
 
-function analyze(::typeof(entropy_timederivative), du, u, t,
+function analyze(::typeof(entropy_timederivative), _du, u, t,
                  mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                              T8codeMesh{3}},
                  equations, dg::Union{DGSEM, FDSBP}, cache)
+    # TODO GPU AnalysiCallback currently lives on CPU
+    backend = trixi_backend(u)
+    if backend isa Nothing # TODO GPU KA CPU backend
+        du = _du
+    else
+        du = Array(_du)
+    end
+
     # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
     integrate_via_indices(u, mesh, equations, dg, cache,
                           du) do u, i, j, k, element, equations, dg, du
diff --git a/src/callbacks_step/save_solution.jl b/src/callbacks_step/save_solution.jl
index 6e002d2eb23..68110a3b257 100644
--- a/src/callbacks_step/save_solution.jl
+++ b/src/callbacks_step/save_solution.jl
@@ -287,6 +287,11 @@ end
                                     element_variables = Dict{Symbol, Any}(),
                                     node_variables = Dict{Symbol, Any}();
                                     system = "")
+    # TODO GPU currently on CPU
+    backend = trixi_backend(u_ode)
+    if backend !== nothing
+        u_ode = Array(u_ode)
+    end
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array_native(u_ode, mesh, equations, solver, cache)
     save_solution_file(u, t, dt, iter, mesh, equations, solver, cache,
diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl
index febda8252b7..c3d8238ed0a 100644
--- a/src/callbacks_step/stepsize.jl
+++ b/src/callbacks_step/stepsize.jl
@@ -143,8 +143,9 @@ function calculate_dt(u_ode, t, cfl_advective, cfl_diffusive,
                       semi::AbstractSemidiscretization)
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array(u_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
-    return cfl_advective(t) * max_dt(u, t, mesh,
+    return cfl_advective(t) * max_dt(backend, u, t, mesh,
                   have_constant_speed(equations), equations,
                   solver, cache)
 end
@@ -154,8 +155,9 @@ function calculate_dt(u_ode, t, cfl_advective::Real, cfl_diffusive::Real,
                       semi::AbstractSemidiscretization)
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array(u_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
-    return cfl_advective * max_dt(u, t, mesh,
+    return cfl_advective * max_dt(backend, u, t, mesh,
                   have_constant_speed(equations), equations,
                   solver, cache)
 end
@@ -167,14 +169,15 @@ function calculate_dt(u_ode, t, cfl_advective, cfl_diffusive,
     equations_parabolic = semi.equations_parabolic
 
     u = wrap_array(u_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
-    dt_advective = cfl_advective(t) * max_dt(u, t, mesh,
+    dt_advective = cfl_advective(t) * max_dt(backend, u, t, mesh,
                           have_constant_speed(equations), equations,
                           solver, cache)
 
     cfl_diff = cfl_diffusive(t)
     if cfl_diff > 0 # Check if diffusive CFL should be considered
-        dt_diffusive = cfl_diff * max_dt(u, t, mesh,
+        dt_diffusive = cfl_diff * max_dt(backend, u, t, mesh,
                               have_constant_diffusivity(equations_parabolic), equations,
                               equations_parabolic, solver, cache)
 
diff --git a/src/callbacks_step/stepsize_dg1d.jl b/src/callbacks_step/stepsize_dg1d.jl
index 8a029543575..f445fc79c88 100644
--- a/src/callbacks_step/stepsize_dg1d.jl
+++ b/src/callbacks_step/stepsize_dg1d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function max_dt(u, t, mesh::TreeMesh{1},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{1},
                 constant_speed::False, equations,
                 dg::DG, cache)
     # Avoid division by zero if the speed vanishes everywhere
@@ -28,7 +28,7 @@ function max_dt(u, t, mesh::TreeMesh{1},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::TreeMesh{1},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{1},
                 constant_diffusivity::False, equations,
                 equations_parabolic::AbstractEquationsParabolic,
                 dg::DG, cache)
@@ -53,7 +53,7 @@ function max_dt(u, t, mesh::TreeMesh{1},
     return 4 / (nnodes(dg) * max_scaled_diffusivity)
 end
 
-function max_dt(u, t, mesh::TreeMesh{1},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{1},
                 constant_speed::True, equations,
                 dg::DG, cache)
     # Avoid division by zero if the speed vanishes everywhere,
@@ -73,7 +73,7 @@ function max_dt(u, t, mesh::TreeMesh{1},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::TreeMesh, # for all dimensions
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh, # for all dimensions
                 constant_diffusivity::True, equations,
                 equations_parabolic::AbstractEquationsParabolic,
                 dg::DG, cache)
@@ -95,7 +95,7 @@ function max_dt(u, t, mesh::TreeMesh, # for all dimensions
     return 4 / (nnodes(dg) * max_scaled_diffusivity)
 end
 
-function max_dt(u, t, mesh::StructuredMesh{1},
+function max_dt(backend::Nothing, u, t, mesh::StructuredMesh{1},
                 constant_speed::False, equations,
                 dg::DG, cache)
     # Avoid division by zero if the speed vanishes everywhere
@@ -122,7 +122,7 @@ function max_dt(u, t, mesh::StructuredMesh{1},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::StructuredMesh{1},
+function max_dt(backend::Nothing, u, t, mesh::StructuredMesh{1},
                 constant_speed::True, equations,
                 dg::DG, cache)
     # Avoid division by zero if the speed vanishes everywhere,
diff --git a/src/callbacks_step/stepsize_dg2d.jl b/src/callbacks_step/stepsize_dg2d.jl
index db1565d9a16..35374ed7028 100644
--- a/src/callbacks_step/stepsize_dg2d.jl
+++ b/src/callbacks_step/stepsize_dg2d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function max_dt(u, t, mesh::TreeMesh{2},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{2},
                 constant_speed::False, equations, dg::DG, cache)
     # Avoid division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -29,7 +29,7 @@ function max_dt(u, t, mesh::TreeMesh{2},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::TreeMesh{2},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{2},
                 constant_diffusivity::False, equations,
                 equations_parabolic::AbstractEquationsParabolic,
                 dg::DG, cache)
@@ -53,7 +53,7 @@ function max_dt(u, t, mesh::TreeMesh{2},
     return 4 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::TreeMesh{2},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{2},
                 constant_speed::True, equations, dg::DG, cache)
     # Avoid division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -72,34 +72,34 @@ function max_dt(u, t, mesh::TreeMesh{2},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::TreeMeshParallel{2},
+function max_dt(backend::Nothing, u, t, mesh::TreeMeshParallel{2},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::TreeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), TreeMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), TreeMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
     return dt
 end
 
-function max_dt(u, t, mesh::TreeMeshParallel{2},
+function max_dt(backend::Nothing, u, t, mesh::TreeMeshParallel{2},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::TreeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), TreeMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), TreeMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
@@ -110,45 +110,111 @@ end
 # Thus, there is no `max_dt` function for `TreeMeshParallel{2}` and
 # `equations_parabolic::AbstractEquationsParabolic` implemented.
 
-function max_dt(u, t,
+function max_dt(backend::Nothing, u, t,
                 mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
                             T8codeMesh{2}, StructuredMeshView{2}},
                 constant_speed::False, equations, dg::DG, cache)
     # Avoid division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
     max_scaled_speed = nextfloat(zero(t))
-
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
-
     @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
-        max_lambda1 = max_lambda2 = zero(max_scaled_speed)
-        for j in eachnode(dg), i in eachnode(dg)
-            u_node = get_node_vars(u, equations, dg, i, j, element)
-            lambda1, lambda2 = max_abs_speeds(u_node, equations)
+        max_lambda = max_scaled_speed_per_element(u, typeof(mesh), constant_speed,
+                                                  equations, dg, contravariant_vectors,
+                                                  inverse_jacobian, element)
+        # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
+        # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
+        max_scaled_speed = Base.max(max_scaled_speed, max_lambda)
+    end
+    return 2 / (nnodes(dg) * max_scaled_speed)
+end
 
-            # Local speeds transformed to the reference element
-            Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
-                                                  i, j, element)
-            lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2)
-            Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
-                                                  i, j, element)
-            lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2)
+function max_dt(backend::Backend, u, t,
+                mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
+                            T8codeMesh{2}, StructuredMeshView{2}},
+                constant_speed::False, equations, dg::DG, cache)
+    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    num_elements = nelements(dg, cache)
+    max_scaled_speeds = allocate(backend, eltype(t), num_elements)
+
+    kernel! = max_scaled_speed_KAkernel!(backend)
+    kernel!(max_scaled_speeds, u, typeof(mesh), constant_speed, equations, dg,
+            contravariant_vectors, inverse_jacobian, ndrange = num_elements)
+    # TODO GPU dt on CPU? (time integration happens on CPU)
+    max_scaled_speed = max(nextfloat(zero(t)), maximum(max_scaled_speeds))
+    return 2 / (nnodes(dg) * max_scaled_speed)
+end
 
-            inv_jacobian = abs(inverse_jacobian[i, j, element])
+# works for both constant and non-constant speed
+@kernel function max_scaled_speed_KAkernel!(max_scaled_speeds, u,
+                                            mT::Type{<:Union{StructuredMesh{2},
+                                                             UnstructuredMesh2D,
+                                                             P4estMesh{2},
+                                                             T8codeMesh{2},
+                                                             StructuredMeshView{2}}},
+                                            constant_speed, equations,
+                                            dg::DG, contravariant_vectors,
+                                            inverse_jacobian)
+    element = @index(Global)
+    max_scaled_speeds[element] = max_scaled_speed_per_element(u, mT, constant_speed,
+                                                              equations, dg,
+                                                              contravariant_vectors,
+                                                              inverse_jacobian,
+                                                              element)
+end
 
-            max_lambda1 = Base.max(max_lambda1, lambda1_transformed * inv_jacobian)
-            max_lambda2 = Base.max(max_lambda2, lambda2_transformed * inv_jacobian)
-        end
+@inline function max_scaled_speed_per_element(u,
+                                              mT::Type{<:Union{StructuredMesh{2},
+                                                               UnstructuredMesh2D,
+                                                               P4estMesh{2},
+                                                               T8codeMesh{2},
+                                                               StructuredMeshView{2}}},
+                                              constant_speed::False, equations, dg::DG,
+                                              contravariant_vectors, inverse_jacobian,
+                                              element)
+    max_lambda1 = max_lambda2 = zero(eltype(u))
+    for j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, element)
+        lambda1, lambda2 = max_abs_speeds(u_node, equations)
+
+        # Local speeds transformed to the reference element
+        Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
+                                              i, j, element)
+        lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2)
+        Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
+                                              i, j, element)
+        lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2)
+
+        inv_jacobian = abs(inverse_jacobian[i, j, element])
+
+        max_lambda1 = Base.max(max_lambda1, lambda1_transformed * inv_jacobian)
+        max_lambda2 = Base.max(max_lambda2, lambda2_transformed * inv_jacobian)
+    end
+    return max_lambda1 + max_lambda2
+end
+
+function max_dt(backend::Nothing, u, t,
+                mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
+                            P4estMeshView{2}, T8codeMesh{2}, StructuredMeshView{2}},
+                constant_speed::True, equations, dg::DG, cache)
+    max_scaled_speed = nextfloat(zero(t))
 
+    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
+        max_scaled_speed_loc = max_scaled_speed_per_element(u, typeof(mesh),
+                                                            constant_speed,
+                                                            equations, dg,
+                                                            contravariant_vectors,
+                                                            inverse_jacobian, element)
         # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
         # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
-        max_scaled_speed = Base.max(max_scaled_speed, max_lambda1 + max_lambda2)
+        max_scaled_speed = Base.max(max_scaled_speed, max_scaled_speed_loc)
     end
 
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t,
+function max_dt(backend::Nothing, u, t,
                 mesh::P4estMesh{2}, # Parabolic terms currently only for `TreeMesh` and `P4estMesh`
                 constant_diffusivity::False, equations,
                 equations_parabolic::AbstractEquationsParabolic,
@@ -194,41 +260,54 @@ function max_dt(u, t,
     return 4 / (nnodes(dg) * max_scaled_diffusivity)
 end
 
-function max_dt(u, t,
+function max_dt(backend::Backend, u, t,
                 mesh::Union{StructuredMesh{2}, UnstructuredMesh2D, P4estMesh{2},
                             P4estMeshView{2}, T8codeMesh{2}, StructuredMeshView{2}},
                 constant_speed::True, equations, dg::DG, cache)
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    num_elements = nelements(dg, cache)
+    max_scaled_speeds = allocate(backend, eltype(t), num_elements)
+
+    kernel! = max_scaled_speed_KAkernel!(backend)
+    kernel!(max_scaled_speeds, u, typeof(mesh), constant_speed, equations, dg,
+            contravariant_vectors, inverse_jacobian, ndrange = num_elements)
+    # TODO GPU dt on CPU? (time integration happens on CPU)
+    max_scaled_speed = max(nextfloat(zero(t)), maximum(max_scaled_speeds))
+    return 2 / (nnodes(dg) * max_scaled_speed)
+end
 
-    # Avoid division by zero if the speed vanishes everywhere,
-    # e.g. for steady-state linear advection
-    max_scaled_speed = nextfloat(zero(t))
-
+function max_scaled_speed_per_element(u,
+                                      ::Type{<:Union{StructuredMesh{2},
+                                                     UnstructuredMesh2D,
+                                                     P4estMesh{2},
+                                                     P4estMeshView{2},
+                                                     T8codeMesh{2},
+                                                     StructuredMeshView{2}}},
+                                      constant_speed::True, equations, dg::DG,
+                                      contravariant_vectors, inverse_jacobian,
+                                      element)
+    max_scaled_speed = zero(eltype(u))
     max_lambda1, max_lambda2 = max_abs_speeds(equations)
+    for j in eachnode(dg), i in eachnode(dg)
+        # Local speeds transformed to the reference element
+        Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
+                                              i, j, element)
+        lambda1_transformed = abs(Ja11 * max_lambda1 + Ja12 * max_lambda2)
+        Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
+                                              i, j, element)
+        lambda2_transformed = abs(Ja21 * max_lambda1 + Ja22 * max_lambda2)
 
-    @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
-        for j in eachnode(dg), i in eachnode(dg)
-            # Local speeds transformed to the reference element
-            Ja11, Ja12 = get_contravariant_vector(1, contravariant_vectors,
-                                                  i, j, element)
-            lambda1_transformed = abs(Ja11 * max_lambda1 + Ja12 * max_lambda2)
-            Ja21, Ja22 = get_contravariant_vector(2, contravariant_vectors,
-                                                  i, j, element)
-            lambda2_transformed = abs(Ja21 * max_lambda1 + Ja22 * max_lambda2)
+        inv_jacobian = abs(inverse_jacobian[i, j, element])
 
-            inv_jacobian = abs(inverse_jacobian[i, j, element])
-            # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
-            # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
-            max_scaled_speed = Base.max(max_scaled_speed,
-                                        inv_jacobian *
-                                        (lambda1_transformed + lambda2_transformed))
-        end
+        max_scaled_speed = Base.max(max_scaled_speed,
+                                    inv_jacobian *
+                                    (lambda1_transformed + lambda2_transformed))
     end
 
-    return 2 / (nnodes(dg) * max_scaled_speed)
+    return max_scaled_speed
 end
 
-function max_dt(u, t,
+function max_dt(backend::Nothing, u, t,
                 mesh::P4estMesh{2}, # Parabolic terms currently only for `TreeMesh` and `P4estMesh`
                 constant_diffusivity::True, equations,
                 equations_parabolic::AbstractEquationsParabolic,
@@ -273,68 +352,68 @@ function max_dt(u, t,
     return 4 / (nnodes(dg) * max_scaled_diffusivity)
 end
 
-function max_dt(u, t, mesh::P4estMeshParallel{2},
+function max_dt(backend::Nothing, u, t, mesh::P4estMeshParallel{2},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), P4estMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), P4estMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
     return dt
 end
 
-function max_dt(u, t, mesh::P4estMeshParallel{2},
+function max_dt(backend::Nothing, u, t, mesh::P4estMeshParallel{2},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), P4estMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), P4estMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
     return dt
 end
 
-function max_dt(u, t, mesh::T8codeMeshParallel{2},
+function max_dt(backend::Nothing, u, t, mesh::T8codeMeshParallel{2},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), T8codeMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), T8codeMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
     return dt
 end
 
-function max_dt(u, t, mesh::T8codeMeshParallel{2},
+function max_dt(backend::Nothing, u, t, mesh::T8codeMeshParallel{2},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{2}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), T8codeMesh{2},
+                Tuple{typeof(backend), typeof(u), typeof(t), T8codeMesh{2},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
diff --git a/src/callbacks_step/stepsize_dg3d.jl b/src/callbacks_step/stepsize_dg3d.jl
index d310be1675e..55645aa92b2 100644
--- a/src/callbacks_step/stepsize_dg3d.jl
+++ b/src/callbacks_step/stepsize_dg3d.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function max_dt(u, t, mesh::TreeMesh{3},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{3},
                 constant_speed::False, equations, dg::DG, cache)
     # Avoid division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -31,7 +31,7 @@ function max_dt(u, t, mesh::TreeMesh{3},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::TreeMesh{3},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{3},
                 constant_diffusivity::False, equations,
                 equations_parabolic::AbstractEquationsParabolic,
                 dg::DG, cache)
@@ -56,7 +56,7 @@ function max_dt(u, t, mesh::TreeMesh{3},
     return 4 / (nnodes(dg) * max_scaled_diffusivity)
 end
 
-function max_dt(u, t, mesh::TreeMesh{3},
+function max_dt(backend::Nothing, u, t, mesh::TreeMesh{3},
                 constant_speed::True, equations, dg::DG, cache)
     # Avoid division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -76,7 +76,8 @@ function max_dt(u, t, mesh::TreeMesh{3},
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
+function max_dt(backend::Nothing, u, t,
+                mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                 constant_speed::False, equations, dg::DG, cache)
     # Avoid division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
@@ -85,38 +86,75 @@ function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
 
     @batch reduction=(max, max_scaled_speed) for element in eachelement(dg, cache)
-        max_lambda1 = max_lambda2 = max_lambda3 = zero(max_scaled_speed)
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            u_node = get_node_vars(u, equations, dg, i, j, k, element)
-            lambda1, lambda2, lambda3 = max_abs_speeds(u_node, equations)
+        max_lambda = max_scaled_speed_element(u, typeof(mesh), equations, dg,
+                                              contravariant_vectors, inverse_jacobian,
+                                              element)
+        # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
+        # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
+        max_scaled_speed = Base.max(max_scaled_speed, max_lambda)
+    end
 
-            Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2 + Ja13 * lambda3)
-            Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2 + Ja23 * lambda3)
-            Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
-                                                        i, j, k, element)
-            lambda3_transformed = abs(Ja31 * lambda1 + Ja32 * lambda2 + Ja33 * lambda3)
+    return 2 / (nnodes(dg) * max_scaled_speed)
+end
 
-            inv_jacobian = abs(inverse_jacobian[i, j, k, element])
+function max_dt(backend::Backend, u, t,
+                mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
+                constant_speed::False, equations, dg::DG, cache)
+    @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    num_elements = nelements(dg, cache)
+    max_scaled_speeds = allocate(backend, eltype(t), num_elements)
 
-            max_lambda1 = Base.max(max_lambda1, inv_jacobian * lambda1_transformed)
-            max_lambda2 = Base.max(max_lambda2, inv_jacobian * lambda2_transformed)
-            max_lambda3 = Base.max(max_lambda3, inv_jacobian * lambda3_transformed)
-        end
+    kernel! = max_scaled_speed_KAkernel!(backend)
+    kernel!(max_scaled_speeds, u, typeof(mesh), equations, dg, contravariant_vectors,
+            inverse_jacobian;
+            ndrange = num_elements)
 
-        # Use `Base.max` to prevent silent failures, as `max` from `@fastmath` doesn't propagate
-        # `NaN`s properly. See https://github.com/trixi-framework/Trixi.jl/pull/2445#discussion_r2336812323
-        max_scaled_speed = Base.max(max_scaled_speed,
-                                    max_lambda1 + max_lambda2 + max_lambda3)
-    end
+    # TODO GPU dt on CPU? (time integration happens on CPU)
+    max_scaled_speed = max(nextfloat(zero(t)), maximum(max_scaled_speeds))
 
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t,
+@kernel function max_scaled_speed_KAkernel!(max_scaled_speeds, u, meshT, equations,
+                                            dg, contravariant_vectors, inverse_jacobian)
+    element = @index(Global)
+    max_scaled_speeds[element] = max_scaled_speed_element(u, meshT, equations, dg,
+                                                          contravariant_vectors,
+                                                          inverse_jacobian,
+                                                          element)
+end
+
+@inline function max_scaled_speed_element(u,
+                                          ::Type{<:Union{StructuredMesh{3},
+                                                         P4estMesh{3},
+                                                         T8codeMesh{3}}}, equations, dg,
+                                          contravariant_vectors, inverse_jacobian,
+                                          element)
+    max_lambda1 = max_lambda2 = max_lambda3 = zero(eltype(u))
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        u_node = get_node_vars(u, equations, dg, i, j, k, element)
+        lambda1, lambda2, lambda3 = max_abs_speeds(u_node, equations)
+
+        Ja11, Ja12, Ja13 = get_contravariant_vector(1, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda1_transformed = abs(Ja11 * lambda1 + Ja12 * lambda2 + Ja13 * lambda3)
+        Ja21, Ja22, Ja23 = get_contravariant_vector(2, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda2_transformed = abs(Ja21 * lambda1 + Ja22 * lambda2 + Ja23 * lambda3)
+        Ja31, Ja32, Ja33 = get_contravariant_vector(3, contravariant_vectors,
+                                                    i, j, k, element)
+        lambda3_transformed = abs(Ja31 * lambda1 + Ja32 * lambda2 + Ja33 * lambda3)
+
+        inv_jacobian = abs(inverse_jacobian[i, j, k, element])
+
+        max_lambda1 = max(max_lambda1, inv_jacobian * lambda1_transformed)
+        max_lambda2 = max(max_lambda2, inv_jacobian * lambda2_transformed)
+        max_lambda3 = max(max_lambda3, inv_jacobian * lambda3_transformed)
+    end
+    return max_lambda1 + max_lambda2 + max_lambda3
+end
+
+function max_dt(backend::Nothing, u, t,
                 mesh::P4estMesh{3}, # Parabolic terms currently only for `TreeMesh` and `P4estMesh`
                 constant_diffusivity::False, equations,
                 equations_parabolic::AbstractEquationsParabolic,
@@ -168,13 +206,19 @@ function max_dt(u, t,
     return 4 / (nnodes(dg) * max_scaled_diffusivity)
 end
 
-function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
+function max_dt(backend, u, t,
+                mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                 constant_speed::True, equations, dg::DG, cache)
     # Avoid division by zero if the speed vanishes everywhere,
     # e.g. for steady-state linear advection
     max_scaled_speed = nextfloat(zero(t))
 
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    if backend !== nothing
+        # TODO: Port to GPU
+        contravariant_vectors = Array(cache.elements.contravariant_vectors)
+        inverse_jacobian = Array(cache.elements.inverse_jacobian)
+    end
 
     max_lambda1, max_lambda2, max_lambda3 = max_abs_speeds(equations)
 
@@ -207,7 +251,7 @@ function max_dt(u, t, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}
     return 2 / (nnodes(dg) * max_scaled_speed)
 end
 
-function max_dt(u, t,
+function max_dt(backend::Nothing, u, t,
                 mesh::P4estMesh{3}, # Parabolic terms currently only for `TreeMesh` and `P4estMesh`
                 constant_diffusivity::True, equations,
                 equations_parabolic::AbstractEquationsParabolic,
@@ -258,68 +302,68 @@ function max_dt(u, t,
     return 4 / (nnodes(dg) * max_scaled_diffusivity)
 end
 
-function max_dt(u, t, mesh::P4estMeshParallel{3},
+function max_dt(backend::Nothing, u, t, mesh::P4estMeshParallel{3},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), P4estMesh{3},
+                Tuple{typeof(backend), typeof(u), typeof(t), P4estMesh{3},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
     return dt
 end
 
-function max_dt(u, t, mesh::P4estMeshParallel{3},
+function max_dt(backend::Nothing, u, t, mesh::P4estMeshParallel{3},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::P4estMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), P4estMesh{3},
+                Tuple{typeof(backend), typeof(u), typeof(t), P4estMesh{3},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
     return dt
 end
 
-function max_dt(u, t, mesh::T8codeMeshParallel{3},
+function max_dt(backend::Nothing, u, t, mesh::T8codeMeshParallel{3},
                 constant_speed::False, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), T8codeMesh{3},
+                Tuple{typeof(backend), typeof(u), typeof(t), T8codeMesh{3},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
     return dt
 end
 
-function max_dt(u, t, mesh::T8codeMeshParallel{3},
+function max_dt(backend::Nothing, u, t, mesh::T8codeMeshParallel{3},
                 constant_speed::True, equations, dg::DG, cache)
     # call the method accepting a general `mesh::T8codeMesh{3}`
     # TODO: MPI, we should improve this; maybe we should dispatch on `u`
     #       and create some MPI array type, overloading broadcasting and mapreduce etc.
     #       Then, this specific array type should also work well with DiffEq etc.
     dt = invoke(max_dt,
-                Tuple{typeof(u), typeof(t), T8codeMesh{3},
+                Tuple{typeof(backend), typeof(u), typeof(t), T8codeMesh{3},
                       typeof(constant_speed), typeof(equations), typeof(dg),
                       typeof(cache)},
-                u, t, mesh, constant_speed, equations, dg, cache)
+                backend, u, t, mesh, constant_speed, equations, dg, cache)
     # Base.min instead of min needed, see comment in src/auxiliary/math.jl
     dt = MPI.Allreduce!(Ref(dt), Base.min, mpi_comm())[]
 
diff --git a/src/semidiscretization/semidiscretization_euler_gravity.jl b/src/semidiscretization/semidiscretization_euler_gravity.jl
index 661fcdfd5ad..161af1b079b 100644
--- a/src/semidiscretization/semidiscretization_euler_gravity.jl
+++ b/src/semidiscretization/semidiscretization_euler_gravity.jl
@@ -301,6 +301,7 @@ function update_gravity!(semi::SemidiscretizationEulerGravity, u_ode)
     u_euler = wrap_array(u_ode, semi_euler)
     u_gravity = wrap_array(cache.u_ode, semi_gravity)
     du_gravity = wrap_array(cache.du_ode, semi_gravity)
+    backend = trixi_backend(u_ode)
 
     # set up main loop
     finalstep = false
@@ -312,7 +313,7 @@ function update_gravity!(semi::SemidiscretizationEulerGravity, u_ode)
     @unpack equations = semi_gravity
     while !finalstep
         dtau = @trixi_timeit timer() "calculate dtau" begin
-            cfl * max_dt(u_gravity, tau, semi_gravity.mesh,
+            cfl * max_dt(backend, u_gravity, tau, semi_gravity.mesh,
                    have_constant_speed(equations), equations,
                    semi_gravity.solver, semi_gravity.cache)
         end
diff --git a/src/semidiscretization/semidiscretization_hyperbolic.jl b/src/semidiscretization/semidiscretization_hyperbolic.jl
index cd20fbf8e3f..17e4ad4ef65 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic.jl
@@ -577,10 +577,11 @@ function rhs!(du_ode, u_ode, semi::SemidiscretizationHyperbolic, t)
 
     u = wrap_array(u_ode, mesh, equations, solver, cache)
     du = wrap_array(du_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
     # TODO: Taal decide, do we need to pass the mesh?
     time_start = time_ns()
-    @trixi_timeit timer() "rhs!" rhs!(du, u, t, mesh, equations,
+    @trixi_timeit timer() "rhs!" rhs!(backend, du, u, t, mesh, equations,
                                       boundary_conditions, source_terms, solver, cache)
     runtime = time_ns() - time_start
     put!(semi.performance_counter, runtime)
diff --git a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
index c730439017c..cab2626783c 100644
--- a/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
+++ b/src/semidiscretization/semidiscretization_hyperbolic_parabolic.jl
@@ -369,10 +369,11 @@ function rhs!(du_ode, u_ode, semi::SemidiscretizationHyperbolicParabolic, t)
 
     u = wrap_array(u_ode, mesh, equations, solver, cache)
     du = wrap_array(du_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
     # TODO: Taal decide, do we need to pass the mesh?
     time_start = time_ns()
-    @trixi_timeit timer() "rhs!" rhs!(du, u, t, mesh, equations,
+    @trixi_timeit timer() "rhs!" rhs!(backend, du, u, t, mesh, equations,
                                       boundary_conditions, source_terms, solver, cache)
     runtime = time_ns() - time_start
     put!(semi.performance_counter.counters[1], runtime)
diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
index 8a11429766a..978ee5cfe39 100644
--- a/src/solvers/dg.jl
+++ b/src/solvers/dg.jl
@@ -1056,6 +1056,13 @@ end
     return u_ll, u_rr
 end
 
+# As above but dispatches on an type argument
+@inline function get_surface_node_vars(u, equations, ::Type{<:DG}, indices...)
+    u_ll = SVector(ntuple(@inline(v->u[1, v, indices...]), Val(nvariables(equations))))
+    u_rr = SVector(ntuple(@inline(v->u[2, v, indices...]), Val(nvariables(equations))))
+    return u_ll, u_rr
+end
+
 @inline function set_node_vars!(u, u_node, equations, solver::DG, indices...)
     for v in eachvariable(equations)
         u[v, indices...] = u_node[v]
@@ -1218,55 +1225,46 @@ function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{
     return nothing
 end
 
-function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{2},
+function compute_coefficients!(backend::Nothing, u, func, t,
+                               mesh::Union{AbstractMesh{2}, AbstractMesh{3}},
                                equations, dg::DG, cache)
     @unpack node_coordinates = cache.elements
+    node_indices = CartesianIndices(ntuple(_ -> nnodes(dg), ndims(mesh)))
     @threaded for element in eachelement(dg, cache)
         compute_coefficients_element!(u, func, t, equations, dg, node_coordinates,
-                                      element)
+                                      element, node_indices)
     end
 
     return nothing
 end
 
-function compute_coefficients!(backend::Backend, u, func, t, mesh::AbstractMesh{2},
+function compute_coefficients!(backend::Backend, u, func, t,
+                               mesh::Union{AbstractMesh{2}, AbstractMesh{3}},
                                equations, dg::DG, cache)
     nelements(dg, cache) == 0 && return nothing
+
     @unpack node_coordinates = cache.elements
-    kernel! = compute_coefficients_kernel!(backend)
-    kernel!(u, func, t, equations, dg, node_coordinates,
+    node_indices = CartesianIndices(ntuple(_ -> nnodes(dg), ndims(mesh)))
+
+    kernel! = compute_coefficients_KAkernel!(backend)
+    kernel!(u, func, t, equations, dg, node_coordinates, node_indices,
             ndrange = nelements(dg, cache))
     return nothing
 end
 
-@kernel function compute_coefficients_kernel!(u, func, t, equations,
-                                              dg::DG, node_coordinates)
+@kernel function compute_coefficients_KAkernel!(u, func, t, equations,
+                                                dg::DG, node_coordinates, node_indices)
     element = @index(Global)
-    compute_coefficients_element!(u, func, t, equations, dg, node_coordinates,
-                                  element)
+    compute_coefficients_element!(u, func, t, equations, dg, node_coordinates, element,
+                                  node_indices)
 end
 
-function compute_coefficients_element!(u, func, t, equations, dg::DG,
-                                       node_coordinates, element)
-    for j in eachnode(dg), i in eachnode(dg)
-        x_node = get_node_coords(node_coordinates, equations, dg, i,
-                                 j, element)
+@inline function compute_coefficients_element!(u, func, t, equations, dg::DG,
+                                               node_coordinates, element, node_indices)
+    for indices in node_indices
+        x_node = get_node_coords(node_coordinates, equations, dg, indices, element)
         u_node = func(x_node, t, equations)
-        set_node_vars!(u, u_node, equations, dg, i, j, element)
-    end
-
-    return nothing
-end
-
-function compute_coefficients!(backend::Nothing, u, func, t, mesh::AbstractMesh{3},
-                               equations, dg::DG, cache)
-    @threaded for element in eachelement(dg, cache)
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            x_node = get_node_coords(cache.elements.node_coordinates, equations, dg, i,
-                                     j, k, element)
-            u_node = func(x_node, t, equations)
-            set_node_vars!(u, u_node, equations, dg, i, j, k, element)
-        end
+        set_node_vars!(u, u_node, equations, dg, indices, element)
     end
 
     return nothing
diff --git a/src/solvers/dgmulti/dg.jl b/src/solvers/dgmulti/dg.jl
index 4f302a196a9..dddbcb1a4e6 100644
--- a/src/solvers/dgmulti/dg.jl
+++ b/src/solvers/dgmulti/dg.jl
@@ -239,7 +239,7 @@ function dt_polydeg_scaling(dg::DGMulti{3, <:Wedge, <:TensorProductWedge})
 end
 
 # for the stepsize callback
-function max_dt(u, t, mesh::DGMultiMesh,
+function max_dt(backend::Nothing, u, t, mesh::DGMultiMesh,
                 constant_diffusivity::False, equations,
                 equations_parabolic::AbstractEquationsParabolic,
                 dg::DGMulti{NDIMS},
@@ -268,7 +268,7 @@ function max_dt(u, t, mesh::DGMultiMesh,
     return 2 * dt_min * dt_polydeg_scaling(dg)
 end
 
-function max_dt(u, t, mesh::DGMultiMesh,
+function max_dt(backend::Nothing, u, t, mesh::DGMultiMesh,
                 constant_diffusivity::True, equations,
                 equations_parabolic::AbstractEquationsParabolic,
                 dg::DGMulti{NDIMS},
@@ -297,7 +297,7 @@ function max_dt(u, t, mesh::DGMultiMesh,
 end
 
 # for the stepsize callback
-function max_dt(u, t, mesh::DGMultiMesh,
+function max_dt(backend, u, t, mesh::DGMultiMesh,
                 constant_speed::False, equations, dg::DGMulti{NDIMS},
                 cache) where {NDIMS}
     @unpack md = mesh
@@ -320,7 +320,7 @@ function max_dt(u, t, mesh::DGMultiMesh,
     return 2 * dt_min * dt_polydeg_scaling(dg)
 end
 
-function max_dt(u, t, mesh::DGMultiMesh,
+function max_dt(backend, u, t, mesh::DGMultiMesh,
                 constant_speed::True, equations, dg::DGMulti{NDIMS},
                 cache) where {NDIMS}
     @unpack md = mesh
@@ -727,7 +727,7 @@ function calc_sources!(du, u, t, source_terms,
     return nothing
 end
 
-function rhs!(du, u, t, mesh, equations,
+function rhs!(backend, du, u, t, mesh, equations,
               boundary_conditions::BC, source_terms::Source,
               dg::DGMulti, cache) where {BC, Source}
     @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
diff --git a/src/solvers/dgmulti/flux_differencing.jl b/src/solvers/dgmulti/flux_differencing.jl
index 56d4e1f3752..e78d0fbee98 100644
--- a/src/solvers/dgmulti/flux_differencing.jl
+++ b/src/solvers/dgmulti/flux_differencing.jl
@@ -631,7 +631,7 @@ end
 # an entropy conservative/stable discretization. For modal DG schemes, an extra `entropy_projection!`
 # is required (see https://doi.org/10.1016/j.jcp.2018.02.033, Section 4.3).
 # Also called by DGMultiFluxDiff{<:GaussSBP} solvers.
-function rhs!(du, u, t, mesh, equations, boundary_conditions::BC,
+function rhs!(backend, du, u, t, mesh, equations, boundary_conditions::BC,
               source_terms::Source, dg::DGMultiFluxDiff, cache) where {Source, BC}
     @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
 
@@ -676,7 +676,7 @@ end
 # integral, e.g., an entropy conservative/stable discretization. The implementation of `rhs!`
 # for such schemes is very similar to the implementation of `rhs!` for standard DG methods,
 # but specializes `calc_volume_integral`.
-function rhs!(du, u, t, mesh, equations,
+function rhs!(backend, du, u, t, mesh, equations,
               boundary_conditions::BC, source_terms::Source,
               dg::DGMultiFluxDiffSBP, cache) where {BC, Source}
     @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
diff --git a/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl b/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
index 5185a7dbec5..4bd95de7e7a 100644
--- a/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
+++ b/src/solvers/dgmulti/flux_differencing_gauss_sbp.jl
@@ -598,7 +598,7 @@ end
 
 # Specialize RHS so that we can call `invert_jacobian_and_interpolate!` instead of just `invert_jacobian!`,
 # since `invert_jacobian!` is also used in other places (e.g., parabolic terms).
-function rhs!(du, u, t, mesh, equations, boundary_conditions::BC,
+function rhs!(backend, du, u, t, mesh, equations, boundary_conditions::BC,
               source_terms::Source, dg::DGMultiFluxDiff{<:GaussSBP},
               cache) where {Source, BC}
     @trixi_timeit timer() "reset ∂u/∂t" set_zero!(du, dg, cache)
diff --git a/src/solvers/dgsem/calc_volume_integral.jl b/src/solvers/dgsem/calc_volume_integral.jl
index 502e31c9f36..9a7865948dc 100644
--- a/src/solvers/dgsem/calc_volume_integral.jl
+++ b/src/solvers/dgsem/calc_volume_integral.jl
@@ -8,44 +8,44 @@
 # The following `volume_integral_kernel!` and `calc_volume_integral!` functions are
 # dimension and meshtype agnostic, i.e., valid for all 1D, 2D, and 3D meshes.
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, meshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralWeakForm,
                                          dg, cache, alpha = true)
-    weak_form_kernel!(du, u, element, mesh,
+    weak_form_kernel!(du, u, element, meshT,
                       have_nonconservative_terms, equations,
                       dg, cache, alpha)
 
     return nothing
 end
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, meshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralFluxDifferencing,
                                          dg, cache, alpha = true)
     @unpack volume_flux = volume_integral # Volume integral specific data
 
-    flux_differencing_kernel!(du, u, element, mesh,
+    flux_differencing_kernel!(du, u, element, meshT,
                               have_nonconservative_terms, equations,
                               volume_flux, dg, cache, alpha)
 
     return nothing
 end
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, meshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralPureLGLFiniteVolume,
                                          dg::DGSEM, cache, alpha = true)
     @unpack volume_flux_fv = volume_integral # Volume integral specific data
 
-    fv_kernel!(du, u, mesh,
+    fv_kernel!(du, u, meshT,
                have_nonconservative_terms, equations,
                volume_flux_fv, dg, cache, element, alpha)
 
     return nothing
 end
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, meshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralPureLGLFiniteVolumeO2,
                                          dg::DGSEM, cache, alpha = true)
@@ -53,7 +53,7 @@ end
     @unpack (sc_interface_coords, volume_flux_fv, reconstruction_mode, slope_limiter,
     cons2recon, recon2cons) = volume_integral
 
-    fvO2_kernel!(du, u, mesh,
+    fvO2_kernel!(du, u, meshT,
                  have_nonconservative_terms, equations,
                  volume_flux_fv, dg, cache, element,
                  sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -63,14 +63,14 @@ end
     return nothing
 end
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, meshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralAdaptive{<:IndicatorEntropyChange},
                                          dg::DGSEM, cache)
     @unpack volume_integral_default, volume_integral_stabilized, indicator = volume_integral
     @unpack maximum_entropy_increase = indicator
 
-    volume_integral_kernel!(du, u, element, mesh,
+    volume_integral_kernel!(du, u, element, meshT,
                             have_nonconservative_terms, equations,
                             volume_integral_default, dg, cache)
 
@@ -79,11 +79,11 @@ end
     # No scaling by inverse Jacobian here, as there is no Jacobian multiplication
     # in `integrate_reference_element`.
     dS_default = -entropy_change_reference_element(du, u, element,
-                                                   mesh, equations, dg, cache)
+                                                   meshT, equations, dg, cache)
 
     # Compute true entropy change given by surface integral of the entropy potential
     dS_true = surface_integral_reference_element(entropy_potential, u, element,
-                                                 mesh, equations, dg, cache)
+                                                 meshT, equations, dg, cache)
 
     entropy_change = dS_default - dS_true
     if entropy_change > maximum_entropy_increase # Recompute using EC FD volume integral
@@ -92,7 +92,7 @@ end
         # before any surface terms are added.
         du[.., element] .= zero(eltype(du))
 
-        volume_integral_kernel!(du, u, element, mesh,
+        volume_integral_kernel!(du, u, element, meshT,
                                 have_nonconservative_terms, equations,
                                 volume_integral_stabilized, dg, cache)
     end
@@ -100,7 +100,7 @@ end
     return nothing
 end
 
-@inline function volume_integral_kernel!(du, u, element, mesh,
+@inline function volume_integral_kernel!(du, u, element, meshT,
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralEntropyCorrection,
                                          dg::DGSEM, cache)
@@ -110,7 +110,7 @@ end
     du_element_threaded = indicator.cache.volume_integral_values_threaded
 
     # run default volume integral 
-    volume_integral_kernel!(du, u, element, mesh,
+    volume_integral_kernel!(du, u, element, meshT,
                             have_nonconservative_terms, equations,
                             volume_integral_default, dg, cache)
 
@@ -125,12 +125,12 @@ end
     # No scaling by inverse Jacobian here, as there is no Jacobian multiplication
     # in `integrate_reference_element`.
     dS_volume_integral = -entropy_change_reference_element(du, u, element,
-                                                           mesh, equations,
+                                                           meshT, equations,
                                                            dg, cache)
 
     # Compute true entropy change given by surface integral of the entropy potential
     dS_true = surface_integral_reference_element(entropy_potential, u, element,
-                                                 mesh, equations, dg, cache)
+                                                 meshT, equations, dg, cache)
 
     # This quantity should be ≤ 0 for an entropy stable volume integral, and 
     # exactly zero for an entropy conservative volume integral. 
@@ -147,13 +147,13 @@ end
         du[.., element] .= zero(eltype(du))
 
         # Calculate entropy stable volume integral contribution
-        volume_integral_kernel!(du, u, element, mesh,
+        volume_integral_kernel!(du, u, element, meshT,
                                 have_nonconservative_terms, equations,
                                 volume_integral_stabilized, dg, cache)
 
         dS_volume_integral_stabilized = -entropy_change_reference_element(du, u,
                                                                           element,
-                                                                          mesh,
+                                                                          meshT,
                                                                           equations, dg,
                                                                           cache)
 
@@ -177,11 +177,11 @@ end
     return nothing
 end
 
-function calc_volume_integral!(du, u, mesh,
+function calc_volume_integral!(backend::Nothing, du, u, mesh,
                                have_nonconservative_terms, equations,
                                volume_integral, dg::DGSEM, cache)
     @threaded for element in eachelement(dg, cache)
-        volume_integral_kernel!(du, u, element, mesh,
+        volume_integral_kernel!(du, u, element, typeof(mesh),
                                 have_nonconservative_terms, equations,
                                 volume_integral, dg, cache)
     end
@@ -189,7 +189,27 @@ function calc_volume_integral!(du, u, mesh,
     return nothing
 end
 
-function calc_volume_integral!(du, u, mesh,
+function calc_volume_integral!(backend::Backend, du, u, mesh,
+                               have_nonconservative_terms, equations,
+                               volume_integral, dg::DGSEM, cache)
+    nelements(dg, cache) == 0 && return nothing
+    kernel! = volume_integral_KAkernel!(backend)
+    kernel_cache = kernel_filter_cache(cache)
+    kernel!(du, u, typeof(mesh), have_nonconservative_terms, equations,
+            volume_integral, dg, kernel_cache,
+            ndrange = nelements(dg, cache))
+    return nothing
+end
+
+@kernel function volume_integral_KAkernel!(du, u, meshT,
+                                           have_nonconservative_terms, equations,
+                                           volume_integral, dg::DGSEM, cache)
+    element = @index(Global)
+    volume_integral_kernel!(du, u, element, meshT, have_nonconservative_terms,
+                            equations, volume_integral, dg, cache)
+end
+
+function calc_volume_integral!(backend::Nothing, du, u, mesh,
                                have_nonconservative_terms, equations,
                                volume_integral::VolumeIntegralShockCapturingHGType,
                                dg::DGSEM, cache)
@@ -210,18 +230,18 @@ function calc_volume_integral!(du, u, mesh,
         dg_only = isapprox(alpha_element, 0, atol = atol)
 
         if dg_only
-            volume_integral_kernel!(du, u, element, mesh,
+            volume_integral_kernel!(du, u, element, typeof(mesh),
                                     have_nonconservative_terms, equations,
                                     volume_integral_default, dg, cache)
         else
             # Calculate DG volume integral contribution
-            volume_integral_kernel!(du, u, element, mesh,
+            volume_integral_kernel!(du, u, element, typeof(mesh),
                                     have_nonconservative_terms, equations,
                                     volume_integral_blend_high_order, dg, cache,
                                     1 - alpha_element)
 
             # Calculate FV volume integral contribution
-            volume_integral_kernel!(du, u, element, mesh,
+            volume_integral_kernel!(du, u, element, typeof(mesh),
                                     have_nonconservative_terms, equations,
                                     volume_integral_blend_low_order, dg, cache,
                                     alpha_element)
@@ -231,7 +251,7 @@ function calc_volume_integral!(du, u, mesh,
     return nothing
 end
 
-function calc_volume_integral!(du, u, mesh,
+function calc_volume_integral!(backend::Nothing, du, u, mesh,
                                have_nonconservative_terms, equations,
                                volume_integral::VolumeIntegralEntropyCorrectionShockCapturingCombined,
                                dg::DGSEM, cache)
@@ -250,7 +270,7 @@ function calc_volume_integral!(du, u, mesh,
 
     @threaded for element in eachelement(dg, cache)
         # run default volume integral 
-        volume_integral_kernel!(du, u, element, mesh,
+        volume_integral_kernel!(du, u, element, typeof(mesh),
                                 have_nonconservative_terms, equations,
                                 volume_integral_default, dg, cache)
 
@@ -265,12 +285,12 @@ function calc_volume_integral!(du, u, mesh,
         # No scaling by inverse Jacobian here, as there is no Jacobian multiplication
         # in `integrate_reference_element`.
         dS_volume_integral = -entropy_change_reference_element(du, u, element,
-                                                               mesh, equations,
+                                                               typeof(mesh), equations,
                                                                dg, cache)
 
         # Compute true entropy change given by surface integral of the entropy potential
         dS_true = surface_integral_reference_element(entropy_potential, u, element,
-                                                     mesh, equations, dg, cache)
+                                                     typeof(mesh), equations, dg, cache)
 
         # This quantity should be ≤ 0 for an entropy stable volume integral, and 
         # exactly zero for an entropy conservative volume integral. 
@@ -287,13 +307,13 @@ function calc_volume_integral!(du, u, mesh,
             du[.., element] .= zero(eltype(du))
 
             # Calculate entropy stable volume integral contribution
-            volume_integral_kernel!(du, u, element, mesh,
+            volume_integral_kernel!(du, u, element, typeof(mesh),
                                     have_nonconservative_terms, equations,
                                     volume_integral_stabilized, dg, cache)
 
             dS_volume_integral_stabilized = -entropy_change_reference_element(du, u,
                                                                               element,
-                                                                              mesh,
+                                                                              typeof(mesh),
                                                                               equations,
                                                                               dg,
                                                                               cache)
diff --git a/src/solvers/dgsem_p4est/containers.jl b/src/solvers/dgsem_p4est/containers.jl
index 7ac6febf470..3f86fff2bb9 100644
--- a/src/solvers/dgsem_p4est/containers.jl
+++ b/src/solvers/dgsem_p4est/containers.jl
@@ -936,6 +936,24 @@ end
     end
 end
 
+@inline function indices2direction2d(indices)
+    if indices[1] === :begin
+        return 1
+    elseif indices[1] === :end
+        return 2
+    elseif indices[2] === :begin
+        return 3
+    else # if indices[2] === :end
+        return 4
+    end
+end
+
+# Build a reduced cache which can be passed to GPU kernels
+@inline function kernel_filter_cache(cache)
+    return (;
+            elements = (; contravariant_vectors = cache.elements.contravariant_vectors))
+end
+
 include("containers_2d.jl")
 include("containers_3d.jl")
 include("containers_parallel.jl")
diff --git a/src/solvers/dgsem_p4est/dg_2d.jl b/src/solvers/dgsem_p4est/dg_2d.jl
index d4cd609c6bd..ec48d5d204a 100644
--- a/src/solvers/dgsem_p4est/dg_2d.jl
+++ b/src/solvers/dgsem_p4est/dg_2d.jl
@@ -62,125 +62,218 @@ end
     end
 end
 
-function prolong2interfaces!(cache, u,
-                             mesh::Union{P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}},
+function prolong2interfaces!(backend::Nothing, cache, u,
+                             mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                         T8codeMesh{2}},
                              equations, dg::DG)
     @unpack interfaces = cache
+    @unpack neighbor_ids, node_indices = cache.interfaces
     index_range = eachnode(dg)
 
     @threaded for interface in eachinterface(dg, cache)
-        # Copy solution data from the primary element using "delayed indexing" with
-        # a start value and a step size to get the correct face and orientation.
-        # Note that in the current implementation, the interface will be
-        # "aligned at the primary element", i.e., the index of the primary side
-        # will always run forwards.
-        primary_element = interfaces.neighbor_ids[1, interface]
-        primary_indices = interfaces.node_indices[1, interface]
+        prolong2interfaces_per_interface!(interfaces.u, u, interface, typeof(mesh),
+                                          equations, neighbor_ids, node_indices,
+                                          index_range)
+    end
+    return nothing
+end
+
+function prolong2interfaces!(backend::Backend, cache, u,
+                             mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                         T8codeMesh{2}},
+                             equations, dg::DG)
+    @unpack interfaces = cache
+    ninterfaces(interfaces) == 0 && return nothing
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    index_range = eachnode(dg)
+
+    kernel! = prolong2interfaces_KAkernel!(backend)
+    kernel!(interfaces.u, u, typeof(mesh), equations, neighbor_ids, node_indices,
+            index_range, ndrange = ninterfaces(interfaces))
+    return nothing
+end
+
+@kernel function prolong2interfaces_KAkernel!(interfaces_u, u,
+                                              mT::Type{<:Union{P4estMesh{2},
+                                                               P4estMeshView{2},
+                                                               T8codeMesh{2}}},
+                                              equations, neighbor_ids,
+                                              node_indices, index_range)
+    interface = @index(Global)
+    prolong2interfaces_per_interface!(interfaces_u, u, interface, mT, equations,
+                                      neighbor_ids, node_indices, index_range)
+end
+
+@inline function prolong2interfaces_per_interface!(interfaces_u, u, interface,
+                                                   ::Type{<:Union{P4estMesh{2},
+                                                                  P4estMeshView{2},
+                                                                  T8codeMesh{2}}},
+                                                   equations, neighbor_ids,
+                                                   node_indices,
+                                                   index_range)
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+
+    i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
+                                                             index_range)
+    j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
+                                                             index_range)
+
+    i_primary = i_primary_start
+    j_primary = j_primary_start
+    for i in index_range
+        for v in eachvariable(equations)
+            interfaces_u[1, v, i, interface] = u[v, i_primary, j_primary,
+                                                 primary_element]
+        end
+        i_primary += i_primary_step
+        j_primary += j_primary_step
+    end
 
-        i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
+    # Copy solution data from the secondary element using "delayed indexing" with
+    # a start value and a step size to get the correct face and orientation.
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
+
+    i_secondary_start, i_secondary_step = index_to_start_step_2d(secondary_indices[1],
                                                                  index_range)
-        j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
+    j_secondary_start, j_secondary_step = index_to_start_step_2d(secondary_indices[2],
                                                                  index_range)
 
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-        for i in eachnode(dg)
-            for v in eachvariable(equations)
-                interfaces.u[1, v, i, interface] = u[v, i_primary, j_primary,
-                                                     primary_element]
-            end
-            i_primary += i_primary_step
-            j_primary += j_primary_step
+    i_secondary = i_secondary_start
+    j_secondary = j_secondary_start
+    for i in index_range
+        for v in eachvariable(equations)
+            interfaces_u[2, v, i, interface] = u[v, i_secondary, j_secondary,
+                                                 secondary_element]
         end
+        i_secondary += i_secondary_step
+        j_secondary += j_secondary_step
+    end
 
-        # Copy solution data from the secondary element using "delayed indexing" with
-        # a start value and a step size to get the correct face and orientation.
-        secondary_element = interfaces.neighbor_ids[2, interface]
-        secondary_indices = interfaces.node_indices[2, interface]
+    return nothing
+end
 
-        i_secondary_start, i_secondary_step = index_to_start_step_2d(secondary_indices[1],
-                                                                     index_range)
-        j_secondary_start, j_secondary_step = index_to_start_step_2d(secondary_indices[2],
-                                                                     index_range)
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
+                              mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                          T8codeMesh{2}},
+                              have_nonconservative_terms,
+                              equations, surface_integral, dg::DG, cache)
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
 
-        i_secondary = i_secondary_start
-        j_secondary = j_secondary_start
-        for i in eachnode(dg)
-            for v in eachvariable(equations)
-                interfaces.u[2, v, i, interface] = u[v, i_secondary, j_secondary,
-                                                     secondary_element]
-            end
-            i_secondary += i_secondary_step
-            j_secondary += j_secondary_step
-        end
+    @threaded for interface in eachinterface(dg, cache)
+        calc_interface_flux_per_interface!(surface_flux_values, typeof(mesh),
+                                           have_nonconservative_terms,
+                                           equations, surface_integral, typeof(dg),
+                                           cache.interfaces.u, interface,
+                                           neighbor_ids, node_indices,
+                                           contravariant_vectors, index_range)
     end
 
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Backend, surface_flux_values,
                               mesh::Union{P4estMesh{2}, P4estMeshView{2},
                                           T8codeMesh{2}},
                               have_nonconservative_terms,
                               equations, surface_integral, dg::DG, cache)
+    ninterfaces(cache.interfaces) == 0 && return nothing
     @unpack neighbor_ids, node_indices = cache.interfaces
     @unpack contravariant_vectors = cache.elements
     index_range = eachnode(dg)
+
+    kernel! = calc_interface_flux_KAkernel!(backend)
+    kernel!(surface_flux_values, typeof(mesh), have_nonconservative_terms,
+            equations, surface_integral, typeof(dg), cache.interfaces.u,
+            neighbor_ids, node_indices, contravariant_vectors, index_range,
+            ndrange = ninterfaces(cache.interfaces))
+
+    return nothing
+end
+
+@kernel function calc_interface_flux_KAkernel!(surface_flux_values,
+                                               mt::Type{<:Union{P4estMesh{2},
+                                                                P4estMeshView{2},
+                                                                T8codeMesh{2}}},
+                                               have_nonconservative_terms,
+                                               equations, surface_integral,
+                                               st::Type{<:DG}, u_interface,
+                                               neighbor_ids, node_indices,
+                                               contravariant_vectors, index_range)
+    interface = @index(Global)
+    calc_interface_flux_per_interface!(surface_flux_values, mt,
+                                       have_nonconservative_terms, equations,
+                                       surface_integral, st, u_interface,
+                                       interface, neighbor_ids, node_indices,
+                                       contravariant_vectors, index_range)
+end
+
+@inline function calc_interface_flux_per_interface!(surface_flux_values,
+                                                    mt::Type{<:Union{P4estMesh{2},
+                                                                     P4estMeshView{2},
+                                                                     T8codeMesh{2}}},
+                                                    have_nonconservative_terms,
+                                                    equations, surface_integral,
+                                                    st::Type{<:DG},
+                                                    u_interface, interface,
+                                                    neighbor_ids,
+                                                    node_indices, contravariant_vectors,
+                                                    index_range)
     index_end = last(index_range)
 
-    @threaded for interface in eachinterface(dg, cache)
-        # Get element and side index information on the primary element
-        primary_element = neighbor_ids[1, interface]
-        primary_indices = node_indices[1, interface]
-        primary_direction = indices2direction(primary_indices)
+    # Get element and side index information on the primary element
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+    primary_direction = indices2direction2d(primary_indices)
 
-        # Create the local i,j indexing on the primary element used to pull normal direction information
-        i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
-                                                                 index_range)
-        j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
-                                                                 index_range)
+    # Create the local i,j indexing on the primary element used to pull normal direction information
+    i_primary_start, i_primary_step = index_to_start_step_2d(primary_indices[1],
+                                                             index_range)
+    j_primary_start, j_primary_step = index_to_start_step_2d(primary_indices[2],
+                                                             index_range)
 
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-
-        # Get element and side index information on the secondary element
-        secondary_element = neighbor_ids[2, interface]
-        secondary_indices = node_indices[2, interface]
-        secondary_direction = indices2direction(secondary_indices)
-
-        # Initiate the secondary index to be used in the surface for loop.
-        # This index on the primary side will always run forward but
-        # the secondary index might need to run backwards for flipped sides.
-        if :i_backward in secondary_indices
-            node_secondary = index_end
-            node_secondary_step = -1
-        else
-            node_secondary = 1
-            node_secondary_step = 1
-        end
+    i_primary = i_primary_start
+    j_primary = j_primary_start
 
-        for node in eachnode(dg)
-            # Get the normal direction on the primary element.
-            # Contravariant vectors at interfaces in negative coordinate direction
-            # are pointing inwards. This is handled by `get_normal_direction`.
-            normal_direction = get_normal_direction(primary_direction,
-                                                    contravariant_vectors,
-                                                    i_primary, j_primary,
-                                                    primary_element)
-
-            calc_interface_flux!(surface_flux_values, mesh, have_nonconservative_terms,
-                                 equations,
-                                 surface_integral, dg, cache,
-                                 interface, normal_direction,
-                                 node, primary_direction, primary_element,
-                                 node_secondary, secondary_direction, secondary_element)
-
-            # Increment primary element indices to pull the normal direction
-            i_primary += i_primary_step
-            j_primary += j_primary_step
-            # Increment the surface node index along the secondary element
-            node_secondary += node_secondary_step
-        end
+    # Get element and side index information on the secondary element
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
+    secondary_direction = indices2direction2d(secondary_indices)
+
+    # Initiate the secondary index to be used in the surface for loop.
+    # This index on the primary side will always run forward but
+    # the secondary index might need to run backwards                                               for flipped sides.
+    if :i_backward in secondary_indices
+        node_secondary = index_end
+        node_secondary_step = -1
+    else
+        node_secondary = 1
+        node_secondary_step = 1
+    end
+
+    for node in index_range
+        # Get the normal direction on the primary element.
+        # Contravariant vectors at interfaces in negative coordinate direction
+        # are pointing inwards. This is handled by `get_normal_direction`.
+        normal_direction = get_normal_direction(primary_direction,
+                                                contravariant_vectors,
+                                                i_primary, j_primary,
+                                                primary_element)
+
+        calc_interface_flux!(surface_flux_values, mt, have_nonconservative_terms,
+                             equations, surface_integral, st, u_interface, interface,
+                             normal_direction, node, primary_direction,
+                             primary_element, node_secondary,
+                             secondary_direction, secondary_element)
+
+        # Increment primary element indices to pull the normal direction
+        i_primary += i_primary_step
+        j_primary += j_primary_step
+        # Increment the surface node index along the secondary element
+        node_secondary += node_secondary_step
     end
 
     return nothing
@@ -188,19 +281,22 @@ end
 
 # Inlined version of the interface flux computation for conservation laws
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{2}, P4estMeshView{2},
-                                                  T8codeMesh{2}},
+                                      ::Type{<:Union{P4estMesh{2},
+                                                     P4estMeshView{2},
+                                                     T8codeMesh{2}}},
                                       have_nonconservative_terms::False, equations,
-                                      surface_integral, dg::DG, cache,
-                                      interface_index, normal_direction,
-                                      primary_node_index, primary_direction_index,
+                                      surface_integral, st::Type{<:DG},
+                                      u_interface, interface_index,
+                                      normal_direction, primary_node_index,
+                                      primary_direction_index,
                                       primary_element_index,
-                                      secondary_node_index, secondary_direction_index,
+                                      secondary_node_index,
+                                      secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     @unpack surface_flux = surface_integral
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, st,
+                                       primary_node_index,
                                        interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -215,20 +311,22 @@ end
 
 # Inlined version of the interface flux computation for equations with conservative and nonconservative terms
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                      meshT::Type{<:Union{P4estMesh{2}, T8codeMesh{2}}},
                                       have_nonconservative_terms::True, equations,
-                                      surface_integral, dg::DG, cache,
-                                      interface_index, normal_direction,
-                                      primary_node_index, primary_direction_index,
+                                      surface_integral, st::Type{<:DG},
+                                      u_interface, interface_index,
+                                      normal_direction, primary_node_index,
+                                      primary_direction_index,
                                       primary_element_index,
-                                      secondary_node_index, secondary_direction_index,
+                                      secondary_node_index,
+                                      secondary_direction_index,
                                       secondary_element_index)
     @unpack surface_flux = surface_integral
-    calc_interface_flux!(surface_flux_values, mesh, have_nonconservative_terms,
+    calc_interface_flux!(surface_flux_values, meshT, have_nonconservative_terms,
                          combine_conservative_and_nonconservative_fluxes(surface_flux,
                                                                          equations),
                          equations,
-                         surface_integral, dg, cache,
+                         surface_integral, st, u_interface,
                          interface_index, normal_direction,
                          primary_node_index, primary_direction_index,
                          primary_element_index,
@@ -238,20 +336,19 @@ end
 end
 
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                      ::Type{<:Union{P4estMesh{2}, T8codeMesh{2}}},
                                       have_nonconservative_terms::True,
                                       combine_conservative_and_nonconservative_fluxes::False,
                                       equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, st::Type{<:DG}, u_interface,
                                       interface_index, normal_direction,
                                       primary_node_index, primary_direction_index,
                                       primary_element_index,
                                       secondary_node_index, secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     surface_flux, nonconservative_flux = surface_integral.surface_flux
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, st, primary_node_index,
                                        interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -265,32 +362,31 @@ end
         # Note the factor 0.5 necessary for the nonconservative fluxes based on
         # the interpretation of global SBP operators coupled discontinuously via
         # central fluxes/SATs
-        surface_flux_values[v, primary_node_index, primary_direction_index, primary_element_index] = (flux_[v] +
-                                                                                                      0.5f0 *
-                                                                                                      noncons_primary[v])
-        surface_flux_values[v, secondary_node_index, secondary_direction_index, secondary_element_index] = -(flux_[v] +
+        surface_flux_values[v, primary_node_index, primary_direction_index, primary_element_index] = Float64(flux_[v] +
                                                                                                              0.5f0 *
-                                                                                                             noncons_secondary[v])
+                                                                                                             noncons_primary[v])
+        surface_flux_values[v, secondary_node_index, secondary_direction_index, secondary_element_index] = Float64(-(flux_[v] +
+                                                                                                                     0.5f0 *
+                                                                                                                     noncons_secondary[v]))
     end
 
     return nothing
 end
 
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{2}, T8codeMesh{2}},
+                                      ::Type{<:Union{P4estMesh{2}, T8codeMesh{2}}},
                                       have_nonconservative_terms::True,
                                       combine_conservative_and_nonconservative_fluxes::True,
                                       equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, st::Type{<:DG}, u_interface,
                                       interface_index, normal_direction,
                                       primary_node_index, primary_direction_index,
                                       primary_element_index,
                                       secondary_node_index, secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     @unpack surface_flux = surface_integral
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, st, primary_node_index,
                                        interface_index)
 
     flux_left, flux_right = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -756,7 +852,7 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u,
+function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{P4estMesh{2}, P4estMeshView{2},
                                             T8codeMesh{2}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
@@ -764,6 +860,51 @@ function calc_surface_integral!(du, u,
     @unpack inverse_weights = dg.basis
     @unpack surface_flux_values = cache.elements
 
+    @threaded for element in eachelement(dg, cache)
+        calc_surface_integral_per_element!(du, typeof(mesh), equations,
+                                           surface_integral, dg, inverse_weights[1],
+                                           surface_flux_values, element)
+    end
+end
+
+function calc_surface_integral!(backend::Backend, du, u,
+                                mesh::Union{P4estMesh{2}, P4estMeshView{2},
+                                            T8codeMesh{2}},
+                                equations,
+                                surface_integral::SurfaceIntegralWeakForm,
+                                dg::DGSEM, cache)
+    nelements(dg, cache) == 0 && return nothing
+    @unpack inverse_weights = dg.basis
+    @unpack surface_flux_values = cache.elements
+
+    kernel! = calc_surface_integral_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, surface_integral, dg, inverse_weights[1],
+            surface_flux_values, ndrange = nelements(dg, cache))
+    return nothing
+end
+
+@kernel function calc_surface_integral_KAkernel!(du,
+                                                 mT::Type{<:Union{P4estMesh{2},
+                                                                  P4estMeshView{2},
+                                                                  T8codeMesh{2}}},
+                                                 equations,
+                                                 surface_integral::SurfaceIntegralWeakForm,
+                                                 dg::DGSEM, factor,
+                                                 surface_flux_values)
+    element = @index(Global)
+    calc_surface_integral_per_element!(du, mT, equations, surface_integral,
+                                       dg, factor, surface_flux_values, element)
+end
+
+@inline function calc_surface_integral_per_element!(du,
+                                                    ::Type{<:Union{P4estMesh{2},
+                                                                   P4estMeshView{2},
+                                                                   T8codeMesh{2}}},
+                                                    equations,
+                                                    surface_integral::SurfaceIntegralWeakForm,
+                                                    dg::DGSEM, factor,
+                                                    surface_flux_values,
+                                                    element)
     # Note that all fluxes have been computed with outward-pointing normal vectors.
     # This computes the **negative** surface integral contribution,
     # i.e., M^{-1} * boundary_interpolation^T (which is for Gauss-Lobatto DGSEM just M^{-1} * B)
@@ -771,33 +912,32 @@ function calc_surface_integral!(du, u,
     #
     # We also use explicit assignments instead of `+=` to let `@muladd` turn these
     # into FMAs (see comment at the top of the file).
-    factor = inverse_weights[1] # For LGL basis: Identical to weighted boundary interpolation at x = ±1
-    @threaded for element in eachelement(dg, cache)
-        for l in eachnode(dg)
-            for v in eachvariable(equations)
-                # surface at -x
-                du[v, 1, l, element] = (du[v, 1, l, element] +
-                                        surface_flux_values[v, l, 1, element] *
-                                        factor)
-
-                # surface at +x
-                du[v, nnodes(dg), l, element] = (du[v, nnodes(dg), l, element] +
-                                                 surface_flux_values[v, l, 2, element] *
-                                                 factor)
-
-                # surface at -y
-                du[v, l, 1, element] = (du[v, l, 1, element] +
-                                        surface_flux_values[v, l, 3, element] *
-                                        factor)
-
-                # surface at +y
-                du[v, l, nnodes(dg), element] = (du[v, l, nnodes(dg), element] +
-                                                 surface_flux_values[v, l, 4, element] *
-                                                 factor)
-            end
+    #
+    # factor = inverse_weights[1]
+    # For LGL basis: Identical to weighted boundary interpolation at x = ±1
+    for l in eachnode(dg)
+        for v in eachvariable(equations)
+            # surface at -x
+            du[v, 1, l, element] = (du[v, 1, l, element] +
+                                    surface_flux_values[v, l, 1, element] *
+                                    factor)
+
+            # surface at +x
+            du[v, nnodes(dg), l, element] = (du[v, nnodes(dg), l, element] +
+                                             surface_flux_values[v, l, 2, element] *
+                                             factor)
+
+            # surface at -y
+            du[v, l, 1, element] = (du[v, l, 1, element] +
+                                    surface_flux_values[v, l, 3, element] *
+                                    factor)
+
+            # surface at +y
+            du[v, l, nnodes(dg), element] = (du[v, l, nnodes(dg), element] +
+                                             surface_flux_values[v, l, 4, element] *
+                                             factor)
         end
     end
-
     return nothing
 end
 end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
index 35372eecea7..d8565b6d22a 100644
--- a/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_p4est/dg_2d_parabolic.jl
@@ -123,7 +123,7 @@ function rhs_parabolic!(du, u, t, mesh::Union{P4estMesh{2}, P4estMesh{3}},
     # Calculate surface integrals.
     # This reuses `calc_surface_integral!` for the purely hyperbolic case.
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations_parabolic,
+        calc_surface_integral!(nothing, du, u, mesh, equations_parabolic,
                                dg.surface_integral, dg, cache)
     end
 
@@ -158,7 +158,7 @@ function calc_gradient!(gradients, u_transformed, t,
     # Prolong solution to interfaces.
     # This reuses `prolong2interfaces` for the purely hyperbolic case.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u_transformed, mesh,
+        prolong2interfaces!(nothing, cache, u_transformed, mesh,
                             equations_parabolic, dg)
     end
 
diff --git a/src/solvers/dgsem_p4est/dg_2d_parallel.jl b/src/solvers/dgsem_p4est/dg_2d_parallel.jl
index f77d625f57e..2cefabd9539 100644
--- a/src/solvers/dgsem_p4est/dg_2d_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_2d_parallel.jl
@@ -140,7 +140,7 @@ end
 @inline function calc_mpi_interface_flux!(surface_flux_values,
                                           mesh::Union{P4estMeshParallel{2},
                                                       T8codeMeshParallel{2}},
-                                          nonconservative_terms::True, equations,
+                                          have_nonconservative_terms::True, equations,
                                           surface_integral, dg::DG, cache,
                                           interface_index, normal_direction,
                                           interface_node_index, local_side,
@@ -321,7 +321,7 @@ end
 @inline function calc_mpi_mortar_flux!(fstar_primary, fstar_secondary,
                                        mesh::Union{P4estMeshParallel{2},
                                                    T8codeMeshParallel{2}},
-                                       nonconservative_terms::True, equations,
+                                       have_nonconservative_terms::True, equations,
                                        surface_integral, dg::DG, cache,
                                        mortar_index, position_index, normal_direction,
                                        node_index)
diff --git a/src/solvers/dgsem_p4est/dg_3d.jl b/src/solvers/dgsem_p4est/dg_3d.jl
index b70661c56ba..07c6b1b6a0b 100644
--- a/src/solvers/dgsem_p4est/dg_3d.jl
+++ b/src/solvers/dgsem_p4est/dg_3d.jl
@@ -91,85 +91,117 @@ end
     return (i1, i2)
 end
 
-function prolong2interfaces!(cache, u,
+function prolong2interfaces!(backend::Nothing, cache, u,
                              mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                              equations, dg::DG)
     @unpack interfaces = cache
+    @unpack neighbor_ids, node_indices = cache.interfaces
     index_range = eachnode(dg)
 
     @threaded for interface in eachinterface(dg, cache)
-        # Copy solution data from the primary element using "delayed indexing" with
-        # a start value and two step sizes to get the correct face and orientation.
-        # Note that in the current implementation, the interface will be
-        # "aligned at the primary element", i.e., the indices of the primary side
-        # will always run forwards.
-        primary_element = interfaces.neighbor_ids[1, interface]
-        primary_indices = interfaces.node_indices[1, interface]
-
-        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
-                                                                                     index_range)
-        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
-                                                                                     index_range)
-        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
-                                                                                     index_range)
-
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-        k_primary = k_primary_start
-        for j in eachnode(dg)
-            for i in eachnode(dg)
-                for v in eachvariable(equations)
-                    interfaces.u[1, v, i, j, interface] = u[v,
-                                                            i_primary, j_primary,
-                                                            k_primary,
-                                                            primary_element]
-                end
-                i_primary += i_primary_step_i
-                j_primary += j_primary_step_i
-                k_primary += k_primary_step_i
+        prolong2interfaces_interface!(interfaces.u, u, typeof(mesh), equations,
+                                      neighbor_ids, node_indices, index_range,
+                                      interface)
+    end
+    return nothing
+end
+
+function prolong2interfaces!(backend::Backend, cache, u,
+                             mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                             equations, dg::DG)
+    @unpack interfaces = cache
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    index_range = eachnode(dg)
+
+    kernel! = prolong2interfaces_KAkernel!(backend)
+    kernel!(interfaces.u, u, typeof(mesh), equations, neighbor_ids, node_indices,
+            index_range,
+            ndrange = ninterfaces(interfaces))
+    return nothing
+end
+
+@kernel function prolong2interfaces_KAkernel!(interface_u, u, meshT, equations,
+                                              neighbor_ids, node_indices, index_range)
+    interface = @index(Global)
+    prolong2interfaces_interface!(interface_u, u, meshT, equations, neighbor_ids,
+                                  node_indices, index_range, interface)
+end
+
+@inline function prolong2interfaces_interface!(u_interface, u,
+                                               ::Type{<:Union{P4estMesh{3},
+                                                              T8codeMesh{3}}},
+                                               equations, neighbor_ids, node_indices,
+                                               index_range, interface)
+    # Copy solution data from the primary element using "delayed indexing" with
+    # a start value and two step sizes to get the correct face and orientation.
+    # Note that in the current implementation, the interface will be
+    # "aligned at the primary element", i.e., the indices of the primary side
+    # will always run forwards.
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+
+    i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                 index_range)
+    j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                 index_range)
+    k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                 index_range)
+
+    i_primary = i_primary_start
+    j_primary = j_primary_start
+    k_primary = k_primary_start
+    for j in index_range
+        for i in index_range
+            for v in eachvariable(equations)
+                u_interface[1, v, i, j, interface] = u[v,
+                                                       i_primary, j_primary,
+                                                       k_primary,
+                                                       primary_element]
             end
-            i_primary += i_primary_step_j
-            j_primary += j_primary_step_j
-            k_primary += k_primary_step_j
+            i_primary += i_primary_step_i
+            j_primary += j_primary_step_i
+            k_primary += k_primary_step_i
         end
+        i_primary += i_primary_step_j
+        j_primary += j_primary_step_j
+        k_primary += k_primary_step_j
+    end
 
-        # Copy solution data from the secondary element using "delayed indexing" with
-        # a start value and two step sizes to get the correct face and orientation.
-        secondary_element = interfaces.neighbor_ids[2, interface]
-        secondary_indices = interfaces.node_indices[2, interface]
-
-        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_indices[1],
-                                                                                           index_range)
-        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_indices[2],
-                                                                                           index_range)
-        k_secondary_start, k_secondary_step_i, k_secondary_step_j = index_to_start_step_3d(secondary_indices[3],
-                                                                                           index_range)
-
-        i_secondary = i_secondary_start
-        j_secondary = j_secondary_start
-        k_secondary = k_secondary_start
-        for j in eachnode(dg)
-            for i in eachnode(dg)
-                for v in eachvariable(equations)
-                    interfaces.u[2, v, i, j, interface] = u[v,
-                                                            i_secondary, j_secondary,
-                                                            k_secondary,
-                                                            secondary_element]
-                end
-                i_secondary += i_secondary_step_i
-                j_secondary += j_secondary_step_i
-                k_secondary += k_secondary_step_i
+    # Copy solution data from the secondary element using "delayed indexing" with
+    # a start value and two step sizes to get the correct face and orientation.
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
+
+    i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_indices[1],
+                                                                                       index_range)
+    j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_indices[2],
+                                                                                       index_range)
+    k_secondary_start, k_secondary_step_i, k_secondary_step_j = index_to_start_step_3d(secondary_indices[3],
+                                                                                       index_range)
+
+    i_secondary = i_secondary_start
+    j_secondary = j_secondary_start
+    k_secondary = k_secondary_start
+    for j in index_range
+        for i in index_range
+            for v in eachvariable(equations)
+                u_interface[2, v, i, j, interface] = u[v,
+                                                       i_secondary, j_secondary,
+                                                       k_secondary,
+                                                       secondary_element]
             end
-            i_secondary += i_secondary_step_j
-            j_secondary += j_secondary_step_j
-            k_secondary += k_secondary_step_j
+            i_secondary += i_secondary_step_i
+            j_secondary += j_secondary_step_i
+            k_secondary += k_secondary_step_i
         end
+        i_secondary += i_secondary_step_j
+        j_secondary += j_secondary_step_j
+        k_secondary += k_secondary_step_j
     end
-
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                               have_nonconservative_terms,
                               equations, surface_integral, dg::DG, cache)
@@ -178,93 +210,140 @@ function calc_interface_flux!(surface_flux_values,
     index_range = eachnode(dg)
 
     @threaded for interface in eachinterface(dg, cache)
-        # Get element and side information on the primary element
-        primary_element = neighbor_ids[1, interface]
-        primary_indices = node_indices[1, interface]
-        primary_direction = indices2direction(primary_indices)
-
-        i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
-                                                                                     index_range)
-        j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
-                                                                                     index_range)
-        k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
-                                                                                     index_range)
-
-        i_primary = i_primary_start
-        j_primary = j_primary_start
-        k_primary = k_primary_start
-
-        # Get element and side information on the secondary element
-        secondary_element = neighbor_ids[2, interface]
-        secondary_indices = node_indices[2, interface]
-        secondary_direction = indices2direction(secondary_indices)
-        secondary_surface_indices = surface_indices(secondary_indices)
-
-        # Get the surface indexing on the secondary element.
-        # Note that the indices of the primary side will always run forward but
-        # the secondary indices might need to run backwards for flipped sides.
-        i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[1],
-                                                                                           index_range)
-        j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[2],
-                                                                                           index_range)
-        i_secondary = i_secondary_start
-        j_secondary = j_secondary_start
+        calc_interface_flux_interface!(surface_flux_values,
+                                       typeof(mesh),
+                                       have_nonconservative_terms,
+                                       equations, surface_integral, typeof(dg),
+                                       cache.interfaces.u, neighbor_ids, node_indices,
+                                       contravariant_vectors, index_range, interface)
+    end
+    return nothing
+end
+
+function calc_interface_flux!(backend::Backend, surface_flux_values,
+                              mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                              have_nonconservative_terms,
+                              equations, surface_integral, dg::DG, cache)
+    @unpack neighbor_ids, node_indices = cache.interfaces
+    @unpack contravariant_vectors = cache.elements
+    index_range = eachnode(dg)
+
+    kernel! = calc_interface_flux_KAkernel!(backend)
+    kernel!(surface_flux_values, typeof(mesh), have_nonconservative_terms, equations,
+            surface_integral, typeof(dg), cache.interfaces.u,
+            neighbor_ids, node_indices, contravariant_vectors, index_range,
+            ndrange = ninterfaces(cache.interfaces))
+    return nothing
+end
+
+@kernel function calc_interface_flux_KAkernel!(surface_flux_values, meshT,
+                                               have_nonconservative_terms, equations,
+                                               surface_integral, solverT, u_interface,
+                                               neighbor_ids, node_indices,
+                                               contravariant_vectors, index_range)
+    interface = @index(Global)
+    calc_interface_flux_interface!(surface_flux_values,
+                                   meshT,
+                                   have_nonconservative_terms,
+                                   equations, surface_integral, solverT, u_interface,
+                                   neighbor_ids, node_indices, contravariant_vectors,
+                                   index_range, interface)
+end
+
+@inline function calc_interface_flux_interface!(surface_flux_values,
+                                                meshT::Type{<:Union{P4estMesh{3},
+                                                                    T8codeMesh{3}}},
+                                                have_nonconservative_terms,
+                                                equations, surface_integral,
+                                                solverT::Type{<:DG}, u_interface,
+                                                neighbor_ids,
+                                                node_indices, contravariant_vectors,
+                                                index_range, interface)
+    # Get element and side information on the primary element
+    primary_element = neighbor_ids[1, interface]
+    primary_indices = node_indices[1, interface]
+    primary_direction = indices2direction(primary_indices)
+
+    i_primary_start, i_primary_step_i, i_primary_step_j = index_to_start_step_3d(primary_indices[1],
+                                                                                 index_range)
+    j_primary_start, j_primary_step_i, j_primary_step_j = index_to_start_step_3d(primary_indices[2],
+                                                                                 index_range)
+    k_primary_start, k_primary_step_i, k_primary_step_j = index_to_start_step_3d(primary_indices[3],
+                                                                                 index_range)
+
+    i_primary = i_primary_start
+    j_primary = j_primary_start
+    k_primary = k_primary_start
+
+    # Get element and side information on the secondary element
+    secondary_element = neighbor_ids[2, interface]
+    secondary_indices = node_indices[2, interface]
+    secondary_direction = indices2direction(secondary_indices)
+    secondary_surface_indices = surface_indices(secondary_indices)
+
+    # Get the surface indexing on the secondary element.
+    # Note that the indices of the primary side will always run forward but
+    # the secondary indices might need to run backwards for flipped sides.
+    i_secondary_start, i_secondary_step_i, i_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[1],
+                                                                                       index_range)
+    j_secondary_start, j_secondary_step_i, j_secondary_step_j = index_to_start_step_3d(secondary_surface_indices[2],
+                                                                                       index_range)
+    i_secondary = i_secondary_start
+    j_secondary = j_secondary_start
+
+    for j in index_range
+        for i in index_range
+            # Get the normal direction from the primary element.
+            # Note, contravariant vectors at interfaces in negative coordinate direction
+            # are pointing inwards. This is handled by `get_normal_direction`.
+            normal_direction = get_normal_direction(primary_direction,
+                                                    contravariant_vectors,
+                                                    i_primary, j_primary, k_primary,
+                                                    primary_element)
+
+            calc_interface_flux!(surface_flux_values, meshT, have_nonconservative_terms,
+                                 equations,
+                                 surface_integral, solverT, u_interface,
+                                 interface, normal_direction,
+                                 i, j, primary_direction, primary_element,
+                                 i_secondary, j_secondary, secondary_direction,
+                                 secondary_element)
 
-        for j in eachnode(dg)
-            for i in eachnode(dg)
-                # Get the normal direction from the primary element.
-                # Note, contravariant vectors at interfaces in negative coordinate direction
-                # are pointing inwards. This is handled by `get_normal_direction`.
-                normal_direction = get_normal_direction(primary_direction,
-                                                        contravariant_vectors,
-                                                        i_primary, j_primary, k_primary,
-                                                        primary_element)
-
-                calc_interface_flux!(surface_flux_values, mesh,
-                                     have_nonconservative_terms,
-                                     equations,
-                                     surface_integral, dg, cache,
-                                     interface, normal_direction,
-                                     i, j, primary_direction, primary_element,
-                                     i_secondary, j_secondary, secondary_direction,
-                                     secondary_element)
-
-                # Increment the primary element indices
-                i_primary += i_primary_step_i
-                j_primary += j_primary_step_i
-                k_primary += k_primary_step_i
-                # Increment the secondary element surface indices
-                i_secondary += i_secondary_step_i
-                j_secondary += j_secondary_step_i
-            end
             # Increment the primary element indices
-            i_primary += i_primary_step_j
-            j_primary += j_primary_step_j
-            k_primary += k_primary_step_j
+            i_primary += i_primary_step_i
+            j_primary += j_primary_step_i
+            k_primary += k_primary_step_i
             # Increment the secondary element surface indices
-            i_secondary += i_secondary_step_j
-            j_secondary += j_secondary_step_j
+            i_secondary += i_secondary_step_i
+            j_secondary += j_secondary_step_i
         end
+        # Increment the primary element indices
+        i_primary += i_primary_step_j
+        j_primary += j_primary_step_j
+        k_primary += k_primary_step_j
+        # Increment the secondary element surface indices
+        i_secondary += i_secondary_step_j
+        j_secondary += j_secondary_step_j
     end
-
     return nothing
 end
 
 # Inlined function for interface flux computation for conservative flux terms
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                      ::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
                                       have_nonconservative_terms::False, equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
                                       secondary_i_node_index, secondary_j_node_index,
                                       secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     @unpack surface_flux = surface_integral
 
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_i_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, solverT,
+                                       primary_i_node_index,
                                        primary_j_node_index, interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -281,19 +360,21 @@ end
 
 # Inlined function for interface flux computation for flux + nonconservative terms
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                      meshT::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
                                       have_nonconservative_terms::True, equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
                                       secondary_i_node_index, secondary_j_node_index,
                                       secondary_direction_index,
                                       secondary_element_index)
-    calc_interface_flux!(surface_flux_values, mesh, have_nonconservative_terms,
+    calc_interface_flux!(surface_flux_values, meshT, have_nonconservative_terms,
                          combine_conservative_and_nonconservative_fluxes(surface_integral.surface_flux,
                                                                          equations),
-                         equations, surface_integral, dg, cache, interface_index,
+                         equations, surface_integral, solverT, u_interface,
+                         interface_index,
                          normal_direction, primary_i_node_index, primary_j_node_index,
                          primary_direction_index, primary_element_index,
                          secondary_i_node_index, secondary_j_node_index,
@@ -302,21 +383,21 @@ end
 end
 
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                      ::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
                                       have_nonconservative_terms::True,
                                       combine_conservative_and_nonconservative_fluxes::False,
                                       equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
                                       secondary_i_node_index, secondary_j_node_index,
                                       secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     surface_flux, nonconservative_flux = surface_integral.surface_flux
-
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_i_node_index,
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, solverT,
+                                       primary_i_node_index,
                                        primary_j_node_index, interface_index)
 
     flux_ = surface_flux(u_ll, u_rr, normal_direction, equations)
@@ -343,21 +424,22 @@ end
 end
 
 @inline function calc_interface_flux!(surface_flux_values,
-                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                      ::Type{<:Union{P4estMesh{3}, T8codeMesh{3}}},
                                       have_nonconservative_terms::True,
                                       combine_conservative_and_nonconservative_fluxes::True,
                                       equations,
-                                      surface_integral, dg::DG, cache,
+                                      surface_integral, solverT::Type{<:DG},
+                                      u_interface,
                                       interface_index, normal_direction,
                                       primary_i_node_index, primary_j_node_index,
                                       primary_direction_index, primary_element_index,
                                       secondary_i_node_index, secondary_j_node_index,
                                       secondary_direction_index,
                                       secondary_element_index)
-    @unpack u = cache.interfaces
     @unpack surface_flux = surface_integral
-    u_ll, u_rr = get_surface_node_vars(u, equations, dg, primary_i_node_index,
-                                       primary_j_node_index, interface_index)
+    u_ll, u_rr = get_surface_node_vars(u_interface, equations, solverT,
+                                       primary_i_node_index, primary_j_node_index,
+                                       interface_index)
 
     flux_left, flux_right = surface_flux(u_ll, u_rr, normal_direction, equations)
 
@@ -463,7 +545,7 @@ end
 # inlined version of the boundary flux calculation along a physical interface
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
-                                     nonconservative_terms::False, equations,
+                                     have_nonconservative_terms::False, equations,
                                      surface_integral, dg::DG, cache, i_index, j_index,
                                      k_index, i_node_index, j_node_index,
                                      direction_index,
@@ -499,13 +581,13 @@ end
 # inlined version of the boundary flux calculation along a physical interface
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
-                                     nonconservative_terms::True, equations,
+                                     have_nonconservative_terms::True, equations,
                                      surface_integral, dg::DG, cache, i_index, j_index,
                                      k_index, i_node_index, j_node_index,
                                      direction_index,
                                      element_index, boundary_index)
     calc_boundary_flux!(surface_flux_values, t, boundary_condition, mesh,
-                        nonconservative_terms,
+                        have_nonconservative_terms,
                         combine_conservative_and_nonconservative_fluxes(surface_integral.surface_flux,
                                                                         equations),
                         equations,
@@ -517,7 +599,7 @@ end
 
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
-                                     nonconservative_terms::True,
+                                     have_nonconservative_terms::True,
                                      combine_conservative_and_nonconservative_fluxes::False,
                                      equations,
                                      surface_integral, dg::DG, cache, i_index, j_index,
@@ -560,7 +642,7 @@ end
 
 @inline function calc_boundary_flux!(surface_flux_values, t, boundary_condition,
                                      mesh::Union{P4estMesh{3}, T8codeMesh{3}},
-                                     nonconservative_terms::True,
+                                     have_nonconservative_terms::True,
                                      combine_conservative_and_nonconservative_fluxes::True,
                                      equations,
                                      surface_integral, dg::DG, cache, i_index, j_index,
@@ -922,13 +1004,53 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u,
+function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
                                 dg::DGSEM, cache)
     @unpack inverse_weights = dg.basis
     @unpack surface_flux_values = cache.elements
 
+    @threaded for element in eachelement(dg, cache)
+        calc_surface_integral_element!(du, typeof(mesh),
+                                       equations, surface_integral,
+                                       dg, inverse_weights[1],
+                                       surface_flux_values,
+                                       element)
+    end
+    return nothing
+end
+
+function calc_surface_integral!(backend::Backend, du, u,
+                                mesh::Union{P4estMesh{3}, T8codeMesh{3}},
+                                equations,
+                                surface_integral::SurfaceIntegralWeakForm,
+                                dg::DGSEM, cache)
+    @unpack inverse_weights = dg.basis
+    @unpack surface_flux_values = cache.elements
+
+    kernel! = calc_surface_integral_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, surface_integral, dg, inverse_weights[1],
+            surface_flux_values, ndrange = nelements(cache.elements))
+    return nothing
+end
+
+@kernel function calc_surface_integral_KAkernel!(du, meshT, equations,
+                                                 surface_integral, dg, factor,
+                                                 surface_flux_values)
+    element = @index(Global)
+    calc_surface_integral_element!(du, meshT,
+                                   equations, surface_integral, dg, factor,
+                                   surface_flux_values, element)
+end
+
+@inline function calc_surface_integral_element!(du,
+                                                ::Type{<:Union{P4estMesh{3},
+                                                               T8codeMesh{3}}},
+                                                equations,
+                                                surface_integral::SurfaceIntegralWeakForm,
+                                                dg::DGSEM, factor, surface_flux_values,
+                                                element)
     # Note that all fluxes have been computed with outward-pointing normal vectors.
     # This computes the **negative** surface integral contribution,
     # i.e., M^{-1} * boundary_interpolation^T (which is for Gauss-Lobatto DGSEM just M^{-1} * B)
@@ -936,49 +1058,48 @@ function calc_surface_integral!(du, u,
     #
     # We also use explicit assignments instead of `+=` to let `@muladd` turn these
     # into FMAs (see comment at the top of the file).
-    factor = inverse_weights[1] # For LGL basis: Identical to weighted boundary interpolation at x = ±1
-    @threaded for element in eachelement(dg, cache)
-        for m in eachnode(dg), l in eachnode(dg)
-            for v in eachvariable(equations)
-                # surface at -x
-                du[v, 1, l, m, element] = (du[v, 1, l, m, element] +
-                                           surface_flux_values[v, l, m, 1,
-                                                               element] *
-                                           factor)
-
-                # surface at +x
-                du[v, nnodes(dg), l, m, element] = (du[v, nnodes(dg), l, m, element] +
-                                                    surface_flux_values[v, l, m, 2,
-                                                                        element] *
-                                                    factor)
-
-                # surface at -y
-                du[v, l, 1, m, element] = (du[v, l, 1, m, element] +
-                                           surface_flux_values[v, l, m, 3,
-                                                               element] *
-                                           factor)
-
-                # surface at +y
-                du[v, l, nnodes(dg), m, element] = (du[v, l, nnodes(dg), m, element] +
-                                                    surface_flux_values[v, l, m, 4,
-                                                                        element] *
-                                                    factor)
-
-                # surface at -z
-                du[v, l, m, 1, element] = (du[v, l, m, 1, element] +
-                                           surface_flux_values[v, l, m, 5,
-                                                               element] *
-                                           factor)
-
-                # surface at +z
-                du[v, l, m, nnodes(dg), element] = (du[v, l, m, nnodes(dg), element] +
-                                                    surface_flux_values[v, l, m, 6,
-                                                                        element] *
-                                                    factor)
-            end
+    #
+    # factor = inverse_weights[1]
+    # For LGL basis: Identical to weighted boundary interpolation at x = ±1
+    for m in eachnode(dg), l in eachnode(dg)
+        for v in eachvariable(equations)
+            # surface at -x
+            du[v, 1, l, m, element] = (du[v, 1, l, m, element] +
+                                       surface_flux_values[v, l, m, 1,
+                                                           element] *
+                                       factor)
+
+            # surface at +x
+            du[v, nnodes(dg), l, m, element] = (du[v, nnodes(dg), l, m, element] +
+                                                surface_flux_values[v, l, m, 2,
+                                                                    element] *
+                                                factor)
+
+            # surface at -y
+            du[v, l, 1, m, element] = (du[v, l, 1, m, element] +
+                                       surface_flux_values[v, l, m, 3,
+                                                           element] *
+                                       factor)
+
+            # surface at +y
+            du[v, l, nnodes(dg), m, element] = (du[v, l, nnodes(dg), m, element] +
+                                                surface_flux_values[v, l, m, 4,
+                                                                    element] *
+                                                factor)
+
+            # surface at -z
+            du[v, l, m, 1, element] = (du[v, l, m, 1, element] +
+                                       surface_flux_values[v, l, m, 5,
+                                                           element] *
+                                       factor)
+
+            # surface at +z
+            du[v, l, m, nnodes(dg), element] = (du[v, l, m, nnodes(dg), element] +
+                                                surface_flux_values[v, l, m, 6,
+                                                                    element] *
+                                                factor)
         end
     end
-
     return nothing
 end
 end # @muladd
diff --git a/src/solvers/dgsem_p4est/dg_3d_parallel.jl b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
index bb5ee8e9196..d062d1fdee4 100644
--- a/src/solvers/dgsem_p4est/dg_3d_parallel.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_parallel.jl
@@ -5,7 +5,7 @@
 @muladd begin
 #! format: noindent
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::Union{P4estMeshParallel{3}, T8codeMeshParallel{3}}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
@@ -33,19 +33,19 @@ function rhs!(du, u, t,
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache)
     end
 
     # Prolong solution to interfaces
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u, mesh, equations, dg)
+        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
     end
 
     # Calculate interface fluxes
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
                              have_nonconservative_terms(equations), equations,
                              dg.surface_integral, dg, cache)
     end
@@ -95,11 +95,13 @@ function rhs!(du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations, dg.surface_integral, dg, cache)
+        calc_surface_integral!(backend, du, u, mesh, equations, dg.surface_integral, dg,
+                               cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
diff --git a/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl b/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl
index ed57b91ce1c..a8036aaf22e 100644
--- a/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl
+++ b/src/solvers/dgsem_p4est/dg_3d_subcell_limiters.jl
@@ -60,7 +60,7 @@ end
 
 # Subcell limiting currently only implemented for certain mesh types
 @inline function volume_integral_kernel!(du, u, element,
-                                         mesh::P4estMesh{3},
+                                         meshT::Type{<:P4estMesh{3}},
                                          nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralSubcellLimiting,
                                          dg::DGSEM, cache)
@@ -77,7 +77,7 @@ end
     fhat3_L = fhat3_L_threaded[Threads.threadid()]
     fhat3_R = fhat3_R_threaded[Threads.threadid()]
     calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
-                   u, mesh, nonconservative_terms, equations, volume_flux_dg,
+                   u, meshT, nonconservative_terms, equations, volume_flux_dg,
                    dg, element, cache)
 
     # low-order FV fluxes
@@ -90,13 +90,13 @@ end
     fstar3_L = fstar3_L_threaded[Threads.threadid()]
     fstar3_R = fstar3_R_threaded[Threads.threadid()]
     calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
-                 u, mesh, nonconservative_terms, equations, volume_flux_fv,
+                 u, meshT, nonconservative_terms, equations, volume_flux_fv,
                  dg, element, cache)
 
     # antidiffusive flux
     calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
                             fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R,
-                            u, mesh, nonconservative_terms, equations, limiter,
+                            u, meshT, nonconservative_terms, equations, limiter,
                             dg, element, cache)
 
     # Calculate volume integral contribution of low-order FV flux
@@ -119,7 +119,7 @@ end
 #
 # See also `flux_differencing_kernel!`.
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
-                                u, mesh::P4estMesh{3},
+                                u, ::Type{<:P4estMesh{3}},
                                 nonconservative_terms::False, equations,
                                 volume_flux, dg::DGSEM, element, cache)
     (; contravariant_vectors) = cache.elements
@@ -263,7 +263,7 @@ end
 #   Discretizations of Non-Conservative Systems. https://arxiv.org/pdf/2211.14009.pdf.
 #
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, fhat3_L, fhat3_R,
-                                u, mesh::P4estMesh{3},
+                                u, ::Type{<:P4estMesh{3}},
                                 nonconservative_terms::True, equations,
                                 volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
                                 element,
@@ -550,7 +550,7 @@ end
                                          fstar1_L, fstar1_R,
                                          fstar2_L, fstar2_R,
                                          fstar3_L, fstar3_R,
-                                         u, mesh::P4estMesh{3},
+                                         u, ::Type{<:P4estMesh{3}},
                                          nonconservative_terms::False, equations,
                                          limiter::SubcellLimiterIDP, dg, element, cache)
     @unpack antidiffusive_flux1_L, antidiffusive_flux1_R, antidiffusive_flux2_L, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R = cache.antidiffusive_fluxes
@@ -601,7 +601,7 @@ end
                                          fstar1_L, fstar1_R,
                                          fstar2_L, fstar2_R,
                                          fstar3_L, fstar3_R,
-                                         u, mesh::P4estMesh{3},
+                                         u, ::Type{<:P4estMesh{3}},
                                          nonconservative_terms::True, equations,
                                          limiter::SubcellLimiterIDP, dg, element, cache)
     @unpack antidiffusive_flux1_L, antidiffusive_flux2_L, antidiffusive_flux1_R, antidiffusive_flux2_R, antidiffusive_flux3_L, antidiffusive_flux3_R = cache.antidiffusive_fluxes
diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl
index cc0243a9989..8a71c7ad884 100644
--- a/src/solvers/dgsem_structured/dg.jl
+++ b/src/solvers/dgsem_structured/dg.jl
@@ -38,7 +38,7 @@ function calc_boundary_flux!(cache, u, t, boundary_condition::BoundaryConditionP
     return nothing
 end
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::Union{StructuredMesh, StructuredMeshView{2}}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
@@ -47,7 +47,7 @@ function rhs!(du, u, t,
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache)
     end
@@ -67,12 +67,13 @@ function rhs!(du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
diff --git a/src/solvers/dgsem_structured/dg_1d.jl b/src/solvers/dgsem_structured/dg_1d.jl
index 672dbe65ebf..57327fa679d 100644
--- a/src/solvers/dgsem_structured/dg_1d.jl
+++ b/src/solvers/dgsem_structured/dg_1d.jl
@@ -69,7 +69,7 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
     return nothing
 end
 
-function apply_jacobian!(du, mesh::StructuredMesh{1},
+function apply_jacobian!(backend::Nothing, du, mesh::StructuredMesh{1},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 
diff --git a/src/solvers/dgsem_structured/dg_2d.jl b/src/solvers/dgsem_structured/dg_2d.jl
index b92944fb338..b7cb31c7b0f 100644
--- a/src/solvers/dgsem_structured/dg_2d.jl
+++ b/src/solvers/dgsem_structured/dg_2d.jl
@@ -29,9 +29,10 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
 =#
 @inline function weak_form_kernel!(du, u,
                                    element,
-                                   mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
-                                               UnstructuredMesh2D, P4estMesh{2},
-                                               P4estMeshView{2}, T8codeMesh{2}},
+                                   ::Type{<:Union{StructuredMesh{2},
+                                                  StructuredMeshView{2},
+                                                  UnstructuredMesh2D, P4estMesh{2},
+                                                  P4estMeshView{2}, T8codeMesh{2}}},
                                    have_nonconservative_terms::False, equations,
                                    dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -70,10 +71,11 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       StructuredMeshView{2},
-                                                       UnstructuredMesh2D, P4estMesh{2},
-                                                       T8codeMesh{2}},
+                                           ::Type{<:Union{StructuredMesh{2},
+                                                          StructuredMeshView{2},
+                                                          UnstructuredMesh2D,
+                                                          P4estMesh{2},
+                                                          T8codeMesh{2}}},
                                            have_nonconservative_terms::False, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     @unpack derivative_split = dg.basis
@@ -133,13 +135,14 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       StructuredMeshView{2},
-                                                       UnstructuredMesh2D, P4estMesh{2},
-                                                       T8codeMesh{2}},
+                                           meshT::Type{<:Union{StructuredMesh{2},
+                                                               StructuredMeshView{2},
+                                                               UnstructuredMesh2D,
+                                                               P4estMesh{2},
+                                                               T8codeMesh{2}}},
                                            have_nonconservative_terms::True, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
-    flux_differencing_kernel!(du, u, element, mesh, have_nonconservative_terms,
+    flux_differencing_kernel!(du, u, element, meshT, have_nonconservative_terms,
                               combine_conservative_and_nonconservative_fluxes(volume_flux,
                                                                               equations),
                               equations,
@@ -149,10 +152,11 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       StructuredMeshView{2},
-                                                       UnstructuredMesh2D, P4estMesh{2},
-                                                       T8codeMesh{2}},
+                                           meshT::Type{<:Union{StructuredMesh{2},
+                                                               StructuredMeshView{2},
+                                                               UnstructuredMesh2D,
+                                                               P4estMesh{2},
+                                                               T8codeMesh{2}}},
                                            have_nonconservative_terms::True,
                                            combine_conservative_and_nonconservative_fluxes::False,
                                            equations,
@@ -162,7 +166,7 @@ end
     symmetric_flux, nonconservative_flux = volume_flux
 
     # Apply the symmetric flux as usual
-    flux_differencing_kernel!(du, u, element, mesh, False(), equations, symmetric_flux,
+    flux_differencing_kernel!(du, u, element, meshT, False(), equations, symmetric_flux,
                               dg, cache, alpha)
 
     # Calculate the remaining volume terms using the nonsymmetric generalized flux
@@ -222,10 +226,11 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       StructuredMeshView{2},
-                                                       UnstructuredMesh2D, P4estMesh{2},
-                                                       T8codeMesh{2}},
+                                           ::Type{<:Union{StructuredMesh{2},
+                                                          StructuredMeshView{2},
+                                                          UnstructuredMesh2D,
+                                                          P4estMesh{2},
+                                                          T8codeMesh{2}}},
                                            have_nonconservative_terms::True,
                                            combine_conservative_and_nonconservative_fluxes::True,
                                            equations,
@@ -293,9 +298,9 @@ end
 end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                              mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
-                                          UnstructuredMesh2D,
-                                          P4estMesh{2}, T8codeMesh{2}},
+                              ::Type{<:Union{StructuredMesh{2}, StructuredMeshView{2},
+                                             UnstructuredMesh2D,
+                                             P4estMesh{2}, T8codeMesh{2}}},
                               have_nonconservative_terms::False, equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     @unpack normal_vectors_1, normal_vectors_2 = cache.normal_vectors
@@ -335,9 +340,9 @@ end
 end
 
 @inline function calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                                mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
-                                            UnstructuredMesh2D,
-                                            P4estMesh{2}, T8codeMesh{2}},
+                                ::Type{<:Union{StructuredMesh{2}, StructuredMeshView{2},
+                                               UnstructuredMesh2D,
+                                               P4estMesh{2}, T8codeMesh{2}}},
                                 have_nonconservative_terms::False, equations,
                                 volume_flux_fv, dg::DGSEM, element, cache,
                                 sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -416,9 +421,9 @@ end
 end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                              mesh::Union{StructuredMesh{2}, StructuredMesh{2},
-                                          UnstructuredMesh2D,
-                                          P4estMesh{2}, T8codeMesh{2}},
+                              ::Type{<:Union{StructuredMesh{2}, StructuredMesh{2},
+                                             UnstructuredMesh2D,
+                                             P4estMesh{2}, T8codeMesh{2}}},
                               have_nonconservative_terms::True, equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     @unpack normal_vectors_1, normal_vectors_2 = cache.normal_vectors
@@ -711,26 +716,61 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
     return nothing
 end
 
-function apply_jacobian!(du,
+function apply_jacobian!(backend::Nothing, du,
                          mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                      UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2},
                                      T8codeMesh{2}},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
-
     @threaded for element in eachelement(dg, cache)
-        for j in eachnode(dg), i in eachnode(dg)
-            # Negative sign included to account for the negated surface and volume terms,
-            # see e.g. the computation of `derivative_hat` in the basis setup and 
-            # the comment in `calc_surface_integral!`.
-            factor = -inverse_jacobian[i, j, element]
-
-            for v in eachvariable(equations)
-                du[v, i, j, element] *= factor
-            end
-        end
+        apply_jacobian_per_element!(du, typeof(mesh), equations, dg, inverse_jacobian,
+                                    element)
     end
+end
 
+function apply_jacobian!(backend::Backend, du,
+                         mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
+                                     UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2},
+                                     T8codeMesh{2}},
+                         equations, dg::DG, cache)
+    nelements(dg, cache) == 0 && return nothing
+    @unpack inverse_jacobian = cache.elements
+    kernel! = apply_jacobian_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, dg, inverse_jacobian,
+            ndrange = nelements(dg, cache))
+end
+
+@kernel function apply_jacobian_KAkernel!(du,
+                                          mT::Type{<:Union{StructuredMesh{2},
+                                                           StructuredMeshView{2},
+                                                           UnstructuredMesh2D,
+                                                           P4estMesh{2},
+                                                           P4estMeshView{2},
+                                                           T8codeMesh{2}}},
+                                          equations, dg::DG, inverse_jacobian)
+    element = @index(Global)
+    apply_jacobian_per_element!(du, mT, equations, dg, inverse_jacobian, element)
+end
+
+@inline function apply_jacobian_per_element!(du,
+                                             ::Type{<:Union{StructuredMesh{2},
+                                                            StructuredMeshView{2},
+                                                            UnstructuredMesh2D,
+                                                            P4estMesh{2},
+                                                            P4estMeshView{2},
+                                                            T8codeMesh{2}}},
+                                             equations, dg::DG, inverse_jacobian,
+                                             element)
+    for j in eachnode(dg), i in eachnode(dg)
+        # Negative sign included to account for the negated surface and volume terms,
+        # see e.g. the computation of `derivative_hat` in the basis setup and 
+        # the comment in `calc_surface_integral!`.
+        factor = -inverse_jacobian[i, j, element]
+
+        for v in eachvariable(equations)
+            du[v, i, j, element] *= factor
+        end
+    end
     return nothing
 end
 end # @muladd
diff --git a/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl b/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl
index c2956d027b8..508d3c92d82 100644
--- a/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl
+++ b/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl
@@ -19,26 +19,27 @@
 # works efficiently here.
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
                                            element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       UnstructuredMesh2D, P4estMesh{2}},
+                                           ::Type{<:Union{StructuredMesh{2},
+                                                          UnstructuredMesh2D, P4estMesh{2}}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations2D,
                                            volume_flux::typeof(flux_shima_etal_turbo),
                                            dg::DGSEM, cache, alpha)
     @unpack derivative_split = dg.basis
     @unpack contravariant_vectors = cache.elements
+    ndims = 2
 
     # Create a temporary array that will be used to store the RHS with permuted
     # indices `[i, j, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims)...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims)...,
                                           StaticInt(nvariables(equations))))
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
@@ -82,7 +83,7 @@
     contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims)))
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_x[j, i, 1] = contravariant_vectors[1, 1, i, j, element]
@@ -155,7 +156,7 @@
     contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims)))
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_y[i, j, 1] = contravariant_vectors[1, 2, i, j, element]
@@ -226,20 +227,21 @@ end
 
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
                                            element,
-                                           mesh::Union{StructuredMesh{2},
-                                                       UnstructuredMesh2D, P4estMesh{2}},
+                                           ::Type{<:Union{StructuredMesh{2},
+                                                          UnstructuredMesh2D, P4estMesh{2}}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations2D,
                                            volume_flux::typeof(flux_ranocha_turbo),
                                            dg::DGSEM, cache, alpha)
     @unpack derivative_split = dg.basis
     @unpack contravariant_vectors = cache.elements
+    ndims = 2
 
     # Create a temporary array that will be used to store the RHS with permuted
     # indices `[i, j, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims)...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`. In addition
@@ -248,7 +250,7 @@ end
     # values.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims)...,
                                           StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
@@ -294,7 +296,7 @@ end
     contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims)))
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_x[j, i, 1] = contravariant_vectors[1, 1, i, j, element]
@@ -400,7 +402,7 @@ end
     contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims)))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_y[i, j, 1] = contravariant_vectors[1, 2, i, j, element]
diff --git a/src/solvers/dgsem_structured/dg_2d_subcell_limiters.jl b/src/solvers/dgsem_structured/dg_2d_subcell_limiters.jl
index f29de2d912d..b0c65960f6c 100644
--- a/src/solvers/dgsem_structured/dg_2d_subcell_limiters.jl
+++ b/src/solvers/dgsem_structured/dg_2d_subcell_limiters.jl
@@ -10,7 +10,7 @@
 #
 # See also `flux_differencing_kernel!`.
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::Union{StructuredMesh{2}, P4estMesh{2}},
+                                ::Type{<:Union{StructuredMesh{2}, P4estMesh{2}}},
                                 have_nonconservative_terms::False, equations,
                                 volume_flux, dg::DGSEM, element, cache)
     (; contravariant_vectors) = cache.elements
@@ -111,8 +111,8 @@ end
 #   Discretizations of Non-Conservative Systems. https://arxiv.org/pdf/2211.14009.pdf.
 #
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::Union{StructuredMesh{2}, P4estMesh{2}},
-                                nonconservative_terms::True, equations,
+                                ::Type{<:Union{StructuredMesh{2}, P4estMesh{2}}},
+                                have_nonconservative_terms::True, equations,
                                 volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
                                 element,
                                 cache) where {
@@ -315,8 +315,8 @@ end
 # The calculation of the non-conservative staggered "fluxes" requires non-conservative
 # terms that can be written as a product of local and jump contributions.
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::Union{StructuredMesh{2}, P4estMesh{2}},
-                                nonconservative_terms::True, equations,
+                                ::Type{<:Union{StructuredMesh{2}, P4estMesh{2}}},
+                                have_nonconservative_terms::True, equations,
                                 volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
                                 element,
                                 cache) where {
diff --git a/src/solvers/dgsem_structured/dg_3d.jl b/src/solvers/dgsem_structured/dg_3d.jl
index 7b0414f66fb..146aba255d1 100644
--- a/src/solvers/dgsem_structured/dg_3d.jl
+++ b/src/solvers/dgsem_structured/dg_3d.jl
@@ -4,7 +4,6 @@
 # See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
 @muladd begin
 #! format: noindent
-
 function create_cache(mesh::Union{StructuredMesh{3},
                                   P4estMesh{3}, T8codeMesh{3}},
                       equations,
@@ -32,8 +31,8 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
 =#
 @inline function weak_form_kernel!(du, u,
                                    element,
-                                   mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                               T8codeMesh{3}},
+                                   ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
+                                                  T8codeMesh{3}}},
                                    have_nonconservative_terms::False, equations,
                                    dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -89,8 +88,9 @@ end
 # mapping terms, stored in `contravariant_vectors`, is peeled apart from the evaluation of
 # the physical fluxes in each Cartesian direction
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                                       T8codeMesh{3}},
+                                           ::Type{<:Union{StructuredMesh{3},
+                                                          P4estMesh{3},
+                                                          T8codeMesh{3}}},
                                            have_nonconservative_terms::False, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -171,11 +171,12 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                                       T8codeMesh{3}},
+                                           meshT::Type{<:Union{StructuredMesh{3},
+                                                               P4estMesh{3},
+                                                               T8codeMesh{3}}},
                                            have_nonconservative_terms::True, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
-    flux_differencing_kernel!(du, u, element, mesh, have_nonconservative_terms,
+    flux_differencing_kernel!(du, u, element, meshT, have_nonconservative_terms,
                               combine_conservative_and_nonconservative_fluxes(volume_flux,
                                                                               equations),
                               equations, volume_flux, dg, cache, alpha)
@@ -184,8 +185,9 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                                       T8codeMesh{3}},
+                                           meshT::Type{<:Union{StructuredMesh{3},
+                                                               P4estMesh{3},
+                                                               T8codeMesh{3}}},
                                            have_nonconservative_terms::True,
                                            combine_conservative_and_nonconservative_fluxes::False,
                                            equations,
@@ -195,7 +197,7 @@ end
     symmetric_flux, nonconservative_flux = volume_flux
 
     # Apply the symmetric flux as usual
-    flux_differencing_kernel!(du, u, element, mesh, False(), equations, symmetric_flux,
+    flux_differencing_kernel!(du, u, element, meshT, False(), equations, symmetric_flux,
                               dg, cache, alpha)
 
     # Calculate the remaining volume terms using the nonsymmetric generalized flux
@@ -274,8 +276,9 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                                       T8codeMesh{3}},
+                                           ::Type{<:Union{StructuredMesh{3},
+                                                          P4estMesh{3},
+                                                          T8codeMesh{3}}},
                                            have_nonconservative_terms::True,
                                            combine_conservative_and_nonconservative_fluxes::True,
                                            equations,
@@ -369,8 +372,8 @@ end
 # [arXiv: 2008.12044v2](https://arxiv.org/pdf/2008.12044)
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                               fstar3_L, fstar3_R, u,
-                              mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                          T8codeMesh{3}},
+                              ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
+                                             T8codeMesh{3}}},
                               have_nonconservative_terms::False,
                               equations, volume_flux_fv, dg::DGSEM, element, cache)
     @unpack contravariant_vectors = cache.elements
@@ -430,8 +433,8 @@ end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                               fstar3_L, fstar3_R, u,
-                              mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                          T8codeMesh{3}},
+                              ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
+                                             T8codeMesh{3}}},
                               have_nonconservative_terms::True,
                               equations, volume_flux_fv, dg::DGSEM, element, cache)
     @unpack contravariant_vectors = cache.elements
@@ -526,8 +529,8 @@ end
 
 @inline function calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                                 fstar3_L, fstar3_R, u,
-                                mesh::Union{StructuredMesh{3}, P4estMesh{3},
-                                            T8codeMesh{3}},
+                                ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
+                                               T8codeMesh{3}}},
                                 have_nonconservative_terms::False, equations,
                                 volume_flux_fv, dg::DGSEM, element, cache,
                                 sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -896,24 +899,48 @@ function calc_boundary_flux!(cache, u, t, boundary_conditions::NamedTuple,
     return nothing
 end
 
-function apply_jacobian!(du,
+function apply_jacobian!(backend::Nothing, du,
                          mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
-
     @threaded for element in eachelement(dg, cache)
-        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            # Negative sign included to account for the negated surface and volume terms,
-            # see e.g. the computation of `derivative_hat` in the basis setup and 
-            # the comment in `calc_surface_integral!`.
-            factor = -inverse_jacobian[i, j, k, element]
-
-            for v in eachvariable(equations)
-                du[v, i, j, k, element] *= factor
-            end
-        end
+        apply_jacobian_element!(du, typeof(mesh), equations, dg, inverse_jacobian,
+                                element)
     end
+    return nothing
+end
 
+function apply_jacobian!(backend::Backend, du,
+                         mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
+                         equations, dg::DG, cache)
+    @unpack inverse_jacobian = cache.elements
+
+    kernel! = apply_jacobian_KAkernel!(backend)
+    kernel!(du, typeof(mesh), equations, dg, inverse_jacobian,
+            ndrange = nelements(cache.elements))
+    return nothing
+end
+
+@kernel function apply_jacobian_KAkernel!(du, meshT, equations, dg::DG,
+                                          inverse_jacobian)
+    element = @index(Global)
+    apply_jacobian_element!(du, meshT, equations, dg, inverse_jacobian, element)
+end
+
+@inline function apply_jacobian_element!(du,
+                                         ::Type{<:Union{StructuredMesh{3}, P4estMesh{3},
+                                                        T8codeMesh{3}}},
+                                         equations, dg, inverse_jacobian, element)
+    for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+        # Negative sign included to account for the negated surface and volume terms,
+        # see e.g. the computation of `derivative_hat` in the basis setup and 
+        # the comment in `calc_surface_integral!`.
+        factor = -inverse_jacobian[i, j, k, element]
+
+        for v in eachvariable(equations)
+            du[v, i, j, k, element] *= factor
+        end
+    end
     return nothing
 end
 end # @muladd
diff --git a/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl b/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
index 8b710417ff7..9143286b88e 100644
--- a/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
+++ b/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
@@ -19,25 +19,26 @@
 # works efficiently here.
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
                                            element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3}},
+                                           ::Type{<:Union{StructuredMesh{3}, P4estMesh{3}}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations3D,
                                            volume_flux::typeof(flux_shima_etal_turbo),
                                            dg::DGSEM, cache, alpha)
     @unpack derivative_split = dg.basis
     @unpack contravariant_vectors = cache.elements
+    ndims = 3
 
     # Create a temporary array that will be used to store the RHS with permuted
     # indices `[i, j, k, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims)...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims)...,
                                           StaticInt(nvariables(equations))))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
@@ -88,7 +89,7 @@
     contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)^2),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims)))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         jk = j + nnodes(dg) * (k - 1)
@@ -176,7 +177,7 @@
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims)))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, element]
@@ -264,7 +265,7 @@
         contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                              (StaticInt(nnodes(dg)^2),
                                                                               StaticInt(nnodes(dg)),
-                                                                              StaticInt(ndims(mesh))))
+                                                                              StaticInt(ndims)))
 
         @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             ij = i + nnodes(dg) * (j - 1)
@@ -351,19 +352,20 @@ end
 
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
                                            element,
-                                           mesh::Union{StructuredMesh{3}, P4estMesh{3}},
+                                           ::Type{<:Union{StructuredMesh{3}, P4estMesh{3}}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations3D,
                                            volume_flux::typeof(flux_ranocha_turbo),
                                            dg::DGSEM, cache, alpha)
     @unpack derivative_split = dg.basis
     @unpack contravariant_vectors = cache.elements
+    ndims = 3
 
     # Create a temporary array that will be used to store the RHS with permuted
     # indices `[i, j, k, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims)...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`. In addition
@@ -372,7 +374,7 @@ end
     # values.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims)...,
                                           StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
@@ -425,7 +427,7 @@ end
     contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                          (StaticInt(nnodes(dg)^2),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims)))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         jk = j + nnodes(dg) * (k - 1)
@@ -546,7 +548,7 @@ end
                                                                          (StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
                                                                           StaticInt(nnodes(dg)),
-                                                                          StaticInt(ndims(mesh))))
+                                                                          StaticInt(ndims)))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
         contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, element]
@@ -667,7 +669,7 @@ end
         contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
                                                                              (StaticInt(nnodes(dg)^2),
                                                                               StaticInt(nnodes(dg)),
-                                                                              StaticInt(ndims(mesh))))
+                                                                              StaticInt(ndims)))
 
         @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             ij = i + nnodes(dg) * (j - 1)
diff --git a/src/solvers/dgsem_tree/dg_1d.jl b/src/solvers/dgsem_tree/dg_1d.jl
index c472e071ab3..74934a1b4b4 100644
--- a/src/solvers/dgsem_tree/dg_1d.jl
+++ b/src/solvers/dgsem_tree/dg_1d.jl
@@ -55,7 +55,7 @@ end
 
 # TODO: Taal discuss/refactor timer, allowing users to pass a custom timer?
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::TreeMesh{1}, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
@@ -64,7 +64,7 @@ function rhs!(du, u, t,
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache)
     end
@@ -94,12 +94,13 @@ function rhs!(du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
@@ -117,7 +118,8 @@ This treatment is required to achieve, e.g., entropy-stability or well-balancedn
 See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-1765644064
 =#
 @inline function weak_form_kernel!(du, u,
-                                   element, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                                   element,
+                                   ::Type{<:Union{TreeMesh{1}, StructuredMesh{1}}},
                                    have_nonconservative_terms::False, equations,
                                    dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -138,7 +140,8 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                                           ::Type{<:Union{TreeMesh{1},
+                                                          StructuredMesh{1}}},
                                            have_nonconservative_terms::False, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -167,7 +170,8 @@ end
 end
 
 @inline function flux_differencing_kernel!(du, u, element,
-                                           mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                                           meshT::Type{<:Union{TreeMesh{1},
+                                                               StructuredMesh{1}}},
                                            have_nonconservative_terms::True, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -176,7 +180,7 @@ end
     symmetric_flux, nonconservative_flux = volume_flux
 
     # Apply the symmetric flux as usual
-    flux_differencing_kernel!(du, u, element, mesh, False(), equations, symmetric_flux,
+    flux_differencing_kernel!(du, u, element, meshT, False(), equations, symmetric_flux,
                               dg, cache, alpha)
 
     # Calculate the remaining volume terms using the nonsymmetric generalized flux
@@ -202,7 +206,7 @@ end
 end
 
 @inline function fv_kernel!(du, u,
-                            mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                            meshT::Type{<:Union{TreeMesh{1}, StructuredMesh{1}}},
                             have_nonconservative_terms, equations,
                             volume_flux_fv, dg::DGSEM, cache, element, alpha = true)
     @unpack fstar1_L_threaded, fstar1_R_threaded = cache
@@ -211,7 +215,7 @@ end
     # Calculate FV two-point fluxes
     fstar1_L = fstar1_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
-    calcflux_fv!(fstar1_L, fstar1_R, u, mesh,
+    calcflux_fv!(fstar1_L, fstar1_R, u, meshT,
                  have_nonconservative_terms, equations,
                  volume_flux_fv, dg, element, cache)
 
@@ -228,7 +232,7 @@ end
 end
 
 @inline function fvO2_kernel!(du, u,
-                              mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                              meshT::Type{<:Union{TreeMesh{1}, StructuredMesh{1}}},
                               nonconservative_terms, equations,
                               volume_flux_fv, dg::DGSEM, cache, element,
                               sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -240,7 +244,7 @@ end
     # Calculate FV two-point fluxes
     fstar1_L = fstar1_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
-    calcflux_fvO2!(fstar1_L, fstar1_R, u, mesh, nonconservative_terms, equations,
+    calcflux_fvO2!(fstar1_L, fstar1_R, u, meshT, nonconservative_terms, equations,
                    volume_flux_fv, dg, element, cache,
                    sc_interface_coords, reconstruction_mode, slope_limiter,
                    cons2recon, recon2cons)
@@ -262,7 +266,7 @@ end
 # "A provably entropy stable subcell shock capturing approach for high order split form DG for the compressible Euler equations"
 # [arXiv: 2008.12044v2](https://arxiv.org/pdf/2008.12044)
 @inline function calcflux_fv!(fstar1_L, fstar1_R, u,
-                              mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                              ::Type{<:Union{TreeMesh{1}, StructuredMesh{1}}},
                               have_nonconservative_terms::False,
                               equations, volume_flux_fv, dg::DGSEM, element, cache)
     for i in 2:nnodes(dg)
@@ -277,7 +281,7 @@ end
 end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, u,
-                              mesh::TreeMesh{1},
+                              ::Type{<:TreeMesh{1}},
                               have_nonconservative_terms::True,
                               equations, volume_flux_fv, dg::DGSEM, element, cache)
     volume_flux, nonconservative_flux = volume_flux_fv
@@ -308,7 +312,7 @@ end
 # "An entropy stable nodal discontinuous Galerkin method for the resistive MHD equations. Part II: Subcell finite volume shock capturing"
 # [JCP: 2021.110580](https://doi.org/10.1016/j.jcp.2021.110580)
 @inline function calcflux_fvO2!(fstar1_L, fstar1_R, u,
-                                mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+                                ::Type{<:Union{TreeMesh{1}, StructuredMesh{1}}},
                                 nonconservative_terms::False,
                                 equations, volume_flux_fv, dg::DGSEM, element, cache,
                                 sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -616,7 +620,8 @@ function calc_boundary_flux_by_direction!(surface_flux_values::AbstractArray{<:A
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+function calc_surface_integral!(backend::Nothing, du, u,
+                                mesh::Union{TreeMesh{1}, StructuredMesh{1}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
                                 dg::DGSEM, cache)
     @unpack inverse_weights = dg.basis
@@ -646,7 +651,8 @@ function calc_surface_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1}},
+function calc_surface_integral!(backend::Nothing, du, u,
+                                mesh::Union{TreeMesh{1}, StructuredMesh{1}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
                                 dg::DG{<:GaussLegendreBasis}, cache)
     @unpack boundary_interpolation_inverse_weights = dg.basis
@@ -677,7 +683,7 @@ function calc_surface_integral!(du, u, mesh::Union{TreeMesh{1}, StructuredMesh{1
     return nothing
 end
 
-function apply_jacobian!(du, mesh::TreeMesh{1},
+function apply_jacobian!(backend::Nothing, du, mesh::TreeMesh{1},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 
diff --git a/src/solvers/dgsem_tree/dg_1d_parabolic.jl b/src/solvers/dgsem_tree/dg_1d_parabolic.jl
index ac06ec4dfac..a6bfa09fdaa 100644
--- a/src/solvers/dgsem_tree/dg_1d_parabolic.jl
+++ b/src/solvers/dgsem_tree/dg_1d_parabolic.jl
@@ -107,7 +107,7 @@ function rhs_parabolic!(du, u, t, mesh::TreeMesh{1},
     # Calculate surface integrals.
     # This reuses `calc_surface_integral!` for the purely hyperbolic case.
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations_parabolic,
+        calc_surface_integral!(nothing, du, u, mesh, equations_parabolic,
                                dg.surface_integral, dg, cache)
     end
 
diff --git a/src/solvers/dgsem_tree/dg_2d.jl b/src/solvers/dgsem_tree/dg_2d.jl
index c1a845ab06d..7b5bc8e7328 100644
--- a/src/solvers/dgsem_tree/dg_2d.jl
+++ b/src/solvers/dgsem_tree/dg_2d.jl
@@ -100,7 +100,7 @@ end
 
 # TODO: Taal discuss/refactor timer, allowing users to pass a custom timer?
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::Union{TreeMesh{2}, P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2},
                           TreeMesh{3}, P4estMesh{3}, T8codeMesh{3}},
               equations,
@@ -111,19 +111,19 @@ function rhs!(du, u, t,
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache)
     end
 
     # Prolong solution to interfaces
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u, mesh, equations, dg)
+        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
     end
 
     # Calculate interface fluxes
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
                              have_nonconservative_terms(equations), equations,
                              dg.surface_integral, dg, cache)
     end
@@ -154,12 +154,13 @@ function rhs!(du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
@@ -177,7 +178,7 @@ This treatment is required to achieve, e.g., entropy-stability or well-balancedn
 See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-1765644064
 =#
 @inline function weak_form_kernel!(du, u,
-                                   element, mesh::TreeMesh{2},
+                                   element, ::Type{<:TreeMesh{2}},
                                    have_nonconservative_terms::False, equations,
                                    dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -204,7 +205,7 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
     return nothing
 end
 
-@inline function flux_differencing_kernel!(du, u, element, mesh::TreeMesh{2},
+@inline function flux_differencing_kernel!(du, u, element, ::Type{<:TreeMesh{2}},
                                            have_nonconservative_terms::False, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -242,7 +243,7 @@ end
     end
 end
 
-@inline function flux_differencing_kernel!(du, u, element, mesh::TreeMesh{2},
+@inline function flux_differencing_kernel!(du, u, element, meshT::Type{<:TreeMesh{2}},
                                            have_nonconservative_terms::True, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -251,7 +252,7 @@ end
     symmetric_flux, nonconservative_flux = volume_flux
 
     # Apply the symmetric flux as usual
-    flux_differencing_kernel!(du, u, element, mesh, False(), equations, symmetric_flux,
+    flux_differencing_kernel!(du, u, element, meshT, False(), equations, symmetric_flux,
                               dg, cache, alpha)
 
     # Calculate the remaining volume terms using the nonsymmetric generalized flux
@@ -285,9 +286,9 @@ end
 end
 
 @inline function fvO2_kernel!(du, u,
-                              mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                          UnstructuredMesh2D, P4estMesh{2},
-                                          T8codeMesh{2}},
+                              meshT::Type{<:Union{TreeMesh{2}, StructuredMesh{2},
+                                                  UnstructuredMesh2D, P4estMesh{2},
+                                                  T8codeMesh{2}}},
                               have_nonconservative_terms, equations,
                               volume_flux_fv, dg::DGSEM, cache, element,
                               sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -301,7 +302,7 @@ end
     fstar2_L = fstar2_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
     fstar2_R = fstar2_R_threaded[Threads.threadid()]
-    calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, mesh,
+    calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, meshT,
                    have_nonconservative_terms, equations,
                    volume_flux_fv, dg, element, cache,
                    sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -322,7 +323,7 @@ end
 end
 
 @inline function calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                                mesh::TreeMesh{2},
+                                ::Type{<:TreeMesh{2}},
                                 have_nonconservative_terms::False,
                                 equations, volume_flux_fv, dg::DGSEM, element, cache,
                                 sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -389,9 +390,9 @@ end
 end
 
 @inline function fv_kernel!(du, u,
-                            mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                        UnstructuredMesh2D, P4estMesh{2},
-                                        T8codeMesh{2}},
+                            meshT::Type{<:Union{TreeMesh{2}, StructuredMesh{2},
+                                                UnstructuredMesh2D, P4estMesh{2},
+                                                T8codeMesh{2}}},
                             have_nonconservative_terms, equations,
                             volume_flux_fv, dg::DGSEM, cache, element, alpha = true)
     @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded = cache
@@ -402,7 +403,7 @@ end
     fstar2_L = fstar2_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
     fstar2_R = fstar2_R_threaded[Threads.threadid()]
-    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, mesh,
+    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, meshT,
                  have_nonconservative_terms, equations, volume_flux_fv, dg, element,
                  cache)
 
@@ -425,7 +426,7 @@ end
 # "A provably entropy stable subcell shock capturing approach for high order split form DG for the compressible Euler equations"
 # [arXiv: 2008.12044v2](https://arxiv.org/pdf/2008.12044)
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                              mesh::TreeMesh{2},
+                              ::Type{<:TreeMesh{2}},
                               have_nonconservative_terms::False, equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     for j in eachnode(dg), i in 2:nnodes(dg)
@@ -448,7 +449,7 @@ end
 end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                              mesh::TreeMesh{2},
+                              ::Type{<:TreeMesh{2}},
                               have_nonconservative_terms::True, equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     volume_flux, nonconservative_flux = volume_flux_fv
@@ -496,7 +497,8 @@ end
     return nothing
 end
 
-function prolong2interfaces!(cache, u, mesh::TreeMesh{2}, equations, dg::DG)
+function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{2}, equations,
+                             dg::DG)
     @unpack interfaces = cache
     @unpack orientations, neighbor_ids = interfaces
     interfaces_u = interfaces.u
@@ -523,7 +525,7 @@ function prolong2interfaces!(cache, u, mesh::TreeMesh{2}, equations, dg::DG)
     return nothing
 end
 
-function prolong2interfaces!(cache, u, mesh::TreeMesh{2}, equations,
+function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{2}, equations,
                              dg::DG{<:GaussLegendreBasis})
     @unpack interfaces = cache
     @unpack orientations, neighbor_ids = interfaces
@@ -564,7 +566,7 @@ function prolong2interfaces!(cache, u, mesh::TreeMesh{2}, equations,
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{2},
                               have_nonconservative_terms::False, equations,
                               surface_integral, dg::DG, cache)
@@ -598,7 +600,7 @@ function calc_interface_flux!(surface_flux_values,
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{2},
                               have_nonconservative_terms::True, equations,
                               surface_integral, dg::DG, cache)
@@ -1120,7 +1122,7 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u,
+function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{TreeMesh{2}, StructuredMesh{2},
                                             StructuredMeshView{2}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
@@ -1164,7 +1166,7 @@ function calc_surface_integral!(du, u,
     return nothing
 end
 
-function calc_surface_integral!(du, u,
+function calc_surface_integral!(backend::Nothing, du, u,
                                 mesh::Union{TreeMesh{2}, StructuredMesh{2},
                                             StructuredMeshView{2}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
@@ -1215,7 +1217,7 @@ function calc_surface_integral!(du, u,
     return nothing
 end
 
-function apply_jacobian!(du, mesh::TreeMesh{2},
+function apply_jacobian!(backend::Nothing, du, mesh::TreeMesh{2},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 
diff --git a/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl b/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
index 51a5897b065..efcb7cc6794 100644
--- a/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
+++ b/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
@@ -65,24 +65,25 @@ end # muladd
 # if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
 # works efficiently here.
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
-                                           element, mesh::TreeMesh{2},
+                                           element, ::Type{<:TreeMesh{2}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations2D,
                                            volume_flux::typeof(flux_shima_etal_turbo),
                                            dg::DGSEM, cache, alpha)
     @unpack derivative_split = dg.basis
+    ndims_mesh = 2
 
     # Create a temporary array that will be used to store the RHS with permuted
     # indices `[i, j, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims_mesh)...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims_mesh)...,
                                           StaticInt(nvariables(equations))))
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
@@ -227,18 +228,19 @@ end # muladd
 end
 
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
-                                           element, mesh::TreeMesh{2},
+                                           element, ::Type{<:TreeMesh{2}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations2D,
                                            volume_flux::typeof(flux_ranocha_turbo),
                                            dg::DGSEM, cache, alpha)
     @unpack derivative_split = dg.basis
+    ndims_mesh = 2
 
     # Create a temporary array that will be used to store the RHS with permuted
     # indices `[i, j, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims_mesh)...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`. In addition
@@ -247,7 +249,7 @@ end
     # values.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims_mesh)...,
                                           StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
 
     @turbo for j in eachnode(dg), i in eachnode(dg)
diff --git a/src/solvers/dgsem_tree/dg_2d_parabolic.jl b/src/solvers/dgsem_tree/dg_2d_parabolic.jl
index d7e452a97db..05bd3d95fc2 100644
--- a/src/solvers/dgsem_tree/dg_2d_parabolic.jl
+++ b/src/solvers/dgsem_tree/dg_2d_parabolic.jl
@@ -128,7 +128,7 @@ function rhs_parabolic!(du, u, t, mesh::Union{TreeMesh{2}, TreeMesh{3}},
     # Calculate surface integrals.
     # This reuses `calc_surface_integral!` for the purely hyperbolic case.
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations_parabolic,
+        calc_surface_integral!(nothing, du, u, mesh, equations_parabolic,
                                dg.surface_integral, dg, cache)
     end
 
@@ -952,7 +952,7 @@ function calc_gradient!(gradients, u_transformed, t,
     # Prolong solution to interfaces
     # This reuses `prolong2interfaces!` for the purely hyperbolic case.
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u_transformed, mesh,
+        prolong2interfaces!(nothing, cache, u_transformed, mesh,
                             equations_parabolic, dg)
     end
 
diff --git a/src/solvers/dgsem_tree/dg_2d_parallel.jl b/src/solvers/dgsem_tree/dg_2d_parallel.jl
index 37ba51e15bc..8a9dc3d58a1 100644
--- a/src/solvers/dgsem_tree/dg_2d_parallel.jl
+++ b/src/solvers/dgsem_tree/dg_2d_parallel.jl
@@ -450,7 +450,7 @@ function init_mpi_neighbor_connectivity(elements, mpi_interfaces, mpi_mortars,
     return mpi_neighbor_ranks, mpi_neighbor_interfaces, mpi_neighbor_mortars
 end
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::Union{TreeMeshParallel{2}, P4estMeshParallel{2},
                           T8codeMeshParallel{2}}, equations,
               boundary_conditions, source_terms::Source,
@@ -479,7 +479,7 @@ function rhs!(du, u, t,
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache)
     end
@@ -487,12 +487,12 @@ function rhs!(du, u, t,
     # Prolong solution to interfaces
     # TODO: Taal decide order of arguments, consistent vs. modified cache first?
     @trixi_timeit timer() "prolong2interfaces" begin
-        prolong2interfaces!(cache, u, mesh, equations, dg)
+        prolong2interfaces!(backend, cache, u, mesh, equations, dg)
     end
 
     # Calculate interface fluxes
     @trixi_timeit timer() "interface flux" begin
-        calc_interface_flux!(cache.elements.surface_flux_values, mesh,
+        calc_interface_flux!(backend, cache.elements.surface_flux_values, mesh,
                              have_nonconservative_terms(equations), equations,
                              dg.surface_integral, dg, cache)
     end
@@ -542,12 +542,13 @@ function rhs!(du, u, t,
 
     # Calculate surface integrals
     @trixi_timeit timer() "surface integral" begin
-        calc_surface_integral!(du, u, mesh, equations,
+        calc_surface_integral!(backend, du, u, mesh, equations,
                                dg.surface_integral, dg, cache)
     end
 
     # Apply Jacobian from mapping to reference element
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
diff --git a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
index bccb96a1404..b9d46ddf164 100644
--- a/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
+++ b/src/solvers/dgsem_tree/dg_2d_subcell_limiters.jl
@@ -58,8 +58,9 @@ end
 
 # Subcell limiting currently only implemented for certain mesh types
 @inline function volume_integral_kernel!(du, u, element,
-                                         mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                                     P4estMesh{2}},
+                                         meshT::Type{<:Union{TreeMesh{2},
+                                                             StructuredMesh{2},
+                                                             P4estMesh{2}}},
                                          have_nonconservative_terms, equations,
                                          volume_integral::VolumeIntegralSubcellLimiting,
                                          dg::DGSEM, cache)
@@ -73,7 +74,7 @@ end
     fhat1_R = fhat1_R_threaded[Threads.threadid()]
     fhat2_L = fhat2_L_threaded[Threads.threadid()]
     fhat2_R = fhat2_R_threaded[Threads.threadid()]
-    calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u, mesh,
+    calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u, meshT,
                    have_nonconservative_terms, equations, volume_flux_dg, dg, element,
                    cache)
 
@@ -84,14 +85,15 @@ end
     fstar2_L = fstar2_L_threaded[Threads.threadid()]
     fstar1_R = fstar1_R_threaded[Threads.threadid()]
     fstar2_R = fstar2_R_threaded[Threads.threadid()]
-    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, mesh,
+    calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, u, meshT,
                  have_nonconservative_terms, equations, volume_flux_fv, dg, element,
                  cache)
 
     # antidiffusive flux
     calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R,
                             fstar1_L, fstar1_R, fstar2_L, fstar2_R,
-                            u, mesh, have_nonconservative_terms, equations, limiter, dg,
+                            u, meshT, have_nonconservative_terms, equations, limiter,
+                            dg,
                             element, cache)
 
     # Calculate volume integral contribution of low-order FV flux
@@ -112,7 +114,8 @@ end
 #
 # See also `flux_differencing_kernel!`.
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::TreeMesh{2}, have_nonconservative_terms::False,
+                                ::Type{<:TreeMesh{2}},
+                                have_nonconservative_terms::False,
                                 equations,
                                 volume_flux, dg::DGSEM, element, cache)
     @unpack weights, derivative_split = dg.basis
@@ -191,7 +194,7 @@ end
 #   Discretizations of Non-Conservative Systems. https://arxiv.org/pdf/2211.14009.pdf.
 #
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::TreeMesh{2}, have_nonconservative_terms::True,
+                                ::Type{<:TreeMesh{2}}, have_nonconservative_terms::True,
                                 equations,
                                 volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
                                 element,
@@ -370,7 +373,7 @@ end
 # The calculation of the non-conservative staggered "fluxes" requires non-conservative
 # terms that can be written as a product of local and jump contributions.
 @inline function calcflux_fhat!(fhat1_L, fhat1_R, fhat2_L, fhat2_R, u,
-                                mesh::TreeMesh{2}, nonconservative_terms::True,
+                                ::Type{<:TreeMesh{2}}, nonconservative_terms::True,
                                 equations,
                                 volume_flux::Tuple{F_CONS, F_NONCONS}, dg::DGSEM,
                                 element,
@@ -608,8 +611,8 @@ end
 # Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
 @inline function calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R,
                                          fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                                         mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                                     P4estMesh{2}},
+                                         ::Type{<:Union{TreeMesh{2}, StructuredMesh{2},
+                                                        P4estMesh{2}}},
                                          have_nonconservative_terms::False, equations,
                                          limiter::SubcellLimiterIDP, dg, element, cache)
     @unpack antidiffusive_flux1_L, antidiffusive_flux2_L, antidiffusive_flux1_R, antidiffusive_flux2_R = cache.antidiffusive_fluxes
@@ -645,8 +648,8 @@ end
 # Calculate the antidiffusive flux `antidiffusive_flux` as the subtraction between `fhat` and `fstar` for conservative systems.
 @inline function calcflux_antidiffusive!(fhat1_L, fhat1_R, fhat2_L, fhat2_R,
                                          fstar1_L, fstar1_R, fstar2_L, fstar2_R, u,
-                                         mesh::Union{TreeMesh{2}, StructuredMesh{2},
-                                                     P4estMesh{2}},
+                                         ::Type{<:Union{TreeMesh{2}, StructuredMesh{2},
+                                                        P4estMesh{2}}},
                                          have_nonconservative_terms::True, equations,
                                          limiter::SubcellLimiterIDP, dg, element, cache)
     @unpack antidiffusive_flux1_L, antidiffusive_flux2_L, antidiffusive_flux1_R, antidiffusive_flux2_R = cache.antidiffusive_fluxes
diff --git a/src/solvers/dgsem_tree/dg_3d.jl b/src/solvers/dgsem_tree/dg_3d.jl
index 41b46f58f95..4aabd232d86 100644
--- a/src/solvers/dgsem_tree/dg_3d.jl
+++ b/src/solvers/dgsem_tree/dg_3d.jl
@@ -126,7 +126,7 @@ This treatment is required to achieve, e.g., entropy-stability or well-balancedn
 See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-1765644064
 =#
 @inline function weak_form_kernel!(du, u,
-                                   element, mesh::TreeMesh{3},
+                                   element, ::Type{<:TreeMesh{3}},
                                    have_nonconservative_terms::False, equations,
                                    dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -158,7 +158,7 @@ See also https://github.com/trixi-framework/Trixi.jl/issues/1671#issuecomment-17
     return nothing
 end
 
-@inline function flux_differencing_kernel!(du, u, element, mesh::TreeMesh{3},
+@inline function flux_differencing_kernel!(du, u, element, ::Type{<:TreeMesh{3}},
                                            have_nonconservative_terms::False, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -208,7 +208,7 @@ end
     return nothing
 end
 
-@inline function flux_differencing_kernel!(du, u, element, mesh::TreeMesh{3},
+@inline function flux_differencing_kernel!(du, u, element, meshT::Type{<:TreeMesh{3}},
                                            have_nonconservative_terms::True, equations,
                                            volume_flux, dg::DGSEM, cache, alpha = true)
     # true * [some floating point value] == [exactly the same floating point value]
@@ -217,7 +217,7 @@ end
     symmetric_flux, nonconservative_flux = volume_flux
 
     # Apply the symmetric flux as usual
-    flux_differencing_kernel!(du, u, element, mesh, False(), equations, symmetric_flux,
+    flux_differencing_kernel!(du, u, element, meshT, False(), equations, symmetric_flux,
                               dg, cache, alpha)
 
     # Calculate the remaining volume terms using the nonsymmetric generalized flux
@@ -261,8 +261,9 @@ end
 end
 
 @inline function fv_kernel!(du, u,
-                            mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
-                                        T8codeMesh{3}},
+                            meshT::Type{<:Union{TreeMesh{3}, StructuredMesh{3},
+                                                P4estMesh{3},
+                                                T8codeMesh{3}}},
                             have_nonconservative_terms, equations,
                             volume_flux_fv, dg::DGSEM, cache, element, alpha = true)
     @unpack fstar1_L_threaded, fstar1_R_threaded, fstar2_L_threaded, fstar2_R_threaded, fstar3_L_threaded, fstar3_R_threaded = cache
@@ -277,7 +278,7 @@ end
     fstar3_R = fstar3_R_threaded[Threads.threadid()]
 
     calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u,
-                 mesh, have_nonconservative_terms, equations,
+                 meshT, have_nonconservative_terms, equations,
                  volume_flux_fv, dg, element, cache)
 
     # Calculate FV volume integral contribution
@@ -300,8 +301,9 @@ end
 end
 
 @inline function fvO2_kernel!(du, u,
-                              mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
-                                          T8codeMesh{3}},
+                              meshT::Type{<:Union{TreeMesh{3}, StructuredMesh{3},
+                                                  P4estMesh{3},
+                                                  T8codeMesh{3}}},
                               have_nonconservative_terms, equations,
                               volume_flux_fv, dg::DGSEM, cache, element,
                               sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -321,7 +323,7 @@ end
     fstar3_R = fstar3_R_threaded[Threads.threadid()]
 
     calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, u,
-                   mesh, have_nonconservative_terms, equations,
+                   meshT, have_nonconservative_terms, equations,
                    volume_flux_fv, dg, element, cache,
                    sc_interface_coords, reconstruction_mode, slope_limiter,
                    cons2recon, recon2cons)
@@ -351,7 +353,7 @@ end
 # [arXiv: 2008.12044v2](https://arxiv.org/pdf/2008.12044)
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                               fstar3_L, fstar3_R, u,
-                              mesh::TreeMesh{3}, have_nonconservative_terms::False,
+                              ::Type{<:TreeMesh{3}}, have_nonconservative_terms::False,
                               equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     for k in eachnode(dg), j in eachnode(dg), i in 2:nnodes(dg)
@@ -383,7 +385,7 @@ end
 
 @inline function calcflux_fv!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                               fstar3_L, fstar3_R, u,
-                              mesh::TreeMesh{3},
+                              ::Type{<:TreeMesh{3}},
                               have_nonconservative_terms::True, equations,
                               volume_flux_fv, dg::DGSEM, element, cache)
     volume_flux, nonconservative_flux = volume_flux_fv
@@ -447,7 +449,8 @@ end
 
 @inline function calcflux_fvO2!(fstar1_L, fstar1_R, fstar2_L, fstar2_R,
                                 fstar3_L, fstar3_R, u,
-                                mesh::TreeMesh{3}, have_nonconservative_terms::False,
+                                ::Type{<:TreeMesh{3}},
+                                have_nonconservative_terms::False,
                                 equations,
                                 volume_flux_fv, dg::DGSEM, element, cache,
                                 sc_interface_coords, reconstruction_mode, slope_limiter,
@@ -519,7 +522,8 @@ end
     return nothing
 end
 
-function prolong2interfaces!(cache, u, mesh::TreeMesh{3}, equations, dg::DG)
+function prolong2interfaces!(backend::Nothing, cache, u, mesh::TreeMesh{3}, equations,
+                             dg::DG)
     @unpack interfaces = cache
     @unpack orientations, neighbor_ids = interfaces
     interfaces_u = interfaces.u
@@ -557,7 +561,7 @@ function prolong2interfaces!(cache, u, mesh::TreeMesh{3}, equations, dg::DG)
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{3},
                               have_nonconservative_terms::False, equations,
                               surface_integral, dg::DG, cache)
@@ -592,7 +596,7 @@ function calc_interface_flux!(surface_flux_values,
     return nothing
 end
 
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{3},
                               have_nonconservative_terms::True, equations,
                               surface_integral, dg::DG, cache)
@@ -1333,7 +1337,8 @@ end
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::Union{TreeMesh{3}, StructuredMesh{3}},
+function calc_surface_integral!(backend::Nothing, du, u,
+                                mesh::Union{TreeMesh{3}, StructuredMesh{3}},
                                 equations, surface_integral::SurfaceIntegralWeakForm,
                                 dg::DGSEM, cache)
     @unpack inverse_weights = dg.basis
@@ -1391,7 +1396,7 @@ function calc_surface_integral!(du, u, mesh::Union{TreeMesh{3}, StructuredMesh{3
     return nothing
 end
 
-function apply_jacobian!(du, mesh::TreeMesh{3},
+function apply_jacobian!(backend::Nothing, du, mesh::TreeMesh{3},
                          equations, dg::DG, cache)
     @unpack inverse_jacobian = cache.elements
 
diff --git a/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl b/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl
index b2c48c9f00a..f1d2573dc79 100644
--- a/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl
+++ b/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl
@@ -17,24 +17,25 @@
 # if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
 # works efficiently here.
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
-                                           element, mesh::TreeMesh{3},
+                                           element, ::Type{<:TreeMesh{3}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations3D,
                                            volume_flux::typeof(flux_shima_etal_turbo),
                                            dg::DGSEM, cache, alpha)
     @unpack derivative_split = dg.basis
+    ndims_mesh = 3
 
     # Create a temporary array that will be used to store the RHS with permuted
     # indices `[i, j, k, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims_mesh)...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims_mesh)...,
                                           StaticInt(nvariables(equations))))
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
@@ -263,18 +264,19 @@
 end
 
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
-                                           element, mesh::TreeMesh{3},
+                                           element, ::Type{<:TreeMesh{3}},
                                            have_nonconservative_terms::False,
                                            equations::CompressibleEulerEquations3D,
                                            volume_flux::typeof(flux_ranocha_turbo),
                                            dg::DGSEM, cache, alpha)
     @unpack derivative_split = dg.basis
+    ndims_mesh = 3
 
     # Create a temporary array that will be used to store the RHS with permuted
     # indices `[i, j, k, v]` to allow using SIMD instructions.
     # `StrideArray`s with purely static dimensions do not allocate on the heap.
     du = StrideArray{eltype(u_cons)}(undef,
-                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(mesh))...,
+                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims_mesh)...,
                                       StaticInt(nvariables(equations))))
 
     # Convert conserved to primitive variables on the given `element`. In addition
@@ -283,7 +285,7 @@ end
     # values.
     u_prim = StrideArray{eltype(u_cons)}(undef,
                                          (ntuple(_ -> StaticInt(nnodes(dg)),
-                                                 ndims(mesh))...,
+                                                 ndims_mesh)...,
                                           StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
 
     @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
diff --git a/src/solvers/dgsem_unstructured/dg_2d.jl b/src/solvers/dgsem_unstructured/dg_2d.jl
index 76a186b9fa6..74053154497 100644
--- a/src/solvers/dgsem_unstructured/dg_2d.jl
+++ b/src/solvers/dgsem_unstructured/dg_2d.jl
@@ -35,7 +35,7 @@ function create_cache(mesh::UnstructuredMesh2D, equations,
     return cache
 end
 
-function rhs!(du, u, t,
+function rhs!(backend, du, u, t,
               mesh::UnstructuredMesh2D, equations,
               boundary_conditions, source_terms::Source,
               dg::DG, cache) where {Source}
@@ -44,7 +44,7 @@ function rhs!(du, u, t,
 
     # Calculate volume integral
     @trixi_timeit timer() "volume integral" begin
-        calc_volume_integral!(du, u, mesh,
+        calc_volume_integral!(backend, du, u, mesh,
                               have_nonconservative_terms(equations), equations,
                               dg.volume_integral, dg, cache)
     end
@@ -80,7 +80,8 @@ function rhs!(du, u, t,
 
     # Apply Jacobian from mapping to reference element
     #  Note! this routine is reused from dgsem_structured/dg_2d.jl
-    @trixi_timeit timer() "Jacobian" apply_jacobian!(du, mesh, equations, dg, cache)
+    @trixi_timeit timer() "Jacobian" apply_jacobian!(backend, du, mesh, equations, dg,
+                                                     cache)
 
     # Calculate source terms
     @trixi_timeit timer() "source terms" begin
diff --git a/src/solvers/fdsbp_tree/fdsbp_1d.jl b/src/solvers/fdsbp_tree/fdsbp_1d.jl
index 004e6b95c98..002093121a7 100644
--- a/src/solvers/fdsbp_tree/fdsbp_1d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_1d.jl
@@ -42,7 +42,8 @@ function create_cache(mesh::TreeMesh{1}, equations,
 end
 
 # 2D volume integral contributions for `VolumeIntegralStrongForm`
-function calc_volume_integral!(du, u, mesh::TreeMesh{1},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{1},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
                                dg::FDSBP, cache)
@@ -88,7 +89,8 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u, mesh::TreeMesh{1},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{1},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,
                                dg::FDSBP, cache)
@@ -139,7 +141,7 @@ function calc_volume_integral!(du, u, mesh::TreeMesh{1},
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::TreeMesh{1},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{1},
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::DG, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -166,7 +168,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{1},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh1D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh1D,
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
@@ -220,7 +222,7 @@ end
 # in the specialized `calc_interface_flux` routine. These SATs are still of
 # a strong form penalty type, except that the interior flux at a particular
 # side of the element are computed in the upwind direction.
-function calc_surface_integral!(du, u, mesh::TreeMesh{1},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{1},
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::FDSBP, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -248,7 +250,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{1},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh1D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh1D,
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
diff --git a/src/solvers/fdsbp_tree/fdsbp_2d.jl b/src/solvers/fdsbp_tree/fdsbp_2d.jl
index d370d29aff8..5d30157a28c 100644
--- a/src/solvers/fdsbp_tree/fdsbp_2d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_2d.jl
@@ -42,7 +42,8 @@ function create_cache(mesh::Union{TreeMesh{2}, UnstructuredMesh2D}, equations,
 end
 
 # 2D volume integral contributions for `VolumeIntegralStrongForm`
-function calc_volume_integral!(du, u, mesh::TreeMesh{2},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{2},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
                                dg::FDSBP, cache)
@@ -97,7 +98,8 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u, mesh::TreeMesh{2},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{2},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,
                                dg::FDSBP, cache)
@@ -159,7 +161,7 @@ function calc_volume_integral!(du, u, mesh::TreeMesh{2},
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::TreeMesh{2},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{2},
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::DG, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -202,7 +204,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{2},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh2D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh2D,
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
@@ -214,7 +216,7 @@ end
 # already separates the solution information into right-traveling and
 # left-traveling information. So we only need to compute the appropriate
 # flux information at each side of an interface.
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{2},
                               have_nonconservative_terms::False, equations,
                               surface_integral::SurfaceIntegralUpwind,
@@ -260,7 +262,7 @@ end
 # in the specialized `calc_interface_flux` routine. These SATs are still of
 # a strong form penalty type, except that the interior flux at a particular
 # side of the element are computed in the upwind direction.
-function calc_surface_integral!(du, u, mesh::TreeMesh{2},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{2},
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::FDSBP, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -304,7 +306,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{2},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh2D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh2D,
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
diff --git a/src/solvers/fdsbp_tree/fdsbp_3d.jl b/src/solvers/fdsbp_tree/fdsbp_3d.jl
index 719ef4f7c96..e737a8f2137 100644
--- a/src/solvers/fdsbp_tree/fdsbp_3d.jl
+++ b/src/solvers/fdsbp_tree/fdsbp_3d.jl
@@ -42,7 +42,8 @@ function create_cache(mesh::TreeMesh{3}, equations,
 end
 
 # 3D volume integral contributions for `VolumeIntegralStrongForm`
-function calc_volume_integral!(du, u, mesh::TreeMesh{3},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{3},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
                                dg::FDSBP, cache)
@@ -104,7 +105,8 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u, mesh::TreeMesh{3},
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::TreeMesh{3},
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,
                                dg::FDSBP, cache)
@@ -181,7 +183,7 @@ function calc_volume_integral!(du, u, mesh::TreeMesh{3},
     return nothing
 end
 
-function calc_surface_integral!(du, u, mesh::TreeMesh{3},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{3},
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::DG, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -238,7 +240,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{3},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh3D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh3D,
                                 equations, surface_integral::SurfaceIntegralStrongForm,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
@@ -250,7 +252,7 @@ end
 # already separates the solution information into right-traveling and
 # left-traveling information. So we only need to compute the appropriate
 # flux information at each side of an interface.
-function calc_interface_flux!(surface_flux_values,
+function calc_interface_flux!(backend::Nothing, surface_flux_values,
                               mesh::TreeMesh{3},
                               have_nonconservative_terms::False, equations,
                               surface_integral::SurfaceIntegralUpwind,
@@ -297,7 +299,7 @@ end
 # in the specialized `calc_interface_flux` routine. These SATs are still of
 # a strong form penalty type, except that the interior flux at a particular
 # side of the element are computed in the upwind direction.
-function calc_surface_integral!(du, u, mesh::TreeMesh{3},
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh{3},
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::FDSBP, cache)
     inv_weight_left = inv(left_boundary_weight(dg.basis))
@@ -355,7 +357,7 @@ function calc_surface_integral!(du, u, mesh::TreeMesh{3},
 end
 
 # Periodic FDSBP operators need to use a single element without boundaries
-function calc_surface_integral!(du, u, mesh::TreeMesh3D,
+function calc_surface_integral!(backend::Nothing, du, u, mesh::TreeMesh3D,
                                 equations, surface_integral::SurfaceIntegralUpwind,
                                 dg::PeriodicFDSBP, cache)
     @assert nelements(dg, cache) == 1
diff --git a/src/solvers/fdsbp_unstructured/fdsbp_2d.jl b/src/solvers/fdsbp_unstructured/fdsbp_2d.jl
index 2a10d0a6cea..a9239ada1fa 100644
--- a/src/solvers/fdsbp_unstructured/fdsbp_2d.jl
+++ b/src/solvers/fdsbp_unstructured/fdsbp_2d.jl
@@ -29,7 +29,8 @@ end
 # 2D volume integral contributions for `VolumeIntegralStrongForm`
 # OBS! This is the standard (not de-aliased) form of the volume integral.
 # So it is not provably stable for variable coefficients due to the the metric terms.
-function calc_volume_integral!(du, u, mesh::UnstructuredMesh2D,
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::UnstructuredMesh2D,
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralStrongForm,
                                dg::FDSBP, cache)
@@ -91,7 +92,8 @@ end
 # the finite difference stencils. Thus, the D^- operator acts on the positive
 # part of the flux splitting f^+ and the D^+ operator acts on the negative part
 # of the flux splitting f^-.
-function calc_volume_integral!(du, u, mesh::UnstructuredMesh2D,
+function calc_volume_integral!(backend::Nothing, du, u,
+                               mesh::UnstructuredMesh2D,
                                have_nonconservative_terms::False, equations,
                                volume_integral::VolumeIntegralUpwind,
                                dg::FDSBP, cache)
diff --git a/src/time_integration/paired_explicit_runge_kutta/paired_explicit_runge_kutta.jl b/src/time_integration/paired_explicit_runge_kutta/paired_explicit_runge_kutta.jl
index 5de2164fa3e..414a016bd0d 100644
--- a/src/time_integration/paired_explicit_runge_kutta/paired_explicit_runge_kutta.jl
+++ b/src/time_integration/paired_explicit_runge_kutta/paired_explicit_runge_kutta.jl
@@ -58,8 +58,9 @@ function calculate_cfl(ode_algorithm::AbstractPairedExplicitRK, ode)
 
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
     u = wrap_array(u_ode, mesh, equations, solver, cache)
+    backend = trixi_backend(u_ode)
 
-    cfl_number = dt_opt / max_dt(u, t0, mesh,
+    cfl_number = dt_opt / max_dt(backend, u, t0, mesh,
                         have_constant_speed(equations), equations,
                         solver, cache)
     return cfl_number
diff --git a/test/Project.toml b/test/Project.toml
index 48e1efa86ee..69ba19cdad1 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -17,6 +17,7 @@ Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
 OrdinaryDiffEqBDF = "6ad6398a-0878-4a85-9266-38940aa047c8"
 OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
@@ -60,6 +61,7 @@ Krylov = "0.10"
 LinearAlgebra = "1"
 LinearSolve = "3.13"
 MPI = "0.20.22"
+Metal = "1.9.2"
 NLsolve = "4.5.1"
 OrdinaryDiffEqBDF = "1.1"
 OrdinaryDiffEqCore = "1.26, 2, 3"
diff --git a/test/runtests.jl b/test/runtests.jl
index 8b524a09df8..f4675429064 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,6 @@
 using Test
 using MPI: mpiexec
+import Trixi
 
 # We run tests in parallel with CI jobs setting the `TRIXI_TEST` environment
 # variable to determine the subset of tests to execute.
@@ -114,9 +115,30 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3)
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "CUDA"
         import CUDA
         if CUDA.functional()
-            include("test_cuda.jl")
+            include("test_cuda_2d.jl")
+            include("test_cuda_3d.jl")
         else
             @warn "Unable to run CUDA tests on this machine"
         end
     end
+
+    @time if TRIXI_TEST == "all" || TRIXI_TEST == "Metal"
+        import Metal
+        if Metal.functional()
+            include("test_metal_2d.jl")
+        else
+            @warn "Unable to run Metal tests on this machine"
+        end
+    end
+
+    @time if TRIXI_TEST == "all" || TRIXI_TEST == "kernelabstractions"
+        previous_backend = Trixi._PREFERENCE_THREADING
+        Trixi.set_threading_backend!(:kernelabstractions)
+        # relaunching julia
+        try
+            run(`$(Base.julia_cmd()) --threads=$TRIXI_NTHREADS --check-bounds=yes $(abspath("test_kernelabstractions.jl"))`)
+        finally
+            Trixi.set_threading_backend!(Symbol(previous_backend))
+        end
+    end
 end
diff --git a/test/test_cuda.jl b/test/test_cuda_2d.jl
similarity index 85%
rename from test/test_cuda.jl
rename to test/test_cuda_2d.jl
index fd0adaeece1..f4b1fa1396e 100644
--- a/test/test_cuda.jl
+++ b/test/test_cuda_2d.jl
@@ -1,15 +1,18 @@
-module TestCUDA
+module TestCUDA2D
 
 using Test
 using Trixi
 
 include("test_trixi.jl")
 
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+
 # Start with a clean environment: remove Trixi.jl output directory if it exists
 outdir = "out"
 isdir(outdir) && rm(outdir, recursive = true)
 
-EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+@testset "CUDA 2D" begin
+#! format: noindent
 
 @trixi_testset "elixir_advection_basic_gpu.jl native" begin
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
@@ -39,14 +42,13 @@ end
     using CUDA
     @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
                         # Expected errors are exactly the same as with TreeMesh!
-                        l2=nothing,   # TODO: GPU. [Float32(8.311947673061856e-6)],
-                        linf=nothing, # TODO: GPU. [Float32(6.627000273229378e-5)],
+                        l2=[Float32(8.311947673061856e-6)],
+                        linf=[Float32(6.627000273229378e-5)],
                         RealT_for_test_tolerances=Float32,
                         real_type=Float32,
-                        storage_type=CuArray,
-                        sol=nothing,) # TODO: GPU. Remove this once we can run the simulation on the GPU
-    # # Ensure that we do not have excessive memory allocations
-    # # (e.g., from type instabilities)
+                        storage_type=CuArray)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
     # @test_allocations(Trixi.rhs!, semi, sol, 1000)
     @test real(ode.p.solver) == Float32
     @test real(ode.p.solver.basis) == Float32
@@ -65,5 +67,5 @@ end
 
 # Clean up afterwards: delete Trixi.jl output directory
 @test_nowarn isdir(outdir) && rm(outdir, recursive = true)
-
+end
 end # module
diff --git a/test/test_cuda_3d.jl b/test/test_cuda_3d.jl
new file mode 100644
index 00000000000..5c6d5a52709
--- /dev/null
+++ b/test/test_cuda_3d.jl
@@ -0,0 +1,71 @@
+module TestCUDA3D
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_3d_dgsem")
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+@testset "CUDA 3D" begin
+#! format: noindent
+
+@trixi_testset "elixir_advection_basic_gpu.jl native" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[0.00016263963870641478],
+                        linf=[0.0014537194925779984])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test real(ode.p.solver) == Float64
+    @test real(ode.p.solver.basis) == Float64
+    @test real(ode.p.solver.mortar) == Float64
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa Array
+    @test ode.p.solver.basis.derivative_matrix isa Array
+
+    @test Trixi.storage_type(ode.p.cache.elements) === Array
+    @test Trixi.storage_type(ode.p.cache.interfaces) === Array
+    @test Trixi.storage_type(ode.p.cache.boundaries) === Array
+    @test Trixi.storage_type(ode.p.cache.mortars) === Array
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32 / CUDA" begin
+    # Using CUDA inside the testset since otherwise the bindings are hiddend by the anonymous modules
+    using CUDA
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors similar to reference on CPU
+                        l2=[Float32(0.00016263963870641478)],
+                        linf=[Float32(0.0014537194925779984)],
+                        RealT_for_test_tolerances=Float32,
+                        real_type=Float32,
+                        storage_type=CuArray)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    # @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa CuArray
+    @test ode.p.solver.basis.derivative_matrix isa CuArray
+
+    @test Trixi.storage_type(ode.p.cache.elements) === CuArray
+    @test Trixi.storage_type(ode.p.cache.interfaces) === CuArray
+    @test Trixi.storage_type(ode.p.cache.boundaries) === CuArray
+    @test Trixi.storage_type(ode.p.cache.mortars) === CuArray
+end
+
+# Clean up afterwards: delete Trixi.jl output directory
+@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
+end
+end # module
diff --git a/test/test_kernelabstractions.jl b/test/test_kernelabstractions.jl
new file mode 100644
index 00000000000..dff169be3bd
--- /dev/null
+++ b/test/test_kernelabstractions.jl
@@ -0,0 +1,79 @@
+module TestExamplesKernelAbstractions
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+EXAMPLES_DIR = examples_dir()
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+Trixi.mpi_isroot() && isdir(outdir) && rm(outdir, recursive = true)
+Trixi.MPI.Barrier(Trixi.mpi_comm())
+
+@testset "basic" begin
+    @test Trixi._PREFERENCE_THREADING == :kernelabstractions
+end
+
+@testset "KernelAbstractions CPU 2D" begin
+#! format: noindent
+
+@trixi_testset "elixir_advection_basic_gpu.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
+                                 "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=8.311947673061856e-6,
+                        linf=6.627000273229378e-5)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    # @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_2d_dgsem",
+                                 "elixir_advection_basic_gpu.jl"),
+                        # Expected errors similar to reference on CPU
+                        l2=[Float32(8.311947673061856e-6)],
+                        linf=[Float32(6.627000273229378e-5)],
+                        RealT_for_test_tolerances=Float32,
+                        real_type=Float32)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    # @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+end
+
+@testset "KernelAbstractions CPU 3D" begin
+#! format: noindent
+
+@trixi_testset "elixir_advection_basic_gpu.jl" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",
+                                 "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[0.00016263963870641478],
+                        linf=[0.0014537194925779984])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    # @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",
+                                 "elixir_advection_basic_gpu.jl"),
+                        # Expected errors similar to reference on CPU
+                        l2=[Float32(0.00016263963870641478)],
+                        linf=[Float32(0.0014537194925779984)],
+                        RealT_for_test_tolerances=Float32,
+                        real_type=Float32)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    # @test_allocations(Trixi.rhs!, semi, sol, 1000)
+end
+end
+
+# Clean up afterwards: delete Trixi.jl output directory
+Trixi.mpi_isroot() && isdir(outdir) && @test_nowarn rm(outdir, recursive = true)
+Trixi.MPI.Barrier(Trixi.mpi_comm())
+
+end # module
diff --git a/test/test_metal_2d.jl b/test/test_metal_2d.jl
new file mode 100644
index 00000000000..7d445712119
--- /dev/null
+++ b/test/test_metal_2d.jl
@@ -0,0 +1,71 @@
+module TestMetal2D
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+@testset "Metal 2D" begin
+#! format: noindent
+
+@trixi_testset "elixir_advection_basic_gpu.jl native" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=8.311947673061856e-6,
+                        linf=6.627000273229378e-5)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test real(ode.p.solver) == Float64
+    @test real(ode.p.solver.basis) == Float64
+    @test real(ode.p.solver.mortar) == Float64
+    # TODO: `mesh` is currently not `adapt`ed correctly
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa Array
+    @test ode.p.solver.basis.derivative_matrix isa Array
+
+    @test Trixi.storage_type(ode.p.cache.elements) === Array
+    @test Trixi.storage_type(ode.p.cache.interfaces) === Array
+    @test Trixi.storage_type(ode.p.cache.boundaries) === Array
+    @test Trixi.storage_type(ode.p.cache.mortars) === Array
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32 / Metal" begin
+    # Using Metal inside the testset since otherwise the bindings are hiddend by the anonymous modules
+    using Metal
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[Float32(8.311947673061856e-6)],
+                        linf=[Float32(6.627000273229378e-5)],
+                        RealT_for_test_tolerances=Float32,
+                        real_type=Float32,
+                        storage_type=MtlArray)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    # @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: `mesh` is currently not `adapt`ed correctly
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa MtlArray
+    @test ode.p.solver.basis.derivative_matrix isa MtlArray
+
+    @test Trixi.storage_type(ode.p.cache.elements) === MtlArray
+    @test Trixi.storage_type(ode.p.cache.interfaces) === MtlArray
+    @test Trixi.storage_type(ode.p.cache.boundaries) === MtlArray
+    @test Trixi.storage_type(ode.p.cache.mortars) === MtlArray
+end
+
+# Clean up afterwards: delete Trixi.jl output directory
+@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
+end
+end # module
diff --git a/test/test_performance_specializations_2d.jl b/test/test_performance_specializations_2d.jl
index b42a1b8f640..7dceea2b6a7 100644
--- a/test/test_performance_specializations_2d.jl
+++ b/test/test_performance_specializations_2d.jl
@@ -33,7 +33,7 @@ isdir(outdir) && rm(outdir, recursive = true)
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -43,10 +43,10 @@ isdir(outdir) && rm(outdir, recursive = true)
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, 1]
@@ -72,7 +72,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -82,10 +82,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, 1]
@@ -112,7 +112,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -122,10 +122,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, 1]
@@ -151,7 +151,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -161,10 +161,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, 1]
diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 3b3bd40b2f5..967b0f9cf3e 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -33,7 +33,7 @@ isdir(outdir) && rm(outdir, recursive = true)
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -43,10 +43,10 @@ isdir(outdir) && rm(outdir, recursive = true)
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, :, 1]
@@ -72,7 +72,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -82,10 +82,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, :, 1]
@@ -112,7 +112,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -122,10 +122,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, :, 1]
@@ -151,7 +151,7 @@ end
 
         # Call the optimized default version
         du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, semi.mesh,
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
@@ -161,10 +161,10 @@ end
         # `semi.solver.volume_integral.volume_flux`
         du .= 0
         invoke(Trixi.flux_differencing_kernel!,
-               Tuple{typeof(du), typeof(u), Integer, typeof(semi.mesh),
+               Tuple{typeof(du), typeof(u), Integer, Type{typeof(semi.mesh)},
                      typeof(have_nonconservative_terms), typeof(semi.equations),
                      Function, typeof(semi.solver), typeof(semi.cache), Bool},
-               du, u, 1, semi.mesh,
+               du, u, 1, typeof(semi.mesh),
                have_nonconservative_terms, semi.equations,
                semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true)
         du_baseline = du[:, :, :, :, 1]