diff --git a/Project.toml b/Project.toml
index 3f73c968f13..cec98c0008f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ version = "0.16.2-DEV"
 authors = ["Michael Schlottke-Lakemper <michael.schlottke-lakemper@uni-a.de>", "Gregor Gassner <ggassner@uni-koeln.de>", "Hendrik Ranocha <mail@ranocha.de>", "Andrew R. Winters <andrew.ross.winters@liu.se>", "Jesse Chan <jesse.chan@rice.edu>", "Andrés Rueda-Ramírez <am.rueda@upm.es>"]
 
 [deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
@@ -72,6 +73,7 @@ TrixiPlotsExt = "Plots"
 TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer"
 
 [compat]
+AcceleratedKernels = "0.4.3"
 Accessors = "0.1.42"
 AMDGPU = "2.2.1"
 Adapt = "4.4"
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 42340659520..e421c18afdf 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -60,8 +60,8 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
 using FillArrays: Ones, Zeros
 using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
-using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend,
-                          allocate
+using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend
+using AcceleratedKernels: AcceleratedKernels
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices
diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl
index 368b7f5e4cc..976954d8359 100644
--- a/src/callbacks_step/analysis_dg2d.jl
+++ b/src/callbacks_step/analysis_dg2d.jl
@@ -138,7 +138,7 @@ function calc_error_norms(func, u, t, analyzer,
     return l2_error, linf_error
 end
 
-function calc_error_norms(func, _u, t, analyzer,
+function calc_error_norms(func, u, t, analyzer,
                           mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                       UnstructuredMesh2D,
                                       P4estMesh{2}, P4estMeshView{2},
@@ -147,16 +147,13 @@ function calc_error_norms(func, _u, t, analyzer,
                           initial_condition, dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
     @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis
+    @unpack node_coordinates, inverse_jacobian = cache.elements
 
-    # TODO GPU AnalysisCallback currently lives on CPU
-    backend = trixi_backend(_u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        @unpack node_coordinates, inverse_jacobian = cache.elements
-        u = _u
-    else
-        node_coordinates = Array(cache.elements.node_coordinates)
-        inverse_jacobian = Array(cache.elements.inverse_jacobian)
-        u = Array(_u)
+    # Calculate error norms on the CPU, to ensure the order of summation is the same.
+    if trixi_backend(u) !== nothing
+        node_coordinates = Array(node_coordinates)
+        inverse_jacobian = Array(inverse_jacobian)
+        u = Array(u)
     end
 
     # Set up data structures
@@ -338,24 +335,26 @@ function integrate_via_indices(func::Func, u,
     return integral
 end
 
-function integrate_via_indices(func::Func, _u,
+function integrate_via_indices(func::Func, u,
                                mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                            UnstructuredMesh2D,
                                            P4estMesh{2}, P4estMeshView{2},
                                            T8codeMesh{2}},
                                equations, dg::DGSEM, cache,
                                args...; normalize = true) where {Func}
-    # TODO GPU AnalysiCallback currently lives on CPU
-    backend = trixi_backend(_u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        @unpack weights = dg.basis
-        @unpack inverse_jacobian = cache.elements
-        u = _u
-    else
-        weights = Array(dg.basis.weights)
-        inverse_jacobian = Array(cache.elements.inverse_jacobian)
-        u = Array(_u)
-    end
+    return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache,
+                                 args...; normalize = normalize)
+end
+
+function integrate_via_indices(func::Func, ::Nothing, u,
+                               mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
+                                           UnstructuredMesh2D,
+                                           P4estMesh{2}, P4estMeshView{2},
+                                           T8codeMesh{2}},
+                               equations, dg::DGSEM, cache,
+                               args...; normalize = true) where {Func}
+    @unpack weights = dg.basis
+    @unpack inverse_jacobian = cache.elements
 
     # Initialize integral with zeros of the right shape
     integral = zero(func(u, 1, 1, 1, equations, dg, args...))
@@ -380,6 +379,53 @@ function integrate_via_indices(func::Func, _u,
     return integral
 end
 
+function integrate_via_indices(func::Func, backend::Backend, u,
+                               mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
+                                           UnstructuredMesh2D,
+                                           P4estMesh{2}, P4estMeshView{2},
+                                           T8codeMesh{2}},
+                               equations, dg::DGSEM, cache,
+                               args...; normalize = true) where {Func}
+    @unpack weights = dg.basis
+    @unpack inverse_jacobian = cache.elements
+
+    function local_plus((integral, total_volume), (integral_element, volume_element))
+        return (integral + integral_element, total_volume + volume_element)
+    end
+
+    # `func(u, 1,1,1, equations, dg, args...)` might access GPU memory
+    # so we have to rely on the compiler to correctly infer the type of the integral here.
+    # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU.
+    integral0 = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations),
+                                     typeof(dg), map(typeof, args)...))
+    init = neutral = (integral0, zero(real(mesh)))
+
+    # Use quadrature to numerically integrate over entire domain
+    num_elements = nelements(dg, cache)
+    integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements,
+                                                          backend; init,
+                                                          neutral) do element
+        # Initialize integral with zeros of the right shapeu
+        local_integral, local_total_volume = neutral
+
+        for j in eachnode(dg), i in eachnode(dg)
+            volume_jacobian = abs(inv(inverse_jacobian[i, j, element]))
+            local_integral += volume_jacobian * weights[i] * weights[j] *
+                              func(u, i, j, element, equations, dg, args...)
+            local_total_volume += volume_jacobian * weights[i] * weights[j]
+        end
+
+        return (local_integral, local_total_volume)
+    end
+
+    # Normalize with total volume
+    if normalize
+        integral = integral / total_volume
+    end
+
+    return integral
+end
+
 function integrate(func::Func, u,
                    mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2},
                                UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2},
@@ -410,18 +456,10 @@ function integrate(func::Func, u,
     end
 end
 
-function analyze(::typeof(entropy_timederivative), _du, u, t,
+function analyze(::typeof(entropy_timederivative), du, u, t,
                  mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2},
                              UnstructuredMesh2D, P4estMesh{2}, T8codeMesh{2}},
                  equations, dg::Union{DGSEM, FDSBP}, cache)
-    # TODO GPU AnalysiCallback currently lives on CPU
-    backend = trixi_backend(u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        du = _du
-    else
-        du = Array(_du)
-    end
-
     # Calculate
     # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
     integrate_via_indices(u, mesh, equations, dg, cache,
diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl
index 7708a32e6ba..65abd91b827 100644
--- a/src/callbacks_step/analysis_dg3d.jl
+++ b/src/callbacks_step/analysis_dg3d.jl
@@ -161,22 +161,19 @@ function calc_error_norms(func, u, t, analyzer,
     return l2_error, linf_error
 end
 
-function calc_error_norms(func, _u, t, analyzer,
+function calc_error_norms(func, u, t, analyzer,
                           mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                           equations, initial_condition,
                           dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
     @unpack u_local, u_tmp1, u_tmp2, x_local, x_tmp1, x_tmp2, jacobian_local, jacobian_tmp1, jacobian_tmp2 = cache_analysis
+    @unpack node_coordinates, inverse_jacobian = cache.elements
 
-    # TODO GPU AnalysisCallback currently lives on CPU
-    backend = trixi_backend(_u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        @unpack node_coordinates, inverse_jacobian = cache.elements
-        u = _u
-    else
-        node_coordinates = Array(cache.elements.node_coordinates)
-        inverse_jacobian = Array(cache.elements.inverse_jacobian)
-        u = Array(_u)
+    # Calculate error norms on the CPU, to ensure the order of summation is the same.
+    if trixi_backend(u) !== nothing
+        node_coordinates = Array(node_coordinates)
+        inverse_jacobian = Array(inverse_jacobian)
+        u = Array(u)
     end
 
     # Set up data structures
@@ -389,22 +386,22 @@ function integrate_via_indices(func::Func, u,
     return integral
 end
 
-function integrate_via_indices(func::Func, _u,
+function integrate_via_indices(func::Func, u,
                                mesh::Union{StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
                                equations, dg::DGSEM, cache,
                                args...; normalize = true) where {Func}
-    # TODO GPU AnalysiCallback currently lives on CPU
-    backend = trixi_backend(_u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        @unpack weights = dg.basis
-        @unpack inverse_jacobian = cache.elements
-        u = _u
-    else
-        weights = Array(dg.basis.weights)
-        inverse_jacobian = Array(cache.elements.inverse_jacobian)
-        u = Array(_u)
-    end
+    return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache,
+                                 args...; normalize = normalize)
+end
+
+function integrate_via_indices(func::Func, ::Nothing, u,
+                               mesh::Union{StructuredMesh{3}, P4estMesh{3},
+                                           T8codeMesh{3}},
+                               equations, dg::DGSEM, cache,
+                               args...; normalize = true) where {Func}
+    @unpack weights = dg.basis
+    @unpack inverse_jacobian = cache.elements
 
     # Initialize integral with zeros of the right shape
     integral = zero(func(u, 1, 1, 1, 1, equations, dg, args...))
@@ -429,6 +426,51 @@ function integrate_via_indices(func::Func, _u,
     return integral
 end
 
+function integrate_via_indices(func::Func, backend::Backend, u,
+                               mesh::Union{StructuredMesh{3}, P4estMesh{3},
+                                           T8codeMesh{3}},
+                               equations, dg::DGSEM, cache,
+                               args...; normalize = true) where {Func}
+    @unpack weights = dg.basis
+    @unpack inverse_jacobian = cache.elements
+
+    function local_plus((integral, total_volume), (integral_element, volume_element))
+        return (integral + integral_element, total_volume + volume_element)
+    end
+
+    # `func(u, 1,1,1,1, equations, dg, args...)` might access GPU memory
+    # so we have to rely on the compiler to correctly infer the type of the integral here.
+    # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU.
+    integral0 = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int,
+                                     typeof(equations), typeof(dg),
+                                     map(typeof, args)...))
+    init = neutral = (integral0, zero(real(mesh)))
+
+    # Use quadrature to numerically integrate over entire domain
+    num_elements = nelements(dg, cache)
+    integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements,
+                                                          backend; init,
+                                                          neutral) do element
+        # Initialize integral with zeros of the right shape
+        local_integral, local_total_volume = neutral
+
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            volume_jacobian = abs(inv(inverse_jacobian[i, j, k, element]))
+            local_integral += volume_jacobian * weights[i] * weights[j] * weights[k] *
+                              func(u, i, j, k, element, equations, dg, args...)
+            local_total_volume += volume_jacobian * weights[i] * weights[j] * weights[k]
+        end
+        return local_integral, local_total_volume
+    end
+
+    # Normalize with total volume
+    if normalize
+        integral = integral / total_volume
+    end
+
+    return integral
+end
+
 function integrate(func::Func, u,
                    mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                                T8codeMesh{3}},
@@ -460,18 +502,10 @@ function integrate(func::Func, u,
     end
 end
 
-function analyze(::typeof(entropy_timederivative), _du, u, t,
+function analyze(::typeof(entropy_timederivative), du, u, t,
                  mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                              T8codeMesh{3}},
                  equations, dg::Union{DGSEM, FDSBP}, cache)
-    # TODO GPU AnalysiCallback currently lives on CPU
-    backend = trixi_backend(u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        du = _du
-    else
-        du = Array(_du)
-    end
-
     # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
     integrate_via_indices(u, mesh, equations, dg, cache,
                           du) do u, i, j, k, element, equations, dg, du
diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl
index ca8d7130f84..9c2b9020cff 100644
--- a/src/callbacks_step/stepsize.jl
+++ b/src/callbacks_step/stepsize.jl
@@ -205,31 +205,25 @@ function calc_max_scaled_speed(backend::Nothing, u, mesh, constant_speed, equati
     return max_scaled_speed
 end
 
-function calc_max_scaled_speed(backend::Backend, u, mesh, constant_speed, equations, dg,
-                               cache)
+function calc_max_scaled_speed(backend::Backend, u, ::MeshT, constant_speed, equations,
+                               dg,
+                               cache) where {MeshT}
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
 
     num_elements = nelements(dg, cache)
-    max_scaled_speeds = allocate(backend, eltype(u), num_elements)
-
-    kernel! = max_scaled_speed_KAkernel!(backend)
-    kernel!(max_scaled_speeds, u, typeof(mesh), constant_speed, equations, dg,
-            contravariant_vectors,
-            inverse_jacobian;
-            ndrange = num_elements)
-
-    return maximum(max_scaled_speeds)
-end
+    init = neutral = AcceleratedKernels.neutral_element(Base.max, eltype(u))
+
+    # Provide a custom neutral and init element since we "reduce" over 1:num_elements
+    max_scaled_speed = AcceleratedKernels.mapreduce(Base.max, 1:num_elements, backend;
+                                                    init, neutral) do element
+        max_scaled_speed_per_element(u, MeshT, constant_speed,
+                                     equations, dg,
+                                     contravariant_vectors,
+                                     inverse_jacobian,
+                                     element)
+    end
 
-@kernel function max_scaled_speed_KAkernel!(max_scaled_speeds, u, MeshT, constant_speed,
-                                            equations,
-                                            dg, contravariant_vectors, inverse_jacobian)
-    element = @index(Global)
-    max_scaled_speeds[element] = max_scaled_speed_per_element(u, MeshT, constant_speed,
-                                                              equations, dg,
-                                                              contravariant_vectors,
-                                                              inverse_jacobian,
-                                                              element)
+    return max_scaled_speed
 end
 
 include("stepsize_dg1d.jl")