From b7039db906488327689a30fd8190b927df0a8da7 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 25 Mar 2026 10:33:16 +0100
Subject: [PATCH 1/9] use mapreduce from AcceleratedKernels in
 calc_max_scaled_speed

---
 Project.toml                   |  2 ++
 src/Trixi.jl                   |  1 +
 src/callbacks_step/stepsize.jl | 29 ++++++++++++-----------------
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/Project.toml b/Project.toml
index 2564c2deca5..a339638658b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ version = "0.16.1-DEV"
 authors = ["Michael Schlottke-Lakemper <michael.schlottke-lakemper@uni-a.de>", "Gregor Gassner <ggassner@uni-koeln.de>", "Hendrik Ranocha <mail@ranocha.de>", "Andrew R. Winters <andrew.ross.winters@liu.se>", "Jesse Chan <jesse.chan@rice.edu>", "Andrés Rueda-Ramírez <am.rueda@upm.es>"]
 
 [deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
@@ -70,6 +71,7 @@ TrixiPlotsExt = "Plots"
 TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer"
 
 [compat]
+AcceleratedKernels = "0.4.3"
 Accessors = "0.1.42"
 Adapt = "4.3"
 CUDA = "5.8.2"
diff --git a/src/Trixi.jl b/src/Trixi.jl
index 42340659520..fabd23fc3cd 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -62,6 +62,7 @@ using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
 using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend,
                           allocate
+using AcceleratedKernels: AcceleratedKernels
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices
diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl
index aca3098af22..a642b6ee918 100644
--- a/src/callbacks_step/stepsize.jl
+++ b/src/callbacks_step/stepsize.jl
@@ -205,28 +205,23 @@ end
 function calc_max_scaled_speed(backend::Backend, u, mesh, constant_speed, equations, dg,
                                cache)
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
+    meshT = typeof(mesh)
 
     num_elements = nelements(dg, cache)
-    max_scaled_speeds = allocate(backend, eltype(u), num_elements)
+    init = neutral = AcceleratedKernels.neutral_element(Base.max, eltype(u))
 
-    kernel! = max_scaled_speed_KAkernel!(backend)
-    kernel!(max_scaled_speeds, u, typeof(mesh), constant_speed, equations, dg,
-            contravariant_vectors,
-            inverse_jacobian;
-            ndrange = num_elements)
+    # Provide a custom neutral and init element since we "reduce" over 1:num_elements
+    max_scaled_speed =
+        AcceleratedKernels.mapreduce(Base.max, 1:num_elements, backend; init, neutral) do element
 
-    return maximum(max_scaled_speeds)
-end
+        max_scaled_speed_per_element(u, meshT, constant_speed,
+                                     equations, dg,
+                                     contravariant_vectors,
+                                     inverse_jacobian,
+                                     element)
+    end
 
-@kernel function max_scaled_speed_KAkernel!(max_scaled_speeds, u, MeshT, constant_speed,
-                                            equations,
-                                            dg, contravariant_vectors, inverse_jacobian)
-    element = @index(Global)
-    max_scaled_speeds[element] = max_scaled_speed_per_element(u, MeshT, constant_speed,
-                                                              equations, dg,
-                                                              contravariant_vectors,
-                                                              inverse_jacobian,
-                                                              element)
+    return max_scaled_speed
 end
 
 include("stepsize_dg1d.jl")

From c095b471bce622d7d6d90973315f68b82af6b1ac Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 25 Mar 2026 11:31:02 +0100
Subject: [PATCH 2/9] convince Julia to not store MeshT in a closure

---
 src/callbacks_step/stepsize.jl | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl
index a642b6ee918..b6eb403b543 100644
--- a/src/callbacks_step/stepsize.jl
+++ b/src/callbacks_step/stepsize.jl
@@ -202,19 +202,17 @@ function calc_max_scaled_speed(backend::Nothing, u, mesh, constant_speed, equati
     return max_scaled_speed
 end
 
-function calc_max_scaled_speed(backend::Backend, u, mesh, constant_speed, equations, dg,
-                               cache)
+function calc_max_scaled_speed(backend::Backend, u, ::MeshT, constant_speed, equations, dg,
+                        cache) where {MeshT}
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
-    meshT = typeof(mesh)
 
     num_elements = nelements(dg, cache)
     init = neutral = AcceleratedKernels.neutral_element(Base.max, eltype(u))
 
     # Provide a custom neutral and init element since we "reduce" over 1:num_elements
-    max_scaled_speed =
-        AcceleratedKernels.mapreduce(Base.max, 1:num_elements, backend; init, neutral) do element
-
-        max_scaled_speed_per_element(u, meshT, constant_speed,
+    max_scaled_speed = AcceleratedKernels.mapreduce(Base.max, 1:num_elements, backend;
+                                                    init, neutral) do element
+        max_scaled_speed_per_element(u, MeshT, constant_speed,
                                      equations, dg,
                                      contravariant_vectors,
                                      inverse_jacobian,

From 87c40b382c28e1fb0cd2f92dbce2b9d44aa1a8ad Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 25 Mar 2026 11:35:29 +0100
Subject: [PATCH 3/9] Update src/callbacks_step/stepsize.jl

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 src/callbacks_step/stepsize.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl
index b6eb403b543..bc4140eff4c 100644
--- a/src/callbacks_step/stepsize.jl
+++ b/src/callbacks_step/stepsize.jl
@@ -202,8 +202,9 @@ function calc_max_scaled_speed(backend::Nothing, u, mesh, constant_speed, equati
     return max_scaled_speed
 end
 
-function calc_max_scaled_speed(backend::Backend, u, ::MeshT, constant_speed, equations, dg,
-                        cache) where {MeshT}
+function calc_max_scaled_speed(backend::Backend, u, ::MeshT, constant_speed, equations,
+                               dg,
+                               cache) where {MeshT}
     @unpack contravariant_vectors, inverse_jacobian = cache.elements
 
     num_elements = nelements(dg, cache)

From 93b5391e19cf35adfa284911bdd369faa78b6c1f Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 25 Mar 2026 14:57:22 +0100
Subject: [PATCH 4/9] port integrate_via_indices to the GPU

---
 src/callbacks_step/analysis_dg2d.jl | 79 +++++++++++++++++++++--------
 src/callbacks_step/analysis_dg3d.jl | 74 +++++++++++++++++++--------
 2 files changed, 111 insertions(+), 42 deletions(-)

diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl
index 368b7f5e4cc..d3fa8873934 100644
--- a/src/callbacks_step/analysis_dg2d.jl
+++ b/src/callbacks_step/analysis_dg2d.jl
@@ -338,24 +338,25 @@ function integrate_via_indices(func::Func, u,
     return integral
 end
 
-function integrate_via_indices(func::Func, _u,
+function integrate_via_indices(func::Func, u,
                                mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                            UnstructuredMesh2D,
                                            P4estMesh{2}, P4estMeshView{2},
                                            T8codeMesh{2}},
                                equations, dg::DGSEM, cache,
                                args...; normalize = true) where {Func}
-    # TODO GPU AnalysiCallback currently lives on CPU
-    backend = trixi_backend(_u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        @unpack weights = dg.basis
-        @unpack inverse_jacobian = cache.elements
-        u = _u
-    else
-        weights = Array(dg.basis.weights)
-        inverse_jacobian = Array(cache.elements.inverse_jacobian)
-        u = Array(_u)
-    end
+    return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, args...; normalize = normalize)
+end
+
+function integrate_via_indices(func::Func, ::Nothing, u,
+                               mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
+                                           UnstructuredMesh2D,
+                                           P4estMesh{2}, P4estMeshView{2},
+                                           T8codeMesh{2}},
+                               equations, dg::DGSEM, cache,
+                               args...; normalize = true) where {Func}
+    @unpack weights = dg.basis
+    @unpack inverse_jacobian = cache.elements
 
     # Initialize integral with zeros of the right shape
     integral = zero(func(u, 1, 1, 1, equations, dg, args...))
@@ -380,6 +381,50 @@ function integrate_via_indices(func::Func, _u,
     return integral
 end
 
+function integrate_via_indices(func::Func, backend::Backend, u,
+                               mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
+                                           UnstructuredMesh2D,
+                                           P4estMesh{2}, P4estMeshView{2},
+                                           T8codeMesh{2}},
+                               equations, dg::DGSEM, cache,
+                               args...; normalize = true) where {Func}
+    @unpack weights = dg.basis
+    @unpack inverse_jacobian = cache.elements
+
+    function local_plus((integral, total_volume), (integral_element, volume_element))
+        return (integral + integral_element, total_volume + volume_element)
+    end
+
+    # `func(u, 1,1,1, equations, dg, args...)` might access GPU memory
+    # so we have to rely on the compiler to correctly infer the type of the integral here.
+    # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU.
+    integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations), typeof(dg), map(typeof, args)...))
+    init = neutral = (integral₀, zero(real(mesh)))
+
+    # Use quadrature to numerically integrate over entire domain
+    num_elements = nelements(dg, cache)
+    _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, backend; init, neutral) do element
+        # Initialize integral with zeros of the right shape
+        integral, total_volume = neutral
+
+        for j in eachnode(dg), i in eachnode(dg)
+            volume_jacobian = abs(inv(inverse_jacobian[i, j, element]))
+            integral += volume_jacobian * weights[i] * weights[j] *
+                        func(u, i, j, element, equations, dg, args...)
+            total_volume += volume_jacobian * weights[i] * weights[j]
+        end
+
+        return (integral, total_volume)
+    end
+
+    # Normalize with total volume
+    if normalize
+        _integral = _integral / _total_volume
+    end
+
+    return _integral
+end
+
 function integrate(func::Func, u,
                    mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2},
                                UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2},
@@ -410,18 +455,10 @@ function integrate(func::Func, u,
     end
 end
 
-function analyze(::typeof(entropy_timederivative), _du, u, t,
+function analyze(::typeof(entropy_timederivative), du, u, t,
                  mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2},
                              UnstructuredMesh2D, P4estMesh{2}, T8codeMesh{2}},
                  equations, dg::Union{DGSEM, FDSBP}, cache)
-    # TODO GPU AnalysiCallback currently lives on CPU
-    backend = trixi_backend(u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        du = _du
-    else
-        du = Array(_du)
-    end
-
     # Calculate
     # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
     integrate_via_indices(u, mesh, equations, dg, cache,
diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl
index 7708a32e6ba..67fd46f9e46 100644
--- a/src/callbacks_step/analysis_dg3d.jl
+++ b/src/callbacks_step/analysis_dg3d.jl
@@ -389,22 +389,21 @@ function integrate_via_indices(func::Func, u,
     return integral
 end
 
-function integrate_via_indices(func::Func, _u,
+function integrate_via_indices(func::Func, u,
                                mesh::Union{StructuredMesh{3}, P4estMesh{3},
                                            T8codeMesh{3}},
                                equations, dg::DGSEM, cache,
                                args...; normalize = true) where {Func}
-    # TODO GPU AnalysiCallback currently lives on CPU
-    backend = trixi_backend(_u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        @unpack weights = dg.basis
-        @unpack inverse_jacobian = cache.elements
-        u = _u
-    else
-        weights = Array(dg.basis.weights)
-        inverse_jacobian = Array(cache.elements.inverse_jacobian)
-        u = Array(_u)
-    end
+    return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, args...; normalize = normalize)
+end
+
+function integrate_via_indices(func::Func, ::Nothing, u,
+                               mesh::Union{StructuredMesh{3}, P4estMesh{3},
+                                           T8codeMesh{3}},
+                               equations, dg::DGSEM, cache,
+                               args...; normalize = true) where {Func}
+    @unpack weights = dg.basis
+    @unpack inverse_jacobian = cache.elements
 
     # Initialize integral with zeros of the right shape
     integral = zero(func(u, 1, 1, 1, 1, equations, dg, args...))
@@ -429,6 +428,47 @@ function integrate_via_indices(func::Func, _u,
     return integral
 end
 
+function integrate_via_indices(func::Func, backend::Backend, u,
+                               mesh::Union{StructuredMesh{3}, P4estMesh{3},
+                                           T8codeMesh{3}},
+                               equations, dg::DGSEM, cache,
+                               args...; normalize = true) where {Func}
+    @unpack weights = dg.basis
+    @unpack inverse_jacobian = cache.elements
+
+    function local_plus((integral, total_volume), (integral_element, volume_element))
+        return (integral + integral_element, total_volume + volume_element)
+    end
+
+    # `func(u, 1,1,1,1, equations, dg, args...)` might access GPU memory
+    # so we have to rely on the compiler to correctly infer the type of the integral here.
+    # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU.
+    integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int, typeof(equations), typeof(dg), map(typeof, args)...))
+    init = neutral = (integral₀, zero(real(mesh)))
+
+    # Use quadrature to numerically integrate over entire domain
+    num_elements = nelements(dg, cache)
+    _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, backend; init, neutral) do element
+        # Initialize integral with zeros of the right shape
+        integral, total_volume = neutral
+
+        for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            volume_jacobian = abs(inv(inverse_jacobian[i, j, k, element]))
+            integral += volume_jacobian * weights[i] * weights[j] * weights[k] *
+                        func(u, i, j, k, element, equations, dg, args...)
+            total_volume += volume_jacobian * weights[i] * weights[j] * weights[k]
+        end
+        return integral, total_volume
+    end
+
+    # Normalize with total volume
+    if normalize
+        _integral = _integral / _total_volume
+    end
+
+    return _integral
+end
+
 function integrate(func::Func, u,
                    mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                                T8codeMesh{3}},
@@ -460,18 +500,10 @@ function integrate(func::Func, u,
     end
 end
 
-function analyze(::typeof(entropy_timederivative), _du, u, t,
+function analyze(::typeof(entropy_timederivative), du, u, t,
                  mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3},
                              T8codeMesh{3}},
                  equations, dg::Union{DGSEM, FDSBP}, cache)
-    # TODO GPU AnalysiCallback currently lives on CPU
-    backend = trixi_backend(u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        du = _du
-    else
-        du = Array(_du)
-    end
-
     # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ
     integrate_via_indices(u, mesh, equations, dg, cache,
                           du) do u, i, j, k, element, equations, dg, du

From 8b61f8106b09e37605a45ac698f3be2627b0c474 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 25 Mar 2026 15:06:46 +0100
Subject: [PATCH 5/9] Apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 src/callbacks_step/analysis_dg2d.jl | 10 +++++++---
 src/callbacks_step/analysis_dg3d.jl | 11 ++++++++---
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl
index d3fa8873934..2a0a1b7288a 100644
--- a/src/callbacks_step/analysis_dg2d.jl
+++ b/src/callbacks_step/analysis_dg2d.jl
@@ -345,7 +345,8 @@ function integrate_via_indices(func::Func, u,
                                            T8codeMesh{2}},
                                equations, dg::DGSEM, cache,
                                args...; normalize = true) where {Func}
-    return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, args...; normalize = normalize)
+    return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache,
+                                 args...; normalize = normalize)
 end
 
 function integrate_via_indices(func::Func, ::Nothing, u,
@@ -398,12 +399,15 @@ function integrate_via_indices(func::Func, backend::Backend, u,
     # `func(u, 1,1,1, equations, dg, args...)` might access GPU memory
     # so we have to rely on the compiler to correctly infer the type of the integral here.
     # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU.
-    integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations), typeof(dg), map(typeof, args)...))
+    integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations),
+                                     typeof(dg), map(typeof, args)...))
     init = neutral = (integral₀, zero(real(mesh)))
 
     # Use quadrature to numerically integrate over entire domain
     num_elements = nelements(dg, cache)
-    _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, backend; init, neutral) do element
+    _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements,
+                                                            backend; init,
+                                                            neutral) do element
         # Initialize integral with zeros of the right shape
         integral, total_volume = neutral
 
diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl
index 67fd46f9e46..1aa8693f310 100644
--- a/src/callbacks_step/analysis_dg3d.jl
+++ b/src/callbacks_step/analysis_dg3d.jl
@@ -394,7 +394,8 @@ function integrate_via_indices(func::Func, u,
                                            T8codeMesh{3}},
                                equations, dg::DGSEM, cache,
                                args...; normalize = true) where {Func}
-    return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, args...; normalize = normalize)
+    return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache,
+                                 args...; normalize = normalize)
 end
 
 function integrate_via_indices(func::Func, ::Nothing, u,
@@ -443,12 +444,16 @@ function integrate_via_indices(func::Func, backend::Backend, u,
     # `func(u, 1,1,1,1, equations, dg, args...)` might access GPU memory
     # so we have to rely on the compiler to correctly infer the type of the integral here.
     # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU.
-    integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int, typeof(equations), typeof(dg), map(typeof, args)...))
+    integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int,
+                                     typeof(equations), typeof(dg),
+                                     map(typeof, args)...))
     init = neutral = (integral₀, zero(real(mesh)))
 
     # Use quadrature to numerically integrate over entire domain
     num_elements = nelements(dg, cache)
-    _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, backend; init, neutral) do element
+    _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements,
+                                                            backend; init,
+                                                            neutral) do element
         # Initialize integral with zeros of the right shape
         integral, total_volume = neutral
 

From ebefe89111dc72690d2138e8c49f8e1ef91f6a2b Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 27 Mar 2026 08:29:52 +0100
Subject: [PATCH 6/9] remove allocate import

---
 src/Trixi.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index fabd23fc3cd..e421c18afdf 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -60,8 +60,7 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect
 using FillArrays: Ones, Zeros
 using ForwardDiff: ForwardDiff
 using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace
-using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend,
-                          allocate
+using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend
 using AcceleratedKernels: AcceleratedKernels
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION

From 46bf9443edc9f7aab613e2e1eab8449490413e05 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sun, 29 Mar 2026 17:44:59 +0200
Subject: [PATCH 7/9] cleanup

---
 src/callbacks_step/analysis_dg2d.jl | 39 +++++++++++++----------------
 src/callbacks_step/analysis_dg3d.jl | 37 +++++++++++++--------------
 2 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl
index 2a0a1b7288a..712ca4bf488 100644
--- a/src/callbacks_step/analysis_dg2d.jl
+++ b/src/callbacks_step/analysis_dg2d.jl
@@ -138,7 +138,7 @@ function calc_error_norms(func, u, t, analyzer,
     return l2_error, linf_error
 end
 
-function calc_error_norms(func, _u, t, analyzer,
+function calc_error_norms(func, u, t, analyzer,
                           mesh::Union{StructuredMesh{2}, StructuredMeshView{2},
                                       UnstructuredMesh2D,
                                       P4estMesh{2}, P4estMeshView{2},
@@ -147,16 +147,13 @@ function calc_error_norms(func, _u, t, analyzer,
                           initial_condition, dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
     @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis
+    @unpack node_coordinates, inverse_jacobian = cache.elements
 
-    # TODO GPU AnalysisCallback currently lives on CPU
-    backend = trixi_backend(_u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        @unpack node_coordinates, inverse_jacobian = cache.elements
-        u = _u
-    else
-        node_coordinates = Array(cache.elements.node_coordinates)
-        inverse_jacobian = Array(cache.elements.inverse_jacobian)
-        u = Array(_u)
+    # Calculate error norms on the CPU, to ensure the oder of summation is the same.
+    if trixi_backend(u) !== nothing
+        node_coordinates = Array(node_coordinates)
+        inverse_jacobian = Array(inverse_jacobian)
+        u = Array(u)
     end
 
     # Set up data structures
@@ -399,34 +396,34 @@ function integrate_via_indices(func::Func, backend::Backend, u,
     # `func(u, 1,1,1, equations, dg, args...)` might access GPU memory
     # so we have to rely on the compiler to correctly infer the type of the integral here.
     # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU.
-    integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations),
+    integral0 = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations),
                                      typeof(dg), map(typeof, args)...))
-    init = neutral = (integral₀, zero(real(mesh)))
+    init = neutral = (integral0, zero(real(mesh)))
 
     # Use quadrature to numerically integrate over entire domain
     num_elements = nelements(dg, cache)
-    _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements,
+    integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements,
                                                             backend; init,
                                                             neutral) do element
-        # Initialize integral with zeros of the right shape
-        integral, total_volume = neutral
+        # Initialize integral with zeros of the right shapeu
+        local_integral, local_total_volume = neutral
 
         for j in eachnode(dg), i in eachnode(dg)
             volume_jacobian = abs(inv(inverse_jacobian[i, j, element]))
-            integral += volume_jacobian * weights[i] * weights[j] *
-                        func(u, i, j, element, equations, dg, args...)
-            total_volume += volume_jacobian * weights[i] * weights[j]
+            local_integral += volume_jacobian * weights[i] * weights[j] *
+                              func(u, i, j, element, equations, dg, args...)
+            local_total_volume += volume_jacobian * weights[i] * weights[j]
         end
 
-        return (integral, total_volume)
+        return (local_integral, local_total_volume)
     end
 
     # Normalize with total volume
     if normalize
-        _integral = _integral / _total_volume
+        integral = integral / total_volume
     end
 
-    return _integral
+    return integral
 end
 
 function integrate(func::Func, u,
diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl
index 1aa8693f310..2cc7cf6a1fe 100644
--- a/src/callbacks_step/analysis_dg3d.jl
+++ b/src/callbacks_step/analysis_dg3d.jl
@@ -161,22 +161,19 @@ function calc_error_norms(func, u, t, analyzer,
     return l2_error, linf_error
 end
 
-function calc_error_norms(func, _u, t, analyzer,
+function calc_error_norms(func, u, t, analyzer,
                           mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}},
                           equations, initial_condition,
                           dg::DGSEM, cache, cache_analysis)
     @unpack vandermonde, weights = analyzer
     @unpack u_local, u_tmp1, u_tmp2, x_local, x_tmp1, x_tmp2, jacobian_local, jacobian_tmp1, jacobian_tmp2 = cache_analysis
+    @unpack node_coordinates, inverse_jacobian = cache.elements
 
-    # TODO GPU AnalysisCallback currently lives on CPU
-    backend = trixi_backend(_u)
-    if backend isa Nothing # TODO GPU KA CPU backend
-        @unpack node_coordinates, inverse_jacobian = cache.elements
-        u = _u
-    else
-        node_coordinates = Array(cache.elements.node_coordinates)
-        inverse_jacobian = Array(cache.elements.inverse_jacobian)
-        u = Array(_u)
+    # Calculate error norms on the CPU, to ensure the oder of summation is the same.
+    if trixi_backend(u) !== nothing
+        node_coordinates = Array(node_coordinates)
+        inverse_jacobian = Array(inverse_jacobian)
+        u = Array(u)
     end
 
     # Set up data structures
@@ -444,34 +441,34 @@ function integrate_via_indices(func::Func, backend::Backend, u,
     # `func(u, 1,1,1,1, equations, dg, args...)` might access GPU memory
     # so we have to rely on the compiler to correctly infer the type of the integral here.
     # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU.
-    integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int,
+    integral0 = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int,
                                      typeof(equations), typeof(dg),
                                      map(typeof, args)...))
-    init = neutral = (integral₀, zero(real(mesh)))
+    init = neutral = (integral0, zero(real(mesh)))
 
     # Use quadrature to numerically integrate over entire domain
     num_elements = nelements(dg, cache)
-    _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements,
+    integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements,
                                                             backend; init,
                                                             neutral) do element
         # Initialize integral with zeros of the right shape
-        integral, total_volume = neutral
+        local_integral, local_total_volume = neutral
 
         for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
             volume_jacobian = abs(inv(inverse_jacobian[i, j, k, element]))
-            integral += volume_jacobian * weights[i] * weights[j] * weights[k] *
-                        func(u, i, j, k, element, equations, dg, args...)
-            total_volume += volume_jacobian * weights[i] * weights[j] * weights[k]
+            local_integral += volume_jacobian * weights[i] * weights[j] * weights[k] *
+                              func(u, i, j, k, element, equations, dg, args...)
+            local_total_volume += volume_jacobian * weights[i] * weights[j] * weights[k]
         end
-        return integral, total_volume
+        return local_integral, local_total_volume
     end
 
     # Normalize with total volume
     if normalize
-        _integral = _integral / _total_volume
+        integral = integral / total_volume
     end
 
-    return _integral
+    return integral
 end
 
 function integrate(func::Func, u,

From f2cfe81be288d577028f583aae61043ec044a126 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Sun, 29 Mar 2026 17:45:57 +0200
Subject: [PATCH 8/9] Apply suggestions from code review

Co-authored-by: Valentin Churavy <v.churavy@gmail.com>
---
 src/callbacks_step/analysis_dg2d.jl | 2 +-
 src/callbacks_step/analysis_dg3d.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl
index 712ca4bf488..fd55f2536a7 100644
--- a/src/callbacks_step/analysis_dg2d.jl
+++ b/src/callbacks_step/analysis_dg2d.jl
@@ -149,7 +149,7 @@ function calc_error_norms(func, u, t, analyzer,
     @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis
     @unpack node_coordinates, inverse_jacobian = cache.elements
 
-    # Calculate error norms on the CPU, to ensure the oder of summation is the same.
+    # Calculate error norms on the CPU, to ensure the order of summation is the same.
     if trixi_backend(u) !== nothing
         node_coordinates = Array(node_coordinates)
         inverse_jacobian = Array(inverse_jacobian)
diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl
index 2cc7cf6a1fe..56727c5d546 100644
--- a/src/callbacks_step/analysis_dg3d.jl
+++ b/src/callbacks_step/analysis_dg3d.jl
@@ -169,7 +169,7 @@ function calc_error_norms(func, u, t, analyzer,
     @unpack u_local, u_tmp1, u_tmp2, x_local, x_tmp1, x_tmp2, jacobian_local, jacobian_tmp1, jacobian_tmp2 = cache_analysis
     @unpack node_coordinates, inverse_jacobian = cache.elements
 
-    # Calculate error norms on the CPU, to ensure the oder of summation is the same.
+    # Calculate error norms on the CPU, to ensure the order of summation is the same.
     if trixi_backend(u) !== nothing
         node_coordinates = Array(node_coordinates)
         inverse_jacobian = Array(inverse_jacobian)

From c9c4898c96588bfc759b7520f675cd6b847e3824 Mon Sep 17 00:00:00 2001
From: Benedict Geihe <bgeihe@uni-koeln.de>
Date: Tue, 31 Mar 2026 16:03:39 +0200
Subject: [PATCH 9/9] fmt

---
 src/callbacks_step/analysis_dg2d.jl | 4 ++--
 src/callbacks_step/analysis_dg3d.jl | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl
index fd55f2536a7..976954d8359 100644
--- a/src/callbacks_step/analysis_dg2d.jl
+++ b/src/callbacks_step/analysis_dg2d.jl
@@ -403,8 +403,8 @@ function integrate_via_indices(func::Func, backend::Backend, u,
     # Use quadrature to numerically integrate over entire domain
     num_elements = nelements(dg, cache)
     integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements,
-                                                            backend; init,
-                                                            neutral) do element
+                                                          backend; init,
+                                                          neutral) do element
         # Initialize integral with zeros of the right shapeu
         local_integral, local_total_volume = neutral
 
diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl
index 56727c5d546..65abd91b827 100644
--- a/src/callbacks_step/analysis_dg3d.jl
+++ b/src/callbacks_step/analysis_dg3d.jl
@@ -449,8 +449,8 @@ function integrate_via_indices(func::Func, backend::Backend, u,
     # Use quadrature to numerically integrate over entire domain
     num_elements = nelements(dg, cache)
     integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements,
-                                                            backend; init,
-                                                            neutral) do element
+                                                          backend; init,
+                                                          neutral) do element
         # Initialize integral with zeros of the right shape
         local_integral, local_total_volume = neutral