From b7039db906488327689a30fd8190b927df0a8da7 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 25 Mar 2026 10:33:16 +0100 Subject: [PATCH 1/9] use mapreduce from AcceleratedKernels in calc_max_scaled_speed --- Project.toml | 2 ++ src/Trixi.jl | 1 + src/callbacks_step/stepsize.jl | 29 ++++++++++++----------------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/Project.toml b/Project.toml index 2564c2deca5..a339638658b 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ version = "0.16.1-DEV" authors = ["Michael Schlottke-Lakemper ", "Gregor Gassner ", "Hendrik Ranocha ", "Andrew R. Winters ", "Jesse Chan ", "Andrés Rueda-Ramírez "] [deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2" @@ -70,6 +71,7 @@ TrixiPlotsExt = "Plots" TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer" [compat] +AcceleratedKernels = "0.4.3" Accessors = "0.1.42" Adapt = "4.3" CUDA = "5.8.2" diff --git a/src/Trixi.jl b/src/Trixi.jl index 42340659520..fabd23fc3cd 100644 --- a/src/Trixi.jl +++ b/src/Trixi.jl @@ -62,6 +62,7 @@ using ForwardDiff: ForwardDiff using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend, allocate +using AcceleratedKernels: AcceleratedKernels using LinearMaps: LinearMap if _PREFERENCE_LOOPVECTORIZATION using LoopVectorization: LoopVectorization, @turbo, indices diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl index aca3098af22..a642b6ee918 100644 --- a/src/callbacks_step/stepsize.jl +++ b/src/callbacks_step/stepsize.jl @@ -205,28 +205,23 @@ end function calc_max_scaled_speed(backend::Backend, u, mesh, constant_speed, equations, dg, cache) @unpack contravariant_vectors, inverse_jacobian = cache.elements + meshT = typeof(mesh) num_elements = nelements(dg, cache) - max_scaled_speeds = allocate(backend, eltype(u), num_elements) + init = neutral = AcceleratedKernels.neutral_element(Base.max, eltype(u)) - kernel! = max_scaled_speed_KAkernel!(backend) - kernel!(max_scaled_speeds, u, typeof(mesh), constant_speed, equations, dg, - contravariant_vectors, - inverse_jacobian; - ndrange = num_elements) + # Provide a custom neutral and init element since we "reduce" over 1:num_elements + max_scaled_speed = + AcceleratedKernels.mapreduce(Base.max, 1:num_elements, backend; init, neutral) do element - return maximum(max_scaled_speeds) -end + max_scaled_speed_per_element(u, meshT, constant_speed, + equations, dg, + contravariant_vectors, + inverse_jacobian, + element) + end -@kernel function max_scaled_speed_KAkernel!(max_scaled_speeds, u, MeshT, constant_speed, - equations, - dg, contravariant_vectors, inverse_jacobian) - element = @index(Global) - max_scaled_speeds[element] = max_scaled_speed_per_element(u, MeshT, constant_speed, - equations, dg, - contravariant_vectors, - inverse_jacobian, - element) + return max_scaled_speed end include("stepsize_dg1d.jl") From c095b471bce622d7d6d90973315f68b82af6b1ac Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 25 Mar 2026 11:31:02 +0100 Subject: [PATCH 2/9] convince Julia to not store MeshT in a closure --- src/callbacks_step/stepsize.jl | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl index a642b6ee918..b6eb403b543 100644 --- a/src/callbacks_step/stepsize.jl +++ b/src/callbacks_step/stepsize.jl @@ -202,19 +202,17 @@ function calc_max_scaled_speed(backend::Nothing, u, mesh, constant_speed, equati return max_scaled_speed end -function calc_max_scaled_speed(backend::Backend, u, mesh, constant_speed, equations, dg, - cache) +function calc_max_scaled_speed(backend::Backend, u, ::MeshT, constant_speed, equations, dg, + cache) where {MeshT} @unpack contravariant_vectors, inverse_jacobian = cache.elements - meshT = typeof(mesh) num_elements = nelements(dg, cache) init = neutral = AcceleratedKernels.neutral_element(Base.max, eltype(u)) # Provide a custom neutral and init element since we "reduce" over 1:num_elements - max_scaled_speed = - AcceleratedKernels.mapreduce(Base.max, 1:num_elements, backend; init, neutral) do element - - max_scaled_speed_per_element(u, meshT, constant_speed, + max_scaled_speed = AcceleratedKernels.mapreduce(Base.max, 1:num_elements, backend; + init, neutral) do element + max_scaled_speed_per_element(u, MeshT, constant_speed, equations, dg, contravariant_vectors, inverse_jacobian, From 87c40b382c28e1fb0cd2f92dbce2b9d44aa1a8ad Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 25 Mar 2026 11:35:29 +0100 Subject: [PATCH 3/9] Update src/callbacks_step/stepsize.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/callbacks_step/stepsize.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl index b6eb403b543..bc4140eff4c 100644 --- a/src/callbacks_step/stepsize.jl +++ b/src/callbacks_step/stepsize.jl @@ -202,8 +202,9 @@ function calc_max_scaled_speed(backend::Nothing, u, mesh, constant_speed, equati return max_scaled_speed end -function calc_max_scaled_speed(backend::Backend, u, ::MeshT, constant_speed, equations, dg, - cache) where {MeshT} +function calc_max_scaled_speed(backend::Backend, u, ::MeshT, constant_speed, equations, + dg, + cache) where {MeshT} @unpack contravariant_vectors, inverse_jacobian = cache.elements num_elements = nelements(dg, cache) From 93b5391e19cf35adfa284911bdd369faa78b6c1f Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 25 Mar 2026 14:57:22 +0100 Subject: [PATCH 4/9] port integrate_via_indices to the GPU --- src/callbacks_step/analysis_dg2d.jl | 79 +++++++++++++++++++++-------- src/callbacks_step/analysis_dg3d.jl | 74 +++++++++++++++++++-------- 2 files changed, 111 insertions(+), 42 deletions(-) diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl index 368b7f5e4cc..d3fa8873934 100644 --- a/src/callbacks_step/analysis_dg2d.jl +++ b/src/callbacks_step/analysis_dg2d.jl @@ -338,24 +338,25 @@ function integrate_via_indices(func::Func, u, return integral end -function integrate_via_indices(func::Func, _u, +function integrate_via_indices(func::Func, u, mesh::Union{StructuredMesh{2}, StructuredMeshView{2}, UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}}, equations, dg::DGSEM, cache, args...; normalize = true) where {Func} - # TODO GPU AnalysiCallback currently lives on CPU - backend = trixi_backend(_u) - if backend isa Nothing # TODO GPU KA CPU backend - @unpack weights = dg.basis - @unpack inverse_jacobian = cache.elements - u = _u - else - weights = Array(dg.basis.weights) - inverse_jacobian = Array(cache.elements.inverse_jacobian) - u = Array(_u) - end + return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, args...; normalize = normalize) +end + +function integrate_via_indices(func::Func, ::Nothing, u, + mesh::Union{StructuredMesh{2}, StructuredMeshView{2}, + UnstructuredMesh2D, + P4estMesh{2}, P4estMeshView{2}, + T8codeMesh{2}}, + equations, dg::DGSEM, cache, + args...; normalize = true) where {Func} + @unpack weights = dg.basis + @unpack inverse_jacobian = cache.elements # Initialize integral with zeros of the right shape integral = zero(func(u, 1, 1, 1, equations, dg, args...)) @@ -380,6 +381,50 @@ function integrate_via_indices(func::Func, _u, return integral end +function integrate_via_indices(func::Func, backend::Backend, u, + mesh::Union{StructuredMesh{2}, StructuredMeshView{2}, + UnstructuredMesh2D, + P4estMesh{2}, P4estMeshView{2}, + T8codeMesh{2}}, + equations, dg::DGSEM, cache, + args...; normalize = true) where {Func} + @unpack weights = dg.basis + @unpack inverse_jacobian = cache.elements + + function local_plus((integral, total_volume), (integral_element, volume_element)) + return (integral + integral_element, total_volume + volume_element) + end + + # `func(u, 1,1,1, equations, dg, args...)` might access GPU memory + # so we have to rely on the compiler to correctly infer the type of the integral here. + # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU. + integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations), typeof(dg), map(typeof, args)...)) + init = neutral = (integral₀, zero(real(mesh))) + + # Use quadrature to numerically integrate over entire domain + num_elements = nelements(dg, cache) + _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, backend; init, neutral) do element + # Initialize integral with zeros of the right shape + integral, total_volume = neutral + + for j in eachnode(dg), i in eachnode(dg) + volume_jacobian = abs(inv(inverse_jacobian[i, j, element])) + integral += volume_jacobian * weights[i] * weights[j] * + func(u, i, j, element, equations, dg, args...) + total_volume += volume_jacobian * weights[i] * weights[j] + end + + return (integral, total_volume) + end + + # Normalize with total volume + if normalize + _integral = _integral / _total_volume + end + + return _integral +end + function integrate(func::Func, u, mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2}, UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2}, @@ -410,18 +455,10 @@ function integrate(func::Func, u, end end -function analyze(::typeof(entropy_timederivative), _du, u, t, +function analyze(::typeof(entropy_timederivative), du, u, t, mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2}, UnstructuredMesh2D, P4estMesh{2}, T8codeMesh{2}}, equations, dg::Union{DGSEM, FDSBP}, cache) - # TODO GPU AnalysiCallback currently lives on CPU - backend = trixi_backend(u) - if backend isa Nothing # TODO GPU KA CPU backend - du = _du - else - du = Array(_du) - end - # Calculate # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ integrate_via_indices(u, mesh, equations, dg, cache, diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl index 7708a32e6ba..67fd46f9e46 100644 --- a/src/callbacks_step/analysis_dg3d.jl +++ b/src/callbacks_step/analysis_dg3d.jl @@ -389,22 +389,21 @@ function integrate_via_indices(func::Func, u, return integral end -function integrate_via_indices(func::Func, _u, +function integrate_via_indices(func::Func, u, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}}, equations, dg::DGSEM, cache, args...; normalize = true) where {Func} - # TODO GPU AnalysiCallback currently lives on CPU - backend = trixi_backend(_u) - if backend isa Nothing # TODO GPU KA CPU backend - @unpack weights = dg.basis - @unpack inverse_jacobian = cache.elements - u = _u - else - weights = Array(dg.basis.weights) - inverse_jacobian = Array(cache.elements.inverse_jacobian) - u = Array(_u) - end + return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, args...; normalize = normalize) +end + +function integrate_via_indices(func::Func, ::Nothing, u, + mesh::Union{StructuredMesh{3}, P4estMesh{3}, + T8codeMesh{3}}, + equations, dg::DGSEM, cache, + args...; normalize = true) where {Func} + @unpack weights = dg.basis + @unpack inverse_jacobian = cache.elements # Initialize integral with zeros of the right shape integral = zero(func(u, 1, 1, 1, 1, equations, dg, args...)) @@ -429,6 +428,47 @@ function integrate_via_indices(func::Func, _u, return integral end +function integrate_via_indices(func::Func, backend::Backend, u, + mesh::Union{StructuredMesh{3}, P4estMesh{3}, + T8codeMesh{3}}, + equations, dg::DGSEM, cache, + args...; normalize = true) where {Func} + @unpack weights = dg.basis + @unpack inverse_jacobian = cache.elements + + function local_plus((integral, total_volume), (integral_element, volume_element)) + return (integral + integral_element, total_volume + volume_element) + end + + # `func(u, 1,1,1,1, equations, dg, args...)` might access GPU memory + # so we have to rely on the compiler to correctly infer the type of the integral here. + # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU. + integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int, typeof(equations), typeof(dg), map(typeof, args)...)) + init = neutral = (integral₀, zero(real(mesh))) + + # Use quadrature to numerically integrate over entire domain + num_elements = nelements(dg, cache) + _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, backend; init, neutral) do element + # Initialize integral with zeros of the right shape + integral, total_volume = neutral + + for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) + volume_jacobian = abs(inv(inverse_jacobian[i, j, k, element])) + integral += volume_jacobian * weights[i] * weights[j] * weights[k] * + func(u, i, j, k, element, equations, dg, args...) + total_volume += volume_jacobian * weights[i] * weights[j] * weights[k] + end + return integral, total_volume + end + + # Normalize with total volume + if normalize + _integral = _integral / _total_volume + end + + return _integral +end + function integrate(func::Func, u, mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}}, @@ -460,18 +500,10 @@ function integrate(func::Func, u, end end -function analyze(::typeof(entropy_timederivative), _du, u, t, +function analyze(::typeof(entropy_timederivative), du, u, t, mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}}, equations, dg::Union{DGSEM, FDSBP}, cache) - # TODO GPU AnalysiCallback currently lives on CPU - backend = trixi_backend(u) - if backend isa Nothing # TODO GPU KA CPU backend - du = _du - else - du = Array(_du) - end - # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ integrate_via_indices(u, mesh, equations, dg, cache, du) do u, i, j, k, element, equations, dg, du From 8b61f8106b09e37605a45ac698f3be2627b0c474 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 25 Mar 2026 15:06:46 +0100 Subject: [PATCH 5/9] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/callbacks_step/analysis_dg2d.jl | 10 +++++++--- src/callbacks_step/analysis_dg3d.jl | 11 ++++++++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl index d3fa8873934..2a0a1b7288a 100644 --- a/src/callbacks_step/analysis_dg2d.jl +++ b/src/callbacks_step/analysis_dg2d.jl @@ -345,7 +345,8 @@ function integrate_via_indices(func::Func, u, T8codeMesh{2}}, equations, dg::DGSEM, cache, args...; normalize = true) where {Func} - return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, args...; normalize = normalize) + return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, + args...; normalize = normalize) end function integrate_via_indices(func::Func, ::Nothing, u, @@ -398,12 +399,15 @@ function integrate_via_indices(func::Func, backend::Backend, u, # `func(u, 1,1,1, equations, dg, args...)` might access GPU memory # so we have to rely on the compiler to correctly infer the type of the integral here. # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU. - integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations), typeof(dg), map(typeof, args)...)) + integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations), + typeof(dg), map(typeof, args)...)) init = neutral = (integral₀, zero(real(mesh))) # Use quadrature to numerically integrate over entire domain num_elements = nelements(dg, cache) - _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, backend; init, neutral) do element + _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, + backend; init, + neutral) do element # Initialize integral with zeros of the right shape integral, total_volume = neutral diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl index 67fd46f9e46..1aa8693f310 100644 --- a/src/callbacks_step/analysis_dg3d.jl +++ b/src/callbacks_step/analysis_dg3d.jl @@ -394,7 +394,8 @@ function integrate_via_indices(func::Func, u, T8codeMesh{3}}, equations, dg::DGSEM, cache, args...; normalize = true) where {Func} - return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, args...; normalize = normalize) + return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, + args...; normalize = normalize) end function integrate_via_indices(func::Func, ::Nothing, u, @@ -443,12 +444,16 @@ function integrate_via_indices(func::Func, backend::Backend, u, # `func(u, 1,1,1,1, equations, dg, args...)` might access GPU memory # so we have to rely on the compiler to correctly infer the type of the integral here. # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU. - integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int, typeof(equations), typeof(dg), map(typeof, args)...)) + integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int, + typeof(equations), typeof(dg), + map(typeof, args)...)) init = neutral = (integral₀, zero(real(mesh))) # Use quadrature to numerically integrate over entire domain num_elements = nelements(dg, cache) - _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, backend; init, neutral) do element + _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, + backend; init, + neutral) do element # Initialize integral with zeros of the right shape integral, total_volume = neutral From ebefe89111dc72690d2138e8c49f8e1ef91f6a2b Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 27 Mar 2026 08:29:52 +0100 Subject: [PATCH 6/9] remove allocate import --- src/Trixi.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Trixi.jl b/src/Trixi.jl index fabd23fc3cd..e421c18afdf 100644 --- a/src/Trixi.jl +++ b/src/Trixi.jl @@ -60,8 +60,7 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect using FillArrays: Ones, Zeros using ForwardDiff: ForwardDiff using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace -using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend, - allocate +using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend using AcceleratedKernels: AcceleratedKernels using LinearMaps: LinearMap if _PREFERENCE_LOOPVECTORIZATION From 46bf9443edc9f7aab613e2e1eab8449490413e05 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 29 Mar 2026 17:44:59 +0200 Subject: [PATCH 7/9] cleanup --- src/callbacks_step/analysis_dg2d.jl | 39 +++++++++++++---------------- src/callbacks_step/analysis_dg3d.jl | 37 +++++++++++++-------------- 2 files changed, 35 insertions(+), 41 deletions(-) diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl index 2a0a1b7288a..712ca4bf488 100644 --- a/src/callbacks_step/analysis_dg2d.jl +++ b/src/callbacks_step/analysis_dg2d.jl @@ -138,7 +138,7 @@ function calc_error_norms(func, u, t, analyzer, return l2_error, linf_error end -function calc_error_norms(func, _u, t, analyzer, +function calc_error_norms(func, u, t, analyzer, mesh::Union{StructuredMesh{2}, StructuredMeshView{2}, UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2}, @@ -147,16 +147,13 @@ function calc_error_norms(func, _u, t, analyzer, initial_condition, dg::DGSEM, cache, cache_analysis) @unpack vandermonde, weights = analyzer @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis + @unpack node_coordinates, inverse_jacobian = cache.elements - # TODO GPU AnalysisCallback currently lives on CPU - backend = trixi_backend(_u) - if backend isa Nothing # TODO GPU KA CPU backend - @unpack node_coordinates, inverse_jacobian = cache.elements - u = _u - else - node_coordinates = Array(cache.elements.node_coordinates) - inverse_jacobian = Array(cache.elements.inverse_jacobian) - u = Array(_u) + # Calculate error norms on the CPU, to ensure the oder of summation is the same. + if trixi_backend(u) !== nothing + node_coordinates = Array(node_coordinates) + inverse_jacobian = Array(inverse_jacobian) + u = Array(u) end # Set up data structures @@ -399,34 +396,34 @@ function integrate_via_indices(func::Func, backend::Backend, u, # `func(u, 1,1,1, equations, dg, args...)` might access GPU memory # so we have to rely on the compiler to correctly infer the type of the integral here. # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU. - integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations), + integral0 = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations), typeof(dg), map(typeof, args)...)) - init = neutral = (integral₀, zero(real(mesh))) + init = neutral = (integral0, zero(real(mesh))) # Use quadrature to numerically integrate over entire domain num_elements = nelements(dg, cache) - _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, + integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, backend; init, neutral) do element - # Initialize integral with zeros of the right shape - integral, total_volume = neutral + # Initialize integral with zeros of the right shapeu + local_integral, local_total_volume = neutral for j in eachnode(dg), i in eachnode(dg) volume_jacobian = abs(inv(inverse_jacobian[i, j, element])) - integral += volume_jacobian * weights[i] * weights[j] * - func(u, i, j, element, equations, dg, args...) - total_volume += volume_jacobian * weights[i] * weights[j] + local_integral += volume_jacobian * weights[i] * weights[j] * + func(u, i, j, element, equations, dg, args...) + local_total_volume += volume_jacobian * weights[i] * weights[j] end - return (integral, total_volume) + return (local_integral, local_total_volume) end # Normalize with total volume if normalize - _integral = _integral / _total_volume + integral = integral / total_volume end - return _integral + return integral end function integrate(func::Func, u, diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl index 1aa8693f310..2cc7cf6a1fe 100644 --- a/src/callbacks_step/analysis_dg3d.jl +++ b/src/callbacks_step/analysis_dg3d.jl @@ -161,22 +161,19 @@ function calc_error_norms(func, u, t, analyzer, return l2_error, linf_error end -function calc_error_norms(func, _u, t, analyzer, +function calc_error_norms(func, u, t, analyzer, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}}, equations, initial_condition, dg::DGSEM, cache, cache_analysis) @unpack vandermonde, weights = analyzer @unpack u_local, u_tmp1, u_tmp2, x_local, x_tmp1, x_tmp2, jacobian_local, jacobian_tmp1, jacobian_tmp2 = cache_analysis + @unpack node_coordinates, inverse_jacobian = cache.elements - # TODO GPU AnalysisCallback currently lives on CPU - backend = trixi_backend(_u) - if backend isa Nothing # TODO GPU KA CPU backend - @unpack node_coordinates, inverse_jacobian = cache.elements - u = _u - else - node_coordinates = Array(cache.elements.node_coordinates) - inverse_jacobian = Array(cache.elements.inverse_jacobian) - u = Array(_u) + # Calculate error norms on the CPU, to ensure the oder of summation is the same. + if trixi_backend(u) !== nothing + node_coordinates = Array(node_coordinates) + inverse_jacobian = Array(inverse_jacobian) + u = Array(u) end # Set up data structures @@ -444,34 +441,34 @@ function integrate_via_indices(func::Func, backend::Backend, u, # `func(u, 1,1,1,1, equations, dg, args...)` might access GPU memory # so we have to rely on the compiler to correctly infer the type of the integral here. # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU. - integral₀ = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int, + integral0 = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int, typeof(equations), typeof(dg), map(typeof, args)...)) - init = neutral = (integral₀, zero(real(mesh))) + init = neutral = (integral0, zero(real(mesh))) # Use quadrature to numerically integrate over entire domain num_elements = nelements(dg, cache) - _integral, _total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, + integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, backend; init, neutral) do element # Initialize integral with zeros of the right shape - integral, total_volume = neutral + local_integral, local_total_volume = neutral for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) volume_jacobian = abs(inv(inverse_jacobian[i, j, k, element])) - integral += volume_jacobian * weights[i] * weights[j] * weights[k] * - func(u, i, j, k, element, equations, dg, args...) - total_volume += volume_jacobian * weights[i] * weights[j] * weights[k] + local_integral += volume_jacobian * weights[i] * weights[j] * weights[k] * + func(u, i, j, k, element, equations, dg, args...) + local_total_volume += volume_jacobian * weights[i] * weights[j] * weights[k] end - return integral, total_volume + return local_integral, local_total_volume end # Normalize with total volume if normalize - _integral = _integral / _total_volume + integral = integral / total_volume end - return _integral + return integral end function integrate(func::Func, u, From f2cfe81be288d577028f583aae61043ec044a126 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 29 Mar 2026 17:45:57 +0200 Subject: [PATCH 8/9] Apply suggestions from code review Co-authored-by: Valentin Churavy --- src/callbacks_step/analysis_dg2d.jl | 2 +- src/callbacks_step/analysis_dg3d.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl index 712ca4bf488..fd55f2536a7 100644 --- a/src/callbacks_step/analysis_dg2d.jl +++ b/src/callbacks_step/analysis_dg2d.jl @@ -149,7 +149,7 @@ function calc_error_norms(func, u, t, analyzer, @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis @unpack node_coordinates, inverse_jacobian = cache.elements - # Calculate error norms on the CPU, to ensure the oder of summation is the same. + # Calculate error norms on the CPU, to ensure the order of summation is the same. if trixi_backend(u) !== nothing node_coordinates = Array(node_coordinates) inverse_jacobian = Array(inverse_jacobian) diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl index 2cc7cf6a1fe..56727c5d546 100644 --- a/src/callbacks_step/analysis_dg3d.jl +++ b/src/callbacks_step/analysis_dg3d.jl @@ -169,7 +169,7 @@ function calc_error_norms(func, u, t, analyzer, @unpack u_local, u_tmp1, u_tmp2, x_local, x_tmp1, x_tmp2, jacobian_local, jacobian_tmp1, jacobian_tmp2 = cache_analysis @unpack node_coordinates, inverse_jacobian = cache.elements - # Calculate error norms on the CPU, to ensure the oder of summation is the same. + # Calculate error norms on the CPU, to ensure the order of summation is the same. if trixi_backend(u) !== nothing node_coordinates = Array(node_coordinates) inverse_jacobian = Array(inverse_jacobian) From c9c4898c96588bfc759b7520f675cd6b847e3824 Mon Sep 17 00:00:00 2001 From: Benedict Geihe Date: Tue, 31 Mar 2026 16:03:39 +0200 Subject: [PATCH 9/9] fmt --- src/callbacks_step/analysis_dg2d.jl | 4 ++-- src/callbacks_step/analysis_dg3d.jl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl index fd55f2536a7..976954d8359 100644 --- a/src/callbacks_step/analysis_dg2d.jl +++ b/src/callbacks_step/analysis_dg2d.jl @@ -403,8 +403,8 @@ function integrate_via_indices(func::Func, backend::Backend, u, # Use quadrature to numerically integrate over entire domain num_elements = nelements(dg, cache) integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, - backend; init, - neutral) do element + backend; init, + neutral) do element # Initialize integral with zeros of the right shapeu local_integral, local_total_volume = neutral diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl index 56727c5d546..65abd91b827 100644 --- a/src/callbacks_step/analysis_dg3d.jl +++ b/src/callbacks_step/analysis_dg3d.jl @@ -449,8 +449,8 @@ function integrate_via_indices(func::Func, backend::Backend, u, # Use quadrature to numerically integrate over entire domain num_elements = nelements(dg, cache) integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, - backend; init, - neutral) do element + backend; init, + neutral) do element # Initialize integral with zeros of the right shape local_integral, local_total_volume = neutral