diff --git a/Project.toml b/Project.toml index 3f73c968f13..cec98c0008f 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ version = "0.16.2-DEV" authors = ["Michael Schlottke-Lakemper ", "Gregor Gassner ", "Hendrik Ranocha ", "Andrew R. Winters ", "Jesse Chan ", "Andrés Rueda-Ramírez "] [deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2" @@ -72,6 +73,7 @@ TrixiPlotsExt = "Plots" TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer" [compat] +AcceleratedKernels = "0.4.3" Accessors = "0.1.42" AMDGPU = "2.2.1" Adapt = "4.4" diff --git a/src/Trixi.jl b/src/Trixi.jl index 42340659520..e421c18afdf 100644 --- a/src/Trixi.jl +++ b/src/Trixi.jl @@ -60,8 +60,8 @@ using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect using FillArrays: Ones, Zeros using ForwardDiff: ForwardDiff using HDF5: HDF5, h5open, attributes, create_dataset, datatype, dataspace -using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend, - allocate +using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Backend +using AcceleratedKernels: AcceleratedKernels using LinearMaps: LinearMap if _PREFERENCE_LOOPVECTORIZATION using LoopVectorization: LoopVectorization, @turbo, indices diff --git a/src/callbacks_step/analysis_dg2d.jl b/src/callbacks_step/analysis_dg2d.jl index 368b7f5e4cc..976954d8359 100644 --- a/src/callbacks_step/analysis_dg2d.jl +++ b/src/callbacks_step/analysis_dg2d.jl @@ -138,7 +138,7 @@ function calc_error_norms(func, u, t, analyzer, return l2_error, linf_error end -function calc_error_norms(func, _u, t, analyzer, +function calc_error_norms(func, u, t, analyzer, mesh::Union{StructuredMesh{2}, StructuredMeshView{2}, UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2}, @@ -147,16 +147,13 @@ function calc_error_norms(func, _u, t, analyzer, initial_condition, dg::DGSEM, cache, cache_analysis) @unpack vandermonde, weights = analyzer @unpack u_local, u_tmp1, x_local, x_tmp1, jacobian_local, jacobian_tmp1 = cache_analysis + @unpack node_coordinates, inverse_jacobian = cache.elements - # TODO GPU AnalysisCallback currently lives on CPU - backend = trixi_backend(_u) - if backend isa Nothing # TODO GPU KA CPU backend - @unpack node_coordinates, inverse_jacobian = cache.elements - u = _u - else - node_coordinates = Array(cache.elements.node_coordinates) - inverse_jacobian = Array(cache.elements.inverse_jacobian) - u = Array(_u) + # Calculate error norms on the CPU, to ensure the order of summation is the same. + if trixi_backend(u) !== nothing + node_coordinates = Array(node_coordinates) + inverse_jacobian = Array(inverse_jacobian) + u = Array(u) end # Set up data structures @@ -338,24 +335,26 @@ function integrate_via_indices(func::Func, u, return integral end -function integrate_via_indices(func::Func, _u, +function integrate_via_indices(func::Func, u, mesh::Union{StructuredMesh{2}, StructuredMeshView{2}, UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2}, T8codeMesh{2}}, equations, dg::DGSEM, cache, args...; normalize = true) where {Func} - # TODO GPU AnalysiCallback currently lives on CPU - backend = trixi_backend(_u) - if backend isa Nothing # TODO GPU KA CPU backend - @unpack weights = dg.basis - @unpack inverse_jacobian = cache.elements - u = _u - else - weights = Array(dg.basis.weights) - inverse_jacobian = Array(cache.elements.inverse_jacobian) - u = Array(_u) - end + return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, + args...; normalize = normalize) +end + +function integrate_via_indices(func::Func, ::Nothing, u, + mesh::Union{StructuredMesh{2}, StructuredMeshView{2}, + UnstructuredMesh2D, + P4estMesh{2}, P4estMeshView{2}, + T8codeMesh{2}}, + equations, dg::DGSEM, cache, + args...; normalize = true) where {Func} + @unpack weights = dg.basis + @unpack inverse_jacobian = cache.elements # Initialize integral with zeros of the right shape integral = zero(func(u, 1, 1, 1, equations, dg, args...)) @@ -380,6 +379,53 @@ function integrate_via_indices(func::Func, _u, return integral end +function integrate_via_indices(func::Func, backend::Backend, u, + mesh::Union{StructuredMesh{2}, StructuredMeshView{2}, + UnstructuredMesh2D, + P4estMesh{2}, P4estMeshView{2}, + T8codeMesh{2}}, + equations, dg::DGSEM, cache, + args...; normalize = true) where {Func} + @unpack weights = dg.basis + @unpack inverse_jacobian = cache.elements + + function local_plus((integral, total_volume), (integral_element, volume_element)) + return (integral + integral_element, total_volume + volume_element) + end + + # `func(u, 1,1,1, equations, dg, args...)` might access GPU memory + # so we have to rely on the compiler to correctly infer the type of the integral here. + # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU. + integral0 = zero(Base.promote_op(func, typeof(u), Int, Int, Int, typeof(equations), + typeof(dg), map(typeof, args)...)) + init = neutral = (integral0, zero(real(mesh))) + + # Use quadrature to numerically integrate over entire domain + num_elements = nelements(dg, cache) + integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, + backend; init, + neutral) do element + # Initialize integral with zeros of the right shapeu + local_integral, local_total_volume = neutral + + for j in eachnode(dg), i in eachnode(dg) + volume_jacobian = abs(inv(inverse_jacobian[i, j, element])) + local_integral += volume_jacobian * weights[i] * weights[j] * + func(u, i, j, element, equations, dg, args...) + local_total_volume += volume_jacobian * weights[i] * weights[j] + end + + return (local_integral, local_total_volume) + end + + # Normalize with total volume + if normalize + integral = integral / total_volume + end + + return integral +end + function integrate(func::Func, u, mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2}, UnstructuredMesh2D, P4estMesh{2}, P4estMeshView{2}, @@ -410,18 +456,10 @@ function integrate(func::Func, u, end end -function analyze(::typeof(entropy_timederivative), _du, u, t, +function analyze(::typeof(entropy_timederivative), du, u, t, mesh::Union{TreeMesh{2}, StructuredMesh{2}, StructuredMeshView{2}, UnstructuredMesh2D, P4estMesh{2}, T8codeMesh{2}}, equations, dg::Union{DGSEM, FDSBP}, cache) - # TODO GPU AnalysiCallback currently lives on CPU - backend = trixi_backend(u) - if backend isa Nothing # TODO GPU KA CPU backend - du = _du - else - du = Array(_du) - end - # Calculate # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ integrate_via_indices(u, mesh, equations, dg, cache, diff --git a/src/callbacks_step/analysis_dg3d.jl b/src/callbacks_step/analysis_dg3d.jl index 7708a32e6ba..65abd91b827 100644 --- a/src/callbacks_step/analysis_dg3d.jl +++ b/src/callbacks_step/analysis_dg3d.jl @@ -161,22 +161,19 @@ function calc_error_norms(func, u, t, analyzer, return l2_error, linf_error end -function calc_error_norms(func, _u, t, analyzer, +function calc_error_norms(func, u, t, analyzer, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}}, equations, initial_condition, dg::DGSEM, cache, cache_analysis) @unpack vandermonde, weights = analyzer @unpack u_local, u_tmp1, u_tmp2, x_local, x_tmp1, x_tmp2, jacobian_local, jacobian_tmp1, jacobian_tmp2 = cache_analysis + @unpack node_coordinates, inverse_jacobian = cache.elements - # TODO GPU AnalysisCallback currently lives on CPU - backend = trixi_backend(_u) - if backend isa Nothing # TODO GPU KA CPU backend - @unpack node_coordinates, inverse_jacobian = cache.elements - u = _u - else - node_coordinates = Array(cache.elements.node_coordinates) - inverse_jacobian = Array(cache.elements.inverse_jacobian) - u = Array(_u) + # Calculate error norms on the CPU, to ensure the order of summation is the same. + if trixi_backend(u) !== nothing + node_coordinates = Array(node_coordinates) + inverse_jacobian = Array(inverse_jacobian) + u = Array(u) end # Set up data structures @@ -389,22 +386,22 @@ function integrate_via_indices(func::Func, u, return integral end -function integrate_via_indices(func::Func, _u, +function integrate_via_indices(func::Func, u, mesh::Union{StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}}, equations, dg::DGSEM, cache, args...; normalize = true) where {Func} - # TODO GPU AnalysiCallback currently lives on CPU - backend = trixi_backend(_u) - if backend isa Nothing # TODO GPU KA CPU backend - @unpack weights = dg.basis - @unpack inverse_jacobian = cache.elements - u = _u - else - weights = Array(dg.basis.weights) - inverse_jacobian = Array(cache.elements.inverse_jacobian) - u = Array(_u) - end + return integrate_via_indices(func, trixi_backend(u), u, mesh, equations, dg, cache, + args...; normalize = normalize) +end + +function integrate_via_indices(func::Func, ::Nothing, u, + mesh::Union{StructuredMesh{3}, P4estMesh{3}, + T8codeMesh{3}}, + equations, dg::DGSEM, cache, + args...; normalize = true) where {Func} + @unpack weights = dg.basis + @unpack inverse_jacobian = cache.elements # Initialize integral with zeros of the right shape integral = zero(func(u, 1, 1, 1, 1, equations, dg, args...)) @@ -429,6 +426,51 @@ function integrate_via_indices(func::Func, _u, return integral end +function integrate_via_indices(func::Func, backend::Backend, u, + mesh::Union{StructuredMesh{3}, P4estMesh{3}, + T8codeMesh{3}}, + equations, dg::DGSEM, cache, + args...; normalize = true) where {Func} + @unpack weights = dg.basis + @unpack inverse_jacobian = cache.elements + + function local_plus((integral, total_volume), (integral_element, volume_element)) + return (integral + integral_element, total_volume + volume_element) + end + + # `func(u, 1,1,1,1, equations, dg, args...)` might access GPU memory + # so we have to rely on the compiler to correctly infer the type of the integral here. + # TODO: Technically we need device_promote_op here that "infers" the function within the context of the GPU. + integral0 = zero(Base.promote_op(func, typeof(u), Int, Int, Int, Int, + typeof(equations), typeof(dg), + map(typeof, args)...)) + init = neutral = (integral0, zero(real(mesh))) + + # Use quadrature to numerically integrate over entire domain + num_elements = nelements(dg, cache) + integral, total_volume = AcceleratedKernels.mapreduce(local_plus, 1:num_elements, + backend; init, + neutral) do element + # Initialize integral with zeros of the right shape + local_integral, local_total_volume = neutral + + for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) + volume_jacobian = abs(inv(inverse_jacobian[i, j, k, element])) + local_integral += volume_jacobian * weights[i] * weights[j] * weights[k] * + func(u, i, j, k, element, equations, dg, args...) + local_total_volume += volume_jacobian * weights[i] * weights[j] * weights[k] + end + return local_integral, local_total_volume + end + + # Normalize with total volume + if normalize + integral = integral / total_volume + end + + return integral +end + function integrate(func::Func, u, mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}}, @@ -460,18 +502,10 @@ function integrate(func::Func, u, end end -function analyze(::typeof(entropy_timederivative), _du, u, t, +function analyze(::typeof(entropy_timederivative), du, u, t, mesh::Union{TreeMesh{3}, StructuredMesh{3}, P4estMesh{3}, T8codeMesh{3}}, equations, dg::Union{DGSEM, FDSBP}, cache) - # TODO GPU AnalysiCallback currently lives on CPU - backend = trixi_backend(u) - if backend isa Nothing # TODO GPU KA CPU backend - du = _du - else - du = Array(_du) - end - # Calculate ∫(∂S/∂u ⋅ ∂u/∂t)dΩ integrate_via_indices(u, mesh, equations, dg, cache, du) do u, i, j, k, element, equations, dg, du diff --git a/src/callbacks_step/stepsize.jl b/src/callbacks_step/stepsize.jl index ca8d7130f84..9c2b9020cff 100644 --- a/src/callbacks_step/stepsize.jl +++ b/src/callbacks_step/stepsize.jl @@ -205,31 +205,25 @@ function calc_max_scaled_speed(backend::Nothing, u, mesh, constant_speed, equati return max_scaled_speed end -function calc_max_scaled_speed(backend::Backend, u, mesh, constant_speed, equations, dg, - cache) +function calc_max_scaled_speed(backend::Backend, u, ::MeshT, constant_speed, equations, + dg, + cache) where {MeshT} @unpack contravariant_vectors, inverse_jacobian = cache.elements num_elements = nelements(dg, cache) - max_scaled_speeds = allocate(backend, eltype(u), num_elements) - - kernel! = max_scaled_speed_KAkernel!(backend) - kernel!(max_scaled_speeds, u, typeof(mesh), constant_speed, equations, dg, - contravariant_vectors, - inverse_jacobian; - ndrange = num_elements) - - return maximum(max_scaled_speeds) -end + init = neutral = AcceleratedKernels.neutral_element(Base.max, eltype(u)) + + # Provide a custom neutral and init element since we "reduce" over 1:num_elements + max_scaled_speed = AcceleratedKernels.mapreduce(Base.max, 1:num_elements, backend; + init, neutral) do element + max_scaled_speed_per_element(u, MeshT, constant_speed, + equations, dg, + contravariant_vectors, + inverse_jacobian, + element) + end -@kernel function max_scaled_speed_KAkernel!(max_scaled_speeds, u, MeshT, constant_speed, - equations, - dg, contravariant_vectors, inverse_jacobian) - element = @index(Global) - max_scaled_speeds[element] = max_scaled_speed_per_element(u, MeshT, constant_speed, - equations, dg, - contravariant_vectors, - inverse_jacobian, - element) + return max_scaled_speed end include("stepsize_dg1d.jl")