diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 344b8eacc3a..c385f9d95b6 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -19,3 +19,21 @@ steps: timeout_in_minutes: 60 soft_fail: - exit_status: 3 + - label: "AMDGPU Julia {{matrix.version}}" + matrix: + setup: + version: + - "1.10" + plugins: + - JuliaCI/julia#v1: + version: "{{matrix.version}}" + - JuliaCI/julia-test#v1: ~ + env: + TRIXI_TEST: "AMDGPU" + agents: + queue: "juliagpu" + rocm: "*" + if: build.message !~ /\[skip ci\]/ + timeout_in_minutes: 60 + soft_fail: + - exit_status: 3 diff --git a/NEWS.md b/NEWS.md index 58b3783f533..fc572df5bf1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,11 @@ Trixi.jl follows the interpretation of used in the Julia ecosystem. Notable changes will be documented in this file for human readability. +## Changes in the v0.16 lifecycle + +#### Added +- GPU support extended to include AMD GPU with a buildkite workflow using `TRIXI_TEST=AMDGPU` ([#2834]). + ## Changes when updating to v0.16 from v0.15.x #### Changed diff --git a/Project.toml b/Project.toml index 2564c2deca5..6ebd27dcf8a 100644 --- a/Project.toml +++ b/Project.toml @@ -53,6 +53,7 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [weakdeps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Convex = "f65535da-76fb-5f13-bab9-19810c17039a" ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199" @@ -62,6 +63,7 @@ Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5" [extensions] +TrixiAMDGPUExt = "AMDGPU" TrixiCUDAExt = "CUDA" TrixiConvexECOSExt = ["Convex", "ECOS"] TrixiMakieExt = "Makie" @@ -71,8 +73,9 @@ TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer" [compat] Accessors = "0.1.42" -Adapt = "4.3" -CUDA = "5.8.2" +AMDGPU = "2.2.1" +Adapt = "4.4" +CUDA = "5.9.1" CodeTracking = "1.0.5, 2, 3" ConstructionBase = "1.5.8" Convex = "0.16" @@ -90,7 +93,7 @@ KernelAbstractions = "0.9.38" LinearAlgebra = "1" LinearMaps = "2.7, 3.0" LoopVectorization = "0.12.171" -MPI = "0.20.22" +MPI = "0.20.23" Makie = "0.22, 0.23, 0.24" MuladdMacro = "0.2.4" NLsolve = "4.5.1" diff --git a/benchmark/AMDGPU/Project.toml b/benchmark/AMDGPU/Project.toml new file mode 100644 index 00000000000..294e9c6ebd2 --- /dev/null +++ b/benchmark/AMDGPU/Project.toml @@ -0,0 +1,16 @@ +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d" +TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb" + +[sources] +Trixi = {path = "../.."} + +[compat] +AMDGPU = "2.3" +JSON = "1.4.0" +OrdinaryDiffEqLowStorageRK = "1.12.0" +TimerOutputs = "0.5.25" +Trixi = "0.16" diff --git a/benchmark/AMDGPU/elixir_euler_taylor_green_vortex.jl b/benchmark/AMDGPU/elixir_euler_taylor_green_vortex.jl new file mode 100644 index 00000000000..02b8beda396 --- /dev/null +++ b/benchmark/AMDGPU/elixir_euler_taylor_green_vortex.jl @@ -0,0 +1,71 @@ +using OrdinaryDiffEqLowStorageRK +using Trixi + +############################################################################### +# semidiscretization of the compressible Euler equations + +equations = CompressibleEulerEquations3D(1.4) + +function initial_condition_taylor_green_vortex(x, t, + equations::CompressibleEulerEquations3D) + A = 1.0 # magnitude of speed + Ms = 0.1 # maximum Mach number + + rho = 1.0 + v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3]) + v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3]) + v3 = 0.0 + p = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms + p = p + + 1.0 / 16.0 * A^2 * rho * + (cos(2 * x[1]) * cos(2 * x[3]) + + 2 * cos(2 * x[2]) + 2 * cos(2 * x[1]) + cos(2 * x[2]) * cos(2 * x[3])) + + return prim2cons(SVector(rho, v1, v2, v3, p), equations) +end + +initial_condition = initial_condition_taylor_green_vortex + +volume_flux = flux_ranocha +surface_flux = flux_lax_friedrichs +volume_integral = VolumeIntegralFluxDifferencing(volume_flux) +solver = DGSEM(polydeg = 5, surface_flux = surface_flux, volume_integral = volume_integral) + +coordinates_min = (-1.0, -1.0, -1.0) .* pi +coordinates_max = (1.0, 1.0, 1.0) .* pi + +initial_refinement_level = 1 +trees_per_dimension = (4, 4, 4) + +mesh = P4estMesh(trees_per_dimension, polydeg = 1, + coordinates_min = coordinates_min, coordinates_max = coordinates_max, + periodicity = true, initial_refinement_level = initial_refinement_level) + +semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver; + boundary_conditions = boundary_condition_periodic) + +############################################################################### +# ODE solvers, callbacks etc. + +tspan = (0.0, 100.0) +ode = semidiscretize(semi, tspan; storage_type = nothing, real_type = nothing) + +summary_callback = SummaryCallback() + +stepsize_callback = StepsizeCallback(cfl = 0.1) + +callbacks = CallbackSet(summary_callback, + stepsize_callback) + +############################################################################### +# run the simulation + +maxiters = 200 + +# disable warnings when maxiters is reached +integrator = init(ode, CarpenterKennedy2N54(williamson_condition = false), + dt = 1.0, + save_everystep = false, callback = callbacks, + maxiters = maxiters, verbose = false) + +solve!(integrator) diff --git a/benchmark/AMDGPU/run.jl b/benchmark/AMDGPU/run.jl new file mode 100644 index 00000000000..76bdb62004d --- /dev/null +++ b/benchmark/AMDGPU/run.jl @@ -0,0 +1,59 @@ +using Trixi +using AMDGPU +using TimerOutputs +using JSON + +function main(elixir_path) + + # setup + maxiters = 50 + initial_refinement_level = 3 + storage_type = ROCArray + real_type = Float64 + + println("Warming up...") + + # start simulation with tiny final time to trigger compilation + duration_compile = @elapsed begin + trixi_include(elixir_path, + tspan = (0.0, 1e-14), + storage_type = storage_type, + real_type = real_type) + end + + println("Finished warm-up in $duration_compile seconds\n") + println("Starting simulation...") + + # start the real simulation + duration_elixir = @elapsed trixi_include(elixir_path, + maxiters = maxiters, + initial_refinement_level = initial_refinement_level, + storage_type = storage_type, + real_type = real_type) + + # store metrics (on every rank!) + metrics = Dict{String, Float64}("elapsed time" => duration_elixir) + + # read TimerOutputs timings + timer = Trixi.timer() + metrics["total time"] = 1.0e-9 * TimerOutputs.tottime(timer) + metrics["rhs! time"] = 1.0e-9 * TimerOutputs.time(timer["rhs!"]) + + # compute performance index + latest_semi = @invokelatest (@__MODULE__).semi + nrhscalls = Trixi.ncalls(latest_semi.performance_counter) + walltime = 1.0e-9 * take!(latest_semi.performance_counter) + metrics["PID"] = walltime * Trixi.mpi_nranks() / + (Trixi.ndofsglobal(latest_semi) * nrhscalls) + + # write json file + open("metrics.out", "w") do f + indent = 2 + JSON.print(f, metrics, indent) + end +end + +# hardcoded elixir +elixir_path = joinpath(@__DIR__(), "elixir_euler_taylor_green_vortex.jl") + +main(elixir_path) diff --git a/ext/TrixiAMDGPUExt.jl b/ext/TrixiAMDGPUExt.jl new file mode 100644 index 00000000000..ad427df3495 --- /dev/null +++ b/ext/TrixiAMDGPUExt.jl @@ -0,0 +1,31 @@ +# Package extension for adding AMDGPU-based features to Trixi.jl +module TrixiAMDGPUExt + +using AMDGPU: AMDGPU, ROCArray, ROCDeviceArray +import AMDGPU.Device: @device_override +import AMDGPU.Runtime: Adaptor +import Trixi + +function Trixi.storage_type(::Type{<:ROCArray}) + return ROCArray +end + +function Trixi.unsafe_wrap_or_alloc(::Adaptor, vec, size) + return Trixi.unsafe_wrap_or_alloc(ROCDeviceArray, vec, size) +end + +function Trixi.unsafe_wrap_or_alloc(::Type{<:ROCDeviceArray}, vec::ROCDeviceArray, size) + return reshape(vec, size) +end + +@static if Trixi._PREFERENCE_LOG == "log_Trixi_NaN" + @device_override Trixi.log(x::Float64) = ccall("extern __ocml_log_f64", llvmcall, + Cdouble, + (Cdouble,), x) + @device_override Trixi.log(x::Float32) = ccall("extern __ocml_log_f32", llvmcall, + Cfloat, + (Cfloat,), x) + # TODO: Trixi.log(x::Float16) +end + +end diff --git a/test/Project.toml b/test/Project.toml index d9cfc69117a..1b2f8c34a60 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,7 +1,8 @@ [deps] -Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" +Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" @@ -45,9 +46,10 @@ TrixiTest = "0a316866-cbd0-4425-8bcb-08103b2c1f26" [compat] Accessors = "0.1.42" ADTypes = "1.16" -Adapt = "4.3" +AMDGPU = "2.2.1" +Adapt = "4.4" Aqua = "0.8" -CUDA = "5.8.2" +CUDA = "5.9.1" CairoMakie = "0.13, 0.14, 0.15" Convex = "0.16" DelimitedFiles = "1" @@ -60,7 +62,7 @@ ForwardDiff = "0.10.38, 1" Krylov = "0.10" LinearAlgebra = "1" LinearSolve = "3.54" -MPI = "0.20.22" +MPI = "0.20.23" NLsolve = "4.5.1" OrdinaryDiffEqBDF = "1.15" OrdinaryDiffEqCore = "3.8" diff --git a/test/runtests.jl b/test/runtests.jl index faacce41a27..16f581c1032 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -130,6 +130,16 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3) end end + @time if TRIXI_TEST == "all" || TRIXI_TEST == "AMDGPU" + import AMDGPU + if AMDGPU.functional() + include(joinpath(@__DIR__, "test_amdgpu_2d.jl")) + include(joinpath(@__DIR__, "test_amdgpu_3d.jl")) + else + @warn "Unable to run AMDGPU tests on this machine" + end + end + @time if TRIXI_TEST == "all" || TRIXI_TEST == "kernelabstractions" previous_backend = Trixi._PREFERENCE_THREADING Trixi.set_threading_backend!(:kernelabstractions) diff --git a/test/test_amdgpu_2d.jl b/test/test_amdgpu_2d.jl new file mode 100644 index 00000000000..eb752ca12b8 --- /dev/null +++ b/test/test_amdgpu_2d.jl @@ -0,0 +1,73 @@ +module TestAMDGPU2D + +using Test +using Trixi + +include("test_trixi.jl") + +EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem") + +# Start with a clean environment: remove Trixi.jl output directory if it exists +outdir = "out" +isdir(outdir) && rm(outdir, recursive = true) + +@testset "AMDGPU 2D" begin +#! format: noindent + +@trixi_testset "elixir_advection_basic_gpu.jl native" begin + @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"), + # Expected errors are exactly the same as with TreeMesh! + l2=8.311947673061856e-6, + linf=6.627000273229378e-5) + # Ensure that we do not have excessive memory allocations + # (e.g., from type instabilities) + semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem. + @test_allocations(Trixi.rhs!, semi, sol, 1000) + @test real(ode.p.solver) == Float64 + @test real(ode.p.solver.basis) == Float64 + @test real(ode.p.solver.mortar) == Float64 + # TODO: `mesh` is currently not `adapt`ed correctly + @test real(ode.p.mesh) == Float64 + + @test ode.u0 isa Array + @test ode.p.solver.basis.derivative_matrix isa Array + + @test Trixi.storage_type(ode.p.cache.elements) === Array + @test Trixi.storage_type(ode.p.cache.interfaces) === Array + @test Trixi.storage_type(ode.p.cache.boundaries) === Array + @test Trixi.storage_type(ode.p.cache.mortars) === Array +end + +@trixi_testset "elixir_advection_basic_gpu.jl Float32 / AMDGPU" begin + # Using AMDGPU inside the testset since otherwise the bindings are hiddend by the anonymous modules + using AMDGPU + @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"), + # Expected errors are exactly the same as with TreeMesh! + l2=[Float32(8.311947673061856e-6)], + linf=[Float32(6.627000273229378e-5)], + RealT_for_test_tolerances=Float32, + real_type=Float32, + storage_type=ROCArray) + # Ensure that we do not have excessive memory allocations + # (e.g., from type instabilities) + semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem. + @test_allocations(Trixi.rhs!, semi, sol, 50_000) + @test real(ode.p.solver) == Float32 + @test real(ode.p.solver.basis) == Float32 + @test real(ode.p.solver.mortar) == Float32 + # TODO: `mesh` is currently not `adapt`ed correctly + @test real(ode.p.mesh) == Float64 + + @test ode.u0 isa ROCArray + @test ode.p.solver.basis.derivative_matrix isa ROCArray + + @test Trixi.storage_type(ode.p.cache.elements) === ROCArray + @test Trixi.storage_type(ode.p.cache.interfaces) === ROCArray + @test Trixi.storage_type(ode.p.cache.boundaries) === ROCArray + @test Trixi.storage_type(ode.p.cache.mortars) === ROCArray +end + +# Clean up afterwards: delete Trixi.jl output directory +@test_nowarn isdir(outdir) && rm(outdir, recursive = true) +end +end # module diff --git a/test/test_amdgpu_3d.jl b/test/test_amdgpu_3d.jl new file mode 100644 index 00000000000..121e537fbad --- /dev/null +++ b/test/test_amdgpu_3d.jl @@ -0,0 +1,73 @@ +module TestAMDGPU3D + +using Test +using Trixi + +include("test_trixi.jl") + +EXAMPLES_DIR = joinpath(examples_dir(), "p4est_3d_dgsem") + +# Start with a clean environment: remove Trixi.jl output directory if it exists +outdir = "out" +isdir(outdir) && rm(outdir, recursive = true) + +@testset "AMDGPU 3D" begin +#! format: noindent + +@trixi_testset "elixir_advection_basic_gpu.jl native" begin + @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"), + # Expected errors are exactly the same as with TreeMesh! + l2=[0.00016263963870641478], + linf=[0.0014537194925779984]) + # Ensure that we do not have excessive memory allocations + # (e.g., from type instabilities) + semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem. + @test_allocations(Trixi.rhs!, semi, sol, 1000) + @test real(ode.p.solver) == Float64 + @test real(ode.p.solver.basis) == Float64 + @test real(ode.p.solver.mortar) == Float64 + # TODO: remake ignores the mesh itself as well + @test real(ode.p.mesh) == Float64 + + @test ode.u0 isa Array + @test ode.p.solver.basis.derivative_matrix isa Array + + @test Trixi.storage_type(ode.p.cache.elements) === Array + @test Trixi.storage_type(ode.p.cache.interfaces) === Array + @test Trixi.storage_type(ode.p.cache.boundaries) === Array + @test Trixi.storage_type(ode.p.cache.mortars) === Array +end + +@trixi_testset "elixir_advection_basic_gpu.jl Float32 / AMDGPU" begin + # Using AMDGPU inside the testset since otherwise the bindings are hiddend by the anonymous modules + using AMDGPU + @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"), + # Expected errors similar to reference on CPU + l2=[Float32(0.00016263963870641478)], + linf=[Float32(0.0014537194925779984)], + RealT_for_test_tolerances=Float32, + real_type=Float32, + storage_type=ROCArray) + # Ensure that we do not have excessive memory allocations + # (e.g., from type instabilities) + semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem. + @test_allocations(Trixi.rhs!, semi, sol, 50_000) + @test real(ode.p.solver) == Float32 + @test real(ode.p.solver.basis) == Float32 + @test real(ode.p.solver.mortar) == Float32 + # TODO: remake ignores the mesh itself as well + @test real(ode.p.mesh) == Float64 + + @test ode.u0 isa ROCArray + @test ode.p.solver.basis.derivative_matrix isa ROCArray + + @test Trixi.storage_type(ode.p.cache.elements) === ROCArray + @test Trixi.storage_type(ode.p.cache.interfaces) === ROCArray + @test Trixi.storage_type(ode.p.cache.boundaries) === ROCArray + @test Trixi.storage_type(ode.p.cache.mortars) === ROCArray +end + +# Clean up afterwards: delete Trixi.jl output directory +@test_nowarn isdir(outdir) && rm(outdir, recursive = true) +end +end # module