Skip to content
18 changes: 18 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,21 @@ steps:
timeout_in_minutes: 60
soft_fail:
- exit_status: 3
- label: "AMDGPU Julia {{matrix.version}}"
matrix:
setup:
version:
- "1.10"
plugins:
- JuliaCI/julia#v1:
version: "{{matrix.version}}"
- JuliaCI/julia-test#v1: ~
env:
TRIXI_TEST: "AMDGPU"
agents:
queue: "juliagpu"
rocm: "*"
if: build.message !~ /\[skip ci\]/
timeout_in_minutes: 60
soft_fail:
- exit_status: 3
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ Trixi.jl follows the interpretation of
used in the Julia ecosystem. Notable changes will be documented in this file
for human readability.

## Changes in the v0.16 lifecycle

#### Added
- GPU support extended to include AMD GPU with a buildkite workflow using `TRIXI_TEST=AMDGPU` ([#2834]).

## Changes when updating to v0.16 from v0.15.x

#### Changed
Expand Down
9 changes: 6 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[weakdeps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
Expand All @@ -62,6 +63,7 @@ Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"

[extensions]
TrixiAMDGPUExt = "AMDGPU"
TrixiCUDAExt = "CUDA"
TrixiConvexECOSExt = ["Convex", "ECOS"]
TrixiMakieExt = "Makie"
Expand All @@ -71,8 +73,9 @@ TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer"

[compat]
Accessors = "0.1.42"
Adapt = "4.3"
CUDA = "5.8.2"
AMDGPU = "2.2.1"
Adapt = "4.4"
CUDA = "5.9.1"
CodeTracking = "1.0.5, 2, 3"
ConstructionBase = "1.5.8"
Convex = "0.16"
Expand All @@ -90,7 +93,7 @@ KernelAbstractions = "0.9.38"
LinearAlgebra = "1"
LinearMaps = "2.7, 3.0"
LoopVectorization = "0.12.171"
MPI = "0.20.22"
MPI = "0.20.23"
Makie = "0.22, 0.23, 0.24"
MuladdMacro = "0.2.4"
NLsolve = "4.5.1"
Expand Down
16 changes: 16 additions & 0 deletions benchmark/AMDGPU/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d"
TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"

[sources]
Trixi = {path = "../.."}

[compat]
AMDGPU = "2.3"
JSON = "1.4.0"
OrdinaryDiffEqLowStorageRK = "1.12.0"
TimerOutputs = "0.5.25"
Trixi = "0.16"
71 changes: 71 additions & 0 deletions benchmark/AMDGPU/elixir_euler_taylor_green_vortex.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
using OrdinaryDiffEqLowStorageRK
using Trixi

###############################################################################
# semidiscretization of the compressible Euler equations

equations = CompressibleEulerEquations3D(1.4)

function initial_condition_taylor_green_vortex(x, t,
equations::CompressibleEulerEquations3D)
A = 1.0 # magnitude of speed
Ms = 0.1 # maximum Mach number

rho = 1.0
v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3])
v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
v3 = 0.0
p = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
p = p +
1.0 / 16.0 * A^2 * rho *
(cos(2 * x[1]) * cos(2 * x[3]) +
2 * cos(2 * x[2]) + 2 * cos(2 * x[1]) + cos(2 * x[2]) * cos(2 * x[3]))

return prim2cons(SVector(rho, v1, v2, v3, p), equations)
end

initial_condition = initial_condition_taylor_green_vortex

volume_flux = flux_ranocha
surface_flux = flux_lax_friedrichs
volume_integral = VolumeIntegralFluxDifferencing(volume_flux)
solver = DGSEM(polydeg = 5, surface_flux = surface_flux, volume_integral = volume_integral)

coordinates_min = (-1.0, -1.0, -1.0) .* pi
coordinates_max = (1.0, 1.0, 1.0) .* pi

initial_refinement_level = 1
trees_per_dimension = (4, 4, 4)

mesh = P4estMesh(trees_per_dimension, polydeg = 1,
coordinates_min = coordinates_min, coordinates_max = coordinates_max,
periodicity = true, initial_refinement_level = initial_refinement_level)

semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver;
boundary_conditions = boundary_condition_periodic)

###############################################################################
# ODE solvers, callbacks etc.

tspan = (0.0, 100.0)
ode = semidiscretize(semi, tspan; storage_type = nothing, real_type = nothing)

summary_callback = SummaryCallback()

stepsize_callback = StepsizeCallback(cfl = 0.1)

callbacks = CallbackSet(summary_callback,
stepsize_callback)

###############################################################################
# run the simulation

maxiters = 200

# disable warnings when maxiters is reached
integrator = init(ode, CarpenterKennedy2N54(williamson_condition = false),
dt = 1.0,
save_everystep = false, callback = callbacks,
maxiters = maxiters, verbose = false)

solve!(integrator)
59 changes: 59 additions & 0 deletions benchmark/AMDGPU/run.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
using Trixi
using AMDGPU
using TimerOutputs
using JSON

function main(elixir_path)

# setup
maxiters = 50
initial_refinement_level = 3
storage_type = ROCArray
real_type = Float64

println("Warming up...")

# start simulation with tiny final time to trigger compilation
duration_compile = @elapsed begin
trixi_include(elixir_path,
tspan = (0.0, 1e-14),
storage_type = storage_type,
real_type = real_type)
end

println("Finished warm-up in $duration_compile seconds\n")
println("Starting simulation...")

# start the real simulation
duration_elixir = @elapsed trixi_include(elixir_path,
maxiters = maxiters,
initial_refinement_level = initial_refinement_level,
storage_type = storage_type,
real_type = real_type)

# store metrics (on every rank!)
metrics = Dict{String, Float64}("elapsed time" => duration_elixir)

# read TimerOutputs timings
timer = Trixi.timer()
metrics["total time"] = 1.0e-9 * TimerOutputs.tottime(timer)
metrics["rhs! time"] = 1.0e-9 * TimerOutputs.time(timer["rhs!"])

# compute performance index
latest_semi = @invokelatest (@__MODULE__).semi
nrhscalls = Trixi.ncalls(latest_semi.performance_counter)
walltime = 1.0e-9 * take!(latest_semi.performance_counter)
metrics["PID"] = walltime * Trixi.mpi_nranks() /
(Trixi.ndofsglobal(latest_semi) * nrhscalls)

# write json file
open("metrics.out", "w") do f
indent = 2
JSON.print(f, metrics, indent)
end
end

# hardcoded elixir
elixir_path = joinpath(@__DIR__(), "elixir_euler_taylor_green_vortex.jl")

main(elixir_path)
31 changes: 31 additions & 0 deletions ext/TrixiAMDGPUExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Package extension for adding AMDGPU-based features to Trixi.jl
module TrixiAMDGPUExt

using AMDGPU: AMDGPU, ROCArray, ROCDeviceArray
import AMDGPU.Device: @device_override
import AMDGPU.Runtime: Adaptor
import Trixi

function Trixi.storage_type(::Type{<:ROCArray})
return ROCArray
end

function Trixi.unsafe_wrap_or_alloc(::Adaptor, vec, size)
return Trixi.unsafe_wrap_or_alloc(ROCDeviceArray, vec, size)
end

function Trixi.unsafe_wrap_or_alloc(::Type{<:ROCDeviceArray}, vec::ROCDeviceArray, size)
return reshape(vec, size)
end

@static if Trixi._PREFERENCE_LOG == "log_Trixi_NaN"
@device_override Trixi.log(x::Float64) = ccall("extern __ocml_log_f64", llvmcall,
Cdouble,
(Cdouble,), x)
@device_override Trixi.log(x::Float32) = ccall("extern __ocml_log_f32", llvmcall,
Cfloat,
(Cfloat,), x)
# TODO: Trixi.log(x::Float16)
end

end
10 changes: 6 additions & 4 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
[deps]
Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
Expand Down Expand Up @@ -45,9 +46,10 @@ TrixiTest = "0a316866-cbd0-4425-8bcb-08103b2c1f26"
[compat]
Accessors = "0.1.42"
ADTypes = "1.16"
Adapt = "4.3"
AMDGPU = "2.2.1"
Adapt = "4.4"
Aqua = "0.8"
CUDA = "5.8.2"
CUDA = "5.9.1"
CairoMakie = "0.13, 0.14, 0.15"
Convex = "0.16"
DelimitedFiles = "1"
Expand All @@ -60,7 +62,7 @@ ForwardDiff = "0.10.38, 1"
Krylov = "0.10"
LinearAlgebra = "1"
LinearSolve = "3.54"
MPI = "0.20.22"
MPI = "0.20.23"
NLsolve = "4.5.1"
OrdinaryDiffEqBDF = "1.15"
OrdinaryDiffEqCore = "3.8"
Expand Down
10 changes: 10 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,16 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3)
end
end

@time if TRIXI_TEST == "all" || TRIXI_TEST == "AMDGPU"
import AMDGPU
if AMDGPU.functional()
include(joinpath(@__DIR__, "test_amdgpu_2d.jl"))
include(joinpath(@__DIR__, "test_amdgpu_3d.jl"))
else
@warn "Unable to run AMDGPU tests on this machine"
end
end

@time if TRIXI_TEST == "all" || TRIXI_TEST == "kernelabstractions"
previous_backend = Trixi._PREFERENCE_THREADING
Trixi.set_threading_backend!(:kernelabstractions)
Expand Down
73 changes: 73 additions & 0 deletions test/test_amdgpu_2d.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
module TestAMDGPU2D

using Test
using Trixi

include("test_trixi.jl")

EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")

# Start with a clean environment: remove Trixi.jl output directory if it exists
outdir = "out"
isdir(outdir) && rm(outdir, recursive = true)

@testset "AMDGPU 2D" begin
#! format: noindent

@trixi_testset "elixir_advection_basic_gpu.jl native" begin
@test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
# Expected errors are exactly the same as with TreeMesh!
l2=8.311947673061856e-6,
linf=6.627000273229378e-5)
# Ensure that we do not have excessive memory allocations
# (e.g., from type instabilities)
semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
@test_allocations(Trixi.rhs!, semi, sol, 1000)
@test real(ode.p.solver) == Float64
@test real(ode.p.solver.basis) == Float64
@test real(ode.p.solver.mortar) == Float64
# TODO: `mesh` is currently not `adapt`ed correctly
@test real(ode.p.mesh) == Float64

@test ode.u0 isa Array
@test ode.p.solver.basis.derivative_matrix isa Array

@test Trixi.storage_type(ode.p.cache.elements) === Array
@test Trixi.storage_type(ode.p.cache.interfaces) === Array
@test Trixi.storage_type(ode.p.cache.boundaries) === Array
@test Trixi.storage_type(ode.p.cache.mortars) === Array
end

@trixi_testset "elixir_advection_basic_gpu.jl Float32 / AMDGPU" begin
# Using AMDGPU inside the testset since otherwise the bindings are hiddend by the anonymous modules
using AMDGPU
@test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
# Expected errors are exactly the same as with TreeMesh!
l2=[Float32(8.311947673061856e-6)],
linf=[Float32(6.627000273229378e-5)],
RealT_for_test_tolerances=Float32,
real_type=Float32,
storage_type=ROCArray)
# Ensure that we do not have excessive memory allocations
# (e.g., from type instabilities)
semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
@test_allocations(Trixi.rhs!, semi, sol, 50_000)
@test real(ode.p.solver) == Float32
@test real(ode.p.solver.basis) == Float32
@test real(ode.p.solver.mortar) == Float32
# TODO: `mesh` is currently not `adapt`ed correctly
@test real(ode.p.mesh) == Float64

@test ode.u0 isa ROCArray
@test ode.p.solver.basis.derivative_matrix isa ROCArray

@test Trixi.storage_type(ode.p.cache.elements) === ROCArray
@test Trixi.storage_type(ode.p.cache.interfaces) === ROCArray
@test Trixi.storage_type(ode.p.cache.boundaries) === ROCArray
@test Trixi.storage_type(ode.p.cache.mortars) === ROCArray
end

# Clean up afterwards: delete Trixi.jl output directory
@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
end
end # module
Loading
Loading