diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 344b8eacc3a..c385f9d95b6 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -19,3 +19,21 @@ steps:
     timeout_in_minutes: 60
     soft_fail:
       - exit_status: 3
+  - label: "AMDGPU Julia {{matrix.version}}"
+    matrix:
+      setup:
+        version:
+          - "1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "{{matrix.version}}"
+      - JuliaCI/julia-test#v1: ~
+    env:
+      TRIXI_TEST: "AMDGPU"
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+    if: build.message !~ /\[skip ci\]/
+    timeout_in_minutes: 60
+    soft_fail:
+      - exit_status: 3
diff --git a/NEWS.md b/NEWS.md
index 58b3783f533..fc572df5bf1 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,6 +5,11 @@ Trixi.jl follows the interpretation of
 used in the Julia ecosystem. Notable changes will be documented in this file
 for human readability.
 
+## Changes in the v0.16 lifecycle
+
+#### Added
+- GPU support extended to include AMD GPU with a buildkite workflow using `TRIXI_TEST=AMDGPU` ([#2834]).
+
 ## Changes when updating to v0.16 from v0.15.x
 
 #### Changed
diff --git a/Project.toml b/Project.toml
index 2564c2deca5..6ebd27dcf8a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -53,6 +53,7 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
 ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
@@ -62,6 +63,7 @@ Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5"
 
 [extensions]
+TrixiAMDGPUExt = "AMDGPU"
 TrixiCUDAExt = "CUDA"
 TrixiConvexECOSExt = ["Convex", "ECOS"]
 TrixiMakieExt = "Makie"
@@ -71,8 +73,9 @@ TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer"
 
 [compat]
 Accessors = "0.1.42"
-Adapt = "4.3"
-CUDA = "5.8.2"
+AMDGPU = "2.2.1"
+Adapt = "4.4"
+CUDA = "5.9.1"
 CodeTracking = "1.0.5, 2, 3"
 ConstructionBase = "1.5.8"
 Convex = "0.16"
@@ -90,7 +93,7 @@ KernelAbstractions = "0.9.38"
 LinearAlgebra = "1"
 LinearMaps = "2.7, 3.0"
 LoopVectorization = "0.12.171"
-MPI = "0.20.22"
+MPI = "0.20.23"
 Makie = "0.22, 0.23, 0.24"
 MuladdMacro = "0.2.4"
 NLsolve = "4.5.1"
diff --git a/benchmark/AMDGPU/Project.toml b/benchmark/AMDGPU/Project.toml
new file mode 100644
index 00000000000..294e9c6ebd2
--- /dev/null
+++ b/benchmark/AMDGPU/Project.toml
@@ -0,0 +1,16 @@
+[deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d"
+TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb"
+
+[sources]
+Trixi = {path = "../.."}
+
+[compat]
+AMDGPU = "2.3"
+JSON = "1.4.0"
+OrdinaryDiffEqLowStorageRK = "1.12.0"
+TimerOutputs = "0.5.25"
+Trixi = "0.16"
diff --git a/benchmark/AMDGPU/elixir_euler_taylor_green_vortex.jl b/benchmark/AMDGPU/elixir_euler_taylor_green_vortex.jl
new file mode 100644
index 00000000000..02b8beda396
--- /dev/null
+++ b/benchmark/AMDGPU/elixir_euler_taylor_green_vortex.jl
@@ -0,0 +1,71 @@
+using OrdinaryDiffEqLowStorageRK
+using Trixi
+
+###############################################################################
+# semidiscretization of the compressible Euler equations
+
+equations = CompressibleEulerEquations3D(1.4)
+
+function initial_condition_taylor_green_vortex(x, t,
+                                               equations::CompressibleEulerEquations3D)
+    A = 1.0 # magnitude of speed
+    Ms = 0.1 # maximum Mach number
+
+    rho = 1.0
+    v1 = A * sin(x[1]) * cos(x[2]) * cos(x[3])
+    v2 = -A * cos(x[1]) * sin(x[2]) * cos(x[3])
+    v3 = 0.0
+    p = (A / Ms)^2 * rho / equations.gamma # scaling to get Ms
+    p = p +
+        1.0 / 16.0 * A^2 * rho *
+        (cos(2 * x[1]) * cos(2 * x[3]) +
+         2 * cos(2 * x[2]) + 2 * cos(2 * x[1]) + cos(2 * x[2]) * cos(2 * x[3]))
+
+    return prim2cons(SVector(rho, v1, v2, v3, p), equations)
+end
+
+initial_condition = initial_condition_taylor_green_vortex
+
+volume_flux = flux_ranocha
+surface_flux = flux_lax_friedrichs
+volume_integral = VolumeIntegralFluxDifferencing(volume_flux)
+solver = DGSEM(polydeg = 5, surface_flux = surface_flux, volume_integral = volume_integral)
+
+coordinates_min = (-1.0, -1.0, -1.0) .* pi
+coordinates_max = (1.0, 1.0, 1.0) .* pi
+
+initial_refinement_level = 1
+trees_per_dimension = (4, 4, 4)
+
+mesh = P4estMesh(trees_per_dimension, polydeg = 1,
+                 coordinates_min = coordinates_min, coordinates_max = coordinates_max,
+                 periodicity = true, initial_refinement_level = initial_refinement_level)
+
+semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver;
+                                    boundary_conditions = boundary_condition_periodic)
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+tspan = (0.0, 100.0)
+ode = semidiscretize(semi, tspan; storage_type = nothing, real_type = nothing)
+
+summary_callback = SummaryCallback()
+
+stepsize_callback = StepsizeCallback(cfl = 0.1)
+
+callbacks = CallbackSet(summary_callback,
+                        stepsize_callback)
+
+###############################################################################
+# run the simulation
+
+maxiters = 200
+
+# disable warnings when maxiters is reached
+integrator = init(ode, CarpenterKennedy2N54(williamson_condition = false),
+                  dt = 1.0,
+                  save_everystep = false, callback = callbacks,
+                  maxiters = maxiters, verbose = false)
+
+solve!(integrator)
diff --git a/benchmark/AMDGPU/run.jl b/benchmark/AMDGPU/run.jl
new file mode 100644
index 00000000000..76bdb62004d
--- /dev/null
+++ b/benchmark/AMDGPU/run.jl
@@ -0,0 +1,59 @@
+using Trixi
+using AMDGPU
+using TimerOutputs
+using JSON
+
+function main(elixir_path)
+
+    # setup
+    maxiters = 50
+    initial_refinement_level = 3
+    storage_type = ROCArray
+    real_type = Float64
+
+    println("Warming up...")
+
+    # start simulation with tiny final time to trigger compilation
+    duration_compile = @elapsed begin
+        trixi_include(elixir_path,
+                      tspan = (0.0, 1e-14),
+                      storage_type = storage_type,
+                      real_type = real_type)
+    end
+
+    println("Finished warm-up in $duration_compile seconds\n")
+    println("Starting simulation...")
+
+    # start the real simulation
+    duration_elixir = @elapsed trixi_include(elixir_path,
+                                             maxiters = maxiters,
+                                             initial_refinement_level = initial_refinement_level,
+                                             storage_type = storage_type,
+                                             real_type = real_type)
+
+    # store metrics (on every rank!)
+    metrics = Dict{String, Float64}("elapsed time" => duration_elixir)
+
+    # read TimerOutputs timings
+    timer = Trixi.timer()
+    metrics["total time"] = 1.0e-9 * TimerOutputs.tottime(timer)
+    metrics["rhs! time"] = 1.0e-9 * TimerOutputs.time(timer["rhs!"])
+
+    # compute performance index
+    latest_semi = @invokelatest (@__MODULE__).semi
+    nrhscalls = Trixi.ncalls(latest_semi.performance_counter)
+    walltime = 1.0e-9 * take!(latest_semi.performance_counter)
+    metrics["PID"] = walltime * Trixi.mpi_nranks() /
+                     (Trixi.ndofsglobal(latest_semi) * nrhscalls)
+
+    # write json file
+    open("metrics.out", "w") do f
+        indent = 2
+        JSON.print(f, metrics, indent)
+    end
+end
+
+# hardcoded elixir
+elixir_path = joinpath(@__DIR__(), "elixir_euler_taylor_green_vortex.jl")
+
+main(elixir_path)
diff --git a/ext/TrixiAMDGPUExt.jl b/ext/TrixiAMDGPUExt.jl
new file mode 100644
index 00000000000..ad427df3495
--- /dev/null
+++ b/ext/TrixiAMDGPUExt.jl
@@ -0,0 +1,31 @@
+# Package extension for adding AMDGPU-based features to Trixi.jl
+module TrixiAMDGPUExt
+
+using AMDGPU: AMDGPU, ROCArray, ROCDeviceArray
+import AMDGPU.Device: @device_override
+import AMDGPU.Runtime: Adaptor
+import Trixi
+
+function Trixi.storage_type(::Type{<:ROCArray})
+    return ROCArray
+end
+
+function Trixi.unsafe_wrap_or_alloc(::Adaptor, vec, size)
+    return Trixi.unsafe_wrap_or_alloc(ROCDeviceArray, vec, size)
+end
+
+function Trixi.unsafe_wrap_or_alloc(::Type{<:ROCDeviceArray}, vec::ROCDeviceArray, size)
+    return reshape(vec, size)
+end
+
+@static if Trixi._PREFERENCE_LOG == "log_Trixi_NaN"
+    @device_override Trixi.log(x::Float64) = ccall("extern __ocml_log_f64", llvmcall,
+                                                   Cdouble,
+                                                   (Cdouble,), x)
+    @device_override Trixi.log(x::Float32) = ccall("extern __ocml_log_f32", llvmcall,
+                                                   Cfloat,
+                                                   (Cfloat,), x)
+    # TODO: Trixi.log(x::Float16)
+end
+
+end
diff --git a/test/Project.toml b/test/Project.toml
index d9cfc69117a..1b2f8c34a60 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,7 +1,8 @@
 [deps]
-Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
+Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
@@ -45,9 +46,10 @@ TrixiTest = "0a316866-cbd0-4425-8bcb-08103b2c1f26"
 [compat]
 Accessors = "0.1.42"
 ADTypes = "1.16"
-Adapt = "4.3"
+AMDGPU = "2.2.1"
+Adapt = "4.4"
 Aqua = "0.8"
-CUDA = "5.8.2"
+CUDA = "5.9.1"
 CairoMakie = "0.13, 0.14, 0.15"
 Convex = "0.16"
 DelimitedFiles = "1"
@@ -60,7 +62,7 @@ ForwardDiff = "0.10.38, 1"
 Krylov = "0.10"
 LinearAlgebra = "1"
 LinearSolve = "3.54"
-MPI = "0.20.22"
+MPI = "0.20.23"
 NLsolve = "4.5.1"
 OrdinaryDiffEqBDF = "1.15"
 OrdinaryDiffEqCore = "3.8"
diff --git a/test/runtests.jl b/test/runtests.jl
index faacce41a27..16f581c1032 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -130,6 +130,16 @@ const TRIXI_NTHREADS = clamp(Sys.CPU_THREADS, 2, 3)
         end
     end
 
+    @time if TRIXI_TEST == "all" || TRIXI_TEST == "AMDGPU"
+        import AMDGPU
+        if AMDGPU.functional()
+            include(joinpath(@__DIR__, "test_amdgpu_2d.jl"))
+            include(joinpath(@__DIR__, "test_amdgpu_3d.jl"))
+        else
+            @warn "Unable to run AMDGPU tests on this machine"
+        end
+    end
+
     @time if TRIXI_TEST == "all" || TRIXI_TEST == "kernelabstractions"
         previous_backend = Trixi._PREFERENCE_THREADING
         Trixi.set_threading_backend!(:kernelabstractions)
diff --git a/test/test_amdgpu_2d.jl b/test/test_amdgpu_2d.jl
new file mode 100644
index 00000000000..eb752ca12b8
--- /dev/null
+++ b/test/test_amdgpu_2d.jl
@@ -0,0 +1,73 @@
+module TestAMDGPU2D
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_2d_dgsem")
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+@testset "AMDGPU 2D" begin
+#! format: noindent
+
+@trixi_testset "elixir_advection_basic_gpu.jl native" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=8.311947673061856e-6,
+                        linf=6.627000273229378e-5)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test real(ode.p.solver) == Float64
+    @test real(ode.p.solver.basis) == Float64
+    @test real(ode.p.solver.mortar) == Float64
+    # TODO: `mesh` is currently not `adapt`ed correctly
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa Array
+    @test ode.p.solver.basis.derivative_matrix isa Array
+
+    @test Trixi.storage_type(ode.p.cache.elements) === Array
+    @test Trixi.storage_type(ode.p.cache.interfaces) === Array
+    @test Trixi.storage_type(ode.p.cache.boundaries) === Array
+    @test Trixi.storage_type(ode.p.cache.mortars) === Array
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32 / AMDGPU" begin
+    # Using AMDGPU inside the testset since otherwise the bindings are hiddend by the anonymous modules
+    using AMDGPU
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[Float32(8.311947673061856e-6)],
+                        linf=[Float32(6.627000273229378e-5)],
+                        RealT_for_test_tolerances=Float32,
+                        real_type=Float32,
+                        storage_type=ROCArray)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
+    @test_allocations(Trixi.rhs!, semi, sol, 50_000)
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: `mesh` is currently not `adapt`ed correctly
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa ROCArray
+    @test ode.p.solver.basis.derivative_matrix isa ROCArray
+
+    @test Trixi.storage_type(ode.p.cache.elements) === ROCArray
+    @test Trixi.storage_type(ode.p.cache.interfaces) === ROCArray
+    @test Trixi.storage_type(ode.p.cache.boundaries) === ROCArray
+    @test Trixi.storage_type(ode.p.cache.mortars) === ROCArray
+end
+
+# Clean up afterwards: delete Trixi.jl output directory
+@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
+end
+end # module
diff --git a/test/test_amdgpu_3d.jl b/test/test_amdgpu_3d.jl
new file mode 100644
index 00000000000..121e537fbad
--- /dev/null
+++ b/test/test_amdgpu_3d.jl
@@ -0,0 +1,73 @@
+module TestAMDGPU3D
+
+using Test
+using Trixi
+
+include("test_trixi.jl")
+
+EXAMPLES_DIR = joinpath(examples_dir(), "p4est_3d_dgsem")
+
+# Start with a clean environment: remove Trixi.jl output directory if it exists
+outdir = "out"
+isdir(outdir) && rm(outdir, recursive = true)
+
+@testset "AMDGPU 3D" begin
+#! format: noindent
+
+@trixi_testset "elixir_advection_basic_gpu.jl native" begin
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors are exactly the same as with TreeMesh!
+                        l2=[0.00016263963870641478],
+                        linf=[0.0014537194925779984])
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
+    @test_allocations(Trixi.rhs!, semi, sol, 1000)
+    @test real(ode.p.solver) == Float64
+    @test real(ode.p.solver.basis) == Float64
+    @test real(ode.p.solver.mortar) == Float64
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa Array
+    @test ode.p.solver.basis.derivative_matrix isa Array
+
+    @test Trixi.storage_type(ode.p.cache.elements) === Array
+    @test Trixi.storage_type(ode.p.cache.interfaces) === Array
+    @test Trixi.storage_type(ode.p.cache.boundaries) === Array
+    @test Trixi.storage_type(ode.p.cache.mortars) === Array
+end
+
+@trixi_testset "elixir_advection_basic_gpu.jl Float32 / AMDGPU" begin
+    # Using AMDGPU inside the testset since otherwise the bindings are hiddend by the anonymous modules
+    using AMDGPU
+    @test_trixi_include(joinpath(EXAMPLES_DIR, "elixir_advection_basic_gpu.jl"),
+                        # Expected errors similar to reference on CPU
+                        l2=[Float32(0.00016263963870641478)],
+                        linf=[Float32(0.0014537194925779984)],
+                        RealT_for_test_tolerances=Float32,
+                        real_type=Float32,
+                        storage_type=ROCArray)
+    # Ensure that we do not have excessive memory allocations
+    # (e.g., from type instabilities)
+    semi = ode.p # `semidiscretize` adapts the semi, so we need to obtain it from the ODE problem.
+    @test_allocations(Trixi.rhs!, semi, sol, 50_000)
+    @test real(ode.p.solver) == Float32
+    @test real(ode.p.solver.basis) == Float32
+    @test real(ode.p.solver.mortar) == Float32
+    # TODO: remake ignores the mesh itself as well
+    @test real(ode.p.mesh) == Float64
+
+    @test ode.u0 isa ROCArray
+    @test ode.p.solver.basis.derivative_matrix isa ROCArray
+
+    @test Trixi.storage_type(ode.p.cache.elements) === ROCArray
+    @test Trixi.storage_type(ode.p.cache.interfaces) === ROCArray
+    @test Trixi.storage_type(ode.p.cache.boundaries) === ROCArray
+    @test Trixi.storage_type(ode.p.cache.mortars) === ROCArray
+end
+
+# Clean up afterwards: delete Trixi.jl output directory
+@test_nowarn isdir(outdir) && rm(outdir, recursive = true)
+end
+end # module