trixi-framework · vchuravy · Dec 17, 2024 · Apr 21, 2025 · Apr 21, 2025 · Apr 21, 2025
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -0,0 +1,21 @@
+env:
+
+steps:
+  - label: "CUDA Julia {{matrix.version}}"
+    matrix:
+      setup:
+        version:
+          - "1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "{{matrix.version}}"
+      - JuliaCI/julia-test#v1: ~
+    env:
+      TRIXI_TEST: "CUDA"
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip ci\]/
+    timeout_in_minutes: 60
+    soft_fail:
+      - exit_status: 3
diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,7 @@ version = "0.11.12-DEV"
 
 [deps]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CodeTracking = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2"
 ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -56,14 +57,18 @@ Convex = "f65535da-76fb-5f13-bab9-19810c17039a"
 ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 TrixiConvexECOSExt = ["Convex", "ECOS"]
 TrixiMakieExt = "Makie"
 TrixiNLsolveExt = "NLsolve"
+TrixiCUDAExt = "CUDA"
 
 [compat]
 Accessors = "0.1.36"
+Adapt = "4"
+CUDA = "5"
 CodeTracking = "1.0.5"
 ConstructionBase = "1.5"
 Convex = "0.16"

diff --git a/docs/make.jl b/docs/make.jl
@@ -163,7 +163,8 @@ makedocs(
                  "Style guide" => "styleguide.md",
                  "Testing" => "testing.md",
                  "Performance" => "performance.md",
-                 "Parallelization" => "parallelization.md"
+                 "Parallelization" => "parallelization.md",
+                 "Heterogeneous" => "heterogeneous.md"
              ],
              "Troubleshooting and FAQ" => "troubleshooting.md",
              "Reference" => [

diff --git a/docs/src/heterogeneous.md b/docs/src/heterogeneous.md
@@ -0,0 +1,82 @@
+# Heterogeneous computing
+
+Support for heterogeneous computing is currently being worked on.
+
+## The use of Adapt.jl
+
+[`Adapt.jl`](https://github.com/JuliaGPU/Adapt.jl) is a package in the JuliaGPU family that allows for
+the translation of nested data structures. The primary goal is to allow the substitution of `Array` 
+at the storage leaves with a GPU array like `CuArray`.
+
+To facilitate this data structures must be parameterized, so instead of:
+
+```julia
+struct Container
+   data::Array{Float64,2}
+end
+```
+
+They must be written as:
+
+```julia
+struct Container{D<:AbstractArray} <: Trixi.AbstractContainer
+   data::D
+end
+```
+
+furthermore, we need to define a function that allows for the conversion of storage
+of our types: 
+
+```julia
+function Adapt.adapt_structure(to, C::Container)
+    return Container(adapt(to, C.data))
+end
+```
+
+or simply
+
+```julia
+Adapt.@adapt_structure(Container)
+```
+
+additionally, we must define `Adapt.parent_type`.
+
+```julia
+function Adapt.parent_type(::Type{<:Container{D}}) where D
+    return D
+end
+```
+
+```julia-repl
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+Array
+
+julia> using CUDA
+
+julia> GPU_C = adapt(CuArray, C)
+Container{CuArray{Float64, 1, CUDA.DeviceMemory}}([0.0, 0.0, 0.0])
+
+julia> Trixi.storage_type(C)
+CuArray
+```
+
+## Element-type conversion with `Trixi.trixi_adapt`.
+
+We can use Trixi.trixi_adapt to perform both an element-type and a storage-type adoption
+
+```julia-repl
+julia> C = Container(zeros(3))
+Container{Vector{Float64}}([0.0, 0.0, 0.0])
+
+julia> Trixi.trixi_adapt(Array, Float32, C)
+Container{Vector{Float32}}(Float32[0.0, 0.0, 0.0])
+
+julia> Trixi.trixi_adapt(CuArray, Float32, C)
+Container{CuArray{Float32, 1, CUDA.DeviceMemory}}(Float32[0.0, 0.0, 0.0])
+```
+
+!!! note
+    `adapt(Array{Float32}, C)` is tempting but will do the wrong thing in the presence of `StaticArrays`.
diff --git a/ext/TrixiCUDAExt.jl b/ext/TrixiCUDAExt.jl
@@ -0,0 +1,11 @@
+# Package extension for adding CUDA-based features to Trixi.jl
+module TrixiCUDAExt
+
+import CUDA: CuArray
+import Trixi
+
+function Trixi.storage_type(::Type{<:CuArray})
+    return CuArray
+end
+
+end
diff --git a/src/Trixi.jl b/src/Trixi.jl
@@ -44,6 +44,7 @@ import SciMLBase: get_du, get_tmp_cache, u_modified!,
 
 using DelimitedFiles: readdlm
 using Downloads: Downloads
+using Adapt: Adapt, adapt
 using CodeTracking: CodeTracking
 using ConstructionBase: ConstructionBase
 using DiffEqBase: DiffEqBase, get_tstops, get_tstops_array
@@ -125,6 +126,7 @@ include("basic_types.jl")
 
 # Include all top-level source files
 include("auxiliary/auxiliary.jl")
+include("auxiliary/vector_of_arrays.jl")
 include("auxiliary/mpi.jl")
 include("auxiliary/p4est.jl")
 include("auxiliary/t8code.jl")

diff --git a/src/auxiliary/containers.jl b/src/auxiliary/containers.jl
@@ -314,4 +314,88 @@ end
 function raw_copy!(c::AbstractContainer, from::Int, destination::Int)
     raw_copy!(c, c, from, from, destination)
 end
+
+# Trixi storage types must implement these two Adapt.jl methods
+function Adapt.adapt_structure(to, c::AbstractContainer)
+    error("Interface: Must implement Adapt.adapt_structure(to, ::$(typeof(c)))")
+end
+
+function Adapt.parent_type(C::Type{<:AbstractContainer})
+    error("Interface: Must implement Adapt.parent_type(::Type{$C}")
+end
+
+function Adapt.unwrap_type(C::Type{<:AbstractContainer})
+    return Adapt.unwrap_type(Adapt.parent_type(C))
+end
+
+# TODO: Upstream to Adapt
+function storage_type(x)
+    return storage_type(typeof(x))
+end
+
+function storage_type(T::Type)
+    error("Interface: Must implement storage_type(::Type{$T}")
+end
+
+function storage_type(::Type{<:Array})
+    Array
+end
+
+function storage_type(C::Type{<:AbstractContainer})
+    return storage_type(Adapt.unwrap_type(C))
+end
+
+# For some storage backends like CUDA.jl, empty arrays do seem to simply be
+# null pointers which can cause `unsafe_wrap` to fail when calling
+# Adapt.adapt (ArgumentError, see
+# https://github.com/JuliaGPU/CUDA.jl/blob/v5.4.2/src/array.jl#L212-L229).
+# To circumvent this, on length zero arrays this allocates
+# a separate empty array instead of wrapping.
+# However, since zero length arrays are not used in calculations,
+# it should be okay if the underlying storage vectors and wrapped arrays
+# are not the same as long as they are properly wrapped when `resize!`d etc.
+function unsafe_wrap_or_alloc(to, vector, size)
+    if length(vector) == 0
+        return similar(vector, size)
+    else
+        return unsafe_wrap(to, pointer(vector), size)
+    end
+end
+
+struct TrixiAdaptor{Storage, Real} end
+
+function trixi_adapt(storage, real, x)
+    adapt(TrixiAdaptor{storage, real}(), x)
+end
+
+# Custom rules
+# 1. handling of StaticArrays
+function Adapt.adapt_storage(::TrixiAdaptor{<:Any, Real},
+                             x::StaticArrays.StaticArray{S, T, N}) where {Real, S, T, N}
+    StaticArrays.similar_type(x, Real)(x)
+end
+
+# 2. Handling of Arrays
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray{T}) where {Storage, Real,
+                                                         T <: AbstractFloat}
+    adapt(Storage{Real}, x)
+end
+
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray{T}) where {Storage, Real,
+                                                         T <: StaticArrays.StaticArray}
+    adapt(Storage{StaticArrays.similar_type(T, Real)}, x)
+end
+
+function Adapt.adapt_storage(::TrixiAdaptor{Storage, Real},
+                             x::AbstractArray) where {Storage, Real}
+    adapt(Storage, x)
+end
+
+# 3. TODO: Should we have a fallback? But that would imply implementing things for NamedTuple again
+
+function unsafe_wrap_or_alloc(::TrixiAdaptor{Storage}, vec, size) where {Storage}
+    return unsafe_wrap_or_alloc(Storage, vec, size)
+end
 end # @muladd
diff --git a/src/auxiliary/vector_of_arrays.jl b/src/auxiliary/vector_of_arrays.jl
@@ -0,0 +1,31 @@
+# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
+# Since these FMAs can increase the performance of many numerical algorithms,
+# we need to opt-in explicitly.
+# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
+@muladd begin
+#! format: noindent
+
+# Wraps a Vector of Arrays, forwards `getindex` to the underlying Vector.
+# Implements `Adapt.adapt_structure` to allow offloading to the GPU which is
+# not possible for a plain Vector of Arrays.
+struct VecOfArrays{T <: AbstractArray}
+    arrays::Vector{T}
+end
+Base.getindex(v::VecOfArrays, i::Int) = Base.getindex(v.arrays, i)
+Base.IndexStyle(v::VecOfArrays) = Base.IndexStyle(v.arrays)
+Base.size(v::VecOfArrays) = Base.size(v.arrays)
+Base.length(v::VecOfArrays) = Base.length(v.arrays)
+Base.eltype(v::VecOfArrays{T}) where {T} = T
+function Adapt.adapt_structure(to, v::VecOfArrays)
+    return VecOfArrays([Adapt.adapt(to, arr) for arr in v.arrays])
+end
+function Adapt.parent_type(::Type{<:VecOfArrays{T}}) where {T}
+    return T
+end
+function Adapt.unwrap_type(A::Type{<:VecOfArrays})
+    Adapt.unwrap_type(Adapt.parent_type(A))
+end
+function Base.convert(::Type{<:VecOfArrays}, v::Vector{<:AbstractArray})
+    VecOfArrays(v)
+end
+end # @muladd
diff --git a/src/semidiscretization/semidiscretization_hyperbolic.jl b/src/semidiscretization/semidiscretization_hyperbolic.jl
@@ -27,25 +27,6 @@ mutable struct SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition,
     solver::Solver
     cache::Cache
     performance_counter::PerformanceCounter
-
-    function SemidiscretizationHyperbolic{Mesh, Equations, InitialCondition,
-                                          BoundaryConditions, SourceTerms, Solver,
-                                          Cache}(mesh::Mesh, equations::Equations,
-                                                 initial_condition::InitialCondition,
-                                                 boundary_conditions::BoundaryConditions,
-                                                 source_terms::SourceTerms,
-                                                 solver::Solver,
-                                                 cache::Cache) where {Mesh, Equations,
-                                                                      InitialCondition,
-                                                                      BoundaryConditions,
-                                                                      SourceTerms,
-                                                                      Solver,
-                                                                      Cache}
-        performance_counter = PerformanceCounter()
-
-        new(mesh, equations, initial_condition, boundary_conditions, source_terms,
-            solver, cache, performance_counter)
-    end
 end
 
 """
@@ -74,16 +55,22 @@ function SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver
 
     check_periodicity_mesh_boundary_conditions(mesh, _boundary_conditions)
 
+    performance_counter = PerformanceCounter()
+
     SemidiscretizationHyperbolic{typeof(mesh), typeof(equations),
                                  typeof(initial_condition),
                                  typeof(_boundary_conditions), typeof(source_terms),
                                  typeof(solver), typeof(cache)}(mesh, equations,
                                                                 initial_condition,
                                                                 _boundary_conditions,
                                                                 source_terms, solver,
-                                                                cache)
+                                                                cache,
+                                                                performance_counter)
 end
 
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(SemidiscretizationHyperbolic)
+
 # Create a new semidiscretization but change some parameters compared to the input.
 # `Base.similar` follows a related concept but would require us to `copy` the `mesh`,
 # which would impact the performance. Instead, `SciMLBase.remake` has exactly the

diff --git a/src/solvers/dg.jl b/src/solvers/dg.jl
@@ -400,6 +400,9 @@ struct DG{Basis, Mortar, SurfaceIntegral, VolumeIntegral}
     volume_integral::VolumeIntegral
 end
 
+# @eval due to @muladd
+@eval Adapt.@adapt_structure(DG)
+
 function Base.show(io::IO, dg::DG)
     @nospecialize dg # reduce precompilation time