trixi-framework · efaulhaber · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024
diff --git a/Project.toml b/Project.toml
@@ -44,14 +44,14 @@ GPUArraysCore = "0.2"
 JSON = "0.21"
 KernelAbstractions = "0.9"
 MuladdMacro = "0.2"
-PointNeighbors = "0.4.7"
+PointNeighbors = "0.4.8"
 Polyester = "0.7.10"
 RecipesBase = "1"
 Reexport = "1"
 SciMLBase = "2"
 StaticArrays = "1"
 StrideArrays = "0.1"
 TimerOutputs = "0.5.25"
-TrixiBase = "0.1.3"
+TrixiBase = "0.1.5"
 WriteVTK = "1"
 julia = "1.10"
diff --git a/docs/make.jl b/docs/make.jl
@@ -111,7 +111,7 @@ bib = CitationBibliography(joinpath(@__DIR__, "src", "refs.bib"))
 makedocs(sitename="TrixiParticles.jl",
          plugins=[bib],
          # Run doctests and check docs for the following modules
-         modules=[TrixiParticles],
+         modules=[TrixiParticles, TrixiBase],
          format=Documenter.HTML(; assets=Asciicast.assets()),
          # Explicitly specify documentation structure
          pages=[

diff --git a/docs/src/gpu.md b/docs/src/gpu.md
@@ -1,24 +1,25 @@
-# GPU Support
+# [GPU Support](@id gpu_support)
 
 GPU support is still an experimental feature that is actively being worked on.
-As of now, the [`WeaklyCompressibleSPHSystem`](@ref) and the [`BoundarySPHSystem`](@ref)
-are supported on GPUs.
-We have tested this on GPUs by Nvidia and AMD.
+Currently, the [`WeaklyCompressibleSPHSystem`](@ref), [`TotalLagrangianSPHSystem`](@ref)
+and [`BoundarySPHSystem`](@ref) support GPU execution.
+We have tested GPU support on Nvidia, AMD and Apple GPUs.
+Note that most Apple GPUs do not support `Float64`.
+See [below on how to run single precision simulations](@ref single_precision).
 
-To run a simulation on a GPU, we need to use the [`FullGridCellList`](@ref)
+To run a simulation on a GPU, use the [`FullGridCellList`](@ref)
 as cell list for the [`GridNeighborhoodSearch`](@ref).
-This cell list requires a bounding box for the domain, unlike the default cell list, which
-uses an unbounded domain.
-For simulations that are bounded by a closed tank, we can use the boundary of the tank
-to obtain the bounding box as follows.
+Unlike the default cell list, which assumes an unbounded domain,
+this cell list requires a bounding box for the domain.
+For simulations that are bounded by a closed tank, we can simply use the boundary
+of the tank to obtain the bounding box as follows.
 ```jldoctest gpu; output=false, setup=:(using TrixiParticles; trixi_include(@__MODULE__, joinpath(examples_dir(), "fluid", "hydrostatic_water_column_2d.jl"), sol=nothing))
-search_radius = TrixiParticles.compact_support(smoothing_kernel, smoothing_length)
-min_corner = minimum(tank.boundary.coordinates, dims=2) .- search_radius
-max_corner = maximum(tank.boundary.coordinates, dims=2) .+ search_radius
+min_corner = minimum(tank.boundary.coordinates, dims=2)
+max_corner = maximum(tank.boundary.coordinates, dims=2)
 cell_list = TrixiParticles.PointNeighbors.FullGridCellList(; min_corner, max_corner)
 
 # output
-PointNeighbors.FullGridCellList{PointNeighbors.DynamicVectorOfVectors{Int32, Matrix{Int32}, Vector{Int32}, Base.RefValue{Int32}}, Nothing, SVector{2, Float64}, SVector{2, Float64}}(Vector{Int32}[], nothing, [-0.24500000000000002, -0.24500000000000002], [1.245, 1.245])
+PointNeighbors.FullGridCellList{PointNeighbors.DynamicVectorOfVectors{Int32, Matrix{Int32}, Vector{Int32}, Base.RefValue{Int32}}, Nothing, SVector{2, Float64}, SVector{2, Float64}}(Vector{Int32}[], nothing, [-0.12500000000000003, -0.12500000000000003], [1.125, 1.125])
 ```
 
 We then need to pass this cell list to the neighborhood search and the neighborhood search
@@ -55,7 +56,59 @@ On an AMD GPU, we use:
 using AMDGPU
 ode = semidiscretize(semi, tspan, data_type=ROCArray)
 ```
-Then, we can run the simulation as usual.
+Now, we can run the simulation as usual.
 All data is transferred to the GPU during initialization and all loops over particles
 and their neighbors will be executed on the GPU as kernels generated by KernelAbstractions.jl.
 Data is only copied to the CPU for saving VTK files via the [`SolutionSavingCallback`](@ref).
+
+## Run an existing example file on the GPU
+
+The example file `examples/fluid/dam_break_2d_gpu.jl` demonstrates how to run an existing
+example file on a GPU.
+It first loads the variables from `examples/fluid/dam_break_2d.jl` without executing
+the simulation. This is achieved by overwriting the line that starts the simulation
+with `trixi_include(..., sol=nothing)`.
+Next, a GPU-compatible neighborhood search is defined, and the original example file
+is included with the new neighborhood search.
+This requires the assignments `neighborhood_search = ...` and `data_type = ...`
+to be present in the original example file.
+Note that in `examples/fluid/dam_break_2d.jl`, we specifically set `data_type=nothing`, even though
+this is the default value, so that we can use `trixi_include` to replace this value.
+
+To run this simulation on a GPU, simply update `data_type` to match the
+array type of the installed GPU.
+We can run this simulation on an Nvidia GPU as follows.
+```julia
+using CUDA
+trixi_include(joinpath(examples_dir(), "fluid", "dam_break_2d_gpu.jl"), data_type=CuArray)
+```
+For AMD GPUs, use
+```julia
+using AMDGPU
+trixi_include(joinpath(examples_dir(), "fluid", "dam_break_2d_gpu.jl"), data_type=ROCArray)
+```
+For Apple GPUs, use
+```julia
+using Metal
+trixi_include(joinpath(examples_dir(), "fluid", "dam_break_2d_gpu.jl"), data_type=MtlArray())
+```
+
+## [Single precision simulations](@id single_precision)
+
+All GPU-supported features can also be used with single precision,
+which is significantly faster on most GPUs and required for many Apple GPUs.
+
+To run a simulation with single precision, all `Float64` literals in an example file
+must be converted to `Float32` (e.g. `0.0` to `0.0f0`).
+TrixiParticles provides a function to automate this conversion:
+```@docs
+trixi_include_changeprecision
+```
+
+To run the previous example with single precision, use the following:
+```julia
+using CUDA
+trixi_include_changeprecision(Float32,
+                              joinpath(examples_dir(), "fluid", "dam_break_2d_gpu.jl"),
+                              data_type=CuArray)
+```
diff --git a/examples/fluid/dam_break_2d_gpu.jl b/examples/fluid/dam_break_2d_gpu.jl
@@ -0,0 +1,26 @@
+# This example file demonstrates how to run an existing example file on a GPU.
+# We simply define a GPU-compatible neighborhood search and `trixi_include` the example
+# file with this neighborhood search.
+# To run this example on a GPU, `data_type` needs to be changed to the array type of the
+# installed GPU. See the docs on GPU support for more information.
+
+using TrixiParticles
+
+# Load setup from dam break example
+trixi_include(@__MODULE__,
+              joinpath(examples_dir(), "fluid", "dam_break_2d.jl"),
+              sol=nothing)
+
+# Define a GPU-compatible neighborhood search
+min_corner = minimum(tank.boundary.coordinates, dims=2)
+max_corner = maximum(tank.boundary.coordinates, dims=2)
+cell_list = FullGridCellList(; min_corner, max_corner)
+neighborhood_search = GridNeighborhoodSearch{2}(; cell_list)
+
+# Run the dam break simulation with the this neighborhood search
+trixi_include(@__MODULE__,
+              joinpath(examples_dir(), "fluid", "dam_break_2d.jl"),
+              neighborhood_search=neighborhood_search,
+              fluid_particle_spacing=H / 40,
+              tspan=(0.0, 5.7 / sqrt(9.81)),
+              data_type=nothing)
diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
@@ -27,11 +27,13 @@ using SciMLBase: CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!
 using StaticArrays: @SMatrix, SMatrix, setindex
 using StrideArrays: PtrArray, StaticInt
 using TimerOutputs: TimerOutput, TimerOutputs, print_timer, reset_timer!
-using TrixiBase: trixi_include, @trixi_timeit, timer, timeit_debug_enabled,
-                 disable_debug_timings, enable_debug_timings
+using TrixiBase: @trixi_timeit, timer, timeit_debug_enabled,
+                 disable_debug_timings, enable_debug_timings, TrixiBase
+@reexport using TrixiBase: trixi_include, trixi_include_changeprecision
 @reexport using PointNeighbors: TrivialNeighborhoodSearch, GridNeighborhoodSearch,
                                 PrecomputedNeighborhoodSearch, PeriodicBox,
-                                ParallelUpdate, SemiParallelUpdate, SerialUpdate
+                                ParallelUpdate, SemiParallelUpdate, SerialUpdate,
+                                FullGridCellList, DictionaryCellList
 using PointNeighbors: PointNeighbors, foreach_point_neighbor, copy_neighborhood_search,
                       @threaded
 using WriteVTK: vtk_grid, MeshCell, VTKCellTypes, paraview_collection, vtk_save
@@ -74,7 +76,7 @@ export BoundaryModelMonaghanKajtar, BoundaryModelDummyParticles, AdamiPressureEx
        BernoulliPressureExtrapolation
 
 export BoundaryMovement
-export examples_dir, validation_dir, trixi_include
+export examples_dir, validation_dir
 export trixi2vtk
 export RectangularTank, RectangularShape, SphereShape, ComplexShape
 export ParticlePackingSystem, SignedDistanceField

diff --git a/src/callbacks/solution_saving.jl b/src/callbacks/solution_saving.jl
@@ -67,7 +67,7 @@ saving_callback = SolutionSavingCallback(dt=0.1, my_custom_quantity=kinetic_ener
 """
 mutable struct SolutionSavingCallback{I, CQ}
     interval              :: I
-    save_times            :: Array{Float64, 1}
+    save_times            :: Vector{Float64}
     save_initial_solution :: Bool
     save_final_solution   :: Bool
     write_meta_data       :: Bool
@@ -81,7 +81,7 @@ mutable struct SolutionSavingCallback{I, CQ}
 end
 
 function SolutionSavingCallback(; interval::Integer=0, dt=0.0,
-                                save_times=Array{Float64, 1}([]),
+                                save_times=Float64[],
                                 save_initial_solution=true, save_final_solution=true,
                                 output_directory="out", append_timestamp=false,
                                 prefix="", verbose=false, write_meta_data=true,
@@ -99,7 +99,7 @@ function SolutionSavingCallback(; interval::Integer=0, dt=0.0,
         output_directory *= string("_", Dates.format(now(), "YY-mm-ddTHHMMSS"))
     end
 
-    solution_callback = SolutionSavingCallback(interval, save_times,
+    solution_callback = SolutionSavingCallback(interval, Float64.(save_times),
                                                save_initial_solution, save_final_solution,
                                                write_meta_data, verbose, output_directory,
                                                prefix, max_coordinates, custom_quantities,

diff --git a/src/general/corrections.jl b/src/general/corrections.jl
@@ -398,7 +398,7 @@ function correction_matrix_inversion_step!(corr_matrix, system)
         # so `L` is singular if and only if the position vectors X_ab don't span the
         # full space, i.e., particle a and all neighbors lie on the same line (in 2D)
         # or plane (in 3D).
-        if abs(det(L)) < 1e-9
+        if abs(det(L)) < 1.0f-9
             L_inv = I
         else
             L_inv = inv(L)

diff --git a/src/schemes/boundary/dummy_particles/dummy_particles.jl b/src/schemes/boundary/dummy_particles/dummy_particles.jl
@@ -84,40 +84,40 @@ function BoundaryModelDummyParticles(initial_density, hydrodynamic_mass,
 end
 
 @doc raw"""
-    AdamiPressureExtrapolation(; pressure_offset=0.0)
+    AdamiPressureExtrapolation(; pressure_offset=0)
 
 `density_calculator` for `BoundaryModelDummyParticles`.
 
 # Keywords
-- `pressure_offset=0.0`: Sometimes it is necessary to artificially increase the boundary pressure
-                         to prevent penetration, which is possible by increasing this value.
+- `pressure_offset=0`: Sometimes it is necessary to artificially increase the boundary pressure
+                       to prevent penetration, which is possible by increasing this value.
 
 """
 struct AdamiPressureExtrapolation{ELTYPE}
     pressure_offset::ELTYPE
 
-    function AdamiPressureExtrapolation(; pressure_offset=0.0)
+    function AdamiPressureExtrapolation(; pressure_offset=0)
         return new{eltype(pressure_offset)}(pressure_offset)
     end
 end
 
 @doc raw"""
-    BernoulliPressureExtrapolation(; pressure_offset=0.0, factor=1.0)
+    BernoulliPressureExtrapolation(; pressure_offset=0, factor=1)
 
 `density_calculator` for `BoundaryModelDummyParticles`.
 
 # Keywords
-- `pressure_offset=0.0`: Sometimes it is necessary to artificially increase the boundary pressure
+- `pressure_offset=0`:   Sometimes it is necessary to artificially increase the boundary pressure
                          to prevent penetration, which is possible by increasing this value.
-- `factor=1.0`         : Setting `factor` allows to just increase the strength of the dynamic
+- `factor=1`         :   Setting `factor` allows to just increase the strength of the dynamic
                          pressure part.
 
 """
 struct BernoulliPressureExtrapolation{ELTYPE}
     pressure_offset :: ELTYPE
     factor          :: ELTYPE
 
-    function BernoulliPressureExtrapolation(; pressure_offset=0.0, factor=0.0)
+    function BernoulliPressureExtrapolation(; pressure_offset=0, factor=1)
         return new{eltype(pressure_offset)}(pressure_offset, factor)
     end
 end
@@ -345,7 +345,7 @@ end
 # Otherwise, `@threaded` does not work here with Julia ARM on macOS.
 # See https://github.com/JuliaSIMD/Polyester.jl/issues/88.
 @inline function apply_state_equation!(boundary_model, density, particle)
-    boundary_model.pressure[particle] = max(boundary_model.state_equation(density), 0.0)
+    boundary_model.pressure[particle] = max(boundary_model.state_equation(density), 0)
 end
 
 function compute_pressure!(boundary_model,
@@ -374,7 +374,7 @@ function compute_pressure!(boundary_model,
         # Note: The version iterating neighbors first is not thread parallelizable.
         # The factor is based on the achievable speed-up of the thread parallelizable version.
         if nparticles(system) >
-           ceil(Int, 0.5 * Threads.nthreads()) * nparticles(neighbor_system)
+           ceil(Int, Threads.nthreads() / 2) * nparticles(neighbor_system)
             nhs = get_neighborhood_search(neighbor_system, system, semi)
 
             # Loop over fluid particles and then the neighboring boundary particles to extrapolate fluid pressure to the boundaries
@@ -395,7 +395,7 @@ function compute_pressure!(boundary_model,
         @threaded system for particle in eachparticle(system)
             # Limit pressure to be non-negative to avoid attractive forces between fluid and
             # boundary particles at free surfaces (sticking artifacts).
-            pressure[particle] = max(pressure[particle], 0.0)
+            pressure[particle] = max(pressure[particle], 0)
         end
     end
 
@@ -529,8 +529,8 @@ end
                             current_velocity(v_neighbor_system, neighbor_system, neighbor)
         normal_velocity = dot(relative_velocity, pos_diff)
 
-        return 0.5 * boundary_density_calculator.factor * density_neighbor *
-               normal_velocity^2 / distance
+        return boundary_density_calculator.factor * density_neighbor *
+               normal_velocity^2 / distance / 2
     end
     return zero(density_neighbor)
 end

diff --git a/src/schemes/solid/total_lagrangian_sph/penalty_force.jl b/src/schemes/solid/total_lagrangian_sph/penalty_force.jl
@@ -35,7 +35,7 @@ end
     eps_sum = (J_a + J_b) * initial_pos_diff - 2 * current_pos_diff
     delta_sum = dot(eps_sum, current_pos_diff) / current_distance
 
-    f = 0.5 * penalty_force.alpha * volume_particle * volume_neighbor *
+    f = (penalty_force.alpha / 2) * volume_particle * volume_neighbor *
         kernel_weight / initial_distance^2 * young_modulus * delta_sum *
         current_pos_diff / current_distance
 

diff --git a/src/schemes/solid/total_lagrangian_sph/system.jl b/src/schemes/solid/total_lagrangian_sph/system.jl
@@ -111,7 +111,7 @@ function TotalLagrangianSPHSystem(initial_condition,
 
     lame_lambda = young_modulus * poisson_ratio /
                   ((1 + poisson_ratio) * (1 - 2 * poisson_ratio))
-    lame_mu = 0.5 * young_modulus / (1 + poisson_ratio)
+    lame_mu = (young_modulus / 2) / (1 + poisson_ratio)
 
     return TotalLagrangianSPHSystem(initial_condition, initial_coordinates,
                                     current_coordinates, mass, correction_matrix,
@@ -233,7 +233,7 @@ end
 function update_positions!(system::TotalLagrangianSPHSystem, v, u, v_ode, u_ode, semi, t)
     (; current_coordinates) = system
 
-    for particle in each_moving_particle(system)
+    @threaded system for particle in each_moving_particle(system)
         for i in 1:ndims(system)
             current_coordinates[i, particle] = u[i, particle]
         end
@@ -318,7 +318,7 @@ end
     (; lame_lambda, lame_mu) = system
 
     # Compute the Green-Lagrange strain
-    E = 0.5 * (transpose(F) * F - I)
+    E = (transpose(F) * F - I) / 2
 
     return lame_lambda * tr(E) * I + 2 * lame_mu * E
 end
@@ -332,25 +332,19 @@ end
 function write_u0!(u0, system::TotalLagrangianSPHSystem)
     (; initial_condition) = system
 
-    for particle in each_moving_particle(system)
-        # Write particle coordinates
-        for dim in 1:ndims(system)
-            u0[dim, particle] = initial_condition.coordinates[dim, particle]
-        end
-    end
+    # This is as fast as a loop with `@inbounds`, but it's GPU-compatible
+    indices = CartesianIndices((ndims(system), each_moving_particle(system)))
+    copyto!(u0, indices, initial_condition.coordinates, indices)
 
     return u0
 end
 
 function write_v0!(v0, system::TotalLagrangianSPHSystem)
     (; initial_condition, boundary_model) = system
 
-    for particle in each_moving_particle(system)
-        # Write particle velocities
-        for dim in 1:ndims(system)
-            v0[dim, particle] = initial_condition.velocity[dim, particle]
-        end
-    end
+    # This is as fast as a loop with `@inbounds`, but it's GPU-compatible
+    indices = CartesianIndices((ndims(system), each_moving_particle(system)))
+    copyto!(v0, indices, initial_condition.velocity, indices)
 
     write_v0!(v0, boundary_model, system)
 
@@ -361,19 +355,6 @@ function write_v0!(v0, model, system::TotalLagrangianSPHSystem)
     return v0
 end
 
-function write_v0!(v0, ::BoundaryModelDummyParticles{ContinuityDensity},
-                   system::TotalLagrangianSPHSystem)
-    (; cache) = system.boundary_model
-    (; initial_density) = cache
-
-    for particle in each_moving_particle(system)
-        # Set particle densities
-        v0[ndims(system) + 1, particle] = initial_density[particle]
-    end
-
-    return v0
-end
-
 function restart_with!(system::TotalLagrangianSPHSystem, v, u)
     for particle in each_moving_particle(system)
         system.current_coordinates[:, particle] .= u[:, particle]