Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix in distributed fill halos #3714

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
e2b39b4
bugfixxed
simone-silvestri Aug 16, 2024
3fe8565
some more changes
simone-silvestri Aug 16, 2024
ddc0836
comment
simone-silvestri Aug 16, 2024
976955a
remove helper shows
simone-silvestri Aug 16, 2024
4934ee3
comment
simone-silvestri Aug 16, 2024
45f14b5
another quick bugfix
simone-silvestri Aug 16, 2024
43571d5
quick change
simone-silvestri Aug 16, 2024
ae9634f
some fixes
simone-silvestri Aug 16, 2024
27f94e0
add child_arch in another PR
simone-silvestri Aug 16, 2024
99162c9
improve the explanation
simone-silvestri Aug 20, 2024
c32a2b5
chnage name to field
simone-silvestri Aug 20, 2024
7dbafb2
chnage back
simone-silvestri Aug 20, 2024
723ee08
removing all the shows
simone-silvestri Aug 20, 2024
89130d0
remove the deadlock
simone-silvestri Aug 23, 2024
1074201
this works!!
simone-silvestri Aug 28, 2024
e355c94
Merge remote-tracking branch 'origin/main' into ss/branch-for-distrib…
simone-silvestri Sep 16, 2024
92091f1
merge with main
simone-silvestri Sep 16, 2024
9a51271
Merge remote-tracking branch 'origin/ss/branch-for-distributed-omip' …
simone-silvestri Sep 16, 2024
c901b51
Merge branch 'main' into ss/branch-for-distributed-omip
simone-silvestri Sep 19, 2024
7e74246
adapt to new syntax
simone-silvestri Sep 19, 2024
f089f6c
a small bug
simone-silvestri Sep 19, 2024
69cb4cd
Merge branch 'ss/branch-for-distributed-omip' of github.com:CliMA/Oce…
simone-silvestri Sep 19, 2024
84ded08
bugfix
simone-silvestri Sep 19, 2024
0adddc6
Merge remote-tracking branch 'origin/main' into ss/branch-for-distrib…
simone-silvestri Oct 11, 2024
f5a7f80
Merge remote-tracking branch 'origin/main' into ss/branch-for-distrib…
simone-silvestri Nov 12, 2024
d22284c
change active cells
simone-silvestri Nov 12, 2024
8406b39
Merge branch 'main' into ss/branch-for-distributed-omip
simone-silvestri Nov 15, 2024
38638b5
Merge branch 'main' into ss/branch-for-distributed-omip
simone-silvestri Dec 12, 2024
3049445
Merge branch 'main' into ss/branch-for-distributed-omip
simone-silvestri Feb 9, 2025
8a9e17e
Merge branch 'main' into ss/branch-for-distributed-omip
simone-silvestri Feb 10, 2025
48991ab
add a Nothing free surface
simone-silvestri Feb 10, 2025
c67d697
better
simone-silvestri Feb 10, 2025
e02a6ee
Merge remote-tracking branch 'origin/main' into ss/branch-for-distrib…
simone-silvestri Feb 17, 2025
39168da
Merge branch 'main' into ss/branch-for-distributed-omip
simone-silvestri Feb 17, 2025
a8c4a1f
Merge branch 'main' into ss/branch-for-distributed-omip
glwagner Feb 21, 2025
c76a7ae
Merge branch 'main' into ss/branch-for-distributed-omip
simone-silvestri Feb 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/DistributedComputations/distributed_architectures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ end
##### Rank connectivity graph
#####

struct RankConnectivity{E, W, N, S, SW, SE, NW, NE}
mutable struct RankConnectivity{E, W, N, S, SW, SE, NW, NE}
east :: E
west :: W
north :: N
Expand Down
2 changes: 1 addition & 1 deletion src/DistributedComputations/distributed_fields.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ function synchronize_communication!(field)
cooperative_waitall!(arch.mpi_requests)

# Reset MPI tag
arch.mpi_tag[] -= arch.mpi_tag[]
arch.mpi_tag[] = 0

# Reset MPI requests
empty!(arch.mpi_requests)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ function map_interior_active_cells(ibg::ImmersedBoundaryGrid{<:Any, <:Any, <:Any
east_halo_dependent_cells = ifelse(include_east, east_halo_dependent_cells, nothing)
south_halo_dependent_cells = ifelse(include_south, south_halo_dependent_cells, nothing)
north_halo_dependent_cells = ifelse(include_north, north_halo_dependent_cells, nothing)

nx = Rx == 1 ? Nx : (Tx == RightConnected || Tx == LeftConnected ? Nx - Hx : Nx - 2Hx)
ny = Ry == 1 ? Ny : (Ty == RightConnected || Ty == LeftConnected ? Ny - Hy : Ny - 2Hy)

Expand Down
55 changes: 34 additions & 21 deletions src/DistributedComputations/halo_communication.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,33 +41,43 @@ opposite_side = Dict(
:northeast => :southwest,
)

# Define functions that return unique send and recv MPI tags for each side.
# It's an integer where
# digit 1-2: an identifier for the field that is reset each timestep
# digit 3: an identifier for the field's Z-location
# digit 4: the side we send to/recieve from
ID_DIGITS = 2

@inline loc_id(::Face) = 0
@inline loc_id(::Center) = 1
@inline loc_id(::Nothing) = 2
@inline loc_id(LX, LY, LZ) = loc_id(LZ)
# A Hashing function which returns a unique
# integer between 0 and 26 for a combination of
# 3 locations wither Center, Face, or Nothing
location_counter = 0
for LX in (:Face, :Center, :Nothing)
for LY in (:Face, :Center, :Nothing)
for LZ in (:Face, :Center, :Nothing)
@eval loc_id(::$LX, ::$LY, ::$LZ) = $location_counter
global location_counter += 1
end
end
end

# Functions that return unique send and recv MPI tags for each side, field location
# keeping into account the possibility of asynchronous communication.
# the MPI tag is an integer with:
# digit 1-2: an counter which keeps track of how many communications are live. The counter is stored in `arch.mpi_tag`
# digit 3-4: a unique identifier for the field's location that goes from 0 - 26 (see `loc_id`)
# digit 5: the side we send / recieve from

for side in sides
side_str = string(side)
send_tag_fn_name = Symbol("$(side)_send_tag")
recv_tag_fn_name = Symbol("$(side)_recv_tag")
@eval begin
function $send_tag_fn_name(arch, location)
function $send_tag_fn_name(arch, grid, location)
field_id = string(arch.mpi_tag[], pad=ID_DIGITS)
loc_digit = string(loc_id(location...))
loc_digit = string(loc_id(location...), pad=ID_DIGITS)
side_digit = string(side_id[Symbol($side_str)])
return parse(Int, field_id * loc_digit * side_digit)
end

function $recv_tag_fn_name(arch, location)
function $recv_tag_fn_name(arch, grid, location)
field_id = string(arch.mpi_tag[], pad=ID_DIGITS)
loc_digit = string(loc_id(location...))
loc_digit = string(loc_id(location...), pad=ID_DIGITS)
side_digit = string(side_id[opposite_side[Symbol($side_str)]])
return parse(Int, field_id * loc_digit * side_digit)
end
Expand Down Expand Up @@ -110,18 +120,22 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed

arch = architecture(grid)
fill_halos!, bcs = permute_boundary_conditions(bcs)

number_of_tasks = length(fill_halos!)
number_of_tasks = length(fill_halos!)
outstanding_requests = length(arch.mpi_requests)

for task = 1:number_of_tasks
fill_halo_event!(c, fill_halos![task], bcs[task], indices, loc, arch, grid, buffers, args...; kwargs...)
end

fill_corners!(c, arch.connectivity, indices, loc, arch, grid, buffers, args...; kwargs...)

# Switch to the next field to send
arch.mpi_tag[] += 1

# We increment the request counter only if we have actually initiated the MPI communication.
# This is the case only if at least one of the boundary conditions is a distributed communication
# boundary condition (DCBCT) _and_ the `only_local_halos` keyword argument is false.
if length(arch.mpi_requests) > outstanding_requests
arch.mpi_tag[] += 1
end

return nothing
end

Expand Down Expand Up @@ -313,10 +327,9 @@ for side in sides
@eval begin
function $send_side_halo(c, grid, arch, location, local_rank, rank_to_send_to, buffers)
send_buffer = $get_side_send_buffer(c, grid, buffers, arch)
send_tag = $side_send_tag(arch, location)
send_tag = $side_send_tag(arch, grid, location)

@debug "Sending " * $side_str * " halo: local_rank=$local_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"

send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)

return send_req
Expand All @@ -340,7 +353,7 @@ for side in sides
@eval begin
function $recv_and_fill_side_halo!(c, grid, arch, location, local_rank, rank_to_recv_from, buffers)
recv_buffer = $get_side_recv_buffer(c, grid, buffers, arch)
recv_tag = $side_recv_tag(arch, location)
recv_tag = $side_recv_tag(arch, grid, location)

@debug "Receiving " * $side_str * " halo: local_rank=$local_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
Expand Down
2 changes: 1 addition & 1 deletion src/ImmersedBoundaries/active_cells_map.jl
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ function findall_active_indices!(active_indices, active_cells_field, ibg, Indice
for k in 1:size(ibg, 3)
interior_indices = findall(on_architecture(CPU(), interior(active_cells_field, :, :, k:k)))
interior_indices = convert_interior_indices(interior_indices, k, IndicesType)
active_indices = vcat(active_indices, interior_indices)
active_indices = vcat(active_indices, interior_indices)
GC.gc()
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ end

validate_free_surface(::Distributed, free_surface::SplitExplicitFreeSurface) = free_surface
validate_free_surface(::Distributed, free_surface::ExplicitFreeSurface) = free_surface
validate_free_surface(::Distributed, free_surface::Nothing) = free_surface
validate_free_surface(arch::Distributed, free_surface) = error("$(typeof(free_surface)) is not supported with $(typeof(arch))")
validate_free_surface(arch, free_surface) = free_surface

Expand Down