Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating cuda code to improve performance and to work around compiler issue #14

Merged
merged 1 commit into from
Sep 18, 2014
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 32 additions & 84 deletions models/atm/cam/src/dynamics/se/share/cuda_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,6 @@ module cuda_mod
real (kind=real_kind),device,allocatable,dimension(:,:,:,:) :: dp_star_d
real (kind=real_kind),device,allocatable,dimension(:,:,:) :: qmin_d
real (kind=real_kind),device,allocatable,dimension(:,:,:) :: qmax_d
integer ,device,allocatable,dimension(:) :: send_nelem_d
integer ,device,allocatable,dimension(:) :: recv_nelem_d
integer ,device,allocatable,dimension(:,:) :: send_indices_d
integer ,device,allocatable,dimension(:,:) :: recv_indices_d
integer ,device,allocatable,dimension(:) :: recv_internal_indices_d
integer ,device,allocatable,dimension(:) :: recv_external_indices_d
real (kind=real_kind),device,allocatable,dimension(:,:,:) :: recvbuf_d
Expand Down Expand Up @@ -159,7 +155,7 @@ module cuda_mod

type(cudaEvent) :: timer1, timer2
integer, parameter :: gs = 2
real(kind=real_kind), device :: rrearth
real(kind=real_kind), device :: rrearth_d


contains
Expand All @@ -176,7 +172,7 @@ subroutine cuda_mod_init(elem,deriv,hvcoord)
use element_mod , only: element_t
use derivative_mod, only: derivative_t
use hybvcoord_mod, only: hvcoord_t
use physical_constants, only: rrearth_mod => rrearth
use physical_constants, only: rrearth
implicit none
type(element_t) , intent(in) :: elem(:)
type(derivative_t), intent(in) :: deriv
Expand All @@ -192,7 +188,7 @@ subroutine cuda_mod_init(elem,deriv,hvcoord)
logical,allocatable,dimension(:) :: elem_computed
integer :: total_work

rrearth = rrearth_mod
rrearth_d = rrearth

max_neigh_edges_d = max_neigh_edges
max_corner_elem_d = max_corner_elem
Expand Down Expand Up @@ -284,26 +280,20 @@ subroutine cuda_mod_init(elem,deriv,hvcoord)
! The PGI compiler with cuda enabled errors when allocating arrays of zero
! size - here when using only one MPI task
if (nSendCycles > 0) then
allocate( send_nelem_d ( nSendCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( send_indices_d (nelemd,nSendCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( sendbuf_h (nlev*qsize_d,mx_send_len,nSendCycles) , stat = ierr ); _CHECK(__LINE__)
allocate( send_elem_mask (nelemd,nSendCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( send_nelem ( nSendCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( send_indices (nelemd,nSendCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( send_elem_mask (nelemd,nSendCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( d2h_done (nSendCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( msg_sent (nSendCycles ) , stat = ierr ); _CHECK(__LINE__)
endif

if (nRecvCycles > 0) then
allocate( recv_nelem_d ( nRecvCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( recv_indices_d (nelemd,nRecvCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( recvbuf_h (nlev*qsize_d,mx_recv_len,nRecvCycles) , stat = ierr ); _CHECK(__LINE__)
allocate( recvbuf_d (nlev*qsize_d,mx_recv_len,nRecvCycles) , stat = ierr ); _CHECK(__LINE__)
allocate( recv_elem_mask (nelemd,nRecvCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( recv_nelem ( nRecvCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( recv_indices (nelemd,nRecvCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( recv_elem_mask (nelemd,nRecvCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( h2d_done (nRecvCycles ) , stat = ierr ); _CHECK(__LINE__)
allocate( msg_rcvd (nRecvCycles ) , stat = ierr ); _CHECK(__LINE__)
endif
Expand Down Expand Up @@ -361,15 +351,6 @@ subroutine cuda_mod_init(elem,deriv,hvcoord)
!For efficient MPI, PCI-e, packing, and unpacking, we need to separate out the cycles by dependence. Once on cycle has packed, then stage the PCI-e D2H, MPI, PCI-e H2D, & internal unpack
!We begin by testing what elements contribute to packing in what cycle's MPI data.
do ie = 1,nelemd
send_elem_mask(ie,:) = .false.
do icycle = 1 , nSendCycles
do n = 1 , max_neigh_edges
if ( elem(ie)%desc%putmapP(n) >= pSchedule%SendCycle(icycle)%ptrP .and. &
elem(ie)%desc%putmapP(n) <= pSchedule%SendCycle(icycle)%ptrP + pSchedule%SendCycle(icycle)%lengthP-1 ) then
send_elem_mask(ie,icycle) = .true.
endif
enddo
enddo
recv_elem_mask(ie,:) = .false.
do icycle = 1 , nRecvCycles
do n = 1 , max_neigh_edges
Expand All @@ -382,33 +363,6 @@ subroutine cuda_mod_init(elem,deriv,hvcoord)
enddo
edgebuf_d = 0.

elem_computed = .false. !elem_computed tells us whether an element has been touched by a cycle
!This pass accumulates for each cycle incides participating in the MPI_Isend
do icycle = 1 , nSendCycles
send_nelem(icycle) = 0
do ie = 1 , nelemd
if ( send_elem_mask(ie,icycle) ) then
send_nelem(icycle) = send_nelem(icycle) + 1
send_indices(send_nelem(icycle),icycle) = ie
elem_computed(ie) = .true.
endif
enddo
enddo
total_work = sum(send_nelem)
do ie = 1 , nelemd
if (.not. elem_computed(ie)) total_work = total_work + 1
enddo
!This pass adds to each cycle the internal elements not participating in MPI_Isend, so as to even distribute them across cycles.
do icycle = 1 , nSendCycles
do ie = 1 , nelemd
if ( .not. elem_computed(ie) .and. send_nelem(icycle) < int(ceiling(total_work/dble(nSendCycles))) ) then
send_nelem(icycle) = send_nelem(icycle) + 1
send_indices(send_nelem(icycle),icycle) = ie
elem_computed(ie) = .true.
endif
enddo
enddo

elem_computed = .false.
!This pass accumulates for each cycle incides participating in the MPI_Irecv
do icycle = 1 , nRecvCycles
Expand Down Expand Up @@ -437,30 +391,8 @@ subroutine cuda_mod_init(elem,deriv,hvcoord)
recv_internal_indices(recv_internal_nelem) = ie
endif
enddo
!This pass adds to each cycle the internal elements not participating in MPI_Irecv, so as to even distribute them across cycles.
do icycle = 1 , nRecvCycles
do ie = 1 , nelemd
if ( .not. elem_computed(ie) .and. recv_nelem(icycle) < int(ceiling(nelemd/dble(nRecvCycles))) ) then
recv_nelem(icycle) = recv_nelem(icycle) + 1
recv_indices(recv_nelem(icycle),icycle) = ie
elem_computed(ie) = .true.
endif
enddo
enddo

old_peu = .false.
do icycle = 1 , nSendCycles
if (send_nelem(icycle) == 0) then
write(*,*) 'WARNING: ZERO ELEMENT CYCLES EXIST. A BETTER DECOMPOSITION WILL RUN FASTER IN THE PACK-EXCHANGE-UNPACK.'
old_peu = .true.
endif
enddo

write(*,*) "Sending element & cycle informationt to device"
ierr = cudaMemcpy(send_nelem_d ,send_nelem ,size(send_nelem ),cudaMemcpyHostToDevice); _CHECK(__LINE__)
ierr = cudaMemcpy(recv_nelem_d ,recv_nelem ,size(recv_nelem ),cudaMemcpyHostToDevice); _CHECK(__LINE__)
ierr = cudaMemcpy(send_indices_d ,send_indices ,size(send_indices ),cudaMemcpyHostToDevice); _CHECK(__LINE__)
ierr = cudaMemcpy(recv_indices_d ,recv_indices ,size(recv_indices ),cudaMemcpyHostToDevice); _CHECK(__LINE__)
ierr = cudaMemcpy(recv_internal_indices_d,recv_internal_indices,size(recv_internal_indices),cudaMemcpyHostToDevice); _CHECK(__LINE__)
ierr = cudaMemcpy(recv_external_indices_d,recv_external_indices,size(recv_external_indices),cudaMemcpyHostToDevice); _CHECK(__LINE__)

Expand Down Expand Up @@ -568,7 +500,8 @@ subroutine euler_step_cuda( np1_qdp , n0_qdp , dt , elem , hvcoord , hybrid , de
integer :: ierr
type(dim3) :: blockdim , griddim

call t_startf('euler_step')
!call t_startf('euler_step')
call t_startf('euler_step_cuda')

if (limiter_option == 8) then
write(*,*) 'CUDA_MOD IS NOT INTENDED FOR USE WITH LIMITER_OPTION == 8 AT THIS TIME!'
Expand Down Expand Up @@ -633,7 +566,11 @@ subroutine euler_step_cuda( np1_qdp , n0_qdp , dt , elem , hvcoord , hybrid , de
enddo
call edgeVpack(edgeAdvDSS,DSSvar(:,:,1:nlev),nlev,0,elem(ie)%desc)
enddo

call t_startf('eus_cuda_bexchV')
call bndry_exchangeV(hybrid,edgeAdvDSS)
call t_stopf('eus_cuda_bexchV')

do ie = nets , nete
if ( DSSopt == DSSeta ) DSSvar => elem(ie)%derived%eta_dot_dpdn(:,:,:)
if ( DSSopt == DSSomega ) DSSvar => elem(ie)%derived%omega_p(:,:,:)
Expand All @@ -645,15 +582,20 @@ subroutine euler_step_cuda( np1_qdp , n0_qdp , dt , elem , hvcoord , hybrid , de
enddo
endif
!$OMP MASTER

call t_startf('eus_cuda_peus')
call pack_exchange_unpack_stage(np1_qdp,hybrid,qdp_d,timelevels)
call t_stopf('eus_cuda_peus')

blockdim = dim3( np*np , nlev , 1 )
griddim = dim3( qsize_d , nelemd , 1 )
call euler_hypervis_kernel_last<<<griddim,blockdim>>>( qdp_d , rspheremp_d , 1 , nelemd , np1_qdp ); _CHECK(__LINE__)
ierr = cudaThreadSynchronize()
!$OMP END MASTER
!$OMP BARRIER

call t_stopf('euler_step')
call t_stopf('euler_step_cuda')
!call t_stopf('euler_step')
end subroutine euler_step_cuda


Expand Down Expand Up @@ -741,7 +683,10 @@ subroutine advance_hypervis_scalar_cuda( edgeAdv , elem , hvcoord , hybrid , der
deriv_dvv_d , hyai_d , hybi_d , hvcoord%ps0 , nets , nete , dt , nt_qdp , nu_p ); _CHECK(__LINE__)
ierr = cudaThreadSynchronize()

call t_startf('ahs_cuda_peus1')
call pack_exchange_unpack_stage(1,hybrid,qtens_d,1)
call t_stopf('ahs_cuda_peus1')

ierr = cudaThreadSynchronize()

!KERNEL 2
Expand All @@ -754,7 +699,10 @@ subroutine advance_hypervis_scalar_cuda( edgeAdv , elem , hvcoord , hybrid , der
call limiter2d_zero_kernel<<<griddim,blockdim,0,streams(1)>>>(qdp_d,nets,nete,nt_qdp); _CHECK(__LINE__)
ierr = cudaThreadSynchronize()

call t_startf('ahs_cuda_peus2')
call pack_exchange_unpack_stage(nt_qdp,hybrid,qdp_d,timelevels)
call t_stopf('ahs_cuda_peus2')

ierr = cudaThreadSynchronize()

!KERNEL 3
Expand Down Expand Up @@ -915,7 +863,7 @@ attributes(device) function divergence_sphere(i,j,ie,k,ij,Vstar,qtmp,metdet,rmet
divtemp = divtemp + deriv_dvv((i-1)*np+s) * gv((j-1)*np+s,k,1)
vvtemp = vvtemp + deriv_dvv((j-1)*np+s) * gv((s-1)*np+i,k,2)
end do
dp_star = ( divtemp + vvtemp ) * ( rmetdet(ij) * rrearth )
dp_star = ( divtemp + vvtemp ) * ( rmetdet(ij) * rrearth_d )
end function divergence_sphere


Expand Down Expand Up @@ -1030,18 +978,18 @@ subroutine pack_exchange_unpack_stage(np1,hybrid,array_in,tl_in)
griddim6 = dim3( qsize_d , recv_external_nelem , 1 )
call edgeVpack_kernel_stage<<<griddim6,blockdim6,0,streams(1)>>>(edgebuf_d,array_in,putmapP_d,reverse_d,nbuf,0,1,nelemd,np1,recv_external_indices_d,tl_in); _CHECK(__LINE__)
endif
do icycle = 1 , nSendCycles
pCycle => pSchedule%SendCycle(icycle)
iptr = pCycle%ptrP
ierr = cudaMemcpyAsync(sendbuf_h(1,1,icycle),edgebuf_d(1,iptr),size(sendbuf_h(1:nlyr,1:pCycle%lengthP,icycle)),cudaMemcpyDeviceToHost,streams(1)); _CHECK(__LINE__)
enddo
if ( recv_internal_nelem > 0 ) then
blockdim6 = dim3( np , np , nlev )
griddim6 = dim3( qsize_d , recv_internal_nelem , 1 )
call edgeVpack_kernel_stage<<<griddim6,blockdim6,0,streams(2)>>>(edgebuf_d,array_in,putmapP_d,reverse_d,nbuf,0,1,nelemd,np1,recv_internal_indices_d,tl_in); _CHECK(__LINE__)
call edgeVunpack_kernel_stage<<<griddim6,blockdim6,0,streams(2)>>>(edgebuf_d,array_in,getmapP_d,nbuf,0,1,nelemd,np1,recv_internal_indices_d,tl_in); _CHECK(__LINE__)
endif
do icycle = 1 , nSendCycles
pCycle => pSchedule%SendCycle(icycle)
iptr = pCycle%ptrP
ierr = cudaMemcpyAsync(sendbuf_h(1,1,icycle),edgebuf_d(1,iptr),size(sendbuf_h(1:nlyr,1:pCycle%lengthP,icycle)),cudaMemcpyDeviceToHost,streams(1)); _CHECK(__LINE__)
enddo
ierr = cudaThreadSynchronize()
ierr = cudaStreamSynchronize(streams(1))
do icycle = 1 , nSendCycles
pCycle => pSchedule%SendCycle(icycle)
dest = pCycle%dest - 1
Expand All @@ -1051,13 +999,13 @@ subroutine pack_exchange_unpack_stage(np1,hybrid,array_in,tl_in)
call MPI_Isend(sendbuf_h(1,1,icycle),length,MPIreal_t,dest,tag,hybrid%par%comm,Srequest(icycle),ierr)
enddo
call MPI_WaitAll(nRecvCycles,Rrequest,status,ierr)
call MPI_WaitAll(nSendCycles,Srequest,status,ierr)
!When this cycle's MPI transfer is compliete, then call the D2H memcopy asynchronously
do icycle = 1 , nRecvCycles
pCycle => pSchedule%RecvCycle(icycle)
iptr = pCycle%ptrP
ierr = cudaMemcpyAsync(edgebuf_d(1,iptr),recvbuf_h(1,1,icycle),size(recvbuf_h(1:nlyr,1:pCycle%lengthP,icycle)),cudaMemcpyHostToDevice,streams(1)); _CHECK(__LINE__)
enddo
call MPI_WaitAll(nSendCycles,Srequest,status,ierr)
if ( recv_external_nelem > 0 ) then
blockdim6 = dim3( np , np , nlev )
griddim6 = dim3( qsize_d , recv_external_nelem , 1 )
Expand Down Expand Up @@ -1466,8 +1414,8 @@ attributes(device) function gradient_sphere(i,j,ie,iz,s,dinv,variable_hypervisco
dsdx00 = dsdx00 + deriv_dvv(l,i)*s(l,j,1,iz)
dsdy00 = dsdy00 + deriv_dvv(l,j)*s(i,l,1,iz)
enddo
ds(1) = ( dinv(i,j,1,1)*dsdx00 + dinv(i,j,2,1)*dsdy00 ) * rrearth * variable_hyperviscosity(i,j)
ds(2) = ( dinv(i,j,1,2)*dsdx00 + dinv(i,j,2,2)*dsdy00 ) * rrearth * variable_hyperviscosity(i,j)
ds(1) = ( dinv(i,j,1,1)*dsdx00 + dinv(i,j,2,1)*dsdy00 ) * rrearth_d * variable_hyperviscosity(i,j)
ds(2) = ( dinv(i,j,1,2)*dsdx00 + dinv(i,j,2,2)*dsdy00 ) * rrearth_d * variable_hyperviscosity(i,j)
end function gradient_sphere


Expand All @@ -1487,7 +1435,7 @@ attributes(device) function divergence_sphere_wk(i,j,ie,iz,tmp,s,dinv,spheremp,d
call syncthreads()
lapl = 0.0d0
do l = 1 , np
lapl = lapl - (spheremp(l,j)*s(l,j,1,iz)*deriv_dvv(i,l)+spheremp(i,l)*s(i,l,2,iz)*deriv_dvv(j,l)) * rrearth
lapl = lapl - (spheremp(l,j)*s(l,j,1,iz)*deriv_dvv(i,l)+spheremp(i,l)*s(i,l,2,iz)*deriv_dvv(j,l)) * rrearth_d
enddo
end function divergence_sphere_wk

Expand Down