From dfead5d147517aa7013fdeaf22c44ba8f31e4464 Mon Sep 17 00:00:00 2001 From: Patrick Worley Date: Sat, 31 Jan 2015 21:52:32 -0500 Subject: [PATCH 1/4] enabling perf_mod to enable/disable GPTL timer calls The perf_mod wrapper on the GPTL timing library controls the enabling/disabling of t_startf/t_stopf calls in a number of ways. However, this control does not currently extend to direct calls of the underlaying GPTLstart/stop routines. This modification corrects this, allowing, for example, GPTL-instrumented versions of Trilinos to have timers enabled/disabled in the same way as in the rest of ACME. --- models/utils/timing/perf_mod.F90 | 40 ++++++++++++++++++-------------- models/utils/timing/private.h | 2 +- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/models/utils/timing/perf_mod.F90 b/models/utils/timing/perf_mod.F90 index 95660635b6d0..c5b6c85d3642 100644 --- a/models/utils/timing/perf_mod.F90 +++ b/models/utils/timing/perf_mod.F90 @@ -622,16 +622,13 @@ subroutine t_startf(event, handle) ! !----------------------------------------------------------------------- ! - if ((timing_initialized) .and. & - (timing_disable_depth .eq. 0) .and. & - (cur_timing_detail .le. timing_detail_limit)) then - - if ( present (handle) ) then - ierr = GPTLstart_handle(event, handle) - else - ierr = GPTLstart(event) - endif + if (.not. timing_initialized) return + if (timing_disable_depth > 0) return + if ( present (handle) ) then + ierr = GPTLstart_handle(event, handle) + else + ierr = GPTLstart(event) endif return @@ -660,16 +657,13 @@ subroutine t_stopf(event, handle) ! !----------------------------------------------------------------------- ! - if ((timing_initialized) .and. & - (timing_disable_depth .eq. 0) .and. & - (cur_timing_detail .le. timing_detail_limit)) then - - if ( present (handle) ) then - ierr = GPTLstop_handle(event, handle) - else - ierr = GPTLstop(event) - endif + if (.not. timing_initialized) return + if (timing_disable_depth > 0) return + if ( present (handle) ) then + ierr = GPTLstop_handle(event, handle) + else + ierr = GPTLstop(event) endif return @@ -774,6 +768,16 @@ subroutine t_adj_detailf(detail_adjustment) if (omp_in_parallel()) return #endif +! using disable/enable to implement timing_detail logic so also control +! direct GPTL calls (such as occur in Trilinos library) + if ((cur_timing_detail <= timing_detail_limit) .and. & + (cur_timing_detail + detail_adjustment > timing_detail_limit)) then + call t_disablef() + elseif ((cur_timing_detail > timing_detail_limit) .and. & + (cur_timing_detail + detail_adjustment <= timing_detail_limit)) then + call t_enablef() + endif + cur_timing_detail = cur_timing_detail + detail_adjustment return diff --git a/models/utils/timing/private.h b/models/utils/timing/private.h index eff11e827fdf..42a71de8ccef 100644 --- a/models/utils/timing/private.h +++ b/models/utils/timing/private.h @@ -32,7 +32,7 @@ #define MAX_STACK 128 /* longest timer name allowed (probably safe to just change) */ -#define MAX_CHARS 63 +#define MAX_CHARS 127 /* ** max allowable number of PAPI counters, or derived events. For convenience, From f8cb1a3717a4bf4fe91203ea48d28e0eeeb870a7 Mon Sep 17 00:00:00 2001 From: Patrick Worley Date: Sat, 31 Jan 2015 21:53:44 -0500 Subject: [PATCH 2/4] updating timing events and associated logic in drv Recent updates changed the logical structure of the driver routines, necessitating corresponding changes in the performance timing logic. --- models/drv/driver/ccsm_comp_mod.F90 | 20 +++++++++----------- models/drv/driver/ccsm_driver.F90 | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/models/drv/driver/ccsm_comp_mod.F90 b/models/drv/driver/ccsm_comp_mod.F90 index 3b43b83157d7..34fbffcfd337 100644 --- a/models/drv/driver/ccsm_comp_mod.F90 +++ b/models/drv/driver/ccsm_comp_mod.F90 @@ -578,6 +578,12 @@ subroutine ccsm_pre_init1() iam=comp_comm_iam(it)) if (iamroot_CPLID) output_perf = .true. + ! Timer initialization (has to be after mpi init) + call t_initf(NLFileName, LogPrint=.false., mpicom=mpicom_GLOID, & + MasterTask=iamroot_GLOID) + call t_startf('DRIVER_INIT') + call t_startf('ccsm_pre_init1') + if (iamin_CPLID) complist = trim(complist)//' cpl' comp_id(it) = CPLID @@ -725,6 +731,9 @@ subroutine ccsm_pre_init1() write(logunit,'(2A)') subname,' ESMF_INTERFACE is set' #endif endif + + call t_stopf('ccsm_pre_init1') + call t_stopf('DRIVER_INIT') ! ! When using io servers (pio_async_interface=.true.) the server tasks do not return from ! shr_pio_init2 @@ -747,19 +756,10 @@ subroutine ccsm_pre_init2() if (iamroot_CPLID) call seq_ccsm_printlogheader() - !---------------------------------------------------------- - !| Timer initialization (has to be after mpi init) - !---------------------------------------------------------- - - call t_initf(NLFileName, LogPrint=.false., mpicom=mpicom_GLOID, & - MasterTask=iamroot_GLOID) - if (iamin_CPLID) then call seq_io_cpl_init() endif - call t_startf('DRIVER_INIT') - !---------------------------------------------------------- !| Memory test !---------------------------------------------------------- @@ -1931,8 +1931,6 @@ subroutine ccsm_init() call shr_sys_flush(logunit) endif - call t_stopf ('DRIVER_INIT') - end subroutine ccsm_init !=============================================================================== diff --git a/models/drv/driver/ccsm_driver.F90 b/models/drv/driver/ccsm_driver.F90 index 695752d5e693..e81809e68fea 100644 --- a/models/drv/driver/ccsm_driver.F90 +++ b/models/drv/driver/ccsm_driver.F90 @@ -40,19 +40,33 @@ program ccsm_driver !-------------------------------------------------------------------------- ! Setup and initialize the communications and logging. !-------------------------------------------------------------------------- + ! Timer initialization has to be after mpi init, so called inside of + ! ccsm_pre_init1, as are t_startf for first two timers. When using pio async + ! option, servers do not return from ccsm_pre_init1, so t_stopf timers + ! are also called within ccsm_pre_init1. + !-------------------------------------------------------------------------- +! call t_startf('DRIVER_INIT') +! call t_startf('ccsm_pre_init1') call ccsm_pre_init1() +! call t_stopf('ccsm_pre_init1') +! call t_stopf('DRIVER_INIT') + call t_startf('DRIVER_INIT') !-------------------------------------------------------------------------- ! Initialize ESMF. This is done outside of the ESMF_INTERFACE ifdef ! because it is needed for the time manager, even if the ESMF_INTERFACE ! is not used. !-------------------------------------------------------------------------- + call t_startf('ESMF_Initialize') call ESMF_Initialize() + call t_stopf('ESMF_Initialize') !-------------------------------------------------------------------------- ! Read in the configuration information and initialize the time manager. !-------------------------------------------------------------------------- + call t_startf('ccsm_pre_init2') call ccsm_pre_init2() + call t_stopf('ccsm_pre_init2') #ifdef USE_ESMF_LIB @@ -74,8 +88,11 @@ program ccsm_driver ! cap component. !-------------------------------------------------------------------------- + call t_startf('ccsm_init_esmf') call ESMF_CplCompInitialize(drvcomp, rc=localrc) if (localrc /= 0) call shr_sys_abort('failed to esmf initialize') + call t_stopf('ccsm_init_esmf') + call t_stopf('DRIVER_INIT') call ESMF_CplCompRun(drvcomp, rc=localrc) if (localrc /= 0) call shr_sys_abort('failed to esmf run') @@ -89,7 +106,11 @@ program ccsm_driver ! If ESMF is not defined, then just call the initialize, run and finalize ! routines directly. !-------------------------------------------------------------------------- + call t_startf('ccsm_init') call ccsm_init() + call t_stopf('ccsm_init') + call t_stopf('DRIVER_INIT') + call ccsm_run() call ccsm_final() From 1d59b4ee6e9a8f7be35ac80bb94a0fdf99cf733b Mon Sep 17 00:00:00 2001 From: Patrick Worley Date: Sat, 31 Jan 2015 21:54:18 -0500 Subject: [PATCH 3/4] updating timing events and associated logic in pio Performance experiments have identified the utility of some additional timers in the PIO library. --- models/utils/pio/box_rearrange.F90.in | 47 +++++++++++++++++++++++++++ models/utils/pio/piodarray.F90.in | 14 ++++++++ 2 files changed, 61 insertions(+) diff --git a/models/utils/pio/box_rearrange.F90.in b/models/utils/pio/box_rearrange.F90.in index 49f89120108f..129ad466c6e9 100644 --- a/models/utils/pio/box_rearrange.F90.in +++ b/models/utils/pio/box_rearrange.F90.in @@ -81,6 +81,9 @@ module box_rearrange #else use pio_support, only : piodie, Debug, DebugIO, CheckMPIReturn, pio_fc_gather_offset +#endif +#ifdef TIMING + use perf_mod, only : t_startf, t_stopf !_EXTERNAL #endif use alloc_mod, only : alloc_check, dealloc_check use pio_spmd_utils, only : pio_swapm @@ -296,16 +299,31 @@ subroutine box_rearrange_comp2io_{TYPE} (IOsystem, ioDesc, s1, src, niodof, & #ifdef _USE_ALLTOALLW if (pio_option == COLLECTIVE) then +#ifdef TIMING + call t_startf("a2a_box_rear_comp2io_{TYPE}") +#endif call MPI_ALLTOALLW(src, a2a_sendcounts, a2a_displs, a2a_sendtypes, & dest, a2a_recvcounts, a2a_displs, a2a_recvtypes, & IOsystem%union_comm, ierror ) +#ifdef TIMING + call t_stopf("a2a_box_rear_comp2io_{TYPE}") +#endif call CheckMPIReturn('box_rearrange', ierror) + else #endif + +#ifdef TIMING + call t_startf("swapm_box_rear_comp2io_{TYPE}") +#endif call pio_swapm( nprocs, myrank, & src, ndof, a2a_sendcounts, a2a_displs, a2a_sendtypes, & dest, niodof, a2a_recvcounts, a2a_displs, a2a_recvtypes, & IOsystem%union_comm, pio_hs, pio_isend, pio_maxreq ) +#ifdef TIMING + call t_stopf("swapm_box_rear_comp2io_{TYPE}") +#endif + #ifdef _USE_ALLTOALLW endif #endif @@ -324,6 +342,9 @@ subroutine box_rearrange_comp2io_{TYPE} (IOsystem, ioDesc, s1, src, niodof, & endif #endif +#ifdef TIMING + call t_startf("p2p_box_rear_comp2io_{TYPE}") +#endif ! ! send data from comp procs ! @@ -379,6 +400,10 @@ subroutine box_rearrange_comp2io_{TYPE} (IOsystem, ioDesc, s1, src, niodof, & call dealloc_check(sreq, 'send requests') +#ifdef TIMING + call t_stopf("p2p_box_rear_comp2io_{TYPE}") +#endif + #if DEBUG_BARRIER call MPI_BARRIER(IOsystem%union_comm,ierror) call CheckMPIReturn(subName,ierror) @@ -562,16 +587,32 @@ subroutine box_rearrange_io2comp_{TYPE} (IOsystem,ioDesc,s1, iobuf,s2, compbuf, #ifdef _USE_ALLTOALLW if (pio_option == COLLECTIVE) then + +#ifdef TIMING + call t_startf("a2a_box_rear_io2comp_{TYPE}") +#endif call MPI_ALLTOALLW(iobuf, a2a_sendcounts, a2a_displs, a2a_sendtypes, & compbuf, a2a_recvcounts, a2a_displs, a2a_recvtypes, & IOsystem%union_comm, ierror ) +#ifdef TIMING + call t_stopf("a2a_box_rear_io2comp_{TYPE}") +#endif call CheckMPIReturn(subName, ierror) + else #endif + +#ifdef TIMING + call t_startf("swapm_box_rear_io2comp_{TYPE}") +#endif call pio_swapm( nprocs, myrank, & iobuf, niodof, a2a_sendcounts, a2a_displs, a2a_sendtypes, & compbuf, ndof, a2a_recvcounts, a2a_displs, a2a_recvtypes, & IOsystem%union_comm, pio_hs, pio_isend, pio_maxreq ) +#ifdef TIMING + call t_stopf("swapm_box_rear_io2comp_{TYPE}") +#endif + #ifdef _USE_ALLTOALLW endif #endif @@ -583,6 +624,9 @@ subroutine box_rearrange_io2comp_{TYPE} (IOsystem,ioDesc,s1, iobuf,s2, compbuf, else +#ifdef TIMING + call t_startf("p2p_box_rear_io2comp_{TYPE}") +#endif call alloc_check(rreq, num_iotasks, 'recv requests') ! @@ -642,6 +686,9 @@ subroutine box_rearrange_io2comp_{TYPE} (IOsystem,ioDesc,s1, iobuf,s2, compbuf, call dealloc_check(sreq,'send requests') endif +#ifdef TIMING + call t_stopf("p2p_box_rear_io2comp_{TYPE}") +#endif endif ! POINT_TO_POINT #endif /* not _MPISERIAL */ diff --git a/models/utils/pio/piodarray.F90.in b/models/utils/pio/piodarray.F90.in index ba2d15434685..e3451914cd7b 100644 --- a/models/utils/pio/piodarray.F90.in +++ b/models/utils/pio/piodarray.F90.in @@ -545,6 +545,7 @@ contains endif ! if(UseRearranger) #ifdef TIMING call t_stopf("pio_rearrange_write") + call t_startf("pre_pio_write_nf") #endif if (IOproc) then @@ -602,6 +603,7 @@ contains endif #ifdef TIMING + call t_stopf("pre_pio_write_nf") call t_startf("pio_write_nf") #endif ierr = write_nf(File,IOBUF,varDesc,iodesc,start,count, request) @@ -612,11 +614,17 @@ contains call dealloc_check(count) if(IOPROC) then +#ifdef TIMING + call t_startf("post_pio_write_nf") +#endif if(file%iotype==pio_iotype_pnetcdf) then call add_data_to_buffer(File, IOBUF, request) else if(Userearranger) then deallocate(iobuf) end if +#ifdef TIMING + call t_stopf("post_pio_write_nf") +#endif end if ! call MPI_Barrier(File%iosystem%comp_comm,ierr) @@ -1205,7 +1213,13 @@ contains this_buffsize = size(iobuf)*c_sizeof(iobuf(1)) file%buffsize=file%buffsize+this_buffsize total_buffsize = total_buffsize+this_buffsize +#ifdef TIMING + call t_startf("allred_add_data_to_buf") +#endif call MPI_ALLREDUCE(total_buffsize,maxbuffsize,1,MPI_INTEGER,MPI_MAX,file%iosystem%io_comm, mpierr) +#ifdef TIMING + call t_stopf("allred_add_data_to_buf") +#endif if(maxbuffsize > pio_buffer_size_limit) then call darray_write_complete(File) From 339d87d5f5fa6ceeb35e2247bd0c09520cf4f0fc Mon Sep 17 00:00:00 2001 From: Patrick Worley Date: Sat, 31 Jan 2015 21:55:14 -0500 Subject: [PATCH 4/4] adding a timer in the atmosphere Filling another hole in the timer coverage of potentially significant performance events (prim_printstate). --- .../atm/cam/src/dynamics/se/share/prim_state_mod.F90 | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/models/atm/cam/src/dynamics/se/share/prim_state_mod.F90 b/models/atm/cam/src/dynamics/se/share/prim_state_mod.F90 index 182d677b8bc4..071f2a210b00 100644 --- a/models/atm/cam/src/dynamics/se/share/prim_state_mod.F90 +++ b/models/atm/cam/src/dynamics/se/share/prim_state_mod.F90 @@ -35,6 +35,8 @@ module prim_state_mod ! ------------------------------ use reduction_mod, only : parallelmax,parallelmin ! ------------------------------ + use perf_mod, only: t_startf, t_stopf + ! ------------------------------ implicit none private character(len=*), private, parameter :: massfname = "mass.out" @@ -145,6 +147,8 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete, fvm) integer :: n0, nm1, pnm1, np1 integer :: npts,n,q + call t_startf('prim_printstate') + if (hybrid%masterthread) then write(iulog,*) "nstep=",tl%nstep," time=",Time_at(tl%nstep)/(24*3600)," [day]" end if @@ -428,7 +432,10 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete, fvm) endif - if (tl%nstep < tl%nstep0) return + if (tl%nstep < tl%nstep0) then + call t_stopf('prim_printstate') + return + endif ! ==================================================================== ! @@ -682,6 +689,8 @@ subroutine prim_printstate(elem, tl,hybrid,hvcoord,nets,nete, fvm) time0=time1 endif + call t_stopf('prim_printstate') + end subroutine prim_printstate