Merge branch 'worleyph/cime/repro_in_cpldiag' (PR #2407)

a) Use shr_reprosum_calc in seq_diag_avect_mct When BFBFLAG is set to true and when INFO_DBUG > 1, the routine seq_diag_avect_mct uses a reproducible sum algorithm that is not as accurate as the algorithm implemented in shr_reprosum_calc. In particular, when summing a vector of INFs, the current algorithm returns zero. Here we replace the existing algorithm with a call to shr_reprosum_calc. This change is BFB for standard usage (INFO_DBUG == 1). It is not BFB with respect to the associated diagnostic, written to cpl.log, when INFO_DBUG > 1. However, these diagnostics are not used in the simulation, and simulation results will BFB. b) Add option to allow INF or NaN summands in shr_reprosum_calc shr_reprosum_calc aborts if input summands include INF or NaN values. For debugging purposes, it can be useful to allow INF or NaN values, returning the IEEE standard results for such a situation (either NaN, positive INF, or negative INF, depending on the situation). An optional logical parameter, allow_infnan, is being added to the shr_reprosum_calc. When set to .true. the routine determines whether summands for an existing field contain NaN or INF values and returns the appropriate value without going through the reproducible sum algorithm (which is very slow and requires signficant memory when summing these special values). Other fields in a multiple field call to shr_reprosum_calc will be computed in the usual fashion. When allow_infnan == .false. or when the parameter is omitted, then the routine aborts with an informative error message when the input contain INF or NaN values, as is done currently. The default can be changed (from allow_infnan=.false. to allow_infnan=.true.) via a new optional parameter, repro_sum_allow_infnan_in, in shr_reprosum_setopts. A new drv_in namelist parameter, reprosum_allow_infnan, has also been added that will be passed to shr_reprosum_setopts to set the default. This can be set in user_nl_cpl. Since the default is not being changed, this change is BFB. If allow_infnan is set to .true., then runs that failed because of INFs or NaNs would now continue to run (longer), but jobs that did not fail with the original default will be BFB even with the default changed. [BFB] * worleyph/cime/repro_in_cpldiag: Minor cleanup Add option to allow INF or NaN summands in shr_reprosum_calc use shr_reprosum_calc in seq_diag_avect_mct
ESMCI · Jul 16, 2018 · 769b42e · 769b42e
2 parents 32e6a48 + 440db83
commit 769b42e
Show file tree

Hide file tree

Showing 5 changed files with 236 additions and 143 deletions.
diff --git a/src/drivers/mct/cime_config/namelist_definition_drv.xml b/src/drivers/mct/cime_config/namelist_definition_drv.xml
@@ -1470,6 +1470,19 @@
     </values>
   </entry>
 
+  <entry id="reprosum_allow_infnan">
+    <type>logical</type>
+    <category>reprosum</category>
+    <group>seq_infodata_inparm</group>
+    <desc>
+      Allow INF and NaN in summands
+      default: .false.
+    </desc>
+    <values>
+      <value>.false.</value>
+    </values>
+  </entry>
+
   <entry id="reprosum_diffmax">
     <type>real</type>
     <category>reprosum</category>

diff --git a/src/drivers/mct/main/cime_comp_mod.F90 b/src/drivers/mct/main/cime_comp_mod.F90
@@ -417,6 +417,7 @@ module cime_comp_mod
   logical  :: shr_map_dopole         ! logical for dopole in shr_map_mod
   logical  :: domain_check           ! .true.  => check consistency of domains
   logical  :: reprosum_use_ddpdd     ! setup reprosum, use ddpdd
+  logical  :: reprosum_allow_infnan  ! setup reprosum, allow INF and NaN in summands
   real(r8) :: reprosum_diffmax       ! setup reprosum, set rel_diff_max
   logical  :: reprosum_recompute     ! setup reprosum, recompute if tolerance exceeded
 
@@ -935,6 +936,7 @@ subroutine cime_pre_init2()
          wall_time_limit=wall_time_limit           , &
          force_stop_at=force_stop_at               , &
          reprosum_use_ddpdd=reprosum_use_ddpdd     , &
+         reprosum_allow_infnan=reprosum_allow_infnan, &
          reprosum_diffmax=reprosum_diffmax         , &
          reprosum_recompute=reprosum_recompute, &
          max_cplstep_time=max_cplstep_time)
@@ -946,6 +948,7 @@ subroutine cime_pre_init2()
 
     call shr_reprosum_setopts(&
          repro_sum_use_ddpdd_in    = reprosum_use_ddpdd, &
+         repro_sum_allow_infnan_in = reprosum_allow_infnan, &
          repro_sum_rel_diff_max_in = reprosum_diffmax, &
          repro_sum_recompute_in    = reprosum_recompute)
 

diff --git a/src/drivers/mct/main/seq_diag_mct.F90 b/src/drivers/mct/main/seq_diag_mct.F90
@@ -45,6 +45,7 @@ module seq_diag_mct
   use component_type_mod, only : COMPONENT_GET_DOM_CX, COMPONENT_GET_C2X_CX, &
        COMPONENT_GET_X2C_CX, COMPONENT_TYPE
   use seq_infodata_mod, only : seq_infodata_type, seq_infodata_getdata
+  use shr_reprosum_mod, only: shr_reprosum_calc
 
   implicit none
   save
@@ -2237,7 +2238,9 @@ SUBROUTINE seq_diag_avect_mct(infodata, id, av, dom, gsmap, comment)
     integer(in)                      :: iam         ! pe number
     integer(in)                      :: km,ka       ! field indices
     integer(in)                      :: ns          ! size of local AV
+    integer(in)                      :: rcode       ! allocate return code
     real(r8),                pointer :: weight(:)   ! weight
+    real(r8), allocatable            :: weighted_data(:,:) ! weighted data
     type(mct_string)                 :: mstring     ! mct char type
     character(CL)                    :: lcomment    ! should be long enough
     character(CL)                    :: itemc       ! string converted to char
@@ -2250,11 +2253,8 @@ SUBROUTINE seq_diag_avect_mct(infodata, id, av, dom, gsmap, comment)
     ! print instantaneous budget data
     !-------------------------------------------------------------------------------
 
-    call seq_comm_setptrs(ID,&
-         mpicom=mpicom, iam=iam)
-
-    call seq_infodata_GetData(infodata,&
-         bfbflag=bfbflag)
+    call seq_comm_setptrs(ID, mpicom=mpicom, iam=iam)
+    call seq_infodata_GetData(infodata, bfbflag=bfbflag)
 
     lcomment = ''
     if (present(comment)) then
@@ -2267,82 +2267,44 @@ SUBROUTINE seq_diag_avect_mct(infodata, id, av, dom, gsmap, comment)
     km = mct_aVect_indexRA(dom%data,'mask')
     ka = mct_aVect_indexRA(dom%data,afldname)
     kflds = mct_aVect_nRattr(AV)
-    allocate(sumbuf(kflds),sumbufg(kflds))
-
-    sumbuf =       0.0_r8
-
-    if (bfbflag) then
-
-       npts = mct_aVect_lsize(AV)
-       allocate(weight(npts))
-       weight(:) = 1.0_r8
-       do n = 1,npts
-          if (dom%data%rAttr(km,n) <= 1.0e-06_R8) then
-             weight(n) = 0.0_r8
-          else
-             weight(n) = dom%data%rAttr(ka,n)*shr_const_rearth*shr_const_rearth
-          endif
-       enddo
+    allocate(sumbufg(kflds),stat=rcode)
+    if (rcode /= 0) call shr_sys_abort(trim(subname)//' allocate sumbufg')
 
-       allocate(maxbuf(kflds),maxbufg(kflds))
-       maxbuf = 0.0_r8
-
-       do n = 1,npts
-          do k = 1,kflds
-             if (.not. shr_const_isspval(AV%rAttr(k,n))) then
-                maxbuf(k) = max(maxbuf(k),abs(AV%rAttr(k,n)*weight(n)))
-             endif
-          enddo
-       enddo
-
-       call shr_mpi_max(maxbuf,maxbufg,mpicom,subname,all=.true.)
-       call shr_mpi_sum(npts,nptsg,mpicom,subname,all=.true.)
+    npts = mct_aVect_lsize(AV)
+    allocate(weight(npts),stat=rcode)
+    if (rcode /= 0) call shr_sys_abort(trim(subname)//' allocate weight')
 
-       do k = 1,kflds
-          if (maxbufg(k) < 1000.0*TINY(maxbufg(k)) .or. &
-               maxbufg(k) > HUGE(maxbufg(k))/(2.0_r8*nptsg)) then
-             maxbufg(k) = 0.0_r8
-          else
-             maxbufg(k) = (1.1_r8) * maxbufg(k) * nptsg
-          endif
-       enddo
+    weight(:) = 1.0_r8
+    do n = 1,npts
+       if (dom%data%rAttr(km,n) <= 1.0e-06_R8) then
+          weight(n) = 0.0_r8
+       else
+          weight(n) = dom%data%rAttr(ka,n)*shr_const_rearth*shr_const_rearth
+       endif
+    enddo
 
-       allocate(isumbuf(kflds),isumbufg(kflds))
-       isumbuf = 0
-       ihuge = HUGE(isumbuf)
+    if (bfbflag) then
+       allocate(weighted_data(npts,kflds),stat=rcode)
+       if (rcode /= 0) call shr_sys_abort(trim(subname)//' allocate weighted_data')
 
+       weighted_data = 0.0_r8
        do n = 1,npts
           do k = 1,kflds
              if (.not. shr_const_isspval(AV%rAttr(k,n))) then
-                if (abs(maxbufg(k)) > 1000.0_r8 * TINY(maxbufg)) then
-                   isumbuf(k) = isumbuf(k) + int((AV%rAttr(k,n)*weight(n)/maxbufg(k))*ihuge,i8)
-                endif
+                weighted_data(n,k) = AV%rAttr(k,n)*weight(n)
              endif
           enddo
        enddo
 
-       call shr_mpi_sum(isumbuf,isumbufg,mpicom,subname)
+       call shr_reprosum_calc (weighted_data, sumbufg, npts, npts, kflds, &
+                               commid=mpicom)
 
-       do k = 1,kflds
-          sumbufg(k) = isumbufg(k)*maxbufg(k)/ihuge
-       enddo
-
-       deallocate(weight)
-       deallocate(maxbuf,maxbufg)
-       deallocate(isumbuf,isumbufg)
+       deallocate(weighted_data)
 
     else
-
-       npts = mct_aVect_lsize(AV)
-       allocate(weight(npts))
-       weight(:) = 1.0_r8
-       do n = 1,npts
-          if (dom%data%rAttr(km,n) <= 1.0e-06_R8) then
-             weight(n) = 0.0_r8
-          else
-             weight(n) = dom%data%rAttr(ka,n)*shr_const_rearth*shr_const_rearth
-          endif
-       enddo
+       allocate(sumbuf(kflds),stat=rcode)
+       if (rcode /= 0) call shr_sys_abort(trim(subname)//' allocate sumbuf')
+       sumbuf = 0.0_r8
 
        do n = 1,npts
           do k = 1,kflds
@@ -2355,9 +2317,10 @@ SUBROUTINE seq_diag_avect_mct(infodata, id, av, dom, gsmap, comment)
        !--- global reduction ---
        call shr_mpi_sum(sumbuf,sumbufg,mpicom,subname)
 
-       deallocate(weight)
+       deallocate(sumbuf)
 
     endif
+    deallocate(weight)
 
     if (iam == 0) then
        !      write(logunit,*) 'sdAV: *** writing ',trim(lcomment),': k fld min/max/sum ***'
@@ -2374,7 +2337,7 @@ SUBROUTINE seq_diag_avect_mct(infodata, id, av, dom, gsmap, comment)
        call shr_sys_flush(logunit)
     endif
 
-    deallocate(sumbuf,sumbufg)
+    deallocate(sumbufg)
 
 100 format('comm_diag ',a3,1x,a4,1x,i3,es26.19,1x,a,1x,a)
 101 format('comm_diag ',a3,1x,a4,1x,i3,es26.19,1x,a)

diff --git a/src/drivers/mct/shr/seq_infodata_mod.F90 b/src/drivers/mct/shr/seq_infodata_mod.F90
@@ -176,6 +176,7 @@ MODULE seq_infodata_mod
      logical                 :: mct_usevector   ! flag for mct vector
 
      logical                 :: reprosum_use_ddpdd  ! use ddpdd algorithm
+     logical                 :: reprosum_allow_infnan ! allow INF and NaN summands
      real(SHR_KIND_R8)       :: reprosum_diffmax    ! maximum difference tolerance
      logical                 :: reprosum_recompute  ! recompute reprosum with nonscalable algorithm
      ! if reprosum_diffmax is exceeded
@@ -412,6 +413,7 @@ SUBROUTINE seq_infodata_Init( infodata, nmlfile, ID, pioid, cpl_tag)
     real(SHR_KIND_R8)      :: eps_ogrid          ! ocn grid error tolerance
     real(SHR_KIND_R8)      :: eps_oarea          ! ocn area error tolerance
     logical                :: reprosum_use_ddpdd ! use ddpdd algorithm
+    logical                :: reprosum_allow_infnan ! allow INF and NaN summands
     real(SHR_KIND_R8)      :: reprosum_diffmax   ! maximum difference tolerance
     logical                :: reprosum_recompute ! recompute reprosum with nonscalable algorithm
     ! if reprosum_diffmax is exceeded
@@ -452,7 +454,8 @@ SUBROUTINE seq_infodata_Init( infodata, nmlfile, ID, pioid, cpl_tag)
          eps_frac, eps_amask,                   &
          eps_agrid, eps_aarea, eps_omask, eps_ogrid,       &
          eps_oarea, esmf_map_flag,                         &
-         reprosum_use_ddpdd, reprosum_diffmax, reprosum_recompute, &
+         reprosum_use_ddpdd, reprosum_allow_infnan,        &
+         reprosum_diffmax, reprosum_recompute,             &
          mct_usealltoall, mct_usevector, max_cplstep_time, model_doi_url
 
     !-------------------------------------------------------------------------------
@@ -560,6 +563,7 @@ SUBROUTINE seq_infodata_Init( infodata, nmlfile, ID, pioid, cpl_tag)
        eps_ogrid             = 1.0e-02_SHR_KIND_R8
        eps_oarea             = 1.0e-01_SHR_KIND_R8
        reprosum_use_ddpdd    = .false.
+       reprosum_allow_infnan = .false.
        reprosum_diffmax      = -1.0e-8
        reprosum_recompute    = .false.
        mct_usealltoall       = .false.
@@ -685,6 +689,7 @@ SUBROUTINE seq_infodata_Init( infodata, nmlfile, ID, pioid, cpl_tag)
        infodata%eps_ogrid             = eps_ogrid
        infodata%eps_oarea             = eps_oarea
        infodata%reprosum_use_ddpdd    = reprosum_use_ddpdd
+       infodata%reprosum_allow_infnan = reprosum_allow_infnan
        infodata%reprosum_diffmax      = reprosum_diffmax
        infodata%reprosum_recompute    = reprosum_recompute
        infodata%mct_usealltoall       = mct_usealltoall
@@ -977,7 +982,8 @@ SUBROUTINE seq_infodata_GetData_explicit( infodata, cime_model, case_name, case_
        lnd_nx, lnd_ny, rof_nx, rof_ny, ice_nx, ice_ny, ocn_nx, ocn_ny,    &
        glc_nx, glc_ny, eps_frac, eps_amask,                               &
        eps_agrid, eps_aarea, eps_omask, eps_ogrid, eps_oarea,             &
-       reprosum_use_ddpdd, reprosum_diffmax, reprosum_recompute,          &
+       reprosum_use_ddpdd, reprosum_allow_infnan,                         &
+       reprosum_diffmax, reprosum_recompute,                              &
        atm_resume, lnd_resume, ocn_resume, ice_resume,                    &
        glc_resume, rof_resume, wav_resume, cpl_resume,                    &
        mct_usealltoall, mct_usevector, max_cplstep_time, model_doi_url,   &
@@ -1085,6 +1091,7 @@ SUBROUTINE seq_infodata_GetData_explicit( infodata, cime_model, case_name, case_
     real(SHR_KIND_R8),      optional, intent(OUT) :: eps_ogrid               ! ocn grid error tolerance
     real(SHR_KIND_R8),      optional, intent(OUT) :: eps_oarea               ! ocn area error tolerance
     logical,                optional, intent(OUT) :: reprosum_use_ddpdd      ! use ddpdd algorithm
+    logical,                optional, intent(OUT) :: reprosum_allow_infnan   ! allow INF and NaN summands
     real(SHR_KIND_R8),      optional, intent(OUT) :: reprosum_diffmax        ! maximum difference tolerance
     logical,                optional, intent(OUT) :: reprosum_recompute      ! recompute if tolerance exceeded
     logical,                optional, intent(OUT) :: mct_usealltoall         ! flag for mct alltoall
@@ -1261,6 +1268,7 @@ SUBROUTINE seq_infodata_GetData_explicit( infodata, cime_model, case_name, case_
     if ( present(eps_ogrid)      ) eps_ogrid      = infodata%eps_ogrid
     if ( present(eps_oarea)      ) eps_oarea      = infodata%eps_oarea
     if ( present(reprosum_use_ddpdd)) reprosum_use_ddpdd = infodata%reprosum_use_ddpdd
+    if ( present(reprosum_allow_infnan)) reprosum_allow_infnan = infodata%reprosum_allow_infnan
     if ( present(reprosum_diffmax)  ) reprosum_diffmax   = infodata%reprosum_diffmax
     if ( present(reprosum_recompute)) reprosum_recompute = infodata%reprosum_recompute
     if ( present(mct_usealltoall)) mct_usealltoall = infodata%mct_usealltoall
@@ -1555,7 +1563,8 @@ SUBROUTINE seq_infodata_PutData_explicit( infodata, cime_model, case_name, case_
        lnd_nx, lnd_ny, rof_nx, rof_ny, ice_nx, ice_ny, ocn_nx, ocn_ny,    &
        glc_nx, glc_ny, eps_frac, eps_amask,                               &
        eps_agrid, eps_aarea, eps_omask, eps_ogrid, eps_oarea,             &
-       reprosum_use_ddpdd, reprosum_diffmax, reprosum_recompute,          &
+       reprosum_use_ddpdd, reprosum_allow_infnan,                         &
+       reprosum_diffmax, reprosum_recompute,                              &
        atm_resume, lnd_resume, ocn_resume, ice_resume,                    &
        glc_resume, rof_resume, wav_resume, cpl_resume,                    &
        mct_usealltoall, mct_usevector, glc_valid_input)
@@ -1661,6 +1670,7 @@ SUBROUTINE seq_infodata_PutData_explicit( infodata, cime_model, case_name, case_
     real(SHR_KIND_R8),      optional, intent(IN)    :: eps_ogrid          ! ocn grid error tolerance
     real(SHR_KIND_R8),      optional, intent(IN)    :: eps_oarea          ! ocn area error tolerance
     logical,                optional, intent(IN)    :: reprosum_use_ddpdd ! use ddpdd algorithm
+    logical,                optional, intent(IN)    :: reprosum_allow_infnan ! allow INF and NaN summands
     real(SHR_KIND_R8),      optional, intent(IN)    :: reprosum_diffmax   ! maximum difference tolerance
     logical,                optional, intent(IN)    :: reprosum_recompute ! recompute if tolerance exceeded
     logical,                optional, intent(IN)    :: mct_usealltoall    ! flag for mct alltoall
@@ -1835,6 +1845,7 @@ SUBROUTINE seq_infodata_PutData_explicit( infodata, cime_model, case_name, case_
     if ( present(eps_ogrid)      ) infodata%eps_ogrid      = eps_ogrid
     if ( present(eps_oarea)      ) infodata%eps_oarea      = eps_oarea
     if ( present(reprosum_use_ddpdd)) infodata%reprosum_use_ddpdd = reprosum_use_ddpdd
+    if ( present(reprosum_allow_infnan)) infodata%reprosum_allow_infnan = reprosum_allow_infnan
     if ( present(reprosum_diffmax)  ) infodata%reprosum_diffmax   = reprosum_diffmax
     if ( present(reprosum_recompute)) infodata%reprosum_recompute = reprosum_recompute
     if ( present(mct_usealltoall)) infodata%mct_usealltoall = mct_usealltoall
@@ -2257,6 +2268,7 @@ subroutine seq_infodata_bcast(infodata,mpicom)
     call shr_mpi_bcast(infodata%eps_ogrid,               mpicom)
     call shr_mpi_bcast(infodata%eps_oarea,               mpicom)
     call shr_mpi_bcast(infodata%reprosum_use_ddpdd,      mpicom)
+    call shr_mpi_bcast(infodata%reprosum_allow_infnan,   mpicom)
     call shr_mpi_bcast(infodata%reprosum_diffmax,        mpicom)
     call shr_mpi_bcast(infodata%reprosum_recompute,      mpicom)
     call shr_mpi_bcast(infodata%mct_usealltoall,         mpicom)
@@ -2931,6 +2943,7 @@ SUBROUTINE seq_infodata_print( infodata )
     write(logunit,F0R) subname,'eps_oarea                = ', infodata%eps_oarea
 
     write(logunit,F0L) subname,'reprosum_use_ddpdd       = ', infodata%reprosum_use_ddpdd
+    write(logunit,F0L) subname,'reprosum_allow_infnan    = ', infodata%reprosum_allow_infnan
     write(logunit,F0R) subname,'reprosum_diffmax         = ', infodata%reprosum_diffmax
     write(logunit,F0L) subname,'reprosum_recompute       = ', infodata%reprosum_recompute